]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
Merge with trunk.
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "wide-int.h"
65 #include "context.h"
66 #include "pass_manager.h"
67
68 static rtx legitimize_dllimport_symbol (rtx, bool);
69 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
70 static rtx legitimize_pe_coff_symbol (rtx, bool);
71
72 #ifndef CHECK_STACK_LIMIT
73 #define CHECK_STACK_LIMIT (-1)
74 #endif
75
76 /* Return index of given mode in mult and division cost tables. */
77 #define MODE_INDEX(mode) \
78 ((mode) == QImode ? 0 \
79 : (mode) == HImode ? 1 \
80 : (mode) == SImode ? 2 \
81 : (mode) == DImode ? 3 \
82 : 4)
83
84 /* Processor costs (relative to an add) */
85 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
86 #define COSTS_N_BYTES(N) ((N) * 2)
87
88 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
89
90 static stringop_algs ix86_size_memcpy[2] = {
91 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
92 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
93 static stringop_algs ix86_size_memset[2] = {
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
95 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
96
97 const
98 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
99 COSTS_N_BYTES (2), /* cost of an add instruction */
100 COSTS_N_BYTES (3), /* cost of a lea instruction */
101 COSTS_N_BYTES (2), /* variable shift costs */
102 COSTS_N_BYTES (3), /* constant shift costs */
103 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
104 COSTS_N_BYTES (3), /* HI */
105 COSTS_N_BYTES (3), /* SI */
106 COSTS_N_BYTES (3), /* DI */
107 COSTS_N_BYTES (5)}, /* other */
108 0, /* cost of multiply per each bit set */
109 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
110 COSTS_N_BYTES (3), /* HI */
111 COSTS_N_BYTES (3), /* SI */
112 COSTS_N_BYTES (3), /* DI */
113 COSTS_N_BYTES (5)}, /* other */
114 COSTS_N_BYTES (3), /* cost of movsx */
115 COSTS_N_BYTES (3), /* cost of movzx */
116 0, /* "large" insn */
117 2, /* MOVE_RATIO */
118 2, /* cost for loading QImode using movzbl */
119 {2, 2, 2}, /* cost of loading integer registers
120 in QImode, HImode and SImode.
121 Relative to reg-reg move (2). */
122 {2, 2, 2}, /* cost of storing integer registers */
123 2, /* cost of reg,reg fld/fst */
124 {2, 2, 2}, /* cost of loading fp registers
125 in SFmode, DFmode and XFmode */
126 {2, 2, 2}, /* cost of storing fp registers
127 in SFmode, DFmode and XFmode */
128 3, /* cost of moving MMX register */
129 {3, 3}, /* cost of loading MMX registers
130 in SImode and DImode */
131 {3, 3}, /* cost of storing MMX registers
132 in SImode and DImode */
133 3, /* cost of moving SSE register */
134 {3, 3, 3}, /* cost of loading SSE registers
135 in SImode, DImode and TImode */
136 {3, 3, 3}, /* cost of storing SSE registers
137 in SImode, DImode and TImode */
138 3, /* MMX or SSE register to integer */
139 0, /* size of l1 cache */
140 0, /* size of l2 cache */
141 0, /* size of prefetch block */
142 0, /* number of parallel prefetches */
143 2, /* Branch cost */
144 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
145 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
146 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
147 COSTS_N_BYTES (2), /* cost of FABS instruction. */
148 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
149 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
150 ix86_size_memcpy,
151 ix86_size_memset,
152 1, /* scalar_stmt_cost. */
153 1, /* scalar load_cost. */
154 1, /* scalar_store_cost. */
155 1, /* vec_stmt_cost. */
156 1, /* vec_to_scalar_cost. */
157 1, /* scalar_to_vec_cost. */
158 1, /* vec_align_load_cost. */
159 1, /* vec_unalign_load_cost. */
160 1, /* vec_store_cost. */
161 1, /* cond_taken_branch_cost. */
162 1, /* cond_not_taken_branch_cost. */
163 };
164
165 /* Processor costs (relative to an add) */
166 static stringop_algs i386_memcpy[2] = {
167 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
168 DUMMY_STRINGOP_ALGS};
169 static stringop_algs i386_memset[2] = {
170 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
171 DUMMY_STRINGOP_ALGS};
172
173 static const
174 struct processor_costs i386_cost = { /* 386 specific costs */
175 COSTS_N_INSNS (1), /* cost of an add instruction */
176 COSTS_N_INSNS (1), /* cost of a lea instruction */
177 COSTS_N_INSNS (3), /* variable shift costs */
178 COSTS_N_INSNS (2), /* constant shift costs */
179 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
180 COSTS_N_INSNS (6), /* HI */
181 COSTS_N_INSNS (6), /* SI */
182 COSTS_N_INSNS (6), /* DI */
183 COSTS_N_INSNS (6)}, /* other */
184 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
185 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
186 COSTS_N_INSNS (23), /* HI */
187 COSTS_N_INSNS (23), /* SI */
188 COSTS_N_INSNS (23), /* DI */
189 COSTS_N_INSNS (23)}, /* other */
190 COSTS_N_INSNS (3), /* cost of movsx */
191 COSTS_N_INSNS (2), /* cost of movzx */
192 15, /* "large" insn */
193 3, /* MOVE_RATIO */
194 4, /* cost for loading QImode using movzbl */
195 {2, 4, 2}, /* cost of loading integer registers
196 in QImode, HImode and SImode.
197 Relative to reg-reg move (2). */
198 {2, 4, 2}, /* cost of storing integer registers */
199 2, /* cost of reg,reg fld/fst */
200 {8, 8, 8}, /* cost of loading fp registers
201 in SFmode, DFmode and XFmode */
202 {8, 8, 8}, /* cost of storing fp registers
203 in SFmode, DFmode and XFmode */
204 2, /* cost of moving MMX register */
205 {4, 8}, /* cost of loading MMX registers
206 in SImode and DImode */
207 {4, 8}, /* cost of storing MMX registers
208 in SImode and DImode */
209 2, /* cost of moving SSE register */
210 {4, 8, 16}, /* cost of loading SSE registers
211 in SImode, DImode and TImode */
212 {4, 8, 16}, /* cost of storing SSE registers
213 in SImode, DImode and TImode */
214 3, /* MMX or SSE register to integer */
215 0, /* size of l1 cache */
216 0, /* size of l2 cache */
217 0, /* size of prefetch block */
218 0, /* number of parallel prefetches */
219 1, /* Branch cost */
220 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
221 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
222 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
223 COSTS_N_INSNS (22), /* cost of FABS instruction. */
224 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
225 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
226 i386_memcpy,
227 i386_memset,
228 1, /* scalar_stmt_cost. */
229 1, /* scalar load_cost. */
230 1, /* scalar_store_cost. */
231 1, /* vec_stmt_cost. */
232 1, /* vec_to_scalar_cost. */
233 1, /* scalar_to_vec_cost. */
234 1, /* vec_align_load_cost. */
235 2, /* vec_unalign_load_cost. */
236 1, /* vec_store_cost. */
237 3, /* cond_taken_branch_cost. */
238 1, /* cond_not_taken_branch_cost. */
239 };
240
241 static stringop_algs i486_memcpy[2] = {
242 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
243 DUMMY_STRINGOP_ALGS};
244 static stringop_algs i486_memset[2] = {
245 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
246 DUMMY_STRINGOP_ALGS};
247
248 static const
249 struct processor_costs i486_cost = { /* 486 specific costs */
250 COSTS_N_INSNS (1), /* cost of an add instruction */
251 COSTS_N_INSNS (1), /* cost of a lea instruction */
252 COSTS_N_INSNS (3), /* variable shift costs */
253 COSTS_N_INSNS (2), /* constant shift costs */
254 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
255 COSTS_N_INSNS (12), /* HI */
256 COSTS_N_INSNS (12), /* SI */
257 COSTS_N_INSNS (12), /* DI */
258 COSTS_N_INSNS (12)}, /* other */
259 1, /* cost of multiply per each bit set */
260 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
261 COSTS_N_INSNS (40), /* HI */
262 COSTS_N_INSNS (40), /* SI */
263 COSTS_N_INSNS (40), /* DI */
264 COSTS_N_INSNS (40)}, /* other */
265 COSTS_N_INSNS (3), /* cost of movsx */
266 COSTS_N_INSNS (2), /* cost of movzx */
267 15, /* "large" insn */
268 3, /* MOVE_RATIO */
269 4, /* cost for loading QImode using movzbl */
270 {2, 4, 2}, /* cost of loading integer registers
271 in QImode, HImode and SImode.
272 Relative to reg-reg move (2). */
273 {2, 4, 2}, /* cost of storing integer registers */
274 2, /* cost of reg,reg fld/fst */
275 {8, 8, 8}, /* cost of loading fp registers
276 in SFmode, DFmode and XFmode */
277 {8, 8, 8}, /* cost of storing fp registers
278 in SFmode, DFmode and XFmode */
279 2, /* cost of moving MMX register */
280 {4, 8}, /* cost of loading MMX registers
281 in SImode and DImode */
282 {4, 8}, /* cost of storing MMX registers
283 in SImode and DImode */
284 2, /* cost of moving SSE register */
285 {4, 8, 16}, /* cost of loading SSE registers
286 in SImode, DImode and TImode */
287 {4, 8, 16}, /* cost of storing SSE registers
288 in SImode, DImode and TImode */
289 3, /* MMX or SSE register to integer */
290 4, /* size of l1 cache. 486 has 8kB cache
291 shared for code and data, so 4kB is
292 not really precise. */
293 4, /* size of l2 cache */
294 0, /* size of prefetch block */
295 0, /* number of parallel prefetches */
296 1, /* Branch cost */
297 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
298 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
299 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
300 COSTS_N_INSNS (3), /* cost of FABS instruction. */
301 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
302 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
303 i486_memcpy,
304 i486_memset,
305 1, /* scalar_stmt_cost. */
306 1, /* scalar load_cost. */
307 1, /* scalar_store_cost. */
308 1, /* vec_stmt_cost. */
309 1, /* vec_to_scalar_cost. */
310 1, /* scalar_to_vec_cost. */
311 1, /* vec_align_load_cost. */
312 2, /* vec_unalign_load_cost. */
313 1, /* vec_store_cost. */
314 3, /* cond_taken_branch_cost. */
315 1, /* cond_not_taken_branch_cost. */
316 };
317
318 static stringop_algs pentium_memcpy[2] = {
319 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
320 DUMMY_STRINGOP_ALGS};
321 static stringop_algs pentium_memset[2] = {
322 {libcall, {{-1, rep_prefix_4_byte, false}}},
323 DUMMY_STRINGOP_ALGS};
324
325 static const
326 struct processor_costs pentium_cost = {
327 COSTS_N_INSNS (1), /* cost of an add instruction */
328 COSTS_N_INSNS (1), /* cost of a lea instruction */
329 COSTS_N_INSNS (4), /* variable shift costs */
330 COSTS_N_INSNS (1), /* constant shift costs */
331 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
332 COSTS_N_INSNS (11), /* HI */
333 COSTS_N_INSNS (11), /* SI */
334 COSTS_N_INSNS (11), /* DI */
335 COSTS_N_INSNS (11)}, /* other */
336 0, /* cost of multiply per each bit set */
337 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
338 COSTS_N_INSNS (25), /* HI */
339 COSTS_N_INSNS (25), /* SI */
340 COSTS_N_INSNS (25), /* DI */
341 COSTS_N_INSNS (25)}, /* other */
342 COSTS_N_INSNS (3), /* cost of movsx */
343 COSTS_N_INSNS (2), /* cost of movzx */
344 8, /* "large" insn */
345 6, /* MOVE_RATIO */
346 6, /* cost for loading QImode using movzbl */
347 {2, 4, 2}, /* cost of loading integer registers
348 in QImode, HImode and SImode.
349 Relative to reg-reg move (2). */
350 {2, 4, 2}, /* cost of storing integer registers */
351 2, /* cost of reg,reg fld/fst */
352 {2, 2, 6}, /* cost of loading fp registers
353 in SFmode, DFmode and XFmode */
354 {4, 4, 6}, /* cost of storing fp registers
355 in SFmode, DFmode and XFmode */
356 8, /* cost of moving MMX register */
357 {8, 8}, /* cost of loading MMX registers
358 in SImode and DImode */
359 {8, 8}, /* cost of storing MMX registers
360 in SImode and DImode */
361 2, /* cost of moving SSE register */
362 {4, 8, 16}, /* cost of loading SSE registers
363 in SImode, DImode and TImode */
364 {4, 8, 16}, /* cost of storing SSE registers
365 in SImode, DImode and TImode */
366 3, /* MMX or SSE register to integer */
367 8, /* size of l1 cache. */
368 8, /* size of l2 cache */
369 0, /* size of prefetch block */
370 0, /* number of parallel prefetches */
371 2, /* Branch cost */
372 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
373 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
374 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
375 COSTS_N_INSNS (1), /* cost of FABS instruction. */
376 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
377 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
378 pentium_memcpy,
379 pentium_memset,
380 1, /* scalar_stmt_cost. */
381 1, /* scalar load_cost. */
382 1, /* scalar_store_cost. */
383 1, /* vec_stmt_cost. */
384 1, /* vec_to_scalar_cost. */
385 1, /* scalar_to_vec_cost. */
386 1, /* vec_align_load_cost. */
387 2, /* vec_unalign_load_cost. */
388 1, /* vec_store_cost. */
389 3, /* cond_taken_branch_cost. */
390 1, /* cond_not_taken_branch_cost. */
391 };
392
393 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
394 (we ensure the alignment). For small blocks inline loop is still a
395 noticeable win, for bigger blocks either rep movsl or rep movsb is
396 way to go. Rep movsb has apparently more expensive startup time in CPU,
397 but after 4K the difference is down in the noise. */
398 static stringop_algs pentiumpro_memcpy[2] = {
399 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
400 {8192, rep_prefix_4_byte, false},
401 {-1, rep_prefix_1_byte, false}}},
402 DUMMY_STRINGOP_ALGS};
403 static stringop_algs pentiumpro_memset[2] = {
404 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
405 {8192, rep_prefix_4_byte, false},
406 {-1, libcall, false}}},
407 DUMMY_STRINGOP_ALGS};
408 static const
409 struct processor_costs pentiumpro_cost = {
410 COSTS_N_INSNS (1), /* cost of an add instruction */
411 COSTS_N_INSNS (1), /* cost of a lea instruction */
412 COSTS_N_INSNS (1), /* variable shift costs */
413 COSTS_N_INSNS (1), /* constant shift costs */
414 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
415 COSTS_N_INSNS (4), /* HI */
416 COSTS_N_INSNS (4), /* SI */
417 COSTS_N_INSNS (4), /* DI */
418 COSTS_N_INSNS (4)}, /* other */
419 0, /* cost of multiply per each bit set */
420 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
421 COSTS_N_INSNS (17), /* HI */
422 COSTS_N_INSNS (17), /* SI */
423 COSTS_N_INSNS (17), /* DI */
424 COSTS_N_INSNS (17)}, /* other */
425 COSTS_N_INSNS (1), /* cost of movsx */
426 COSTS_N_INSNS (1), /* cost of movzx */
427 8, /* "large" insn */
428 6, /* MOVE_RATIO */
429 2, /* cost for loading QImode using movzbl */
430 {4, 4, 4}, /* cost of loading integer registers
431 in QImode, HImode and SImode.
432 Relative to reg-reg move (2). */
433 {2, 2, 2}, /* cost of storing integer registers */
434 2, /* cost of reg,reg fld/fst */
435 {2, 2, 6}, /* cost of loading fp registers
436 in SFmode, DFmode and XFmode */
437 {4, 4, 6}, /* cost of storing fp registers
438 in SFmode, DFmode and XFmode */
439 2, /* cost of moving MMX register */
440 {2, 2}, /* cost of loading MMX registers
441 in SImode and DImode */
442 {2, 2}, /* cost of storing MMX registers
443 in SImode and DImode */
444 2, /* cost of moving SSE register */
445 {2, 2, 8}, /* cost of loading SSE registers
446 in SImode, DImode and TImode */
447 {2, 2, 8}, /* cost of storing SSE registers
448 in SImode, DImode and TImode */
449 3, /* MMX or SSE register to integer */
450 8, /* size of l1 cache. */
451 256, /* size of l2 cache */
452 32, /* size of prefetch block */
453 6, /* number of parallel prefetches */
454 2, /* Branch cost */
455 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
456 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
457 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
458 COSTS_N_INSNS (2), /* cost of FABS instruction. */
459 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
460 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
461 pentiumpro_memcpy,
462 pentiumpro_memset,
463 1, /* scalar_stmt_cost. */
464 1, /* scalar load_cost. */
465 1, /* scalar_store_cost. */
466 1, /* vec_stmt_cost. */
467 1, /* vec_to_scalar_cost. */
468 1, /* scalar_to_vec_cost. */
469 1, /* vec_align_load_cost. */
470 2, /* vec_unalign_load_cost. */
471 1, /* vec_store_cost. */
472 3, /* cond_taken_branch_cost. */
473 1, /* cond_not_taken_branch_cost. */
474 };
475
476 static stringop_algs geode_memcpy[2] = {
477 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
478 DUMMY_STRINGOP_ALGS};
479 static stringop_algs geode_memset[2] = {
480 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
481 DUMMY_STRINGOP_ALGS};
482 static const
483 struct processor_costs geode_cost = {
484 COSTS_N_INSNS (1), /* cost of an add instruction */
485 COSTS_N_INSNS (1), /* cost of a lea instruction */
486 COSTS_N_INSNS (2), /* variable shift costs */
487 COSTS_N_INSNS (1), /* constant shift costs */
488 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
489 COSTS_N_INSNS (4), /* HI */
490 COSTS_N_INSNS (7), /* SI */
491 COSTS_N_INSNS (7), /* DI */
492 COSTS_N_INSNS (7)}, /* other */
493 0, /* cost of multiply per each bit set */
494 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
495 COSTS_N_INSNS (23), /* HI */
496 COSTS_N_INSNS (39), /* SI */
497 COSTS_N_INSNS (39), /* DI */
498 COSTS_N_INSNS (39)}, /* other */
499 COSTS_N_INSNS (1), /* cost of movsx */
500 COSTS_N_INSNS (1), /* cost of movzx */
501 8, /* "large" insn */
502 4, /* MOVE_RATIO */
503 1, /* cost for loading QImode using movzbl */
504 {1, 1, 1}, /* cost of loading integer registers
505 in QImode, HImode and SImode.
506 Relative to reg-reg move (2). */
507 {1, 1, 1}, /* cost of storing integer registers */
508 1, /* cost of reg,reg fld/fst */
509 {1, 1, 1}, /* cost of loading fp registers
510 in SFmode, DFmode and XFmode */
511 {4, 6, 6}, /* cost of storing fp registers
512 in SFmode, DFmode and XFmode */
513
514 1, /* cost of moving MMX register */
515 {1, 1}, /* cost of loading MMX registers
516 in SImode and DImode */
517 {1, 1}, /* cost of storing MMX registers
518 in SImode and DImode */
519 1, /* cost of moving SSE register */
520 {1, 1, 1}, /* cost of loading SSE registers
521 in SImode, DImode and TImode */
522 {1, 1, 1}, /* cost of storing SSE registers
523 in SImode, DImode and TImode */
524 1, /* MMX or SSE register to integer */
525 64, /* size of l1 cache. */
526 128, /* size of l2 cache. */
527 32, /* size of prefetch block */
528 1, /* number of parallel prefetches */
529 1, /* Branch cost */
530 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
531 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
532 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
533 COSTS_N_INSNS (1), /* cost of FABS instruction. */
534 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
535 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
536 geode_memcpy,
537 geode_memset,
538 1, /* scalar_stmt_cost. */
539 1, /* scalar load_cost. */
540 1, /* scalar_store_cost. */
541 1, /* vec_stmt_cost. */
542 1, /* vec_to_scalar_cost. */
543 1, /* scalar_to_vec_cost. */
544 1, /* vec_align_load_cost. */
545 2, /* vec_unalign_load_cost. */
546 1, /* vec_store_cost. */
547 3, /* cond_taken_branch_cost. */
548 1, /* cond_not_taken_branch_cost. */
549 };
550
551 static stringop_algs k6_memcpy[2] = {
552 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
553 DUMMY_STRINGOP_ALGS};
554 static stringop_algs k6_memset[2] = {
555 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
556 DUMMY_STRINGOP_ALGS};
557 static const
558 struct processor_costs k6_cost = {
559 COSTS_N_INSNS (1), /* cost of an add instruction */
560 COSTS_N_INSNS (2), /* cost of a lea instruction */
561 COSTS_N_INSNS (1), /* variable shift costs */
562 COSTS_N_INSNS (1), /* constant shift costs */
563 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
564 COSTS_N_INSNS (3), /* HI */
565 COSTS_N_INSNS (3), /* SI */
566 COSTS_N_INSNS (3), /* DI */
567 COSTS_N_INSNS (3)}, /* other */
568 0, /* cost of multiply per each bit set */
569 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
570 COSTS_N_INSNS (18), /* HI */
571 COSTS_N_INSNS (18), /* SI */
572 COSTS_N_INSNS (18), /* DI */
573 COSTS_N_INSNS (18)}, /* other */
574 COSTS_N_INSNS (2), /* cost of movsx */
575 COSTS_N_INSNS (2), /* cost of movzx */
576 8, /* "large" insn */
577 4, /* MOVE_RATIO */
578 3, /* cost for loading QImode using movzbl */
579 {4, 5, 4}, /* cost of loading integer registers
580 in QImode, HImode and SImode.
581 Relative to reg-reg move (2). */
582 {2, 3, 2}, /* cost of storing integer registers */
583 4, /* cost of reg,reg fld/fst */
584 {6, 6, 6}, /* cost of loading fp registers
585 in SFmode, DFmode and XFmode */
586 {4, 4, 4}, /* cost of storing fp registers
587 in SFmode, DFmode and XFmode */
588 2, /* cost of moving MMX register */
589 {2, 2}, /* cost of loading MMX registers
590 in SImode and DImode */
591 {2, 2}, /* cost of storing MMX registers
592 in SImode and DImode */
593 2, /* cost of moving SSE register */
594 {2, 2, 8}, /* cost of loading SSE registers
595 in SImode, DImode and TImode */
596 {2, 2, 8}, /* cost of storing SSE registers
597 in SImode, DImode and TImode */
598 6, /* MMX or SSE register to integer */
599 32, /* size of l1 cache. */
600 32, /* size of l2 cache. Some models
601 have integrated l2 cache, but
602 optimizing for k6 is not important
603 enough to worry about that. */
604 32, /* size of prefetch block */
605 1, /* number of parallel prefetches */
606 1, /* Branch cost */
607 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
608 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
609 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
610 COSTS_N_INSNS (2), /* cost of FABS instruction. */
611 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
612 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
613 k6_memcpy,
614 k6_memset,
615 1, /* scalar_stmt_cost. */
616 1, /* scalar load_cost. */
617 1, /* scalar_store_cost. */
618 1, /* vec_stmt_cost. */
619 1, /* vec_to_scalar_cost. */
620 1, /* scalar_to_vec_cost. */
621 1, /* vec_align_load_cost. */
622 2, /* vec_unalign_load_cost. */
623 1, /* vec_store_cost. */
624 3, /* cond_taken_branch_cost. */
625 1, /* cond_not_taken_branch_cost. */
626 };
627
628 /* For some reason, Athlon deals better with REP prefix (relative to loops)
629 compared to K8. Alignment becomes important after 8 bytes for memcpy and
630 128 bytes for memset. */
631 static stringop_algs athlon_memcpy[2] = {
632 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
633 DUMMY_STRINGOP_ALGS};
634 static stringop_algs athlon_memset[2] = {
635 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
636 DUMMY_STRINGOP_ALGS};
637 static const
638 struct processor_costs athlon_cost = {
639 COSTS_N_INSNS (1), /* cost of an add instruction */
640 COSTS_N_INSNS (2), /* cost of a lea instruction */
641 COSTS_N_INSNS (1), /* variable shift costs */
642 COSTS_N_INSNS (1), /* constant shift costs */
643 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
644 COSTS_N_INSNS (5), /* HI */
645 COSTS_N_INSNS (5), /* SI */
646 COSTS_N_INSNS (5), /* DI */
647 COSTS_N_INSNS (5)}, /* other */
648 0, /* cost of multiply per each bit set */
649 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
650 COSTS_N_INSNS (26), /* HI */
651 COSTS_N_INSNS (42), /* SI */
652 COSTS_N_INSNS (74), /* DI */
653 COSTS_N_INSNS (74)}, /* other */
654 COSTS_N_INSNS (1), /* cost of movsx */
655 COSTS_N_INSNS (1), /* cost of movzx */
656 8, /* "large" insn */
657 9, /* MOVE_RATIO */
658 4, /* cost for loading QImode using movzbl */
659 {3, 4, 3}, /* cost of loading integer registers
660 in QImode, HImode and SImode.
661 Relative to reg-reg move (2). */
662 {3, 4, 3}, /* cost of storing integer registers */
663 4, /* cost of reg,reg fld/fst */
664 {4, 4, 12}, /* cost of loading fp registers
665 in SFmode, DFmode and XFmode */
666 {6, 6, 8}, /* cost of storing fp registers
667 in SFmode, DFmode and XFmode */
668 2, /* cost of moving MMX register */
669 {4, 4}, /* cost of loading MMX registers
670 in SImode and DImode */
671 {4, 4}, /* cost of storing MMX registers
672 in SImode and DImode */
673 2, /* cost of moving SSE register */
674 {4, 4, 6}, /* cost of loading SSE registers
675 in SImode, DImode and TImode */
676 {4, 4, 5}, /* cost of storing SSE registers
677 in SImode, DImode and TImode */
678 5, /* MMX or SSE register to integer */
679 64, /* size of l1 cache. */
680 256, /* size of l2 cache. */
681 64, /* size of prefetch block */
682 6, /* number of parallel prefetches */
683 5, /* Branch cost */
684 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
685 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
686 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
687 COSTS_N_INSNS (2), /* cost of FABS instruction. */
688 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
689 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
690 athlon_memcpy,
691 athlon_memset,
692 1, /* scalar_stmt_cost. */
693 1, /* scalar load_cost. */
694 1, /* scalar_store_cost. */
695 1, /* vec_stmt_cost. */
696 1, /* vec_to_scalar_cost. */
697 1, /* scalar_to_vec_cost. */
698 1, /* vec_align_load_cost. */
699 2, /* vec_unalign_load_cost. */
700 1, /* vec_store_cost. */
701 3, /* cond_taken_branch_cost. */
702 1, /* cond_not_taken_branch_cost. */
703 };
704
705 /* K8 has optimized REP instruction for medium sized blocks, but for very
706 small blocks it is better to use loop. For large blocks, libcall can
707 do nontemporary accesses and beat inline considerably. */
708 static stringop_algs k8_memcpy[2] = {
709 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
710 {-1, rep_prefix_4_byte, false}}},
711 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
712 {-1, libcall, false}}}};
713 static stringop_algs k8_memset[2] = {
714 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
715 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
716 {libcall, {{48, unrolled_loop, false},
717 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
718 static const
719 struct processor_costs k8_cost = {
720 COSTS_N_INSNS (1), /* cost of an add instruction */
721 COSTS_N_INSNS (2), /* cost of a lea instruction */
722 COSTS_N_INSNS (1), /* variable shift costs */
723 COSTS_N_INSNS (1), /* constant shift costs */
724 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
725 COSTS_N_INSNS (4), /* HI */
726 COSTS_N_INSNS (3), /* SI */
727 COSTS_N_INSNS (4), /* DI */
728 COSTS_N_INSNS (5)}, /* other */
729 0, /* cost of multiply per each bit set */
730 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
731 COSTS_N_INSNS (26), /* HI */
732 COSTS_N_INSNS (42), /* SI */
733 COSTS_N_INSNS (74), /* DI */
734 COSTS_N_INSNS (74)}, /* other */
735 COSTS_N_INSNS (1), /* cost of movsx */
736 COSTS_N_INSNS (1), /* cost of movzx */
737 8, /* "large" insn */
738 9, /* MOVE_RATIO */
739 4, /* cost for loading QImode using movzbl */
740 {3, 4, 3}, /* cost of loading integer registers
741 in QImode, HImode and SImode.
742 Relative to reg-reg move (2). */
743 {3, 4, 3}, /* cost of storing integer registers */
744 4, /* cost of reg,reg fld/fst */
745 {4, 4, 12}, /* cost of loading fp registers
746 in SFmode, DFmode and XFmode */
747 {6, 6, 8}, /* cost of storing fp registers
748 in SFmode, DFmode and XFmode */
749 2, /* cost of moving MMX register */
750 {3, 3}, /* cost of loading MMX registers
751 in SImode and DImode */
752 {4, 4}, /* cost of storing MMX registers
753 in SImode and DImode */
754 2, /* cost of moving SSE register */
755 {4, 3, 6}, /* cost of loading SSE registers
756 in SImode, DImode and TImode */
757 {4, 4, 5}, /* cost of storing SSE registers
758 in SImode, DImode and TImode */
759 5, /* MMX or SSE register to integer */
760 64, /* size of l1 cache. */
761 512, /* size of l2 cache. */
762 64, /* size of prefetch block */
763 /* New AMD processors never drop prefetches; if they cannot be performed
764 immediately, they are queued. We set number of simultaneous prefetches
765 to a large constant to reflect this (it probably is not a good idea not
766 to limit number of prefetches at all, as their execution also takes some
767 time). */
768 100, /* number of parallel prefetches */
769 3, /* Branch cost */
770 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
771 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
772 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
773 COSTS_N_INSNS (2), /* cost of FABS instruction. */
774 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
775 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
776
777 k8_memcpy,
778 k8_memset,
779 4, /* scalar_stmt_cost. */
780 2, /* scalar load_cost. */
781 2, /* scalar_store_cost. */
782 5, /* vec_stmt_cost. */
783 0, /* vec_to_scalar_cost. */
784 2, /* scalar_to_vec_cost. */
785 2, /* vec_align_load_cost. */
786 3, /* vec_unalign_load_cost. */
787 3, /* vec_store_cost. */
788 3, /* cond_taken_branch_cost. */
789 2, /* cond_not_taken_branch_cost. */
790 };
791
792 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
793 very small blocks it is better to use loop. For large blocks, libcall can
794 do nontemporary accesses and beat inline considerably. */
795 static stringop_algs amdfam10_memcpy[2] = {
796 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
797 {-1, rep_prefix_4_byte, false}}},
798 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
799 {-1, libcall, false}}}};
800 static stringop_algs amdfam10_memset[2] = {
801 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
802 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
803 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
804 {-1, libcall, false}}}};
805 struct processor_costs amdfam10_cost = {
806 COSTS_N_INSNS (1), /* cost of an add instruction */
807 COSTS_N_INSNS (2), /* cost of a lea instruction */
808 COSTS_N_INSNS (1), /* variable shift costs */
809 COSTS_N_INSNS (1), /* constant shift costs */
810 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
811 COSTS_N_INSNS (4), /* HI */
812 COSTS_N_INSNS (3), /* SI */
813 COSTS_N_INSNS (4), /* DI */
814 COSTS_N_INSNS (5)}, /* other */
815 0, /* cost of multiply per each bit set */
816 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
817 COSTS_N_INSNS (35), /* HI */
818 COSTS_N_INSNS (51), /* SI */
819 COSTS_N_INSNS (83), /* DI */
820 COSTS_N_INSNS (83)}, /* other */
821 COSTS_N_INSNS (1), /* cost of movsx */
822 COSTS_N_INSNS (1), /* cost of movzx */
823 8, /* "large" insn */
824 9, /* MOVE_RATIO */
825 4, /* cost for loading QImode using movzbl */
826 {3, 4, 3}, /* cost of loading integer registers
827 in QImode, HImode and SImode.
828 Relative to reg-reg move (2). */
829 {3, 4, 3}, /* cost of storing integer registers */
830 4, /* cost of reg,reg fld/fst */
831 {4, 4, 12}, /* cost of loading fp registers
832 in SFmode, DFmode and XFmode */
833 {6, 6, 8}, /* cost of storing fp registers
834 in SFmode, DFmode and XFmode */
835 2, /* cost of moving MMX register */
836 {3, 3}, /* cost of loading MMX registers
837 in SImode and DImode */
838 {4, 4}, /* cost of storing MMX registers
839 in SImode and DImode */
840 2, /* cost of moving SSE register */
841 {4, 4, 3}, /* cost of loading SSE registers
842 in SImode, DImode and TImode */
843 {4, 4, 5}, /* cost of storing SSE registers
844 in SImode, DImode and TImode */
845 3, /* MMX or SSE register to integer */
846 /* On K8:
847 MOVD reg64, xmmreg Double FSTORE 4
848 MOVD reg32, xmmreg Double FSTORE 4
849 On AMDFAM10:
850 MOVD reg64, xmmreg Double FADD 3
851 1/1 1/1
852 MOVD reg32, xmmreg Double FADD 3
853 1/1 1/1 */
854 64, /* size of l1 cache. */
855 512, /* size of l2 cache. */
856 64, /* size of prefetch block */
857 /* New AMD processors never drop prefetches; if they cannot be performed
858 immediately, they are queued. We set number of simultaneous prefetches
859 to a large constant to reflect this (it probably is not a good idea not
860 to limit number of prefetches at all, as their execution also takes some
861 time). */
862 100, /* number of parallel prefetches */
863 2, /* Branch cost */
864 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
865 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
866 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
867 COSTS_N_INSNS (2), /* cost of FABS instruction. */
868 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
869 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
870
871 amdfam10_memcpy,
872 amdfam10_memset,
873 4, /* scalar_stmt_cost. */
874 2, /* scalar load_cost. */
875 2, /* scalar_store_cost. */
876 6, /* vec_stmt_cost. */
877 0, /* vec_to_scalar_cost. */
878 2, /* scalar_to_vec_cost. */
879 2, /* vec_align_load_cost. */
880 2, /* vec_unalign_load_cost. */
881 2, /* vec_store_cost. */
882 2, /* cond_taken_branch_cost. */
883 1, /* cond_not_taken_branch_cost. */
884 };
885
886 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
887 very small blocks it is better to use loop. For large blocks, libcall
888 can do nontemporary accesses and beat inline considerably. */
889 static stringop_algs bdver1_memcpy[2] = {
890 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
891 {-1, rep_prefix_4_byte, false}}},
892 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
893 {-1, libcall, false}}}};
894 static stringop_algs bdver1_memset[2] = {
895 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
896 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
897 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
898 {-1, libcall, false}}}};
899
900 const struct processor_costs bdver1_cost = {
901 COSTS_N_INSNS (1), /* cost of an add instruction */
902 COSTS_N_INSNS (1), /* cost of a lea instruction */
903 COSTS_N_INSNS (1), /* variable shift costs */
904 COSTS_N_INSNS (1), /* constant shift costs */
905 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
906 COSTS_N_INSNS (4), /* HI */
907 COSTS_N_INSNS (4), /* SI */
908 COSTS_N_INSNS (6), /* DI */
909 COSTS_N_INSNS (6)}, /* other */
910 0, /* cost of multiply per each bit set */
911 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
912 COSTS_N_INSNS (35), /* HI */
913 COSTS_N_INSNS (51), /* SI */
914 COSTS_N_INSNS (83), /* DI */
915 COSTS_N_INSNS (83)}, /* other */
916 COSTS_N_INSNS (1), /* cost of movsx */
917 COSTS_N_INSNS (1), /* cost of movzx */
918 8, /* "large" insn */
919 9, /* MOVE_RATIO */
920 4, /* cost for loading QImode using movzbl */
921 {5, 5, 4}, /* cost of loading integer registers
922 in QImode, HImode and SImode.
923 Relative to reg-reg move (2). */
924 {4, 4, 4}, /* cost of storing integer registers */
925 2, /* cost of reg,reg fld/fst */
926 {5, 5, 12}, /* cost of loading fp registers
927 in SFmode, DFmode and XFmode */
928 {4, 4, 8}, /* cost of storing fp registers
929 in SFmode, DFmode and XFmode */
930 2, /* cost of moving MMX register */
931 {4, 4}, /* cost of loading MMX registers
932 in SImode and DImode */
933 {4, 4}, /* cost of storing MMX registers
934 in SImode and DImode */
935 2, /* cost of moving SSE register */
936 {4, 4, 4}, /* cost of loading SSE registers
937 in SImode, DImode and TImode */
938 {4, 4, 4}, /* cost of storing SSE registers
939 in SImode, DImode and TImode */
940 2, /* MMX or SSE register to integer */
941 /* On K8:
942 MOVD reg64, xmmreg Double FSTORE 4
943 MOVD reg32, xmmreg Double FSTORE 4
944 On AMDFAM10:
945 MOVD reg64, xmmreg Double FADD 3
946 1/1 1/1
947 MOVD reg32, xmmreg Double FADD 3
948 1/1 1/1 */
949 16, /* size of l1 cache. */
950 2048, /* size of l2 cache. */
951 64, /* size of prefetch block */
952 /* New AMD processors never drop prefetches; if they cannot be performed
953 immediately, they are queued. We set number of simultaneous prefetches
954 to a large constant to reflect this (it probably is not a good idea not
955 to limit number of prefetches at all, as their execution also takes some
956 time). */
957 100, /* number of parallel prefetches */
958 2, /* Branch cost */
959 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
960 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
961 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
962 COSTS_N_INSNS (2), /* cost of FABS instruction. */
963 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
964 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
965
966 bdver1_memcpy,
967 bdver1_memset,
968 6, /* scalar_stmt_cost. */
969 4, /* scalar load_cost. */
970 4, /* scalar_store_cost. */
971 6, /* vec_stmt_cost. */
972 0, /* vec_to_scalar_cost. */
973 2, /* scalar_to_vec_cost. */
974 4, /* vec_align_load_cost. */
975 4, /* vec_unalign_load_cost. */
976 4, /* vec_store_cost. */
977 2, /* cond_taken_branch_cost. */
978 1, /* cond_not_taken_branch_cost. */
979 };
980
981 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
982 very small blocks it is better to use loop. For large blocks, libcall
983 can do nontemporary accesses and beat inline considerably. */
984
985 static stringop_algs bdver2_memcpy[2] = {
986 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
987 {-1, rep_prefix_4_byte, false}}},
988 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
989 {-1, libcall, false}}}};
990 static stringop_algs bdver2_memset[2] = {
991 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
992 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
993 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
994 {-1, libcall, false}}}};
995
996 const struct processor_costs bdver2_cost = {
997 COSTS_N_INSNS (1), /* cost of an add instruction */
998 COSTS_N_INSNS (1), /* cost of a lea instruction */
999 COSTS_N_INSNS (1), /* variable shift costs */
1000 COSTS_N_INSNS (1), /* constant shift costs */
1001 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1002 COSTS_N_INSNS (4), /* HI */
1003 COSTS_N_INSNS (4), /* SI */
1004 COSTS_N_INSNS (6), /* DI */
1005 COSTS_N_INSNS (6)}, /* other */
1006 0, /* cost of multiply per each bit set */
1007 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1008 COSTS_N_INSNS (35), /* HI */
1009 COSTS_N_INSNS (51), /* SI */
1010 COSTS_N_INSNS (83), /* DI */
1011 COSTS_N_INSNS (83)}, /* other */
1012 COSTS_N_INSNS (1), /* cost of movsx */
1013 COSTS_N_INSNS (1), /* cost of movzx */
1014 8, /* "large" insn */
1015 9, /* MOVE_RATIO */
1016 4, /* cost for loading QImode using movzbl */
1017 {5, 5, 4}, /* cost of loading integer registers
1018 in QImode, HImode and SImode.
1019 Relative to reg-reg move (2). */
1020 {4, 4, 4}, /* cost of storing integer registers */
1021 2, /* cost of reg,reg fld/fst */
1022 {5, 5, 12}, /* cost of loading fp registers
1023 in SFmode, DFmode and XFmode */
1024 {4, 4, 8}, /* cost of storing fp registers
1025 in SFmode, DFmode and XFmode */
1026 2, /* cost of moving MMX register */
1027 {4, 4}, /* cost of loading MMX registers
1028 in SImode and DImode */
1029 {4, 4}, /* cost of storing MMX registers
1030 in SImode and DImode */
1031 2, /* cost of moving SSE register */
1032 {4, 4, 4}, /* cost of loading SSE registers
1033 in SImode, DImode and TImode */
1034 {4, 4, 4}, /* cost of storing SSE registers
1035 in SImode, DImode and TImode */
1036 2, /* MMX or SSE register to integer */
1037 /* On K8:
1038 MOVD reg64, xmmreg Double FSTORE 4
1039 MOVD reg32, xmmreg Double FSTORE 4
1040 On AMDFAM10:
1041 MOVD reg64, xmmreg Double FADD 3
1042 1/1 1/1
1043 MOVD reg32, xmmreg Double FADD 3
1044 1/1 1/1 */
1045 16, /* size of l1 cache. */
1046 2048, /* size of l2 cache. */
1047 64, /* size of prefetch block */
1048 /* New AMD processors never drop prefetches; if they cannot be performed
1049 immediately, they are queued. We set number of simultaneous prefetches
1050 to a large constant to reflect this (it probably is not a good idea not
1051 to limit number of prefetches at all, as their execution also takes some
1052 time). */
1053 100, /* number of parallel prefetches */
1054 2, /* Branch cost */
1055 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1056 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1057 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1058 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1059 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1060 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1061
1062 bdver2_memcpy,
1063 bdver2_memset,
1064 6, /* scalar_stmt_cost. */
1065 4, /* scalar load_cost. */
1066 4, /* scalar_store_cost. */
1067 6, /* vec_stmt_cost. */
1068 0, /* vec_to_scalar_cost. */
1069 2, /* scalar_to_vec_cost. */
1070 4, /* vec_align_load_cost. */
1071 4, /* vec_unalign_load_cost. */
1072 4, /* vec_store_cost. */
1073 2, /* cond_taken_branch_cost. */
1074 1, /* cond_not_taken_branch_cost. */
1075 };
1076
1077
1078 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1079 very small blocks it is better to use loop. For large blocks, libcall
1080 can do nontemporary accesses and beat inline considerably. */
1081 static stringop_algs bdver3_memcpy[2] = {
1082 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1083 {-1, rep_prefix_4_byte, false}}},
1084 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1086 static stringop_algs bdver3_memset[2] = {
1087 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1088 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1089 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1090 {-1, libcall, false}}}};
1091 struct processor_costs bdver3_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (1), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (4), /* SI */
1099 COSTS_N_INSNS (6), /* DI */
1100 COSTS_N_INSNS (6)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (35), /* HI */
1104 COSTS_N_INSNS (51), /* SI */
1105 COSTS_N_INSNS (83), /* DI */
1106 COSTS_N_INSNS (83)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {5, 5, 4}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {4, 4, 4}, /* cost of storing integer registers */
1116 2, /* cost of reg,reg fld/fst */
1117 {5, 5, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {4, 4, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {4, 4}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 4, 4}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 4}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 2, /* MMX or SSE register to integer */
1132 16, /* size of l1 cache. */
1133 2048, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 2, /* Branch cost */
1142 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1148
1149 bdver3_memcpy,
1150 bdver3_memset,
1151 6, /* scalar_stmt_cost. */
1152 4, /* scalar load_cost. */
1153 4, /* scalar_store_cost. */
1154 6, /* vec_stmt_cost. */
1155 0, /* vec_to_scalar_cost. */
1156 2, /* scalar_to_vec_cost. */
1157 4, /* vec_align_load_cost. */
1158 4, /* vec_unalign_load_cost. */
1159 4, /* vec_store_cost. */
1160 2, /* cond_taken_branch_cost. */
1161 1, /* cond_not_taken_branch_cost. */
1162 };
1163
1164 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1165 very small blocks it is better to use loop. For large blocks, libcall can
1166 do nontemporary accesses and beat inline considerably. */
1167 static stringop_algs btver1_memcpy[2] = {
1168 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1169 {-1, rep_prefix_4_byte, false}}},
1170 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1171 {-1, libcall, false}}}};
1172 static stringop_algs btver1_memset[2] = {
1173 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1174 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1175 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1176 {-1, libcall, false}}}};
1177 const struct processor_costs btver1_cost = {
1178 COSTS_N_INSNS (1), /* cost of an add instruction */
1179 COSTS_N_INSNS (2), /* cost of a lea instruction */
1180 COSTS_N_INSNS (1), /* variable shift costs */
1181 COSTS_N_INSNS (1), /* constant shift costs */
1182 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1183 COSTS_N_INSNS (4), /* HI */
1184 COSTS_N_INSNS (3), /* SI */
1185 COSTS_N_INSNS (4), /* DI */
1186 COSTS_N_INSNS (5)}, /* other */
1187 0, /* cost of multiply per each bit set */
1188 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1189 COSTS_N_INSNS (35), /* HI */
1190 COSTS_N_INSNS (51), /* SI */
1191 COSTS_N_INSNS (83), /* DI */
1192 COSTS_N_INSNS (83)}, /* other */
1193 COSTS_N_INSNS (1), /* cost of movsx */
1194 COSTS_N_INSNS (1), /* cost of movzx */
1195 8, /* "large" insn */
1196 9, /* MOVE_RATIO */
1197 4, /* cost for loading QImode using movzbl */
1198 {3, 4, 3}, /* cost of loading integer registers
1199 in QImode, HImode and SImode.
1200 Relative to reg-reg move (2). */
1201 {3, 4, 3}, /* cost of storing integer registers */
1202 4, /* cost of reg,reg fld/fst */
1203 {4, 4, 12}, /* cost of loading fp registers
1204 in SFmode, DFmode and XFmode */
1205 {6, 6, 8}, /* cost of storing fp registers
1206 in SFmode, DFmode and XFmode */
1207 2, /* cost of moving MMX register */
1208 {3, 3}, /* cost of loading MMX registers
1209 in SImode and DImode */
1210 {4, 4}, /* cost of storing MMX registers
1211 in SImode and DImode */
1212 2, /* cost of moving SSE register */
1213 {4, 4, 3}, /* cost of loading SSE registers
1214 in SImode, DImode and TImode */
1215 {4, 4, 5}, /* cost of storing SSE registers
1216 in SImode, DImode and TImode */
1217 3, /* MMX or SSE register to integer */
1218 /* On K8:
1219 MOVD reg64, xmmreg Double FSTORE 4
1220 MOVD reg32, xmmreg Double FSTORE 4
1221 On AMDFAM10:
1222 MOVD reg64, xmmreg Double FADD 3
1223 1/1 1/1
1224 MOVD reg32, xmmreg Double FADD 3
1225 1/1 1/1 */
1226 32, /* size of l1 cache. */
1227 512, /* size of l2 cache. */
1228 64, /* size of prefetch block */
1229 100, /* number of parallel prefetches */
1230 2, /* Branch cost */
1231 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1232 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1233 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1234 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1235 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1236 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1237
1238 btver1_memcpy,
1239 btver1_memset,
1240 4, /* scalar_stmt_cost. */
1241 2, /* scalar load_cost. */
1242 2, /* scalar_store_cost. */
1243 6, /* vec_stmt_cost. */
1244 0, /* vec_to_scalar_cost. */
1245 2, /* scalar_to_vec_cost. */
1246 2, /* vec_align_load_cost. */
1247 2, /* vec_unalign_load_cost. */
1248 2, /* vec_store_cost. */
1249 2, /* cond_taken_branch_cost. */
1250 1, /* cond_not_taken_branch_cost. */
1251 };
1252
1253 static stringop_algs btver2_memcpy[2] = {
1254 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1255 {-1, rep_prefix_4_byte, false}}},
1256 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1257 {-1, libcall, false}}}};
1258 static stringop_algs btver2_memset[2] = {
1259 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1260 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1261 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1262 {-1, libcall, false}}}};
1263 const struct processor_costs btver2_cost = {
1264 COSTS_N_INSNS (1), /* cost of an add instruction */
1265 COSTS_N_INSNS (2), /* cost of a lea instruction */
1266 COSTS_N_INSNS (1), /* variable shift costs */
1267 COSTS_N_INSNS (1), /* constant shift costs */
1268 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1269 COSTS_N_INSNS (4), /* HI */
1270 COSTS_N_INSNS (3), /* SI */
1271 COSTS_N_INSNS (4), /* DI */
1272 COSTS_N_INSNS (5)}, /* other */
1273 0, /* cost of multiply per each bit set */
1274 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1275 COSTS_N_INSNS (35), /* HI */
1276 COSTS_N_INSNS (51), /* SI */
1277 COSTS_N_INSNS (83), /* DI */
1278 COSTS_N_INSNS (83)}, /* other */
1279 COSTS_N_INSNS (1), /* cost of movsx */
1280 COSTS_N_INSNS (1), /* cost of movzx */
1281 8, /* "large" insn */
1282 9, /* MOVE_RATIO */
1283 4, /* cost for loading QImode using movzbl */
1284 {3, 4, 3}, /* cost of loading integer registers
1285 in QImode, HImode and SImode.
1286 Relative to reg-reg move (2). */
1287 {3, 4, 3}, /* cost of storing integer registers */
1288 4, /* cost of reg,reg fld/fst */
1289 {4, 4, 12}, /* cost of loading fp registers
1290 in SFmode, DFmode and XFmode */
1291 {6, 6, 8}, /* cost of storing fp registers
1292 in SFmode, DFmode and XFmode */
1293 2, /* cost of moving MMX register */
1294 {3, 3}, /* cost of loading MMX registers
1295 in SImode and DImode */
1296 {4, 4}, /* cost of storing MMX registers
1297 in SImode and DImode */
1298 2, /* cost of moving SSE register */
1299 {4, 4, 3}, /* cost of loading SSE registers
1300 in SImode, DImode and TImode */
1301 {4, 4, 5}, /* cost of storing SSE registers
1302 in SImode, DImode and TImode */
1303 3, /* MMX or SSE register to integer */
1304 /* On K8:
1305 MOVD reg64, xmmreg Double FSTORE 4
1306 MOVD reg32, xmmreg Double FSTORE 4
1307 On AMDFAM10:
1308 MOVD reg64, xmmreg Double FADD 3
1309 1/1 1/1
1310 MOVD reg32, xmmreg Double FADD 3
1311 1/1 1/1 */
1312 32, /* size of l1 cache. */
1313 2048, /* size of l2 cache. */
1314 64, /* size of prefetch block */
1315 100, /* number of parallel prefetches */
1316 2, /* Branch cost */
1317 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1318 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1319 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1320 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1321 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1322 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1323 btver2_memcpy,
1324 btver2_memset,
1325 4, /* scalar_stmt_cost. */
1326 2, /* scalar load_cost. */
1327 2, /* scalar_store_cost. */
1328 6, /* vec_stmt_cost. */
1329 0, /* vec_to_scalar_cost. */
1330 2, /* scalar_to_vec_cost. */
1331 2, /* vec_align_load_cost. */
1332 2, /* vec_unalign_load_cost. */
1333 2, /* vec_store_cost. */
1334 2, /* cond_taken_branch_cost. */
1335 1, /* cond_not_taken_branch_cost. */
1336 };
1337
1338 static stringop_algs pentium4_memcpy[2] = {
1339 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1340 DUMMY_STRINGOP_ALGS};
1341 static stringop_algs pentium4_memset[2] = {
1342 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1343 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1344 DUMMY_STRINGOP_ALGS};
1345
1346 static const
1347 struct processor_costs pentium4_cost = {
1348 COSTS_N_INSNS (1), /* cost of an add instruction */
1349 COSTS_N_INSNS (3), /* cost of a lea instruction */
1350 COSTS_N_INSNS (4), /* variable shift costs */
1351 COSTS_N_INSNS (4), /* constant shift costs */
1352 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1353 COSTS_N_INSNS (15), /* HI */
1354 COSTS_N_INSNS (15), /* SI */
1355 COSTS_N_INSNS (15), /* DI */
1356 COSTS_N_INSNS (15)}, /* other */
1357 0, /* cost of multiply per each bit set */
1358 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1359 COSTS_N_INSNS (56), /* HI */
1360 COSTS_N_INSNS (56), /* SI */
1361 COSTS_N_INSNS (56), /* DI */
1362 COSTS_N_INSNS (56)}, /* other */
1363 COSTS_N_INSNS (1), /* cost of movsx */
1364 COSTS_N_INSNS (1), /* cost of movzx */
1365 16, /* "large" insn */
1366 6, /* MOVE_RATIO */
1367 2, /* cost for loading QImode using movzbl */
1368 {4, 5, 4}, /* cost of loading integer registers
1369 in QImode, HImode and SImode.
1370 Relative to reg-reg move (2). */
1371 {2, 3, 2}, /* cost of storing integer registers */
1372 2, /* cost of reg,reg fld/fst */
1373 {2, 2, 6}, /* cost of loading fp registers
1374 in SFmode, DFmode and XFmode */
1375 {4, 4, 6}, /* cost of storing fp registers
1376 in SFmode, DFmode and XFmode */
1377 2, /* cost of moving MMX register */
1378 {2, 2}, /* cost of loading MMX registers
1379 in SImode and DImode */
1380 {2, 2}, /* cost of storing MMX registers
1381 in SImode and DImode */
1382 12, /* cost of moving SSE register */
1383 {12, 12, 12}, /* cost of loading SSE registers
1384 in SImode, DImode and TImode */
1385 {2, 2, 8}, /* cost of storing SSE registers
1386 in SImode, DImode and TImode */
1387 10, /* MMX or SSE register to integer */
1388 8, /* size of l1 cache. */
1389 256, /* size of l2 cache. */
1390 64, /* size of prefetch block */
1391 6, /* number of parallel prefetches */
1392 2, /* Branch cost */
1393 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1394 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1395 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1396 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1397 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1398 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1399 pentium4_memcpy,
1400 pentium4_memset,
1401 1, /* scalar_stmt_cost. */
1402 1, /* scalar load_cost. */
1403 1, /* scalar_store_cost. */
1404 1, /* vec_stmt_cost. */
1405 1, /* vec_to_scalar_cost. */
1406 1, /* scalar_to_vec_cost. */
1407 1, /* vec_align_load_cost. */
1408 2, /* vec_unalign_load_cost. */
1409 1, /* vec_store_cost. */
1410 3, /* cond_taken_branch_cost. */
1411 1, /* cond_not_taken_branch_cost. */
1412 };
1413
1414 static stringop_algs nocona_memcpy[2] = {
1415 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1416 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1417 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1418
1419 static stringop_algs nocona_memset[2] = {
1420 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1421 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1422 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1423 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1424
1425 static const
1426 struct processor_costs nocona_cost = {
1427 COSTS_N_INSNS (1), /* cost of an add instruction */
1428 COSTS_N_INSNS (1), /* cost of a lea instruction */
1429 COSTS_N_INSNS (1), /* variable shift costs */
1430 COSTS_N_INSNS (1), /* constant shift costs */
1431 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1432 COSTS_N_INSNS (10), /* HI */
1433 COSTS_N_INSNS (10), /* SI */
1434 COSTS_N_INSNS (10), /* DI */
1435 COSTS_N_INSNS (10)}, /* other */
1436 0, /* cost of multiply per each bit set */
1437 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1438 COSTS_N_INSNS (66), /* HI */
1439 COSTS_N_INSNS (66), /* SI */
1440 COSTS_N_INSNS (66), /* DI */
1441 COSTS_N_INSNS (66)}, /* other */
1442 COSTS_N_INSNS (1), /* cost of movsx */
1443 COSTS_N_INSNS (1), /* cost of movzx */
1444 16, /* "large" insn */
1445 17, /* MOVE_RATIO */
1446 4, /* cost for loading QImode using movzbl */
1447 {4, 4, 4}, /* cost of loading integer registers
1448 in QImode, HImode and SImode.
1449 Relative to reg-reg move (2). */
1450 {4, 4, 4}, /* cost of storing integer registers */
1451 3, /* cost of reg,reg fld/fst */
1452 {12, 12, 12}, /* cost of loading fp registers
1453 in SFmode, DFmode and XFmode */
1454 {4, 4, 4}, /* cost of storing fp registers
1455 in SFmode, DFmode and XFmode */
1456 6, /* cost of moving MMX register */
1457 {12, 12}, /* cost of loading MMX registers
1458 in SImode and DImode */
1459 {12, 12}, /* cost of storing MMX registers
1460 in SImode and DImode */
1461 6, /* cost of moving SSE register */
1462 {12, 12, 12}, /* cost of loading SSE registers
1463 in SImode, DImode and TImode */
1464 {12, 12, 12}, /* cost of storing SSE registers
1465 in SImode, DImode and TImode */
1466 8, /* MMX or SSE register to integer */
1467 8, /* size of l1 cache. */
1468 1024, /* size of l2 cache. */
1469 128, /* size of prefetch block */
1470 8, /* number of parallel prefetches */
1471 1, /* Branch cost */
1472 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1473 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1474 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1475 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1476 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1477 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1478 nocona_memcpy,
1479 nocona_memset,
1480 1, /* scalar_stmt_cost. */
1481 1, /* scalar load_cost. */
1482 1, /* scalar_store_cost. */
1483 1, /* vec_stmt_cost. */
1484 1, /* vec_to_scalar_cost. */
1485 1, /* scalar_to_vec_cost. */
1486 1, /* vec_align_load_cost. */
1487 2, /* vec_unalign_load_cost. */
1488 1, /* vec_store_cost. */
1489 3, /* cond_taken_branch_cost. */
1490 1, /* cond_not_taken_branch_cost. */
1491 };
1492
1493 static stringop_algs atom_memcpy[2] = {
1494 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1495 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1496 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1497 static stringop_algs atom_memset[2] = {
1498 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1499 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1500 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1501 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1502 static const
1503 struct processor_costs atom_cost = {
1504 COSTS_N_INSNS (1), /* cost of an add instruction */
1505 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1506 COSTS_N_INSNS (1), /* variable shift costs */
1507 COSTS_N_INSNS (1), /* constant shift costs */
1508 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1509 COSTS_N_INSNS (4), /* HI */
1510 COSTS_N_INSNS (3), /* SI */
1511 COSTS_N_INSNS (4), /* DI */
1512 COSTS_N_INSNS (2)}, /* other */
1513 0, /* cost of multiply per each bit set */
1514 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1515 COSTS_N_INSNS (26), /* HI */
1516 COSTS_N_INSNS (42), /* SI */
1517 COSTS_N_INSNS (74), /* DI */
1518 COSTS_N_INSNS (74)}, /* other */
1519 COSTS_N_INSNS (1), /* cost of movsx */
1520 COSTS_N_INSNS (1), /* cost of movzx */
1521 8, /* "large" insn */
1522 17, /* MOVE_RATIO */
1523 4, /* cost for loading QImode using movzbl */
1524 {4, 4, 4}, /* cost of loading integer registers
1525 in QImode, HImode and SImode.
1526 Relative to reg-reg move (2). */
1527 {4, 4, 4}, /* cost of storing integer registers */
1528 4, /* cost of reg,reg fld/fst */
1529 {12, 12, 12}, /* cost of loading fp registers
1530 in SFmode, DFmode and XFmode */
1531 {6, 6, 8}, /* cost of storing fp registers
1532 in SFmode, DFmode and XFmode */
1533 2, /* cost of moving MMX register */
1534 {8, 8}, /* cost of loading MMX registers
1535 in SImode and DImode */
1536 {8, 8}, /* cost of storing MMX registers
1537 in SImode and DImode */
1538 2, /* cost of moving SSE register */
1539 {8, 8, 8}, /* cost of loading SSE registers
1540 in SImode, DImode and TImode */
1541 {8, 8, 8}, /* cost of storing SSE registers
1542 in SImode, DImode and TImode */
1543 5, /* MMX or SSE register to integer */
1544 32, /* size of l1 cache. */
1545 256, /* size of l2 cache. */
1546 64, /* size of prefetch block */
1547 6, /* number of parallel prefetches */
1548 3, /* Branch cost */
1549 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1550 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1551 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1552 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1553 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1554 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1555 atom_memcpy,
1556 atom_memset,
1557 1, /* scalar_stmt_cost. */
1558 1, /* scalar load_cost. */
1559 1, /* scalar_store_cost. */
1560 1, /* vec_stmt_cost. */
1561 1, /* vec_to_scalar_cost. */
1562 1, /* scalar_to_vec_cost. */
1563 1, /* vec_align_load_cost. */
1564 2, /* vec_unalign_load_cost. */
1565 1, /* vec_store_cost. */
1566 3, /* cond_taken_branch_cost. */
1567 1, /* cond_not_taken_branch_cost. */
1568 };
1569
1570 static stringop_algs slm_memcpy[2] = {
1571 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1572 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1573 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1574 static stringop_algs slm_memset[2] = {
1575 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1576 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1577 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1578 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1579 static const
1580 struct processor_costs slm_cost = {
1581 COSTS_N_INSNS (1), /* cost of an add instruction */
1582 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1583 COSTS_N_INSNS (1), /* variable shift costs */
1584 COSTS_N_INSNS (1), /* constant shift costs */
1585 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1586 COSTS_N_INSNS (4), /* HI */
1587 COSTS_N_INSNS (3), /* SI */
1588 COSTS_N_INSNS (4), /* DI */
1589 COSTS_N_INSNS (2)}, /* other */
1590 0, /* cost of multiply per each bit set */
1591 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1592 COSTS_N_INSNS (26), /* HI */
1593 COSTS_N_INSNS (42), /* SI */
1594 COSTS_N_INSNS (74), /* DI */
1595 COSTS_N_INSNS (74)}, /* other */
1596 COSTS_N_INSNS (1), /* cost of movsx */
1597 COSTS_N_INSNS (1), /* cost of movzx */
1598 8, /* "large" insn */
1599 17, /* MOVE_RATIO */
1600 4, /* cost for loading QImode using movzbl */
1601 {4, 4, 4}, /* cost of loading integer registers
1602 in QImode, HImode and SImode.
1603 Relative to reg-reg move (2). */
1604 {4, 4, 4}, /* cost of storing integer registers */
1605 4, /* cost of reg,reg fld/fst */
1606 {12, 12, 12}, /* cost of loading fp registers
1607 in SFmode, DFmode and XFmode */
1608 {6, 6, 8}, /* cost of storing fp registers
1609 in SFmode, DFmode and XFmode */
1610 2, /* cost of moving MMX register */
1611 {8, 8}, /* cost of loading MMX registers
1612 in SImode and DImode */
1613 {8, 8}, /* cost of storing MMX registers
1614 in SImode and DImode */
1615 2, /* cost of moving SSE register */
1616 {8, 8, 8}, /* cost of loading SSE registers
1617 in SImode, DImode and TImode */
1618 {8, 8, 8}, /* cost of storing SSE registers
1619 in SImode, DImode and TImode */
1620 5, /* MMX or SSE register to integer */
1621 32, /* size of l1 cache. */
1622 256, /* size of l2 cache. */
1623 64, /* size of prefetch block */
1624 6, /* number of parallel prefetches */
1625 3, /* Branch cost */
1626 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1627 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1628 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1629 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1630 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1631 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1632 slm_memcpy,
1633 slm_memset,
1634 1, /* scalar_stmt_cost. */
1635 1, /* scalar load_cost. */
1636 1, /* scalar_store_cost. */
1637 1, /* vec_stmt_cost. */
1638 1, /* vec_to_scalar_cost. */
1639 1, /* scalar_to_vec_cost. */
1640 1, /* vec_align_load_cost. */
1641 2, /* vec_unalign_load_cost. */
1642 1, /* vec_store_cost. */
1643 3, /* cond_taken_branch_cost. */
1644 1, /* cond_not_taken_branch_cost. */
1645 };
1646
1647 /* Generic should produce code tuned for Core-i7 (and newer chips)
1648 and btver1 (and newer chips). */
1649
1650 static stringop_algs generic_memcpy[2] = {
1651 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1652 {-1, libcall, false}}},
1653 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1654 {-1, libcall, false}}}};
1655 static stringop_algs generic_memset[2] = {
1656 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1657 {-1, libcall, false}}},
1658 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1659 {-1, libcall, false}}}};
1660 static const
1661 struct processor_costs generic_cost = {
1662 COSTS_N_INSNS (1), /* cost of an add instruction */
1663 /* On all chips taken into consideration lea is 2 cycles and more. With
1664 this cost however our current implementation of synth_mult results in
1665 use of unnecessary temporary registers causing regression on several
1666 SPECfp benchmarks. */
1667 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1668 COSTS_N_INSNS (1), /* variable shift costs */
1669 COSTS_N_INSNS (1), /* constant shift costs */
1670 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1671 COSTS_N_INSNS (4), /* HI */
1672 COSTS_N_INSNS (3), /* SI */
1673 COSTS_N_INSNS (4), /* DI */
1674 COSTS_N_INSNS (2)}, /* other */
1675 0, /* cost of multiply per each bit set */
1676 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1677 COSTS_N_INSNS (26), /* HI */
1678 COSTS_N_INSNS (42), /* SI */
1679 COSTS_N_INSNS (74), /* DI */
1680 COSTS_N_INSNS (74)}, /* other */
1681 COSTS_N_INSNS (1), /* cost of movsx */
1682 COSTS_N_INSNS (1), /* cost of movzx */
1683 8, /* "large" insn */
1684 17, /* MOVE_RATIO */
1685 4, /* cost for loading QImode using movzbl */
1686 {4, 4, 4}, /* cost of loading integer registers
1687 in QImode, HImode and SImode.
1688 Relative to reg-reg move (2). */
1689 {4, 4, 4}, /* cost of storing integer registers */
1690 4, /* cost of reg,reg fld/fst */
1691 {12, 12, 12}, /* cost of loading fp registers
1692 in SFmode, DFmode and XFmode */
1693 {6, 6, 8}, /* cost of storing fp registers
1694 in SFmode, DFmode and XFmode */
1695 2, /* cost of moving MMX register */
1696 {8, 8}, /* cost of loading MMX registers
1697 in SImode and DImode */
1698 {8, 8}, /* cost of storing MMX registers
1699 in SImode and DImode */
1700 2, /* cost of moving SSE register */
1701 {8, 8, 8}, /* cost of loading SSE registers
1702 in SImode, DImode and TImode */
1703 {8, 8, 8}, /* cost of storing SSE registers
1704 in SImode, DImode and TImode */
1705 5, /* MMX or SSE register to integer */
1706 32, /* size of l1 cache. */
1707 512, /* size of l2 cache. */
1708 64, /* size of prefetch block */
1709 6, /* number of parallel prefetches */
1710 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1711 value is increased to perhaps more appropriate value of 5. */
1712 3, /* Branch cost */
1713 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1714 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1715 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1716 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1717 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1718 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1719 generic_memcpy,
1720 generic_memset,
1721 1, /* scalar_stmt_cost. */
1722 1, /* scalar load_cost. */
1723 1, /* scalar_store_cost. */
1724 1, /* vec_stmt_cost. */
1725 1, /* vec_to_scalar_cost. */
1726 1, /* scalar_to_vec_cost. */
1727 1, /* vec_align_load_cost. */
1728 2, /* vec_unalign_load_cost. */
1729 1, /* vec_store_cost. */
1730 3, /* cond_taken_branch_cost. */
1731 1, /* cond_not_taken_branch_cost. */
1732 };
1733
1734 /* core_cost should produce code tuned for Core familly of CPUs. */
1735 static stringop_algs core_memcpy[2] = {
1736 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1737 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1738 {-1, libcall, false}}}};
1739 static stringop_algs core_memset[2] = {
1740 {libcall, {{6, loop_1_byte, true},
1741 {24, loop, true},
1742 {8192, rep_prefix_4_byte, true},
1743 {-1, libcall, false}}},
1744 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1745 {-1, libcall, false}}}};
1746
1747 static const
1748 struct processor_costs core_cost = {
1749 COSTS_N_INSNS (1), /* cost of an add instruction */
1750 /* On all chips taken into consideration lea is 2 cycles and more. With
1751 this cost however our current implementation of synth_mult results in
1752 use of unnecessary temporary registers causing regression on several
1753 SPECfp benchmarks. */
1754 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1755 COSTS_N_INSNS (1), /* variable shift costs */
1756 COSTS_N_INSNS (1), /* constant shift costs */
1757 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1758 COSTS_N_INSNS (4), /* HI */
1759 COSTS_N_INSNS (3), /* SI */
1760 COSTS_N_INSNS (4), /* DI */
1761 COSTS_N_INSNS (2)}, /* other */
1762 0, /* cost of multiply per each bit set */
1763 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1764 COSTS_N_INSNS (26), /* HI */
1765 COSTS_N_INSNS (42), /* SI */
1766 COSTS_N_INSNS (74), /* DI */
1767 COSTS_N_INSNS (74)}, /* other */
1768 COSTS_N_INSNS (1), /* cost of movsx */
1769 COSTS_N_INSNS (1), /* cost of movzx */
1770 8, /* "large" insn */
1771 17, /* MOVE_RATIO */
1772 4, /* cost for loading QImode using movzbl */
1773 {4, 4, 4}, /* cost of loading integer registers
1774 in QImode, HImode and SImode.
1775 Relative to reg-reg move (2). */
1776 {4, 4, 4}, /* cost of storing integer registers */
1777 4, /* cost of reg,reg fld/fst */
1778 {12, 12, 12}, /* cost of loading fp registers
1779 in SFmode, DFmode and XFmode */
1780 {6, 6, 8}, /* cost of storing fp registers
1781 in SFmode, DFmode and XFmode */
1782 2, /* cost of moving MMX register */
1783 {8, 8}, /* cost of loading MMX registers
1784 in SImode and DImode */
1785 {8, 8}, /* cost of storing MMX registers
1786 in SImode and DImode */
1787 2, /* cost of moving SSE register */
1788 {8, 8, 8}, /* cost of loading SSE registers
1789 in SImode, DImode and TImode */
1790 {8, 8, 8}, /* cost of storing SSE registers
1791 in SImode, DImode and TImode */
1792 5, /* MMX or SSE register to integer */
1793 64, /* size of l1 cache. */
1794 512, /* size of l2 cache. */
1795 64, /* size of prefetch block */
1796 6, /* number of parallel prefetches */
1797 /* FIXME perhaps more appropriate value is 5. */
1798 3, /* Branch cost */
1799 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1800 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1801 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1802 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1803 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1804 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1805 core_memcpy,
1806 core_memset,
1807 1, /* scalar_stmt_cost. */
1808 1, /* scalar load_cost. */
1809 1, /* scalar_store_cost. */
1810 1, /* vec_stmt_cost. */
1811 1, /* vec_to_scalar_cost. */
1812 1, /* scalar_to_vec_cost. */
1813 1, /* vec_align_load_cost. */
1814 2, /* vec_unalign_load_cost. */
1815 1, /* vec_store_cost. */
1816 3, /* cond_taken_branch_cost. */
1817 1, /* cond_not_taken_branch_cost. */
1818 };
1819
1820
1821 /* Set by -mtune. */
1822 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1823
1824 /* Set by -mtune or -Os. */
1825 const struct processor_costs *ix86_cost = &pentium_cost;
1826
1827 /* Processor feature/optimization bitmasks. */
1828 #define m_386 (1<<PROCESSOR_I386)
1829 #define m_486 (1<<PROCESSOR_I486)
1830 #define m_PENT (1<<PROCESSOR_PENTIUM)
1831 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1832 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1833 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1834 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1835 #define m_CORE2 (1<<PROCESSOR_CORE2)
1836 #define m_COREI7 (1<<PROCESSOR_COREI7)
1837 #define m_COREI7_AVX (1<<PROCESSOR_COREI7_AVX)
1838 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1839 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_COREI7_AVX | m_HASWELL)
1840 #define m_ATOM (1<<PROCESSOR_ATOM)
1841 #define m_SLM (1<<PROCESSOR_SLM)
1842
1843 #define m_GEODE (1<<PROCESSOR_GEODE)
1844 #define m_K6 (1<<PROCESSOR_K6)
1845 #define m_K6_GEODE (m_K6 | m_GEODE)
1846 #define m_K8 (1<<PROCESSOR_K8)
1847 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1848 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1849 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1850 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1851 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1852 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1853 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1854 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1855 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1856 #define m_BTVER (m_BTVER1 | m_BTVER2)
1857 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1858
1859 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1860
1861 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1862 #undef DEF_TUNE
1863 #define DEF_TUNE(tune, name, selector) name,
1864 #include "x86-tune.def"
1865 #undef DEF_TUNE
1866 };
1867
1868 /* Feature tests against the various tunings. */
1869 unsigned char ix86_tune_features[X86_TUNE_LAST];
1870
1871 /* Feature tests against the various tunings used to create ix86_tune_features
1872 based on the processor mask. */
1873 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1874 #undef DEF_TUNE
1875 #define DEF_TUNE(tune, name, selector) selector,
1876 #include "x86-tune.def"
1877 #undef DEF_TUNE
1878 };
1879
1880 /* Feature tests against the various architecture variations. */
1881 unsigned char ix86_arch_features[X86_ARCH_LAST];
1882
1883 /* Feature tests against the various architecture variations, used to create
1884 ix86_arch_features based on the processor mask. */
1885 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1886 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1887 ~(m_386 | m_486 | m_PENT | m_K6),
1888
1889 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1890 ~m_386,
1891
1892 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1893 ~(m_386 | m_486),
1894
1895 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1896 ~m_386,
1897
1898 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1899 ~m_386,
1900 };
1901
1902 /* In case the average insn count for single function invocation is
1903 lower than this constant, emit fast (but longer) prologue and
1904 epilogue code. */
1905 #define FAST_PROLOGUE_INSN_COUNT 20
1906
1907 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1908 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1909 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1910 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1911
1912 /* Array of the smallest class containing reg number REGNO, indexed by
1913 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1914
1915 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1916 {
1917 /* ax, dx, cx, bx */
1918 AREG, DREG, CREG, BREG,
1919 /* si, di, bp, sp */
1920 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1921 /* FP registers */
1922 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1923 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1924 /* arg pointer */
1925 NON_Q_REGS,
1926 /* flags, fpsr, fpcr, frame */
1927 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1928 /* SSE registers */
1929 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1930 SSE_REGS, SSE_REGS,
1931 /* MMX registers */
1932 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1933 MMX_REGS, MMX_REGS,
1934 /* REX registers */
1935 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1936 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1937 /* SSE REX registers */
1938 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1939 SSE_REGS, SSE_REGS,
1940 /* AVX-512 SSE registers */
1941 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1942 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1943 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1944 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1945 /* Mask registers. */
1946 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1947 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1948 /* MPX bound registers */
1949 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
1950 };
1951
1952 /* The "default" register map used in 32bit mode. */
1953
1954 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1955 {
1956 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1957 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1958 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1959 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1960 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1961 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1962 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1963 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
1964 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
1965 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
1966 101, 102, 103, 104, /* bound registers */
1967 };
1968
1969 /* The "default" register map used in 64bit mode. */
1970
1971 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1972 {
1973 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1974 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1975 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1976 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1977 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1978 8,9,10,11,12,13,14,15, /* extended integer registers */
1979 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1980 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
1981 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
1982 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
1983 126, 127, 128, 129, /* bound registers */
1984 };
1985
1986 /* Define the register numbers to be used in Dwarf debugging information.
1987 The SVR4 reference port C compiler uses the following register numbers
1988 in its Dwarf output code:
1989 0 for %eax (gcc regno = 0)
1990 1 for %ecx (gcc regno = 2)
1991 2 for %edx (gcc regno = 1)
1992 3 for %ebx (gcc regno = 3)
1993 4 for %esp (gcc regno = 7)
1994 5 for %ebp (gcc regno = 6)
1995 6 for %esi (gcc regno = 4)
1996 7 for %edi (gcc regno = 5)
1997 The following three DWARF register numbers are never generated by
1998 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1999 believes these numbers have these meanings.
2000 8 for %eip (no gcc equivalent)
2001 9 for %eflags (gcc regno = 17)
2002 10 for %trapno (no gcc equivalent)
2003 It is not at all clear how we should number the FP stack registers
2004 for the x86 architecture. If the version of SDB on x86/svr4 were
2005 a bit less brain dead with respect to floating-point then we would
2006 have a precedent to follow with respect to DWARF register numbers
2007 for x86 FP registers, but the SDB on x86/svr4 is so completely
2008 broken with respect to FP registers that it is hardly worth thinking
2009 of it as something to strive for compatibility with.
2010 The version of x86/svr4 SDB I have at the moment does (partially)
2011 seem to believe that DWARF register number 11 is associated with
2012 the x86 register %st(0), but that's about all. Higher DWARF
2013 register numbers don't seem to be associated with anything in
2014 particular, and even for DWARF regno 11, SDB only seems to under-
2015 stand that it should say that a variable lives in %st(0) (when
2016 asked via an `=' command) if we said it was in DWARF regno 11,
2017 but SDB still prints garbage when asked for the value of the
2018 variable in question (via a `/' command).
2019 (Also note that the labels SDB prints for various FP stack regs
2020 when doing an `x' command are all wrong.)
2021 Note that these problems generally don't affect the native SVR4
2022 C compiler because it doesn't allow the use of -O with -g and
2023 because when it is *not* optimizing, it allocates a memory
2024 location for each floating-point variable, and the memory
2025 location is what gets described in the DWARF AT_location
2026 attribute for the variable in question.
2027 Regardless of the severe mental illness of the x86/svr4 SDB, we
2028 do something sensible here and we use the following DWARF
2029 register numbers. Note that these are all stack-top-relative
2030 numbers.
2031 11 for %st(0) (gcc regno = 8)
2032 12 for %st(1) (gcc regno = 9)
2033 13 for %st(2) (gcc regno = 10)
2034 14 for %st(3) (gcc regno = 11)
2035 15 for %st(4) (gcc regno = 12)
2036 16 for %st(5) (gcc regno = 13)
2037 17 for %st(6) (gcc regno = 14)
2038 18 for %st(7) (gcc regno = 15)
2039 */
2040 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2041 {
2042 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2043 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2044 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2045 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2046 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2047 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2048 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2049 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2050 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2051 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2052 -1, -1, -1, -1, /* bound registers */
2053 };
2054
2055 /* Define parameter passing and return registers. */
2056
2057 static int const x86_64_int_parameter_registers[6] =
2058 {
2059 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2060 };
2061
2062 static int const x86_64_ms_abi_int_parameter_registers[4] =
2063 {
2064 CX_REG, DX_REG, R8_REG, R9_REG
2065 };
2066
2067 static int const x86_64_int_return_registers[4] =
2068 {
2069 AX_REG, DX_REG, DI_REG, SI_REG
2070 };
2071
2072 /* Additional registers that are clobbered by SYSV calls. */
2073
2074 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2075 {
2076 SI_REG, DI_REG,
2077 XMM6_REG, XMM7_REG,
2078 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2079 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2080 };
2081
2082 /* Define the structure for the machine field in struct function. */
2083
2084 struct GTY(()) stack_local_entry {
2085 unsigned short mode;
2086 unsigned short n;
2087 rtx rtl;
2088 struct stack_local_entry *next;
2089 };
2090
2091 /* Structure describing stack frame layout.
2092 Stack grows downward:
2093
2094 [arguments]
2095 <- ARG_POINTER
2096 saved pc
2097
2098 saved static chain if ix86_static_chain_on_stack
2099
2100 saved frame pointer if frame_pointer_needed
2101 <- HARD_FRAME_POINTER
2102 [saved regs]
2103 <- regs_save_offset
2104 [padding0]
2105
2106 [saved SSE regs]
2107 <- sse_regs_save_offset
2108 [padding1] |
2109 | <- FRAME_POINTER
2110 [va_arg registers] |
2111 |
2112 [frame] |
2113 |
2114 [padding2] | = to_allocate
2115 <- STACK_POINTER
2116 */
2117 struct ix86_frame
2118 {
2119 int nsseregs;
2120 int nregs;
2121 int va_arg_size;
2122 int red_zone_size;
2123 int outgoing_arguments_size;
2124
2125 /* The offsets relative to ARG_POINTER. */
2126 HOST_WIDE_INT frame_pointer_offset;
2127 HOST_WIDE_INT hard_frame_pointer_offset;
2128 HOST_WIDE_INT stack_pointer_offset;
2129 HOST_WIDE_INT hfp_save_offset;
2130 HOST_WIDE_INT reg_save_offset;
2131 HOST_WIDE_INT sse_reg_save_offset;
2132
2133 /* When save_regs_using_mov is set, emit prologue using
2134 move instead of push instructions. */
2135 bool save_regs_using_mov;
2136 };
2137
2138 /* Which cpu are we scheduling for. */
2139 enum attr_cpu ix86_schedule;
2140
2141 /* Which cpu are we optimizing for. */
2142 enum processor_type ix86_tune;
2143
2144 /* Which instruction set architecture to use. */
2145 enum processor_type ix86_arch;
2146
2147 /* True if processor has SSE prefetch instruction. */
2148 unsigned char x86_prefetch_sse;
2149
2150 /* -mstackrealign option */
2151 static const char ix86_force_align_arg_pointer_string[]
2152 = "force_align_arg_pointer";
2153
2154 static rtx (*ix86_gen_leave) (void);
2155 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2156 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2157 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2158 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2159 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2160 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2161 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2162 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2163 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2164 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2165 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2166
2167 /* Preferred alignment for stack boundary in bits. */
2168 unsigned int ix86_preferred_stack_boundary;
2169
2170 /* Alignment for incoming stack boundary in bits specified at
2171 command line. */
2172 static unsigned int ix86_user_incoming_stack_boundary;
2173
2174 /* Default alignment for incoming stack boundary in bits. */
2175 static unsigned int ix86_default_incoming_stack_boundary;
2176
2177 /* Alignment for incoming stack boundary in bits. */
2178 unsigned int ix86_incoming_stack_boundary;
2179
2180 /* Calling abi specific va_list type nodes. */
2181 static GTY(()) tree sysv_va_list_type_node;
2182 static GTY(()) tree ms_va_list_type_node;
2183
2184 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2185 char internal_label_prefix[16];
2186 int internal_label_prefix_len;
2187
2188 /* Fence to use after loop using movnt. */
2189 tree x86_mfence;
2190
2191 /* Register class used for passing given 64bit part of the argument.
2192 These represent classes as documented by the PS ABI, with the exception
2193 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2194 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2195
2196 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2197 whenever possible (upper half does contain padding). */
2198 enum x86_64_reg_class
2199 {
2200 X86_64_NO_CLASS,
2201 X86_64_INTEGER_CLASS,
2202 X86_64_INTEGERSI_CLASS,
2203 X86_64_SSE_CLASS,
2204 X86_64_SSESF_CLASS,
2205 X86_64_SSEDF_CLASS,
2206 X86_64_SSEUP_CLASS,
2207 X86_64_X87_CLASS,
2208 X86_64_X87UP_CLASS,
2209 X86_64_COMPLEX_X87_CLASS,
2210 X86_64_MEMORY_CLASS
2211 };
2212
2213 #define MAX_CLASSES 4
2214
2215 /* Table of constants used by fldpi, fldln2, etc.... */
2216 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2217 static bool ext_80387_constants_init = 0;
2218
2219 \f
2220 static struct machine_function * ix86_init_machine_status (void);
2221 static rtx ix86_function_value (const_tree, const_tree, bool);
2222 static bool ix86_function_value_regno_p (const unsigned int);
2223 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2224 const_tree);
2225 static rtx ix86_static_chain (const_tree, bool);
2226 static int ix86_function_regparm (const_tree, const_tree);
2227 static void ix86_compute_frame_layout (struct ix86_frame *);
2228 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2229 rtx, rtx, int);
2230 static void ix86_add_new_builtins (HOST_WIDE_INT);
2231 static tree ix86_canonical_va_list_type (tree);
2232 static void predict_jump (int);
2233 static unsigned int split_stack_prologue_scratch_regno (void);
2234 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2235
2236 enum ix86_function_specific_strings
2237 {
2238 IX86_FUNCTION_SPECIFIC_ARCH,
2239 IX86_FUNCTION_SPECIFIC_TUNE,
2240 IX86_FUNCTION_SPECIFIC_MAX
2241 };
2242
2243 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2244 const char *, enum fpmath_unit, bool);
2245 static void ix86_function_specific_save (struct cl_target_option *,
2246 struct gcc_options *opts);
2247 static void ix86_function_specific_restore (struct gcc_options *opts,
2248 struct cl_target_option *);
2249 static void ix86_function_specific_print (FILE *, int,
2250 struct cl_target_option *);
2251 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2252 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2253 struct gcc_options *,
2254 struct gcc_options *,
2255 struct gcc_options *);
2256 static bool ix86_can_inline_p (tree, tree);
2257 static void ix86_set_current_function (tree);
2258 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2259
2260 static enum calling_abi ix86_function_abi (const_tree);
2261
2262 \f
2263 #ifndef SUBTARGET32_DEFAULT_CPU
2264 #define SUBTARGET32_DEFAULT_CPU "i386"
2265 #endif
2266
2267 /* Whether -mtune= or -march= were specified */
2268 static int ix86_tune_defaulted;
2269 static int ix86_arch_specified;
2270
2271 /* Vectorization library interface and handlers. */
2272 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2273
2274 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2275 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2276
2277 /* Processor target table, indexed by processor number */
2278 struct ptt
2279 {
2280 const struct processor_costs *cost; /* Processor costs */
2281 const int align_loop; /* Default alignments. */
2282 const int align_loop_max_skip;
2283 const int align_jump;
2284 const int align_jump_max_skip;
2285 const int align_func;
2286 };
2287
2288 static const struct ptt processor_target_table[PROCESSOR_max] =
2289 {
2290 {&i386_cost, 4, 3, 4, 3, 4},
2291 {&i486_cost, 16, 15, 16, 15, 16},
2292 {&pentium_cost, 16, 7, 16, 7, 16},
2293 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2294 {&geode_cost, 0, 0, 0, 0, 0},
2295 {&k6_cost, 32, 7, 32, 7, 32},
2296 {&athlon_cost, 16, 7, 16, 7, 16},
2297 {&pentium4_cost, 0, 0, 0, 0, 0},
2298 {&k8_cost, 16, 7, 16, 7, 16},
2299 {&nocona_cost, 0, 0, 0, 0, 0},
2300 /* Core 2 */
2301 {&core_cost, 16, 10, 16, 10, 16},
2302 /* Core i7 */
2303 {&core_cost, 16, 10, 16, 10, 16},
2304 /* Core i7 avx */
2305 {&core_cost, 16, 10, 16, 10, 16},
2306 /* Core avx2 */
2307 {&core_cost, 16, 10, 16, 10, 16},
2308 {&generic_cost, 16, 10, 16, 10, 16},
2309 {&amdfam10_cost, 32, 24, 32, 7, 32},
2310 {&bdver1_cost, 16, 10, 16, 7, 11},
2311 {&bdver2_cost, 16, 10, 16, 7, 11},
2312 {&bdver3_cost, 16, 10, 16, 7, 11},
2313 {&btver1_cost, 16, 10, 16, 7, 11},
2314 {&btver2_cost, 16, 10, 16, 7, 11},
2315 {&atom_cost, 16, 15, 16, 7, 16},
2316 {&slm_cost, 16, 15, 16, 7, 16}
2317 };
2318
2319 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2320 {
2321 "generic",
2322 "i386",
2323 "i486",
2324 "pentium",
2325 "pentium-mmx",
2326 "pentiumpro",
2327 "pentium2",
2328 "pentium3",
2329 "pentium4",
2330 "pentium-m",
2331 "prescott",
2332 "nocona",
2333 "core2",
2334 "corei7",
2335 "corei7-avx",
2336 "core-avx2",
2337 "atom",
2338 "slm",
2339 "geode",
2340 "k6",
2341 "k6-2",
2342 "k6-3",
2343 "athlon",
2344 "athlon-4",
2345 "k8",
2346 "amdfam10",
2347 "bdver1",
2348 "bdver2",
2349 "bdver3",
2350 "btver1",
2351 "btver2"
2352 };
2353 \f
2354 static bool
2355 gate_insert_vzeroupper (void)
2356 {
2357 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2358 }
2359
2360 static unsigned int
2361 rest_of_handle_insert_vzeroupper (void)
2362 {
2363 int i;
2364
2365 /* vzeroupper instructions are inserted immediately after reload to
2366 account for possible spills from 256bit registers. The pass
2367 reuses mode switching infrastructure by re-running mode insertion
2368 pass, so disable entities that have already been processed. */
2369 for (i = 0; i < MAX_386_ENTITIES; i++)
2370 ix86_optimize_mode_switching[i] = 0;
2371
2372 ix86_optimize_mode_switching[AVX_U128] = 1;
2373
2374 /* Call optimize_mode_switching. */
2375 g->get_passes ()->execute_pass_mode_switching ();
2376 return 0;
2377 }
2378
2379 namespace {
2380
2381 const pass_data pass_data_insert_vzeroupper =
2382 {
2383 RTL_PASS, /* type */
2384 "vzeroupper", /* name */
2385 OPTGROUP_NONE, /* optinfo_flags */
2386 true, /* has_gate */
2387 true, /* has_execute */
2388 TV_NONE, /* tv_id */
2389 0, /* properties_required */
2390 0, /* properties_provided */
2391 0, /* properties_destroyed */
2392 0, /* todo_flags_start */
2393 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2394 };
2395
2396 class pass_insert_vzeroupper : public rtl_opt_pass
2397 {
2398 public:
2399 pass_insert_vzeroupper(gcc::context *ctxt)
2400 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2401 {}
2402
2403 /* opt_pass methods: */
2404 bool gate () { return gate_insert_vzeroupper (); }
2405 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2406
2407 }; // class pass_insert_vzeroupper
2408
2409 } // anon namespace
2410
2411 rtl_opt_pass *
2412 make_pass_insert_vzeroupper (gcc::context *ctxt)
2413 {
2414 return new pass_insert_vzeroupper (ctxt);
2415 }
2416
2417 /* Return true if a red-zone is in use. */
2418
2419 static inline bool
2420 ix86_using_red_zone (void)
2421 {
2422 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2423 }
2424 \f
2425 /* Return a string that documents the current -m options. The caller is
2426 responsible for freeing the string. */
2427
2428 static char *
2429 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2430 const char *tune, enum fpmath_unit fpmath,
2431 bool add_nl_p)
2432 {
2433 struct ix86_target_opts
2434 {
2435 const char *option; /* option string */
2436 HOST_WIDE_INT mask; /* isa mask options */
2437 };
2438
2439 /* This table is ordered so that options like -msse4.2 that imply
2440 preceding options while match those first. */
2441 static struct ix86_target_opts isa_opts[] =
2442 {
2443 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2444 { "-mfma", OPTION_MASK_ISA_FMA },
2445 { "-mxop", OPTION_MASK_ISA_XOP },
2446 { "-mlwp", OPTION_MASK_ISA_LWP },
2447 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2448 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2449 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2450 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2451 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2452 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2453 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2454 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2455 { "-msse3", OPTION_MASK_ISA_SSE3 },
2456 { "-msse2", OPTION_MASK_ISA_SSE2 },
2457 { "-msse", OPTION_MASK_ISA_SSE },
2458 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2459 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2460 { "-mmmx", OPTION_MASK_ISA_MMX },
2461 { "-mabm", OPTION_MASK_ISA_ABM },
2462 { "-mbmi", OPTION_MASK_ISA_BMI },
2463 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2464 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2465 { "-mhle", OPTION_MASK_ISA_HLE },
2466 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2467 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2468 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2469 { "-madx", OPTION_MASK_ISA_ADX },
2470 { "-mtbm", OPTION_MASK_ISA_TBM },
2471 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2472 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2473 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2474 { "-maes", OPTION_MASK_ISA_AES },
2475 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2476 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2477 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2478 { "-mf16c", OPTION_MASK_ISA_F16C },
2479 { "-mrtm", OPTION_MASK_ISA_RTM },
2480 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2481 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2482 { "-mmpx", OPTION_MASK_ISA_MPX },
2483 };
2484
2485 /* Flag options. */
2486 static struct ix86_target_opts flag_opts[] =
2487 {
2488 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2489 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2490 { "-m80387", MASK_80387 },
2491 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2492 { "-malign-double", MASK_ALIGN_DOUBLE },
2493 { "-mcld", MASK_CLD },
2494 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2495 { "-mieee-fp", MASK_IEEE_FP },
2496 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2497 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2498 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2499 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2500 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2501 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2502 { "-mno-red-zone", MASK_NO_RED_ZONE },
2503 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2504 { "-mrecip", MASK_RECIP },
2505 { "-mrtd", MASK_RTD },
2506 { "-msseregparm", MASK_SSEREGPARM },
2507 { "-mstack-arg-probe", MASK_STACK_PROBE },
2508 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2509 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2510 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2511 { "-mvzeroupper", MASK_VZEROUPPER },
2512 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2513 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2514 { "-mprefer-avx128", MASK_PREFER_AVX128},
2515 };
2516
2517 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2518
2519 char isa_other[40];
2520 char target_other[40];
2521 unsigned num = 0;
2522 unsigned i, j;
2523 char *ret;
2524 char *ptr;
2525 size_t len;
2526 size_t line_len;
2527 size_t sep_len;
2528 const char *abi;
2529
2530 memset (opts, '\0', sizeof (opts));
2531
2532 /* Add -march= option. */
2533 if (arch)
2534 {
2535 opts[num][0] = "-march=";
2536 opts[num++][1] = arch;
2537 }
2538
2539 /* Add -mtune= option. */
2540 if (tune)
2541 {
2542 opts[num][0] = "-mtune=";
2543 opts[num++][1] = tune;
2544 }
2545
2546 /* Add -m32/-m64/-mx32. */
2547 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2548 {
2549 if ((isa & OPTION_MASK_ABI_64) != 0)
2550 abi = "-m64";
2551 else
2552 abi = "-mx32";
2553 isa &= ~ (OPTION_MASK_ISA_64BIT
2554 | OPTION_MASK_ABI_64
2555 | OPTION_MASK_ABI_X32);
2556 }
2557 else
2558 abi = "-m32";
2559 opts[num++][0] = abi;
2560
2561 /* Pick out the options in isa options. */
2562 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2563 {
2564 if ((isa & isa_opts[i].mask) != 0)
2565 {
2566 opts[num++][0] = isa_opts[i].option;
2567 isa &= ~ isa_opts[i].mask;
2568 }
2569 }
2570
2571 if (isa && add_nl_p)
2572 {
2573 opts[num++][0] = isa_other;
2574 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2575 isa);
2576 }
2577
2578 /* Add flag options. */
2579 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2580 {
2581 if ((flags & flag_opts[i].mask) != 0)
2582 {
2583 opts[num++][0] = flag_opts[i].option;
2584 flags &= ~ flag_opts[i].mask;
2585 }
2586 }
2587
2588 if (flags && add_nl_p)
2589 {
2590 opts[num++][0] = target_other;
2591 sprintf (target_other, "(other flags: %#x)", flags);
2592 }
2593
2594 /* Add -fpmath= option. */
2595 if (fpmath)
2596 {
2597 opts[num][0] = "-mfpmath=";
2598 switch ((int) fpmath)
2599 {
2600 case FPMATH_387:
2601 opts[num++][1] = "387";
2602 break;
2603
2604 case FPMATH_SSE:
2605 opts[num++][1] = "sse";
2606 break;
2607
2608 case FPMATH_387 | FPMATH_SSE:
2609 opts[num++][1] = "sse+387";
2610 break;
2611
2612 default:
2613 gcc_unreachable ();
2614 }
2615 }
2616
2617 /* Any options? */
2618 if (num == 0)
2619 return NULL;
2620
2621 gcc_assert (num < ARRAY_SIZE (opts));
2622
2623 /* Size the string. */
2624 len = 0;
2625 sep_len = (add_nl_p) ? 3 : 1;
2626 for (i = 0; i < num; i++)
2627 {
2628 len += sep_len;
2629 for (j = 0; j < 2; j++)
2630 if (opts[i][j])
2631 len += strlen (opts[i][j]);
2632 }
2633
2634 /* Build the string. */
2635 ret = ptr = (char *) xmalloc (len);
2636 line_len = 0;
2637
2638 for (i = 0; i < num; i++)
2639 {
2640 size_t len2[2];
2641
2642 for (j = 0; j < 2; j++)
2643 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2644
2645 if (i != 0)
2646 {
2647 *ptr++ = ' ';
2648 line_len++;
2649
2650 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2651 {
2652 *ptr++ = '\\';
2653 *ptr++ = '\n';
2654 line_len = 0;
2655 }
2656 }
2657
2658 for (j = 0; j < 2; j++)
2659 if (opts[i][j])
2660 {
2661 memcpy (ptr, opts[i][j], len2[j]);
2662 ptr += len2[j];
2663 line_len += len2[j];
2664 }
2665 }
2666
2667 *ptr = '\0';
2668 gcc_assert (ret + len >= ptr);
2669
2670 return ret;
2671 }
2672
2673 /* Return true, if profiling code should be emitted before
2674 prologue. Otherwise it returns false.
2675 Note: For x86 with "hotfix" it is sorried. */
2676 static bool
2677 ix86_profile_before_prologue (void)
2678 {
2679 return flag_fentry != 0;
2680 }
2681
2682 /* Function that is callable from the debugger to print the current
2683 options. */
2684 void ATTRIBUTE_UNUSED
2685 ix86_debug_options (void)
2686 {
2687 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2688 ix86_arch_string, ix86_tune_string,
2689 ix86_fpmath, true);
2690
2691 if (opts)
2692 {
2693 fprintf (stderr, "%s\n\n", opts);
2694 free (opts);
2695 }
2696 else
2697 fputs ("<no options>\n\n", stderr);
2698
2699 return;
2700 }
2701
2702 static const char *stringop_alg_names[] = {
2703 #define DEF_ENUM
2704 #define DEF_ALG(alg, name) #name,
2705 #include "stringop.def"
2706 #undef DEF_ENUM
2707 #undef DEF_ALG
2708 };
2709
2710 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2711 The string is of the following form (or comma separated list of it):
2712
2713 strategy_alg:max_size:[align|noalign]
2714
2715 where the full size range for the strategy is either [0, max_size] or
2716 [min_size, max_size], in which min_size is the max_size + 1 of the
2717 preceding range. The last size range must have max_size == -1.
2718
2719 Examples:
2720
2721 1.
2722 -mmemcpy-strategy=libcall:-1:noalign
2723
2724 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2725
2726
2727 2.
2728 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2729
2730 This is to tell the compiler to use the following strategy for memset
2731 1) when the expected size is between [1, 16], use rep_8byte strategy;
2732 2) when the size is between [17, 2048], use vector_loop;
2733 3) when the size is > 2048, use libcall. */
2734
2735 struct stringop_size_range
2736 {
2737 int max;
2738 stringop_alg alg;
2739 bool noalign;
2740 };
2741
2742 static void
2743 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2744 {
2745 const struct stringop_algs *default_algs;
2746 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2747 char *curr_range_str, *next_range_str;
2748 int i = 0, n = 0;
2749
2750 if (is_memset)
2751 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2752 else
2753 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2754
2755 curr_range_str = strategy_str;
2756
2757 do
2758 {
2759 int maxs;
2760 stringop_alg alg;
2761 char alg_name[128];
2762 char align[16];
2763 next_range_str = strchr (curr_range_str, ',');
2764 if (next_range_str)
2765 *next_range_str++ = '\0';
2766
2767 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2768 alg_name, &maxs, align))
2769 {
2770 error ("wrong arg %s to option %s", curr_range_str,
2771 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2772 return;
2773 }
2774
2775 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2776 {
2777 error ("size ranges of option %s should be increasing",
2778 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2779 return;
2780 }
2781
2782 for (i = 0; i < last_alg; i++)
2783 {
2784 if (!strcmp (alg_name, stringop_alg_names[i]))
2785 {
2786 alg = (stringop_alg) i;
2787 break;
2788 }
2789 }
2790
2791 if (i == last_alg)
2792 {
2793 error ("wrong stringop strategy name %s specified for option %s",
2794 alg_name,
2795 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2796 return;
2797 }
2798
2799 input_ranges[n].max = maxs;
2800 input_ranges[n].alg = alg;
2801 if (!strcmp (align, "align"))
2802 input_ranges[n].noalign = false;
2803 else if (!strcmp (align, "noalign"))
2804 input_ranges[n].noalign = true;
2805 else
2806 {
2807 error ("unknown alignment %s specified for option %s",
2808 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2809 return;
2810 }
2811 n++;
2812 curr_range_str = next_range_str;
2813 }
2814 while (curr_range_str);
2815
2816 if (input_ranges[n - 1].max != -1)
2817 {
2818 error ("the max value for the last size range should be -1"
2819 " for option %s",
2820 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2821 return;
2822 }
2823
2824 if (n > MAX_STRINGOP_ALGS)
2825 {
2826 error ("too many size ranges specified in option %s",
2827 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2828 return;
2829 }
2830
2831 /* Now override the default algs array. */
2832 for (i = 0; i < n; i++)
2833 {
2834 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2835 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2836 = input_ranges[i].alg;
2837 *const_cast<int *>(&default_algs->size[i].noalign)
2838 = input_ranges[i].noalign;
2839 }
2840 }
2841
2842 \f
2843 /* parse -mtune-ctrl= option. When DUMP is true,
2844 print the features that are explicitly set. */
2845
2846 static void
2847 parse_mtune_ctrl_str (bool dump)
2848 {
2849 if (!ix86_tune_ctrl_string)
2850 return;
2851
2852 char *next_feature_string = NULL;
2853 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2854 char *orig = curr_feature_string;
2855 int i;
2856 do
2857 {
2858 bool clear = false;
2859
2860 next_feature_string = strchr (curr_feature_string, ',');
2861 if (next_feature_string)
2862 *next_feature_string++ = '\0';
2863 if (*curr_feature_string == '^')
2864 {
2865 curr_feature_string++;
2866 clear = true;
2867 }
2868 for (i = 0; i < X86_TUNE_LAST; i++)
2869 {
2870 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2871 {
2872 ix86_tune_features[i] = !clear;
2873 if (dump)
2874 fprintf (stderr, "Explicitly %s feature %s\n",
2875 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2876 break;
2877 }
2878 }
2879 if (i == X86_TUNE_LAST)
2880 error ("Unknown parameter to option -mtune-ctrl: %s",
2881 clear ? curr_feature_string - 1 : curr_feature_string);
2882 curr_feature_string = next_feature_string;
2883 }
2884 while (curr_feature_string);
2885 free (orig);
2886 }
2887
2888 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2889 processor type. */
2890
2891 static void
2892 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2893 {
2894 unsigned int ix86_tune_mask = 1u << ix86_tune;
2895 int i;
2896
2897 for (i = 0; i < X86_TUNE_LAST; ++i)
2898 {
2899 if (ix86_tune_no_default)
2900 ix86_tune_features[i] = 0;
2901 else
2902 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
2903 }
2904
2905 if (dump)
2906 {
2907 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
2908 for (i = 0; i < X86_TUNE_LAST; i++)
2909 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
2910 ix86_tune_features[i] ? "on" : "off");
2911 }
2912
2913 parse_mtune_ctrl_str (dump);
2914 }
2915
2916
2917 /* Override various settings based on options. If MAIN_ARGS_P, the
2918 options are from the command line, otherwise they are from
2919 attributes. */
2920
2921 static void
2922 ix86_option_override_internal (bool main_args_p,
2923 struct gcc_options *opts,
2924 struct gcc_options *opts_set)
2925 {
2926 int i;
2927 unsigned int ix86_arch_mask;
2928 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
2929 const char *prefix;
2930 const char *suffix;
2931 const char *sw;
2932
2933 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2934 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2935 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2936 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2937 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2938 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2939 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2940 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2941 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2942 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2943 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2944 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2945 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2946 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2947 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2948 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2949 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2950 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2951 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2952 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2953 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2954 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2955 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2956 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2957 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2958 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2959 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2960 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2961 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2962 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2963 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2964 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2965 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2966 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2967 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2968 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2969 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2970 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2971 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2972 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2973 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
2974 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
2975 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
2976 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
2977 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
2978
2979 /* if this reaches 64, need to widen struct pta flags below */
2980
2981 static struct pta
2982 {
2983 const char *const name; /* processor name or nickname. */
2984 const enum processor_type processor;
2985 const enum attr_cpu schedule;
2986 const unsigned HOST_WIDE_INT flags;
2987 }
2988 const processor_alias_table[] =
2989 {
2990 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2991 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2992 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2993 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2994 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2995 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2996 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2997 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2998 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2999 PTA_MMX | PTA_SSE | PTA_FXSR},
3000 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3001 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3002 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3003 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3004 PTA_MMX | PTA_SSE | PTA_FXSR},
3005 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3006 PTA_MMX | PTA_SSE | PTA_FXSR},
3007 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3008 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3009 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3010 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3011 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3012 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3013 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3014 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3015 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3016 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3017 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3018 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3019 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3020 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3021 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3022 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3023 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3024 {"corei7-avx", PROCESSOR_COREI7_AVX, CPU_COREI7,
3025 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3026 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3027 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3028 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3029 {"core-avx-i", PROCESSOR_COREI7_AVX, CPU_COREI7,
3030 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3031 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3032 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3033 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3034 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3035 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3036 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3037 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3038 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3039 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3040 | PTA_XSAVEOPT},
3041 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3042 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3043 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3044 {"slm", PROCESSOR_SLM, CPU_SLM,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3047 | PTA_FXSR},
3048 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3049 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3050 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3051 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3052 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3053 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3054 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3055 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3056 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3057 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3058 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3059 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3060 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3061 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3062 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3063 {"x86-64", PROCESSOR_K8, CPU_K8,
3064 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3065 {"k8", PROCESSOR_K8, CPU_K8,
3066 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3067 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3068 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3069 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3070 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3071 {"opteron", PROCESSOR_K8, CPU_K8,
3072 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3073 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3074 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3075 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3076 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3077 {"athlon64", PROCESSOR_K8, CPU_K8,
3078 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3079 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3080 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3081 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3082 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3083 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3084 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3085 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3086 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3087 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3088 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3089 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3090 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3091 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3092 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3093 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3094 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3095 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3096 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3097 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3098 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3099 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3100 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3101 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3102 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3103 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3104 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3105 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3106 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3107 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3108 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3109 | PTA_XSAVEOPT | PTA_FSGSBASE},
3110 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3111 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3112 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3113 | PTA_FXSR | PTA_XSAVE},
3114 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3115 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3116 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3117 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3118 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3119 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3120
3121 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3122 PTA_64BIT
3123 | PTA_HLE /* flags are only used for -march switch. */ },
3124 };
3125
3126 /* -mrecip options. */
3127 static struct
3128 {
3129 const char *string; /* option name */
3130 unsigned int mask; /* mask bits to set */
3131 }
3132 const recip_options[] =
3133 {
3134 { "all", RECIP_MASK_ALL },
3135 { "none", RECIP_MASK_NONE },
3136 { "div", RECIP_MASK_DIV },
3137 { "sqrt", RECIP_MASK_SQRT },
3138 { "vec-div", RECIP_MASK_VEC_DIV },
3139 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3140 };
3141
3142 int const pta_size = ARRAY_SIZE (processor_alias_table);
3143
3144 /* Set up prefix/suffix so the error messages refer to either the command
3145 line argument, or the attribute(target). */
3146 if (main_args_p)
3147 {
3148 prefix = "-m";
3149 suffix = "";
3150 sw = "switch";
3151 }
3152 else
3153 {
3154 prefix = "option(\"";
3155 suffix = "\")";
3156 sw = "attribute";
3157 }
3158
3159 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3160 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3161 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3162 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3163 #ifdef TARGET_BI_ARCH
3164 else
3165 {
3166 #if TARGET_BI_ARCH == 1
3167 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3168 is on and OPTION_MASK_ABI_X32 is off. We turn off
3169 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3170 -mx32. */
3171 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3172 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3173 #else
3174 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3175 on and OPTION_MASK_ABI_64 is off. We turn off
3176 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3177 -m64. */
3178 if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3179 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3180 #endif
3181 }
3182 #endif
3183
3184 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3185 {
3186 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3187 OPTION_MASK_ABI_64 for TARGET_X32. */
3188 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3189 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3190 }
3191 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3192 {
3193 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3194 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3195 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3196 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3197 }
3198
3199 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3200 SUBTARGET_OVERRIDE_OPTIONS;
3201 #endif
3202
3203 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3204 SUBSUBTARGET_OVERRIDE_OPTIONS;
3205 #endif
3206
3207 /* -fPIC is the default for x86_64. */
3208 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3209 opts->x_flag_pic = 2;
3210
3211 /* Need to check -mtune=generic first. */
3212 if (opts->x_ix86_tune_string)
3213 {
3214 if (!strcmp (opts->x_ix86_tune_string, "generic")
3215 || !strcmp (opts->x_ix86_tune_string, "i686")
3216 /* As special support for cross compilers we read -mtune=native
3217 as -mtune=generic. With native compilers we won't see the
3218 -mtune=native, as it was changed by the driver. */
3219 || !strcmp (opts->x_ix86_tune_string, "native"))
3220 {
3221 opts->x_ix86_tune_string = "generic";
3222 }
3223 /* If this call is for setting the option attribute, allow the
3224 generic that was previously set. */
3225 else if (!main_args_p
3226 && !strcmp (opts->x_ix86_tune_string, "generic"))
3227 ;
3228 else if (!strncmp (opts->x_ix86_tune_string, "generic", 7))
3229 error ("bad value (%s) for %stune=%s %s",
3230 opts->x_ix86_tune_string, prefix, suffix, sw);
3231 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3232 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3233 "%stune=k8%s or %stune=generic%s instead as appropriate",
3234 prefix, suffix, prefix, suffix, prefix, suffix);
3235 }
3236 else
3237 {
3238 if (opts->x_ix86_arch_string)
3239 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3240 if (!opts->x_ix86_tune_string)
3241 {
3242 opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3243 ix86_tune_defaulted = 1;
3244 }
3245
3246 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3247 or defaulted. We need to use a sensible tune option. */
3248 if (!strcmp (opts->x_ix86_tune_string, "generic")
3249 || !strcmp (opts->x_ix86_tune_string, "x86-64")
3250 || !strcmp (opts->x_ix86_tune_string, "i686"))
3251 {
3252 opts->x_ix86_tune_string = "generic";
3253 }
3254 }
3255
3256 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3257 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3258 {
3259 /* rep; movq isn't available in 32-bit code. */
3260 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3261 opts->x_ix86_stringop_alg = no_stringop;
3262 }
3263
3264 if (!opts->x_ix86_arch_string)
3265 opts->x_ix86_arch_string
3266 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3267 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3268 else
3269 ix86_arch_specified = 1;
3270
3271 if (opts_set->x_ix86_pmode)
3272 {
3273 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3274 && opts->x_ix86_pmode == PMODE_SI)
3275 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3276 && opts->x_ix86_pmode == PMODE_DI))
3277 error ("address mode %qs not supported in the %s bit mode",
3278 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3279 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3280 }
3281 else
3282 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3283 ? PMODE_DI : PMODE_SI;
3284
3285 if (!opts_set->x_ix86_abi)
3286 opts->x_ix86_abi = DEFAULT_ABI;
3287
3288 /* For targets using ms ABI enable ms-extensions, if not
3289 explicit turned off. For non-ms ABI we turn off this
3290 option. */
3291 if (!opts_set->x_flag_ms_extensions)
3292 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3293
3294 if (opts_set->x_ix86_cmodel)
3295 {
3296 switch (opts->x_ix86_cmodel)
3297 {
3298 case CM_SMALL:
3299 case CM_SMALL_PIC:
3300 if (opts->x_flag_pic)
3301 opts->x_ix86_cmodel = CM_SMALL_PIC;
3302 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3303 error ("code model %qs not supported in the %s bit mode",
3304 "small", "32");
3305 break;
3306
3307 case CM_MEDIUM:
3308 case CM_MEDIUM_PIC:
3309 if (opts->x_flag_pic)
3310 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3311 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3312 error ("code model %qs not supported in the %s bit mode",
3313 "medium", "32");
3314 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3315 error ("code model %qs not supported in x32 mode",
3316 "medium");
3317 break;
3318
3319 case CM_LARGE:
3320 case CM_LARGE_PIC:
3321 if (opts->x_flag_pic)
3322 opts->x_ix86_cmodel = CM_LARGE_PIC;
3323 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3324 error ("code model %qs not supported in the %s bit mode",
3325 "large", "32");
3326 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3327 error ("code model %qs not supported in x32 mode",
3328 "large");
3329 break;
3330
3331 case CM_32:
3332 if (opts->x_flag_pic)
3333 error ("code model %s does not support PIC mode", "32");
3334 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3335 error ("code model %qs not supported in the %s bit mode",
3336 "32", "64");
3337 break;
3338
3339 case CM_KERNEL:
3340 if (opts->x_flag_pic)
3341 {
3342 error ("code model %s does not support PIC mode", "kernel");
3343 opts->x_ix86_cmodel = CM_32;
3344 }
3345 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3346 error ("code model %qs not supported in the %s bit mode",
3347 "kernel", "32");
3348 break;
3349
3350 default:
3351 gcc_unreachable ();
3352 }
3353 }
3354 else
3355 {
3356 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3357 use of rip-relative addressing. This eliminates fixups that
3358 would otherwise be needed if this object is to be placed in a
3359 DLL, and is essentially just as efficient as direct addressing. */
3360 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3361 && (TARGET_RDOS || TARGET_PECOFF))
3362 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3363 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3364 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3365 else
3366 opts->x_ix86_cmodel = CM_32;
3367 }
3368 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3369 {
3370 error ("-masm=intel not supported in this configuration");
3371 opts->x_ix86_asm_dialect = ASM_ATT;
3372 }
3373 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3374 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3375 sorry ("%i-bit mode not compiled in",
3376 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3377
3378 for (i = 0; i < pta_size; i++)
3379 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3380 {
3381 ix86_schedule = processor_alias_table[i].schedule;
3382 ix86_arch = processor_alias_table[i].processor;
3383 /* Default cpu tuning to the architecture. */
3384 ix86_tune = ix86_arch;
3385
3386 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3387 && !(processor_alias_table[i].flags & PTA_64BIT))
3388 error ("CPU you selected does not support x86-64 "
3389 "instruction set");
3390
3391 if (processor_alias_table[i].flags & PTA_MMX
3392 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3393 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3394 if (processor_alias_table[i].flags & PTA_3DNOW
3395 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3396 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3397 if (processor_alias_table[i].flags & PTA_3DNOW_A
3398 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3399 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3400 if (processor_alias_table[i].flags & PTA_SSE
3401 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3402 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3403 if (processor_alias_table[i].flags & PTA_SSE2
3404 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3405 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3406 if (processor_alias_table[i].flags & PTA_SSE3
3407 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3408 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3409 if (processor_alias_table[i].flags & PTA_SSSE3
3410 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3411 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3412 if (processor_alias_table[i].flags & PTA_SSE4_1
3413 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3414 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3415 if (processor_alias_table[i].flags & PTA_SSE4_2
3416 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3417 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3418 if (processor_alias_table[i].flags & PTA_AVX
3419 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3420 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3421 if (processor_alias_table[i].flags & PTA_AVX2
3422 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3423 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3424 if (processor_alias_table[i].flags & PTA_FMA
3425 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3426 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3427 if (processor_alias_table[i].flags & PTA_SSE4A
3428 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3429 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3430 if (processor_alias_table[i].flags & PTA_FMA4
3431 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3432 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3433 if (processor_alias_table[i].flags & PTA_XOP
3434 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3435 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3436 if (processor_alias_table[i].flags & PTA_LWP
3437 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3438 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3439 if (processor_alias_table[i].flags & PTA_ABM
3440 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3441 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3442 if (processor_alias_table[i].flags & PTA_BMI
3443 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3444 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3445 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3446 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3447 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3448 if (processor_alias_table[i].flags & PTA_TBM
3449 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3450 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3451 if (processor_alias_table[i].flags & PTA_BMI2
3452 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3453 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3454 if (processor_alias_table[i].flags & PTA_CX16
3455 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3456 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3457 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3458 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3459 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3460 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3461 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3462 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3463 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3464 if (processor_alias_table[i].flags & PTA_MOVBE
3465 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3466 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3467 if (processor_alias_table[i].flags & PTA_AES
3468 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3469 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES;
3470 if (processor_alias_table[i].flags & PTA_PCLMUL
3471 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3472 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3473 if (processor_alias_table[i].flags & PTA_FSGSBASE
3474 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3475 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3476 if (processor_alias_table[i].flags & PTA_RDRND
3477 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3478 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3479 if (processor_alias_table[i].flags & PTA_F16C
3480 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3481 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3482 if (processor_alias_table[i].flags & PTA_RTM
3483 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3484 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3485 if (processor_alias_table[i].flags & PTA_HLE
3486 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3487 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3488 if (processor_alias_table[i].flags & PTA_PRFCHW
3489 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3490 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3491 if (processor_alias_table[i].flags & PTA_RDSEED
3492 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3493 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3494 if (processor_alias_table[i].flags & PTA_ADX
3495 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3496 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3497 if (processor_alias_table[i].flags & PTA_FXSR
3498 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3499 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3500 if (processor_alias_table[i].flags & PTA_XSAVE
3501 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3502 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3503 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3504 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3505 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3506 if (processor_alias_table[i].flags & PTA_AVX512F
3507 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3508 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3509 if (processor_alias_table[i].flags & PTA_AVX512ER
3510 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3511 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3512 if (processor_alias_table[i].flags & PTA_AVX512PF
3513 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3514 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3515 if (processor_alias_table[i].flags & PTA_AVX512CD
3516 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3517 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3518 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3519 x86_prefetch_sse = true;
3520
3521 break;
3522 }
3523
3524 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3525 error ("generic CPU can be used only for %stune=%s %s",
3526 prefix, suffix, sw);
3527 else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size)
3528 error ("bad value (%s) for %sarch=%s %s",
3529 opts->x_ix86_arch_string, prefix, suffix, sw);
3530
3531 ix86_arch_mask = 1u << ix86_arch;
3532 for (i = 0; i < X86_ARCH_LAST; ++i)
3533 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3534
3535 for (i = 0; i < pta_size; i++)
3536 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
3537 {
3538 ix86_schedule = processor_alias_table[i].schedule;
3539 ix86_tune = processor_alias_table[i].processor;
3540 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3541 {
3542 if (!(processor_alias_table[i].flags & PTA_64BIT))
3543 {
3544 if (ix86_tune_defaulted)
3545 {
3546 opts->x_ix86_tune_string = "x86-64";
3547 for (i = 0; i < pta_size; i++)
3548 if (! strcmp (opts->x_ix86_tune_string,
3549 processor_alias_table[i].name))
3550 break;
3551 ix86_schedule = processor_alias_table[i].schedule;
3552 ix86_tune = processor_alias_table[i].processor;
3553 }
3554 else
3555 error ("CPU you selected does not support x86-64 "
3556 "instruction set");
3557 }
3558 }
3559 /* Intel CPUs have always interpreted SSE prefetch instructions as
3560 NOPs; so, we can enable SSE prefetch instructions even when
3561 -mtune (rather than -march) points us to a processor that has them.
3562 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3563 higher processors. */
3564 if (TARGET_CMOV
3565 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3566 x86_prefetch_sse = true;
3567 break;
3568 }
3569
3570 if (ix86_tune_specified && i == pta_size)
3571 error ("bad value (%s) for %stune=%s %s",
3572 opts->x_ix86_tune_string, prefix, suffix, sw);
3573
3574 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
3575
3576 #ifndef USE_IX86_FRAME_POINTER
3577 #define USE_IX86_FRAME_POINTER 0
3578 #endif
3579
3580 #ifndef USE_X86_64_FRAME_POINTER
3581 #define USE_X86_64_FRAME_POINTER 0
3582 #endif
3583
3584 /* Set the default values for switches whose default depends on TARGET_64BIT
3585 in case they weren't overwritten by command line options. */
3586 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3587 {
3588 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3589 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3590 if (opts->x_flag_asynchronous_unwind_tables == 2)
3591 opts->x_flag_unwind_tables
3592 = opts->x_flag_asynchronous_unwind_tables = 1;
3593 if (opts->x_flag_pcc_struct_return == 2)
3594 opts->x_flag_pcc_struct_return = 0;
3595 }
3596 else
3597 {
3598 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3599 opts->x_flag_omit_frame_pointer
3600 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
3601 if (opts->x_flag_asynchronous_unwind_tables == 2)
3602 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3603 if (opts->x_flag_pcc_struct_return == 2)
3604 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3605 }
3606
3607 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3608 if (opts->x_optimize_size)
3609 ix86_cost = &ix86_size_cost;
3610 else
3611 ix86_cost = ix86_tune_cost;
3612
3613 /* Arrange to set up i386_stack_locals for all functions. */
3614 init_machine_status = ix86_init_machine_status;
3615
3616 /* Validate -mregparm= value. */
3617 if (opts_set->x_ix86_regparm)
3618 {
3619 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3620 warning (0, "-mregparm is ignored in 64-bit mode");
3621 if (opts->x_ix86_regparm > REGPARM_MAX)
3622 {
3623 error ("-mregparm=%d is not between 0 and %d",
3624 opts->x_ix86_regparm, REGPARM_MAX);
3625 opts->x_ix86_regparm = 0;
3626 }
3627 }
3628 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3629 opts->x_ix86_regparm = REGPARM_MAX;
3630
3631 /* Default align_* from the processor table. */
3632 if (opts->x_align_loops == 0)
3633 {
3634 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3635 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3636 }
3637 if (opts->x_align_jumps == 0)
3638 {
3639 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3640 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3641 }
3642 if (opts->x_align_functions == 0)
3643 {
3644 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3645 }
3646
3647 /* Provide default for -mbranch-cost= value. */
3648 if (!opts_set->x_ix86_branch_cost)
3649 opts->x_ix86_branch_cost = ix86_cost->branch_cost;
3650
3651 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3652 {
3653 opts->x_target_flags
3654 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
3655
3656 /* Enable by default the SSE and MMX builtins. Do allow the user to
3657 explicitly disable any of these. In particular, disabling SSE and
3658 MMX for kernel code is extremely useful. */
3659 if (!ix86_arch_specified)
3660 opts->x_ix86_isa_flags
3661 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3662 | TARGET_SUBTARGET64_ISA_DEFAULT)
3663 & ~opts->x_ix86_isa_flags_explicit);
3664
3665 if (TARGET_RTD_P (opts->x_target_flags))
3666 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3667 }
3668 else
3669 {
3670 opts->x_target_flags
3671 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
3672
3673 if (!ix86_arch_specified)
3674 opts->x_ix86_isa_flags
3675 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
3676
3677 /* i386 ABI does not specify red zone. It still makes sense to use it
3678 when programmer takes care to stack from being destroyed. */
3679 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
3680 opts->x_target_flags |= MASK_NO_RED_ZONE;
3681 }
3682
3683 /* Keep nonleaf frame pointers. */
3684 if (opts->x_flag_omit_frame_pointer)
3685 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3686 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
3687 opts->x_flag_omit_frame_pointer = 1;
3688
3689 /* If we're doing fast math, we don't care about comparison order
3690 wrt NaNs. This lets us use a shorter comparison sequence. */
3691 if (opts->x_flag_finite_math_only)
3692 opts->x_target_flags &= ~MASK_IEEE_FP;
3693
3694 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3695 since the insns won't need emulation. */
3696 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
3697 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
3698
3699 /* Likewise, if the target doesn't have a 387, or we've specified
3700 software floating point, don't use 387 inline intrinsics. */
3701 if (!TARGET_80387_P (opts->x_target_flags))
3702 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
3703
3704 /* Turn on MMX builtins for -msse. */
3705 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
3706 opts->x_ix86_isa_flags
3707 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
3708
3709 /* Enable SSE prefetch. */
3710 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
3711 || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
3712 x86_prefetch_sse = true;
3713
3714 /* Enable prefetch{,w} instructions for -m3dnow. */
3715 if (TARGET_3DNOW_P (opts->x_ix86_isa_flags))
3716 opts->x_ix86_isa_flags
3717 |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
3718
3719 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3720 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
3721 || TARGET_ABM_P (opts->x_ix86_isa_flags))
3722 opts->x_ix86_isa_flags
3723 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
3724
3725 /* Enable lzcnt instruction for -mabm. */
3726 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
3727 opts->x_ix86_isa_flags
3728 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
3729
3730 /* Validate -mpreferred-stack-boundary= value or default it to
3731 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3732 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3733 if (opts_set->x_ix86_preferred_stack_boundary_arg)
3734 {
3735 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3736 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
3737 int max = (TARGET_SEH ? 4 : 12);
3738
3739 if (opts->x_ix86_preferred_stack_boundary_arg < min
3740 || opts->x_ix86_preferred_stack_boundary_arg > max)
3741 {
3742 if (min == max)
3743 error ("-mpreferred-stack-boundary is not supported "
3744 "for this target");
3745 else
3746 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3747 opts->x_ix86_preferred_stack_boundary_arg, min, max);
3748 }
3749 else
3750 ix86_preferred_stack_boundary
3751 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3752 }
3753
3754 /* Set the default value for -mstackrealign. */
3755 if (opts->x_ix86_force_align_arg_pointer == -1)
3756 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3757
3758 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3759
3760 /* Validate -mincoming-stack-boundary= value or default it to
3761 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3762 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3763 if (opts_set->x_ix86_incoming_stack_boundary_arg)
3764 {
3765 if (ix86_incoming_stack_boundary_arg
3766 < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
3767 || ix86_incoming_stack_boundary_arg > 12)
3768 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3769 ix86_incoming_stack_boundary_arg,
3770 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
3771 else
3772 {
3773 ix86_user_incoming_stack_boundary
3774 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3775 ix86_incoming_stack_boundary
3776 = ix86_user_incoming_stack_boundary;
3777 }
3778 }
3779
3780 /* Accept -msseregparm only if at least SSE support is enabled. */
3781 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
3782 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
3783 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3784
3785 if (opts_set->x_ix86_fpmath)
3786 {
3787 if (opts->x_ix86_fpmath & FPMATH_SSE)
3788 {
3789 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
3790 {
3791 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3792 opts->x_ix86_fpmath = FPMATH_387;
3793 }
3794 else if ((opts->x_ix86_fpmath & FPMATH_387)
3795 && !TARGET_80387_P (opts->x_target_flags))
3796 {
3797 warning (0, "387 instruction set disabled, using SSE arithmetics");
3798 opts->x_ix86_fpmath = FPMATH_SSE;
3799 }
3800 }
3801 }
3802 /* For all chips supporting SSE2, -mfpmath=sse performs better than
3803 fpmath=387. The second is however default at many targets since the
3804 extra 80bit precision of temporaries is considered to be part of ABI.
3805 Overwrite the default at least for -ffast-math.
3806 TODO: -mfpmath=both seems to produce same performing code with bit
3807 smaller binaries. It is however not clear if register allocation is
3808 ready for this setting.
3809 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
3810 codegen. We may switch to 387 with -ffast-math for size optimized
3811 functions. */
3812 else if (fast_math_flags_set_p (&global_options)
3813 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
3814 ix86_fpmath = FPMATH_SSE;
3815 else
3816 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
3817
3818 /* If the i387 is disabled, then do not return values in it. */
3819 if (!TARGET_80387_P (opts->x_target_flags))
3820 opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
3821
3822 /* Use external vectorized library in vectorizing intrinsics. */
3823 if (opts_set->x_ix86_veclibabi_type)
3824 switch (opts->x_ix86_veclibabi_type)
3825 {
3826 case ix86_veclibabi_type_svml:
3827 ix86_veclib_handler = ix86_veclibabi_svml;
3828 break;
3829
3830 case ix86_veclibabi_type_acml:
3831 ix86_veclib_handler = ix86_veclibabi_acml;
3832 break;
3833
3834 default:
3835 gcc_unreachable ();
3836 }
3837
3838 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
3839 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3840 && !opts->x_optimize_size)
3841 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3842
3843 /* If stack probes are required, the space used for large function
3844 arguments on the stack must also be probed, so enable
3845 -maccumulate-outgoing-args so this happens in the prologue. */
3846 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
3847 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3848 {
3849 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3850 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3851 "for correctness", prefix, suffix);
3852 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3853 }
3854
3855 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3856 {
3857 char *p;
3858 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3859 p = strchr (internal_label_prefix, 'X');
3860 internal_label_prefix_len = p - internal_label_prefix;
3861 *p = '\0';
3862 }
3863
3864 /* When scheduling description is not available, disable scheduler pass
3865 so it won't slow down the compilation and make x87 code slower. */
3866 if (!TARGET_SCHEDULE)
3867 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
3868
3869 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3870 ix86_tune_cost->simultaneous_prefetches,
3871 opts->x_param_values,
3872 opts_set->x_param_values);
3873 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3874 ix86_tune_cost->prefetch_block,
3875 opts->x_param_values,
3876 opts_set->x_param_values);
3877 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3878 ix86_tune_cost->l1_cache_size,
3879 opts->x_param_values,
3880 opts_set->x_param_values);
3881 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3882 ix86_tune_cost->l2_cache_size,
3883 opts->x_param_values,
3884 opts_set->x_param_values);
3885
3886 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3887 if (opts->x_flag_prefetch_loop_arrays < 0
3888 && HAVE_prefetch
3889 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
3890 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3891 opts->x_flag_prefetch_loop_arrays = 1;
3892
3893 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3894 can be opts->x_optimized to ap = __builtin_next_arg (0). */
3895 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
3896 targetm.expand_builtin_va_start = NULL;
3897
3898 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3899 {
3900 ix86_gen_leave = gen_leave_rex64;
3901 if (Pmode == DImode)
3902 {
3903 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3904 ix86_gen_tls_local_dynamic_base_64
3905 = gen_tls_local_dynamic_base_64_di;
3906 }
3907 else
3908 {
3909 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3910 ix86_gen_tls_local_dynamic_base_64
3911 = gen_tls_local_dynamic_base_64_si;
3912 }
3913 }
3914 else
3915 ix86_gen_leave = gen_leave;
3916
3917 if (Pmode == DImode)
3918 {
3919 ix86_gen_add3 = gen_adddi3;
3920 ix86_gen_sub3 = gen_subdi3;
3921 ix86_gen_sub3_carry = gen_subdi3_carry;
3922 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3923 ix86_gen_andsp = gen_anddi3;
3924 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3925 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3926 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3927 ix86_gen_monitor = gen_sse3_monitor_di;
3928 }
3929 else
3930 {
3931 ix86_gen_add3 = gen_addsi3;
3932 ix86_gen_sub3 = gen_subsi3;
3933 ix86_gen_sub3_carry = gen_subsi3_carry;
3934 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3935 ix86_gen_andsp = gen_andsi3;
3936 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3937 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3938 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3939 ix86_gen_monitor = gen_sse3_monitor_si;
3940 }
3941
3942 #ifdef USE_IX86_CLD
3943 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3944 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3945 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
3946 #endif
3947
3948 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
3949 {
3950 if (opts->x_flag_fentry > 0)
3951 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3952 "with -fpic");
3953 opts->x_flag_fentry = 0;
3954 }
3955 else if (TARGET_SEH)
3956 {
3957 if (opts->x_flag_fentry == 0)
3958 sorry ("-mno-fentry isn%'t compatible with SEH");
3959 opts->x_flag_fentry = 1;
3960 }
3961 else if (opts->x_flag_fentry < 0)
3962 {
3963 #if defined(PROFILE_BEFORE_PROLOGUE)
3964 opts->x_flag_fentry = 1;
3965 #else
3966 opts->x_flag_fentry = 0;
3967 #endif
3968 }
3969
3970 /* When not opts->x_optimize for size, enable vzeroupper optimization for
3971 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3972 AVX unaligned load/store. */
3973 if (!opts->x_optimize_size)
3974 {
3975 if (flag_expensive_optimizations
3976 && !(opts_set->x_target_flags & MASK_VZEROUPPER))
3977 opts->x_target_flags |= MASK_VZEROUPPER;
3978 if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
3979 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3980 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3981 if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
3982 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3983 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3984 /* Enable 128-bit AVX instruction generation
3985 for the auto-vectorizer. */
3986 if (TARGET_AVX128_OPTIMAL
3987 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
3988 opts->x_target_flags |= MASK_PREFER_AVX128;
3989 }
3990
3991 if (opts->x_ix86_recip_name)
3992 {
3993 char *p = ASTRDUP (opts->x_ix86_recip_name);
3994 char *q;
3995 unsigned int mask, i;
3996 bool invert;
3997
3998 while ((q = strtok (p, ",")) != NULL)
3999 {
4000 p = NULL;
4001 if (*q == '!')
4002 {
4003 invert = true;
4004 q++;
4005 }
4006 else
4007 invert = false;
4008
4009 if (!strcmp (q, "default"))
4010 mask = RECIP_MASK_ALL;
4011 else
4012 {
4013 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4014 if (!strcmp (q, recip_options[i].string))
4015 {
4016 mask = recip_options[i].mask;
4017 break;
4018 }
4019
4020 if (i == ARRAY_SIZE (recip_options))
4021 {
4022 error ("unknown option for -mrecip=%s", q);
4023 invert = false;
4024 mask = RECIP_MASK_NONE;
4025 }
4026 }
4027
4028 opts->x_recip_mask_explicit |= mask;
4029 if (invert)
4030 opts->x_recip_mask &= ~mask;
4031 else
4032 opts->x_recip_mask |= mask;
4033 }
4034 }
4035
4036 if (TARGET_RECIP_P (opts->x_target_flags))
4037 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4038 else if (opts_set->x_target_flags & MASK_RECIP)
4039 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4040
4041 /* Default long double to 64-bit for Bionic. */
4042 if (TARGET_HAS_BIONIC
4043 && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64))
4044 opts->x_target_flags |= MASK_LONG_DOUBLE_64;
4045
4046 /* Save the initial options in case the user does function specific
4047 options. */
4048 if (main_args_p)
4049 target_option_default_node = target_option_current_node
4050 = build_target_option_node (opts);
4051
4052 /* Handle stack protector */
4053 if (!opts_set->x_ix86_stack_protector_guard)
4054 opts->x_ix86_stack_protector_guard
4055 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4056
4057 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4058 if (opts->x_ix86_tune_memcpy_strategy)
4059 {
4060 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4061 ix86_parse_stringop_strategy_string (str, false);
4062 free (str);
4063 }
4064
4065 if (opts->x_ix86_tune_memset_strategy)
4066 {
4067 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4068 ix86_parse_stringop_strategy_string (str, true);
4069 free (str);
4070 }
4071 }
4072
4073 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4074
4075 static void
4076 ix86_option_override (void)
4077 {
4078 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4079 static struct register_pass_info insert_vzeroupper_info
4080 = { pass_insert_vzeroupper, "reload",
4081 1, PASS_POS_INSERT_AFTER
4082 };
4083
4084 ix86_option_override_internal (true, &global_options, &global_options_set);
4085
4086
4087 /* This needs to be done at start up. It's convenient to do it here. */
4088 register_pass (&insert_vzeroupper_info);
4089 }
4090
4091 /* Update register usage after having seen the compiler flags. */
4092
4093 static void
4094 ix86_conditional_register_usage (void)
4095 {
4096 int i, c_mask;
4097 unsigned int j;
4098
4099 /* The PIC register, if it exists, is fixed. */
4100 j = PIC_OFFSET_TABLE_REGNUM;
4101 if (j != INVALID_REGNUM)
4102 fixed_regs[j] = call_used_regs[j] = 1;
4103
4104 /* For 32-bit targets, squash the REX registers. */
4105 if (! TARGET_64BIT)
4106 {
4107 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4108 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4109 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4110 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4111 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4112 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4113 }
4114
4115 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4116 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4117 : TARGET_64BIT ? (1 << 2)
4118 : (1 << 1));
4119
4120 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4121
4122 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4123 {
4124 /* Set/reset conditionally defined registers from
4125 CALL_USED_REGISTERS initializer. */
4126 if (call_used_regs[i] > 1)
4127 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4128
4129 /* Calculate registers of CLOBBERED_REGS register set
4130 as call used registers from GENERAL_REGS register set. */
4131 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4132 && call_used_regs[i])
4133 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4134 }
4135
4136 /* If MMX is disabled, squash the registers. */
4137 if (! TARGET_MMX)
4138 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4139 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4140 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4141
4142 /* If SSE is disabled, squash the registers. */
4143 if (! TARGET_SSE)
4144 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4145 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4146 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4147
4148 /* If the FPU is disabled, squash the registers. */
4149 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4150 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4151 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4152 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4153
4154 /* If AVX512F is disabled, squash the registers. */
4155 if (! TARGET_AVX512F)
4156 {
4157 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4158 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4159
4160 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4161 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4162 }
4163
4164 /* If MPX is disabled, squash the registers. */
4165 if (! TARGET_MPX)
4166 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4167 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4168 }
4169
4170 \f
4171 /* Save the current options */
4172
4173 static void
4174 ix86_function_specific_save (struct cl_target_option *ptr,
4175 struct gcc_options *opts)
4176 {
4177 ptr->arch = ix86_arch;
4178 ptr->schedule = ix86_schedule;
4179 ptr->tune = ix86_tune;
4180 ptr->branch_cost = ix86_branch_cost;
4181 ptr->tune_defaulted = ix86_tune_defaulted;
4182 ptr->arch_specified = ix86_arch_specified;
4183 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4184 ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
4185 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
4186
4187 /* The fields are char but the variables are not; make sure the
4188 values fit in the fields. */
4189 gcc_assert (ptr->arch == ix86_arch);
4190 gcc_assert (ptr->schedule == ix86_schedule);
4191 gcc_assert (ptr->tune == ix86_tune);
4192 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4193 }
4194
4195 /* Restore the current options */
4196
4197 static void
4198 ix86_function_specific_restore (struct gcc_options *opts,
4199 struct cl_target_option *ptr)
4200 {
4201 enum processor_type old_tune = ix86_tune;
4202 enum processor_type old_arch = ix86_arch;
4203 unsigned int ix86_arch_mask;
4204 int i;
4205
4206 ix86_arch = (enum processor_type) ptr->arch;
4207 ix86_schedule = (enum attr_cpu) ptr->schedule;
4208 ix86_tune = (enum processor_type) ptr->tune;
4209 opts->x_ix86_branch_cost = ptr->branch_cost;
4210 ix86_tune_defaulted = ptr->tune_defaulted;
4211 ix86_arch_specified = ptr->arch_specified;
4212 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4213 opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
4214 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
4215
4216 /* Recreate the arch feature tests if the arch changed */
4217 if (old_arch != ix86_arch)
4218 {
4219 ix86_arch_mask = 1u << ix86_arch;
4220 for (i = 0; i < X86_ARCH_LAST; ++i)
4221 ix86_arch_features[i]
4222 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4223 }
4224
4225 /* Recreate the tune optimization tests */
4226 if (old_tune != ix86_tune)
4227 set_ix86_tune_features (ix86_tune, false);
4228 }
4229
4230 /* Print the current options */
4231
4232 static void
4233 ix86_function_specific_print (FILE *file, int indent,
4234 struct cl_target_option *ptr)
4235 {
4236 char *target_string
4237 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4238 NULL, NULL, ptr->x_ix86_fpmath, false);
4239
4240 fprintf (file, "%*sarch = %d (%s)\n",
4241 indent, "",
4242 ptr->arch,
4243 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4244 ? cpu_names[ptr->arch]
4245 : "<unknown>"));
4246
4247 fprintf (file, "%*stune = %d (%s)\n",
4248 indent, "",
4249 ptr->tune,
4250 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4251 ? cpu_names[ptr->tune]
4252 : "<unknown>"));
4253
4254 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4255
4256 if (target_string)
4257 {
4258 fprintf (file, "%*s%s\n", indent, "", target_string);
4259 free (target_string);
4260 }
4261 }
4262
4263 \f
4264 /* Inner function to process the attribute((target(...))), take an argument and
4265 set the current options from the argument. If we have a list, recursively go
4266 over the list. */
4267
4268 static bool
4269 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4270 struct gcc_options *opts,
4271 struct gcc_options *opts_set,
4272 struct gcc_options *enum_opts_set)
4273 {
4274 char *next_optstr;
4275 bool ret = true;
4276
4277 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4278 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4279 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4280 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4281 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4282
4283 enum ix86_opt_type
4284 {
4285 ix86_opt_unknown,
4286 ix86_opt_yes,
4287 ix86_opt_no,
4288 ix86_opt_str,
4289 ix86_opt_enum,
4290 ix86_opt_isa
4291 };
4292
4293 static const struct
4294 {
4295 const char *string;
4296 size_t len;
4297 enum ix86_opt_type type;
4298 int opt;
4299 int mask;
4300 } attrs[] = {
4301 /* isa options */
4302 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4303 IX86_ATTR_ISA ("abm", OPT_mabm),
4304 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4305 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4306 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4307 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4308 IX86_ATTR_ISA ("aes", OPT_maes),
4309 IX86_ATTR_ISA ("avx", OPT_mavx),
4310 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4311 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4312 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4313 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4314 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4315 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4316 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4317 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4318 IX86_ATTR_ISA ("sse", OPT_msse),
4319 IX86_ATTR_ISA ("sse2", OPT_msse2),
4320 IX86_ATTR_ISA ("sse3", OPT_msse3),
4321 IX86_ATTR_ISA ("sse4", OPT_msse4),
4322 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4323 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4324 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4325 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4326 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4327 IX86_ATTR_ISA ("fma", OPT_mfma),
4328 IX86_ATTR_ISA ("xop", OPT_mxop),
4329 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4330 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4331 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4332 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4333 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4334 IX86_ATTR_ISA ("hle", OPT_mhle),
4335 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4336 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4337 IX86_ATTR_ISA ("adx", OPT_madx),
4338 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4339 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4340 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4341
4342 /* enum options */
4343 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4344
4345 /* string options */
4346 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4347 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4348
4349 /* flag options */
4350 IX86_ATTR_YES ("cld",
4351 OPT_mcld,
4352 MASK_CLD),
4353
4354 IX86_ATTR_NO ("fancy-math-387",
4355 OPT_mfancy_math_387,
4356 MASK_NO_FANCY_MATH_387),
4357
4358 IX86_ATTR_YES ("ieee-fp",
4359 OPT_mieee_fp,
4360 MASK_IEEE_FP),
4361
4362 IX86_ATTR_YES ("inline-all-stringops",
4363 OPT_minline_all_stringops,
4364 MASK_INLINE_ALL_STRINGOPS),
4365
4366 IX86_ATTR_YES ("inline-stringops-dynamically",
4367 OPT_minline_stringops_dynamically,
4368 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4369
4370 IX86_ATTR_NO ("align-stringops",
4371 OPT_mno_align_stringops,
4372 MASK_NO_ALIGN_STRINGOPS),
4373
4374 IX86_ATTR_YES ("recip",
4375 OPT_mrecip,
4376 MASK_RECIP),
4377
4378 };
4379
4380 /* If this is a list, recurse to get the options. */
4381 if (TREE_CODE (args) == TREE_LIST)
4382 {
4383 bool ret = true;
4384
4385 for (; args; args = TREE_CHAIN (args))
4386 if (TREE_VALUE (args)
4387 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4388 p_strings, opts, opts_set,
4389 enum_opts_set))
4390 ret = false;
4391
4392 return ret;
4393 }
4394
4395 else if (TREE_CODE (args) != STRING_CST)
4396 {
4397 error ("attribute %<target%> argument not a string");
4398 return false;
4399 }
4400
4401 /* Handle multiple arguments separated by commas. */
4402 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4403
4404 while (next_optstr && *next_optstr != '\0')
4405 {
4406 char *p = next_optstr;
4407 char *orig_p = p;
4408 char *comma = strchr (next_optstr, ',');
4409 const char *opt_string;
4410 size_t len, opt_len;
4411 int opt;
4412 bool opt_set_p;
4413 char ch;
4414 unsigned i;
4415 enum ix86_opt_type type = ix86_opt_unknown;
4416 int mask = 0;
4417
4418 if (comma)
4419 {
4420 *comma = '\0';
4421 len = comma - next_optstr;
4422 next_optstr = comma + 1;
4423 }
4424 else
4425 {
4426 len = strlen (p);
4427 next_optstr = NULL;
4428 }
4429
4430 /* Recognize no-xxx. */
4431 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4432 {
4433 opt_set_p = false;
4434 p += 3;
4435 len -= 3;
4436 }
4437 else
4438 opt_set_p = true;
4439
4440 /* Find the option. */
4441 ch = *p;
4442 opt = N_OPTS;
4443 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4444 {
4445 type = attrs[i].type;
4446 opt_len = attrs[i].len;
4447 if (ch == attrs[i].string[0]
4448 && ((type != ix86_opt_str && type != ix86_opt_enum)
4449 ? len == opt_len
4450 : len > opt_len)
4451 && memcmp (p, attrs[i].string, opt_len) == 0)
4452 {
4453 opt = attrs[i].opt;
4454 mask = attrs[i].mask;
4455 opt_string = attrs[i].string;
4456 break;
4457 }
4458 }
4459
4460 /* Process the option. */
4461 if (opt == N_OPTS)
4462 {
4463 error ("attribute(target(\"%s\")) is unknown", orig_p);
4464 ret = false;
4465 }
4466
4467 else if (type == ix86_opt_isa)
4468 {
4469 struct cl_decoded_option decoded;
4470
4471 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4472 ix86_handle_option (opts, opts_set,
4473 &decoded, input_location);
4474 }
4475
4476 else if (type == ix86_opt_yes || type == ix86_opt_no)
4477 {
4478 if (type == ix86_opt_no)
4479 opt_set_p = !opt_set_p;
4480
4481 if (opt_set_p)
4482 opts->x_target_flags |= mask;
4483 else
4484 opts->x_target_flags &= ~mask;
4485 }
4486
4487 else if (type == ix86_opt_str)
4488 {
4489 if (p_strings[opt])
4490 {
4491 error ("option(\"%s\") was already specified", opt_string);
4492 ret = false;
4493 }
4494 else
4495 p_strings[opt] = xstrdup (p + opt_len);
4496 }
4497
4498 else if (type == ix86_opt_enum)
4499 {
4500 bool arg_ok;
4501 int value;
4502
4503 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4504 if (arg_ok)
4505 set_option (opts, enum_opts_set, opt, value,
4506 p + opt_len, DK_UNSPECIFIED, input_location,
4507 global_dc);
4508 else
4509 {
4510 error ("attribute(target(\"%s\")) is unknown", orig_p);
4511 ret = false;
4512 }
4513 }
4514
4515 else
4516 gcc_unreachable ();
4517 }
4518
4519 return ret;
4520 }
4521
4522 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4523
4524 tree
4525 ix86_valid_target_attribute_tree (tree args,
4526 struct gcc_options *opts,
4527 struct gcc_options *opts_set)
4528 {
4529 const char *orig_arch_string = ix86_arch_string;
4530 const char *orig_tune_string = ix86_tune_string;
4531 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
4532 int orig_tune_defaulted = ix86_tune_defaulted;
4533 int orig_arch_specified = ix86_arch_specified;
4534 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4535 tree t = NULL_TREE;
4536 int i;
4537 struct cl_target_option *def
4538 = TREE_TARGET_OPTION (target_option_default_node);
4539 struct gcc_options enum_opts_set;
4540
4541 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4542
4543 /* Process each of the options on the chain. */
4544 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
4545 opts_set, &enum_opts_set))
4546 return error_mark_node;
4547
4548 /* If the changed options are different from the default, rerun
4549 ix86_option_override_internal, and then save the options away.
4550 The string options are are attribute options, and will be undone
4551 when we copy the save structure. */
4552 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
4553 || opts->x_target_flags != def->x_target_flags
4554 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4555 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4556 || enum_opts_set.x_ix86_fpmath)
4557 {
4558 /* If we are using the default tune= or arch=, undo the string assigned,
4559 and use the default. */
4560 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4561 opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4562 else if (!orig_arch_specified)
4563 opts->x_ix86_arch_string = NULL;
4564
4565 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4566 opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4567 else if (orig_tune_defaulted)
4568 opts->x_ix86_tune_string = NULL;
4569
4570 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4571 if (enum_opts_set.x_ix86_fpmath)
4572 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4573 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4574 && TARGET_SSE_P (opts->x_ix86_isa_flags))
4575 {
4576 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4577 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4578 }
4579
4580 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4581 ix86_option_override_internal (false, opts, opts_set);
4582
4583 /* Add any builtin functions with the new isa if any. */
4584 ix86_add_new_builtins (opts->x_ix86_isa_flags);
4585
4586 /* Save the current options unless we are validating options for
4587 #pragma. */
4588 t = build_target_option_node (opts);
4589
4590 opts->x_ix86_arch_string = orig_arch_string;
4591 opts->x_ix86_tune_string = orig_tune_string;
4592 opts_set->x_ix86_fpmath = orig_fpmath_set;
4593
4594 /* Free up memory allocated to hold the strings */
4595 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4596 free (option_strings[i]);
4597 }
4598
4599 return t;
4600 }
4601
4602 /* Hook to validate attribute((target("string"))). */
4603
4604 static bool
4605 ix86_valid_target_attribute_p (tree fndecl,
4606 tree ARG_UNUSED (name),
4607 tree args,
4608 int ARG_UNUSED (flags))
4609 {
4610 struct gcc_options func_options;
4611 tree new_target, new_optimize;
4612 bool ret = true;
4613
4614 /* attribute((target("default"))) does nothing, beyond
4615 affecting multi-versioning. */
4616 if (TREE_VALUE (args)
4617 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4618 && TREE_CHAIN (args) == NULL_TREE
4619 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4620 return true;
4621
4622 tree old_optimize = build_optimization_node (&global_options);
4623
4624 /* Get the optimization options of the current function. */
4625 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4626
4627 if (!func_optimize)
4628 func_optimize = old_optimize;
4629
4630 /* Init func_options. */
4631 memset (&func_options, 0, sizeof (func_options));
4632 init_options_struct (&func_options, NULL);
4633 lang_hooks.init_options_struct (&func_options);
4634
4635 cl_optimization_restore (&func_options,
4636 TREE_OPTIMIZATION (func_optimize));
4637
4638 /* Initialize func_options to the default before its target options can
4639 be set. */
4640 cl_target_option_restore (&func_options,
4641 TREE_TARGET_OPTION (target_option_default_node));
4642
4643 new_target = ix86_valid_target_attribute_tree (args, &func_options,
4644 &global_options_set);
4645
4646 new_optimize = build_optimization_node (&func_options);
4647
4648 if (new_target == error_mark_node)
4649 ret = false;
4650
4651 else if (fndecl && new_target)
4652 {
4653 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4654
4655 if (old_optimize != new_optimize)
4656 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4657 }
4658
4659 return ret;
4660 }
4661
4662 \f
4663 /* Hook to determine if one function can safely inline another. */
4664
4665 static bool
4666 ix86_can_inline_p (tree caller, tree callee)
4667 {
4668 bool ret = false;
4669 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4670 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4671
4672 /* If callee has no option attributes, then it is ok to inline. */
4673 if (!callee_tree)
4674 ret = true;
4675
4676 /* If caller has no option attributes, but callee does then it is not ok to
4677 inline. */
4678 else if (!caller_tree)
4679 ret = false;
4680
4681 else
4682 {
4683 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4684 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4685
4686 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4687 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4688 function. */
4689 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4690 != callee_opts->x_ix86_isa_flags)
4691 ret = false;
4692
4693 /* See if we have the same non-isa options. */
4694 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4695 ret = false;
4696
4697 /* See if arch, tune, etc. are the same. */
4698 else if (caller_opts->arch != callee_opts->arch)
4699 ret = false;
4700
4701 else if (caller_opts->tune != callee_opts->tune)
4702 ret = false;
4703
4704 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4705 ret = false;
4706
4707 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4708 ret = false;
4709
4710 else
4711 ret = true;
4712 }
4713
4714 return ret;
4715 }
4716
4717 \f
4718 /* Remember the last target of ix86_set_current_function. */
4719 static GTY(()) tree ix86_previous_fndecl;
4720
4721 /* Invalidate ix86_previous_fndecl cache. */
4722 void
4723 ix86_reset_previous_fndecl (void)
4724 {
4725 ix86_previous_fndecl = NULL_TREE;
4726 }
4727
4728 /* Establish appropriate back-end context for processing the function
4729 FNDECL. The argument might be NULL to indicate processing at top
4730 level, outside of any function scope. */
4731 static void
4732 ix86_set_current_function (tree fndecl)
4733 {
4734 /* Only change the context if the function changes. This hook is called
4735 several times in the course of compiling a function, and we don't want to
4736 slow things down too much or call target_reinit when it isn't safe. */
4737 if (fndecl && fndecl != ix86_previous_fndecl)
4738 {
4739 tree old_tree = (ix86_previous_fndecl
4740 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4741 : NULL_TREE);
4742
4743 tree new_tree = (fndecl
4744 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4745 : NULL_TREE);
4746
4747 ix86_previous_fndecl = fndecl;
4748 if (old_tree == new_tree)
4749 ;
4750
4751 else if (new_tree)
4752 {
4753 cl_target_option_restore (&global_options,
4754 TREE_TARGET_OPTION (new_tree));
4755 target_reinit ();
4756 }
4757
4758 else if (old_tree)
4759 {
4760 struct cl_target_option *def
4761 = TREE_TARGET_OPTION (target_option_current_node);
4762
4763 cl_target_option_restore (&global_options, def);
4764 target_reinit ();
4765 }
4766 }
4767 }
4768
4769 \f
4770 /* Return true if this goes in large data/bss. */
4771
4772 static bool
4773 ix86_in_large_data_p (tree exp)
4774 {
4775 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4776 return false;
4777
4778 /* Functions are never large data. */
4779 if (TREE_CODE (exp) == FUNCTION_DECL)
4780 return false;
4781
4782 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4783 {
4784 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4785 if (strcmp (section, ".ldata") == 0
4786 || strcmp (section, ".lbss") == 0)
4787 return true;
4788 return false;
4789 }
4790 else
4791 {
4792 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4793
4794 /* If this is an incomplete type with size 0, then we can't put it
4795 in data because it might be too big when completed. */
4796 if (!size || size > ix86_section_threshold)
4797 return true;
4798 }
4799
4800 return false;
4801 }
4802
4803 /* Switch to the appropriate section for output of DECL.
4804 DECL is either a `VAR_DECL' node or a constant of some sort.
4805 RELOC indicates whether forming the initial value of DECL requires
4806 link-time relocations. */
4807
4808 ATTRIBUTE_UNUSED static section *
4809 x86_64_elf_select_section (tree decl, int reloc,
4810 unsigned HOST_WIDE_INT align)
4811 {
4812 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4813 && ix86_in_large_data_p (decl))
4814 {
4815 const char *sname = NULL;
4816 unsigned int flags = SECTION_WRITE;
4817 switch (categorize_decl_for_section (decl, reloc))
4818 {
4819 case SECCAT_DATA:
4820 sname = ".ldata";
4821 break;
4822 case SECCAT_DATA_REL:
4823 sname = ".ldata.rel";
4824 break;
4825 case SECCAT_DATA_REL_LOCAL:
4826 sname = ".ldata.rel.local";
4827 break;
4828 case SECCAT_DATA_REL_RO:
4829 sname = ".ldata.rel.ro";
4830 break;
4831 case SECCAT_DATA_REL_RO_LOCAL:
4832 sname = ".ldata.rel.ro.local";
4833 break;
4834 case SECCAT_BSS:
4835 sname = ".lbss";
4836 flags |= SECTION_BSS;
4837 break;
4838 case SECCAT_RODATA:
4839 case SECCAT_RODATA_MERGE_STR:
4840 case SECCAT_RODATA_MERGE_STR_INIT:
4841 case SECCAT_RODATA_MERGE_CONST:
4842 sname = ".lrodata";
4843 flags = 0;
4844 break;
4845 case SECCAT_SRODATA:
4846 case SECCAT_SDATA:
4847 case SECCAT_SBSS:
4848 gcc_unreachable ();
4849 case SECCAT_TEXT:
4850 case SECCAT_TDATA:
4851 case SECCAT_TBSS:
4852 /* We don't split these for medium model. Place them into
4853 default sections and hope for best. */
4854 break;
4855 }
4856 if (sname)
4857 {
4858 /* We might get called with string constants, but get_named_section
4859 doesn't like them as they are not DECLs. Also, we need to set
4860 flags in that case. */
4861 if (!DECL_P (decl))
4862 return get_section (sname, flags, NULL);
4863 return get_named_section (decl, sname, reloc);
4864 }
4865 }
4866 return default_elf_select_section (decl, reloc, align);
4867 }
4868
4869 /* Select a set of attributes for section NAME based on the properties
4870 of DECL and whether or not RELOC indicates that DECL's initializer
4871 might contain runtime relocations. */
4872
4873 static unsigned int ATTRIBUTE_UNUSED
4874 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4875 {
4876 unsigned int flags = default_section_type_flags (decl, name, reloc);
4877
4878 if (decl == NULL_TREE
4879 && (strcmp (name, ".ldata.rel.ro") == 0
4880 || strcmp (name, ".ldata.rel.ro.local") == 0))
4881 flags |= SECTION_RELRO;
4882
4883 if (strcmp (name, ".lbss") == 0
4884 || strncmp (name, ".lbss.", 5) == 0
4885 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4886 flags |= SECTION_BSS;
4887
4888 return flags;
4889 }
4890
4891 /* Build up a unique section name, expressed as a
4892 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4893 RELOC indicates whether the initial value of EXP requires
4894 link-time relocations. */
4895
4896 static void ATTRIBUTE_UNUSED
4897 x86_64_elf_unique_section (tree decl, int reloc)
4898 {
4899 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4900 && ix86_in_large_data_p (decl))
4901 {
4902 const char *prefix = NULL;
4903 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4904 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4905
4906 switch (categorize_decl_for_section (decl, reloc))
4907 {
4908 case SECCAT_DATA:
4909 case SECCAT_DATA_REL:
4910 case SECCAT_DATA_REL_LOCAL:
4911 case SECCAT_DATA_REL_RO:
4912 case SECCAT_DATA_REL_RO_LOCAL:
4913 prefix = one_only ? ".ld" : ".ldata";
4914 break;
4915 case SECCAT_BSS:
4916 prefix = one_only ? ".lb" : ".lbss";
4917 break;
4918 case SECCAT_RODATA:
4919 case SECCAT_RODATA_MERGE_STR:
4920 case SECCAT_RODATA_MERGE_STR_INIT:
4921 case SECCAT_RODATA_MERGE_CONST:
4922 prefix = one_only ? ".lr" : ".lrodata";
4923 break;
4924 case SECCAT_SRODATA:
4925 case SECCAT_SDATA:
4926 case SECCAT_SBSS:
4927 gcc_unreachable ();
4928 case SECCAT_TEXT:
4929 case SECCAT_TDATA:
4930 case SECCAT_TBSS:
4931 /* We don't split these for medium model. Place them into
4932 default sections and hope for best. */
4933 break;
4934 }
4935 if (prefix)
4936 {
4937 const char *name, *linkonce;
4938 char *string;
4939
4940 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4941 name = targetm.strip_name_encoding (name);
4942
4943 /* If we're using one_only, then there needs to be a .gnu.linkonce
4944 prefix to the section name. */
4945 linkonce = one_only ? ".gnu.linkonce" : "";
4946
4947 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4948
4949 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4950 return;
4951 }
4952 }
4953 default_unique_section (decl, reloc);
4954 }
4955
4956 #ifdef COMMON_ASM_OP
4957 /* This says how to output assembler code to declare an
4958 uninitialized external linkage data object.
4959
4960 For medium model x86-64 we need to use .largecomm opcode for
4961 large objects. */
4962 void
4963 x86_elf_aligned_common (FILE *file,
4964 const char *name, unsigned HOST_WIDE_INT size,
4965 int align)
4966 {
4967 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4968 && size > (unsigned int)ix86_section_threshold)
4969 fputs (".largecomm\t", file);
4970 else
4971 fputs (COMMON_ASM_OP, file);
4972 assemble_name (file, name);
4973 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4974 size, align / BITS_PER_UNIT);
4975 }
4976 #endif
4977
4978 /* Utility function for targets to use in implementing
4979 ASM_OUTPUT_ALIGNED_BSS. */
4980
4981 void
4982 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4983 const char *name, unsigned HOST_WIDE_INT size,
4984 int align)
4985 {
4986 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4987 && size > (unsigned int)ix86_section_threshold)
4988 switch_to_section (get_named_section (decl, ".lbss", 0));
4989 else
4990 switch_to_section (bss_section);
4991 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4992 #ifdef ASM_DECLARE_OBJECT_NAME
4993 last_assemble_variable_decl = decl;
4994 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4995 #else
4996 /* Standard thing is just output label for the object. */
4997 ASM_OUTPUT_LABEL (file, name);
4998 #endif /* ASM_DECLARE_OBJECT_NAME */
4999 ASM_OUTPUT_SKIP (file, size ? size : 1);
5000 }
5001 \f
5002 /* Decide whether we must probe the stack before any space allocation
5003 on this target. It's essentially TARGET_STACK_PROBE except when
5004 -fstack-check causes the stack to be already probed differently. */
5005
5006 bool
5007 ix86_target_stack_probe (void)
5008 {
5009 /* Do not probe the stack twice if static stack checking is enabled. */
5010 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5011 return false;
5012
5013 return TARGET_STACK_PROBE;
5014 }
5015 \f
5016 /* Decide whether we can make a sibling call to a function. DECL is the
5017 declaration of the function being targeted by the call and EXP is the
5018 CALL_EXPR representing the call. */
5019
5020 static bool
5021 ix86_function_ok_for_sibcall (tree decl, tree exp)
5022 {
5023 tree type, decl_or_type;
5024 rtx a, b;
5025
5026 /* If we are generating position-independent code, we cannot sibcall
5027 optimize any indirect call, or a direct call to a global function,
5028 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5029 if (!TARGET_MACHO
5030 && !TARGET_64BIT
5031 && flag_pic
5032 && (!decl || !targetm.binds_local_p (decl)))
5033 return false;
5034
5035 /* If we need to align the outgoing stack, then sibcalling would
5036 unalign the stack, which may break the called function. */
5037 if (ix86_minimum_incoming_stack_boundary (true)
5038 < PREFERRED_STACK_BOUNDARY)
5039 return false;
5040
5041 if (decl)
5042 {
5043 decl_or_type = decl;
5044 type = TREE_TYPE (decl);
5045 }
5046 else
5047 {
5048 /* We're looking at the CALL_EXPR, we need the type of the function. */
5049 type = CALL_EXPR_FN (exp); /* pointer expression */
5050 type = TREE_TYPE (type); /* pointer type */
5051 type = TREE_TYPE (type); /* function type */
5052 decl_or_type = type;
5053 }
5054
5055 /* Check that the return value locations are the same. Like
5056 if we are returning floats on the 80387 register stack, we cannot
5057 make a sibcall from a function that doesn't return a float to a
5058 function that does or, conversely, from a function that does return
5059 a float to a function that doesn't; the necessary stack adjustment
5060 would not be executed. This is also the place we notice
5061 differences in the return value ABI. Note that it is ok for one
5062 of the functions to have void return type as long as the return
5063 value of the other is passed in a register. */
5064 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5065 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5066 cfun->decl, false);
5067 if (STACK_REG_P (a) || STACK_REG_P (b))
5068 {
5069 if (!rtx_equal_p (a, b))
5070 return false;
5071 }
5072 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5073 ;
5074 else if (!rtx_equal_p (a, b))
5075 return false;
5076
5077 if (TARGET_64BIT)
5078 {
5079 /* The SYSV ABI has more call-clobbered registers;
5080 disallow sibcalls from MS to SYSV. */
5081 if (cfun->machine->call_abi == MS_ABI
5082 && ix86_function_type_abi (type) == SYSV_ABI)
5083 return false;
5084 }
5085 else
5086 {
5087 /* If this call is indirect, we'll need to be able to use a
5088 call-clobbered register for the address of the target function.
5089 Make sure that all such registers are not used for passing
5090 parameters. Note that DLLIMPORT functions are indirect. */
5091 if (!decl
5092 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5093 {
5094 if (ix86_function_regparm (type, NULL) >= 3)
5095 {
5096 /* ??? Need to count the actual number of registers to be used,
5097 not the possible number of registers. Fix later. */
5098 return false;
5099 }
5100 }
5101 }
5102
5103 /* Otherwise okay. That also includes certain types of indirect calls. */
5104 return true;
5105 }
5106
5107 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5108 and "sseregparm" calling convention attributes;
5109 arguments as in struct attribute_spec.handler. */
5110
5111 static tree
5112 ix86_handle_cconv_attribute (tree *node, tree name,
5113 tree args,
5114 int flags ATTRIBUTE_UNUSED,
5115 bool *no_add_attrs)
5116 {
5117 if (TREE_CODE (*node) != FUNCTION_TYPE
5118 && TREE_CODE (*node) != METHOD_TYPE
5119 && TREE_CODE (*node) != FIELD_DECL
5120 && TREE_CODE (*node) != TYPE_DECL)
5121 {
5122 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5123 name);
5124 *no_add_attrs = true;
5125 return NULL_TREE;
5126 }
5127
5128 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5129 if (is_attribute_p ("regparm", name))
5130 {
5131 tree cst;
5132
5133 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5134 {
5135 error ("fastcall and regparm attributes are not compatible");
5136 }
5137
5138 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5139 {
5140 error ("regparam and thiscall attributes are not compatible");
5141 }
5142
5143 cst = TREE_VALUE (args);
5144 if (TREE_CODE (cst) != INTEGER_CST)
5145 {
5146 warning (OPT_Wattributes,
5147 "%qE attribute requires an integer constant argument",
5148 name);
5149 *no_add_attrs = true;
5150 }
5151 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5152 {
5153 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5154 name, REGPARM_MAX);
5155 *no_add_attrs = true;
5156 }
5157
5158 return NULL_TREE;
5159 }
5160
5161 if (TARGET_64BIT)
5162 {
5163 /* Do not warn when emulating the MS ABI. */
5164 if ((TREE_CODE (*node) != FUNCTION_TYPE
5165 && TREE_CODE (*node) != METHOD_TYPE)
5166 || ix86_function_type_abi (*node) != MS_ABI)
5167 warning (OPT_Wattributes, "%qE attribute ignored",
5168 name);
5169 *no_add_attrs = true;
5170 return NULL_TREE;
5171 }
5172
5173 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5174 if (is_attribute_p ("fastcall", name))
5175 {
5176 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5177 {
5178 error ("fastcall and cdecl attributes are not compatible");
5179 }
5180 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5181 {
5182 error ("fastcall and stdcall attributes are not compatible");
5183 }
5184 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5185 {
5186 error ("fastcall and regparm attributes are not compatible");
5187 }
5188 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5189 {
5190 error ("fastcall and thiscall attributes are not compatible");
5191 }
5192 }
5193
5194 /* Can combine stdcall with fastcall (redundant), regparm and
5195 sseregparm. */
5196 else if (is_attribute_p ("stdcall", name))
5197 {
5198 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5199 {
5200 error ("stdcall and cdecl attributes are not compatible");
5201 }
5202 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5203 {
5204 error ("stdcall and fastcall attributes are not compatible");
5205 }
5206 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5207 {
5208 error ("stdcall and thiscall attributes are not compatible");
5209 }
5210 }
5211
5212 /* Can combine cdecl with regparm and sseregparm. */
5213 else if (is_attribute_p ("cdecl", name))
5214 {
5215 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5216 {
5217 error ("stdcall and cdecl attributes are not compatible");
5218 }
5219 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5220 {
5221 error ("fastcall and cdecl attributes are not compatible");
5222 }
5223 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5224 {
5225 error ("cdecl and thiscall attributes are not compatible");
5226 }
5227 }
5228 else if (is_attribute_p ("thiscall", name))
5229 {
5230 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5231 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5232 name);
5233 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5234 {
5235 error ("stdcall and thiscall attributes are not compatible");
5236 }
5237 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5238 {
5239 error ("fastcall and thiscall attributes are not compatible");
5240 }
5241 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5242 {
5243 error ("cdecl and thiscall attributes are not compatible");
5244 }
5245 }
5246
5247 /* Can combine sseregparm with all attributes. */
5248
5249 return NULL_TREE;
5250 }
5251
5252 /* The transactional memory builtins are implicitly regparm or fastcall
5253 depending on the ABI. Override the generic do-nothing attribute that
5254 these builtins were declared with, and replace it with one of the two
5255 attributes that we expect elsewhere. */
5256
5257 static tree
5258 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5259 tree args ATTRIBUTE_UNUSED,
5260 int flags, bool *no_add_attrs)
5261 {
5262 tree alt;
5263
5264 /* In no case do we want to add the placeholder attribute. */
5265 *no_add_attrs = true;
5266
5267 /* The 64-bit ABI is unchanged for transactional memory. */
5268 if (TARGET_64BIT)
5269 return NULL_TREE;
5270
5271 /* ??? Is there a better way to validate 32-bit windows? We have
5272 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5273 if (CHECK_STACK_LIMIT > 0)
5274 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5275 else
5276 {
5277 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5278 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5279 }
5280 decl_attributes (node, alt, flags);
5281
5282 return NULL_TREE;
5283 }
5284
5285 /* This function determines from TYPE the calling-convention. */
5286
5287 unsigned int
5288 ix86_get_callcvt (const_tree type)
5289 {
5290 unsigned int ret = 0;
5291 bool is_stdarg;
5292 tree attrs;
5293
5294 if (TARGET_64BIT)
5295 return IX86_CALLCVT_CDECL;
5296
5297 attrs = TYPE_ATTRIBUTES (type);
5298 if (attrs != NULL_TREE)
5299 {
5300 if (lookup_attribute ("cdecl", attrs))
5301 ret |= IX86_CALLCVT_CDECL;
5302 else if (lookup_attribute ("stdcall", attrs))
5303 ret |= IX86_CALLCVT_STDCALL;
5304 else if (lookup_attribute ("fastcall", attrs))
5305 ret |= IX86_CALLCVT_FASTCALL;
5306 else if (lookup_attribute ("thiscall", attrs))
5307 ret |= IX86_CALLCVT_THISCALL;
5308
5309 /* Regparam isn't allowed for thiscall and fastcall. */
5310 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5311 {
5312 if (lookup_attribute ("regparm", attrs))
5313 ret |= IX86_CALLCVT_REGPARM;
5314 if (lookup_attribute ("sseregparm", attrs))
5315 ret |= IX86_CALLCVT_SSEREGPARM;
5316 }
5317
5318 if (IX86_BASE_CALLCVT(ret) != 0)
5319 return ret;
5320 }
5321
5322 is_stdarg = stdarg_p (type);
5323 if (TARGET_RTD && !is_stdarg)
5324 return IX86_CALLCVT_STDCALL | ret;
5325
5326 if (ret != 0
5327 || is_stdarg
5328 || TREE_CODE (type) != METHOD_TYPE
5329 || ix86_function_type_abi (type) != MS_ABI)
5330 return IX86_CALLCVT_CDECL | ret;
5331
5332 return IX86_CALLCVT_THISCALL;
5333 }
5334
5335 /* Return 0 if the attributes for two types are incompatible, 1 if they
5336 are compatible, and 2 if they are nearly compatible (which causes a
5337 warning to be generated). */
5338
5339 static int
5340 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5341 {
5342 unsigned int ccvt1, ccvt2;
5343
5344 if (TREE_CODE (type1) != FUNCTION_TYPE
5345 && TREE_CODE (type1) != METHOD_TYPE)
5346 return 1;
5347
5348 ccvt1 = ix86_get_callcvt (type1);
5349 ccvt2 = ix86_get_callcvt (type2);
5350 if (ccvt1 != ccvt2)
5351 return 0;
5352 if (ix86_function_regparm (type1, NULL)
5353 != ix86_function_regparm (type2, NULL))
5354 return 0;
5355
5356 return 1;
5357 }
5358 \f
5359 /* Return the regparm value for a function with the indicated TYPE and DECL.
5360 DECL may be NULL when calling function indirectly
5361 or considering a libcall. */
5362
5363 static int
5364 ix86_function_regparm (const_tree type, const_tree decl)
5365 {
5366 tree attr;
5367 int regparm;
5368 unsigned int ccvt;
5369
5370 if (TARGET_64BIT)
5371 return (ix86_function_type_abi (type) == SYSV_ABI
5372 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5373 ccvt = ix86_get_callcvt (type);
5374 regparm = ix86_regparm;
5375
5376 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5377 {
5378 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5379 if (attr)
5380 {
5381 regparm = tree_to_hwi (TREE_VALUE (TREE_VALUE (attr)));
5382 return regparm;
5383 }
5384 }
5385 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5386 return 2;
5387 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5388 return 1;
5389
5390 /* Use register calling convention for local functions when possible. */
5391 if (decl
5392 && TREE_CODE (decl) == FUNCTION_DECL
5393 && optimize
5394 && !(profile_flag && !flag_fentry))
5395 {
5396 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5397 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5398 if (i && i->local && i->can_change_signature)
5399 {
5400 int local_regparm, globals = 0, regno;
5401
5402 /* Make sure no regparm register is taken by a
5403 fixed register variable. */
5404 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5405 if (fixed_regs[local_regparm])
5406 break;
5407
5408 /* We don't want to use regparm(3) for nested functions as
5409 these use a static chain pointer in the third argument. */
5410 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5411 local_regparm = 2;
5412
5413 /* In 32-bit mode save a register for the split stack. */
5414 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5415 local_regparm = 2;
5416
5417 /* Each fixed register usage increases register pressure,
5418 so less registers should be used for argument passing.
5419 This functionality can be overriden by an explicit
5420 regparm value. */
5421 for (regno = AX_REG; regno <= DI_REG; regno++)
5422 if (fixed_regs[regno])
5423 globals++;
5424
5425 local_regparm
5426 = globals < local_regparm ? local_regparm - globals : 0;
5427
5428 if (local_regparm > regparm)
5429 regparm = local_regparm;
5430 }
5431 }
5432
5433 return regparm;
5434 }
5435
5436 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5437 DFmode (2) arguments in SSE registers for a function with the
5438 indicated TYPE and DECL. DECL may be NULL when calling function
5439 indirectly or considering a libcall. Otherwise return 0. */
5440
5441 static int
5442 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5443 {
5444 gcc_assert (!TARGET_64BIT);
5445
5446 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5447 by the sseregparm attribute. */
5448 if (TARGET_SSEREGPARM
5449 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5450 {
5451 if (!TARGET_SSE)
5452 {
5453 if (warn)
5454 {
5455 if (decl)
5456 error ("calling %qD with attribute sseregparm without "
5457 "SSE/SSE2 enabled", decl);
5458 else
5459 error ("calling %qT with attribute sseregparm without "
5460 "SSE/SSE2 enabled", type);
5461 }
5462 return 0;
5463 }
5464
5465 return 2;
5466 }
5467
5468 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5469 (and DFmode for SSE2) arguments in SSE registers. */
5470 if (decl && TARGET_SSE_MATH && optimize
5471 && !(profile_flag && !flag_fentry))
5472 {
5473 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5474 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5475 if (i && i->local && i->can_change_signature)
5476 return TARGET_SSE2 ? 2 : 1;
5477 }
5478
5479 return 0;
5480 }
5481
5482 /* Return true if EAX is live at the start of the function. Used by
5483 ix86_expand_prologue to determine if we need special help before
5484 calling allocate_stack_worker. */
5485
5486 static bool
5487 ix86_eax_live_at_start_p (void)
5488 {
5489 /* Cheat. Don't bother working forward from ix86_function_regparm
5490 to the function type to whether an actual argument is located in
5491 eax. Instead just look at cfg info, which is still close enough
5492 to correct at this point. This gives false positives for broken
5493 functions that might use uninitialized data that happens to be
5494 allocated in eax, but who cares? */
5495 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5496 }
5497
5498 static bool
5499 ix86_keep_aggregate_return_pointer (tree fntype)
5500 {
5501 tree attr;
5502
5503 if (!TARGET_64BIT)
5504 {
5505 attr = lookup_attribute ("callee_pop_aggregate_return",
5506 TYPE_ATTRIBUTES (fntype));
5507 if (attr)
5508 return (tree_to_hwi (TREE_VALUE (TREE_VALUE (attr))) == 0);
5509
5510 /* For 32-bit MS-ABI the default is to keep aggregate
5511 return pointer. */
5512 if (ix86_function_type_abi (fntype) == MS_ABI)
5513 return true;
5514 }
5515 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5516 }
5517
5518 /* Value is the number of bytes of arguments automatically
5519 popped when returning from a subroutine call.
5520 FUNDECL is the declaration node of the function (as a tree),
5521 FUNTYPE is the data type of the function (as a tree),
5522 or for a library call it is an identifier node for the subroutine name.
5523 SIZE is the number of bytes of arguments passed on the stack.
5524
5525 On the 80386, the RTD insn may be used to pop them if the number
5526 of args is fixed, but if the number is variable then the caller
5527 must pop them all. RTD can't be used for library calls now
5528 because the library is compiled with the Unix compiler.
5529 Use of RTD is a selectable option, since it is incompatible with
5530 standard Unix calling sequences. If the option is not selected,
5531 the caller must always pop the args.
5532
5533 The attribute stdcall is equivalent to RTD on a per module basis. */
5534
5535 static int
5536 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5537 {
5538 unsigned int ccvt;
5539
5540 /* None of the 64-bit ABIs pop arguments. */
5541 if (TARGET_64BIT)
5542 return 0;
5543
5544 ccvt = ix86_get_callcvt (funtype);
5545
5546 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5547 | IX86_CALLCVT_THISCALL)) != 0
5548 && ! stdarg_p (funtype))
5549 return size;
5550
5551 /* Lose any fake structure return argument if it is passed on the stack. */
5552 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5553 && !ix86_keep_aggregate_return_pointer (funtype))
5554 {
5555 int nregs = ix86_function_regparm (funtype, fundecl);
5556 if (nregs == 0)
5557 return GET_MODE_SIZE (Pmode);
5558 }
5559
5560 return 0;
5561 }
5562
5563 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5564
5565 static bool
5566 ix86_legitimate_combined_insn (rtx insn)
5567 {
5568 /* Check operand constraints in case hard registers were propagated
5569 into insn pattern. This check prevents combine pass from
5570 generating insn patterns with invalid hard register operands.
5571 These invalid insns can eventually confuse reload to error out
5572 with a spill failure. See also PRs 46829 and 46843. */
5573 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5574 {
5575 int i;
5576
5577 extract_insn (insn);
5578 preprocess_constraints ();
5579
5580 for (i = 0; i < recog_data.n_operands; i++)
5581 {
5582 rtx op = recog_data.operand[i];
5583 enum machine_mode mode = GET_MODE (op);
5584 struct operand_alternative *op_alt;
5585 int offset = 0;
5586 bool win;
5587 int j;
5588
5589 /* A unary operator may be accepted by the predicate, but it
5590 is irrelevant for matching constraints. */
5591 if (UNARY_P (op))
5592 op = XEXP (op, 0);
5593
5594 if (GET_CODE (op) == SUBREG)
5595 {
5596 if (REG_P (SUBREG_REG (op))
5597 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5598 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5599 GET_MODE (SUBREG_REG (op)),
5600 SUBREG_BYTE (op),
5601 GET_MODE (op));
5602 op = SUBREG_REG (op);
5603 }
5604
5605 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5606 continue;
5607
5608 op_alt = recog_op_alt[i];
5609
5610 /* Operand has no constraints, anything is OK. */
5611 win = !recog_data.n_alternatives;
5612
5613 for (j = 0; j < recog_data.n_alternatives; j++)
5614 {
5615 if (op_alt[j].anything_ok
5616 || (op_alt[j].matches != -1
5617 && operands_match_p
5618 (recog_data.operand[i],
5619 recog_data.operand[op_alt[j].matches]))
5620 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5621 {
5622 win = true;
5623 break;
5624 }
5625 }
5626
5627 if (!win)
5628 return false;
5629 }
5630 }
5631
5632 return true;
5633 }
5634 \f
5635 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5636
5637 static unsigned HOST_WIDE_INT
5638 ix86_asan_shadow_offset (void)
5639 {
5640 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5641 : HOST_WIDE_INT_C (0x7fff8000))
5642 : (HOST_WIDE_INT_1 << 29);
5643 }
5644 \f
5645 /* Argument support functions. */
5646
5647 /* Return true when register may be used to pass function parameters. */
5648 bool
5649 ix86_function_arg_regno_p (int regno)
5650 {
5651 int i;
5652 const int *parm_regs;
5653
5654 if (!TARGET_64BIT)
5655 {
5656 if (TARGET_MACHO)
5657 return (regno < REGPARM_MAX
5658 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5659 else
5660 return (regno < REGPARM_MAX
5661 || (TARGET_MMX && MMX_REGNO_P (regno)
5662 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5663 || (TARGET_SSE && SSE_REGNO_P (regno)
5664 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5665 }
5666
5667 if (TARGET_SSE && SSE_REGNO_P (regno)
5668 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5669 return true;
5670
5671 /* TODO: The function should depend on current function ABI but
5672 builtins.c would need updating then. Therefore we use the
5673 default ABI. */
5674
5675 /* RAX is used as hidden argument to va_arg functions. */
5676 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5677 return true;
5678
5679 if (ix86_abi == MS_ABI)
5680 parm_regs = x86_64_ms_abi_int_parameter_registers;
5681 else
5682 parm_regs = x86_64_int_parameter_registers;
5683 for (i = 0; i < (ix86_abi == MS_ABI
5684 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5685 if (regno == parm_regs[i])
5686 return true;
5687 return false;
5688 }
5689
5690 /* Return if we do not know how to pass TYPE solely in registers. */
5691
5692 static bool
5693 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5694 {
5695 if (must_pass_in_stack_var_size_or_pad (mode, type))
5696 return true;
5697
5698 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5699 The layout_type routine is crafty and tries to trick us into passing
5700 currently unsupported vector types on the stack by using TImode. */
5701 return (!TARGET_64BIT && mode == TImode
5702 && type && TREE_CODE (type) != VECTOR_TYPE);
5703 }
5704
5705 /* It returns the size, in bytes, of the area reserved for arguments passed
5706 in registers for the function represented by fndecl dependent to the used
5707 abi format. */
5708 int
5709 ix86_reg_parm_stack_space (const_tree fndecl)
5710 {
5711 enum calling_abi call_abi = SYSV_ABI;
5712 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5713 call_abi = ix86_function_abi (fndecl);
5714 else
5715 call_abi = ix86_function_type_abi (fndecl);
5716 if (TARGET_64BIT && call_abi == MS_ABI)
5717 return 32;
5718 return 0;
5719 }
5720
5721 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5722 call abi used. */
5723 enum calling_abi
5724 ix86_function_type_abi (const_tree fntype)
5725 {
5726 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5727 {
5728 enum calling_abi abi = ix86_abi;
5729 if (abi == SYSV_ABI)
5730 {
5731 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5732 abi = MS_ABI;
5733 }
5734 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5735 abi = SYSV_ABI;
5736 return abi;
5737 }
5738 return ix86_abi;
5739 }
5740
5741 /* We add this as a workaround in order to use libc_has_function
5742 hook in i386.md. */
5743 bool
5744 ix86_libc_has_function (enum function_class fn_class)
5745 {
5746 return targetm.libc_has_function (fn_class);
5747 }
5748
5749 static bool
5750 ix86_function_ms_hook_prologue (const_tree fn)
5751 {
5752 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5753 {
5754 if (decl_function_context (fn) != NULL_TREE)
5755 error_at (DECL_SOURCE_LOCATION (fn),
5756 "ms_hook_prologue is not compatible with nested function");
5757 else
5758 return true;
5759 }
5760 return false;
5761 }
5762
5763 static enum calling_abi
5764 ix86_function_abi (const_tree fndecl)
5765 {
5766 if (! fndecl)
5767 return ix86_abi;
5768 return ix86_function_type_abi (TREE_TYPE (fndecl));
5769 }
5770
5771 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5772 call abi used. */
5773 enum calling_abi
5774 ix86_cfun_abi (void)
5775 {
5776 if (! cfun)
5777 return ix86_abi;
5778 return cfun->machine->call_abi;
5779 }
5780
5781 /* Write the extra assembler code needed to declare a function properly. */
5782
5783 void
5784 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5785 tree decl)
5786 {
5787 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5788
5789 if (is_ms_hook)
5790 {
5791 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5792 unsigned int filler_cc = 0xcccccccc;
5793
5794 for (i = 0; i < filler_count; i += 4)
5795 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5796 }
5797
5798 #ifdef SUBTARGET_ASM_UNWIND_INIT
5799 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5800 #endif
5801
5802 ASM_OUTPUT_LABEL (asm_out_file, fname);
5803
5804 /* Output magic byte marker, if hot-patch attribute is set. */
5805 if (is_ms_hook)
5806 {
5807 if (TARGET_64BIT)
5808 {
5809 /* leaq [%rsp + 0], %rsp */
5810 asm_fprintf (asm_out_file, ASM_BYTE
5811 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5812 }
5813 else
5814 {
5815 /* movl.s %edi, %edi
5816 push %ebp
5817 movl.s %esp, %ebp */
5818 asm_fprintf (asm_out_file, ASM_BYTE
5819 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5820 }
5821 }
5822 }
5823
5824 /* regclass.c */
5825 extern void init_regs (void);
5826
5827 /* Implementation of call abi switching target hook. Specific to FNDECL
5828 the specific call register sets are set. See also
5829 ix86_conditional_register_usage for more details. */
5830 void
5831 ix86_call_abi_override (const_tree fndecl)
5832 {
5833 if (fndecl == NULL_TREE)
5834 cfun->machine->call_abi = ix86_abi;
5835 else
5836 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5837 }
5838
5839 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5840 expensive re-initialization of init_regs each time we switch function context
5841 since this is needed only during RTL expansion. */
5842 static void
5843 ix86_maybe_switch_abi (void)
5844 {
5845 if (TARGET_64BIT &&
5846 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5847 reinit_regs ();
5848 }
5849
5850 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5851 for a call to a function whose data type is FNTYPE.
5852 For a library call, FNTYPE is 0. */
5853
5854 void
5855 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5856 tree fntype, /* tree ptr for function decl */
5857 rtx libname, /* SYMBOL_REF of library name or 0 */
5858 tree fndecl,
5859 int caller)
5860 {
5861 struct cgraph_local_info *i;
5862
5863 memset (cum, 0, sizeof (*cum));
5864
5865 if (fndecl)
5866 {
5867 i = cgraph_local_info (fndecl);
5868 cum->call_abi = ix86_function_abi (fndecl);
5869 }
5870 else
5871 {
5872 i = NULL;
5873 cum->call_abi = ix86_function_type_abi (fntype);
5874 }
5875
5876 cum->caller = caller;
5877
5878 /* Set up the number of registers to use for passing arguments. */
5879
5880 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5881 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5882 "or subtarget optimization implying it");
5883 cum->nregs = ix86_regparm;
5884 if (TARGET_64BIT)
5885 {
5886 cum->nregs = (cum->call_abi == SYSV_ABI
5887 ? X86_64_REGPARM_MAX
5888 : X86_64_MS_REGPARM_MAX);
5889 }
5890 if (TARGET_SSE)
5891 {
5892 cum->sse_nregs = SSE_REGPARM_MAX;
5893 if (TARGET_64BIT)
5894 {
5895 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5896 ? X86_64_SSE_REGPARM_MAX
5897 : X86_64_MS_SSE_REGPARM_MAX);
5898 }
5899 }
5900 if (TARGET_MMX)
5901 cum->mmx_nregs = MMX_REGPARM_MAX;
5902 cum->warn_avx = true;
5903 cum->warn_sse = true;
5904 cum->warn_mmx = true;
5905
5906 /* Because type might mismatch in between caller and callee, we need to
5907 use actual type of function for local calls.
5908 FIXME: cgraph_analyze can be told to actually record if function uses
5909 va_start so for local functions maybe_vaarg can be made aggressive
5910 helping K&R code.
5911 FIXME: once typesytem is fixed, we won't need this code anymore. */
5912 if (i && i->local && i->can_change_signature)
5913 fntype = TREE_TYPE (fndecl);
5914 cum->maybe_vaarg = (fntype
5915 ? (!prototype_p (fntype) || stdarg_p (fntype))
5916 : !libname);
5917
5918 if (!TARGET_64BIT)
5919 {
5920 /* If there are variable arguments, then we won't pass anything
5921 in registers in 32-bit mode. */
5922 if (stdarg_p (fntype))
5923 {
5924 cum->nregs = 0;
5925 cum->sse_nregs = 0;
5926 cum->mmx_nregs = 0;
5927 cum->warn_avx = 0;
5928 cum->warn_sse = 0;
5929 cum->warn_mmx = 0;
5930 return;
5931 }
5932
5933 /* Use ecx and edx registers if function has fastcall attribute,
5934 else look for regparm information. */
5935 if (fntype)
5936 {
5937 unsigned int ccvt = ix86_get_callcvt (fntype);
5938 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5939 {
5940 cum->nregs = 1;
5941 cum->fastcall = 1; /* Same first register as in fastcall. */
5942 }
5943 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5944 {
5945 cum->nregs = 2;
5946 cum->fastcall = 1;
5947 }
5948 else
5949 cum->nregs = ix86_function_regparm (fntype, fndecl);
5950 }
5951
5952 /* Set up the number of SSE registers used for passing SFmode
5953 and DFmode arguments. Warn for mismatching ABI. */
5954 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5955 }
5956 }
5957
5958 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5959 But in the case of vector types, it is some vector mode.
5960
5961 When we have only some of our vector isa extensions enabled, then there
5962 are some modes for which vector_mode_supported_p is false. For these
5963 modes, the generic vector support in gcc will choose some non-vector mode
5964 in order to implement the type. By computing the natural mode, we'll
5965 select the proper ABI location for the operand and not depend on whatever
5966 the middle-end decides to do with these vector types.
5967
5968 The midde-end can't deal with the vector types > 16 bytes. In this
5969 case, we return the original mode and warn ABI change if CUM isn't
5970 NULL. */
5971
5972 static enum machine_mode
5973 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5974 {
5975 enum machine_mode mode = TYPE_MODE (type);
5976
5977 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5978 {
5979 HOST_WIDE_INT size = int_size_in_bytes (type);
5980 if ((size == 8 || size == 16 || size == 32)
5981 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5982 && TYPE_VECTOR_SUBPARTS (type) > 1)
5983 {
5984 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5985
5986 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5987 mode = MIN_MODE_VECTOR_FLOAT;
5988 else
5989 mode = MIN_MODE_VECTOR_INT;
5990
5991 /* Get the mode which has this inner mode and number of units. */
5992 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5993 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5994 && GET_MODE_INNER (mode) == innermode)
5995 {
5996 if (size == 32 && !TARGET_AVX)
5997 {
5998 static bool warnedavx;
5999
6000 if (cum
6001 && !warnedavx
6002 && cum->warn_avx)
6003 {
6004 warnedavx = true;
6005 warning (0, "AVX vector argument without AVX "
6006 "enabled changes the ABI");
6007 }
6008 return TYPE_MODE (type);
6009 }
6010 else if ((size == 8 || size == 16) && !TARGET_SSE)
6011 {
6012 static bool warnedsse;
6013
6014 if (cum
6015 && !warnedsse
6016 && cum->warn_sse)
6017 {
6018 warnedsse = true;
6019 warning (0, "SSE vector argument without SSE "
6020 "enabled changes the ABI");
6021 }
6022 return mode;
6023 }
6024 else
6025 return mode;
6026 }
6027
6028 gcc_unreachable ();
6029 }
6030 }
6031
6032 return mode;
6033 }
6034
6035 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6036 this may not agree with the mode that the type system has chosen for the
6037 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6038 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6039
6040 static rtx
6041 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6042 unsigned int regno)
6043 {
6044 rtx tmp;
6045
6046 if (orig_mode != BLKmode)
6047 tmp = gen_rtx_REG (orig_mode, regno);
6048 else
6049 {
6050 tmp = gen_rtx_REG (mode, regno);
6051 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6052 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6053 }
6054
6055 return tmp;
6056 }
6057
6058 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6059 of this code is to classify each 8bytes of incoming argument by the register
6060 class and assign registers accordingly. */
6061
6062 /* Return the union class of CLASS1 and CLASS2.
6063 See the x86-64 PS ABI for details. */
6064
6065 static enum x86_64_reg_class
6066 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6067 {
6068 /* Rule #1: If both classes are equal, this is the resulting class. */
6069 if (class1 == class2)
6070 return class1;
6071
6072 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6073 the other class. */
6074 if (class1 == X86_64_NO_CLASS)
6075 return class2;
6076 if (class2 == X86_64_NO_CLASS)
6077 return class1;
6078
6079 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6080 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6081 return X86_64_MEMORY_CLASS;
6082
6083 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6084 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6085 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6086 return X86_64_INTEGERSI_CLASS;
6087 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6088 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6089 return X86_64_INTEGER_CLASS;
6090
6091 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6092 MEMORY is used. */
6093 if (class1 == X86_64_X87_CLASS
6094 || class1 == X86_64_X87UP_CLASS
6095 || class1 == X86_64_COMPLEX_X87_CLASS
6096 || class2 == X86_64_X87_CLASS
6097 || class2 == X86_64_X87UP_CLASS
6098 || class2 == X86_64_COMPLEX_X87_CLASS)
6099 return X86_64_MEMORY_CLASS;
6100
6101 /* Rule #6: Otherwise class SSE is used. */
6102 return X86_64_SSE_CLASS;
6103 }
6104
6105 /* Classify the argument of type TYPE and mode MODE.
6106 CLASSES will be filled by the register class used to pass each word
6107 of the operand. The number of words is returned. In case the parameter
6108 should be passed in memory, 0 is returned. As a special case for zero
6109 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6110
6111 BIT_OFFSET is used internally for handling records and specifies offset
6112 of the offset in bits modulo 256 to avoid overflow cases.
6113
6114 See the x86-64 PS ABI for details.
6115 */
6116
6117 static int
6118 classify_argument (enum machine_mode mode, const_tree type,
6119 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6120 {
6121 HOST_WIDE_INT bytes =
6122 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6123 int words
6124 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6125
6126 /* Variable sized entities are always passed/returned in memory. */
6127 if (bytes < 0)
6128 return 0;
6129
6130 if (mode != VOIDmode
6131 && targetm.calls.must_pass_in_stack (mode, type))
6132 return 0;
6133
6134 if (type && AGGREGATE_TYPE_P (type))
6135 {
6136 int i;
6137 tree field;
6138 enum x86_64_reg_class subclasses[MAX_CLASSES];
6139
6140 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6141 if (bytes > 32)
6142 return 0;
6143
6144 for (i = 0; i < words; i++)
6145 classes[i] = X86_64_NO_CLASS;
6146
6147 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6148 signalize memory class, so handle it as special case. */
6149 if (!words)
6150 {
6151 classes[0] = X86_64_NO_CLASS;
6152 return 1;
6153 }
6154
6155 /* Classify each field of record and merge classes. */
6156 switch (TREE_CODE (type))
6157 {
6158 case RECORD_TYPE:
6159 /* And now merge the fields of structure. */
6160 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6161 {
6162 if (TREE_CODE (field) == FIELD_DECL)
6163 {
6164 int num;
6165
6166 if (TREE_TYPE (field) == error_mark_node)
6167 continue;
6168
6169 /* Bitfields are always classified as integer. Handle them
6170 early, since later code would consider them to be
6171 misaligned integers. */
6172 if (DECL_BIT_FIELD (field))
6173 {
6174 for (i = (int_bit_position (field)
6175 + (bit_offset % 64)) / 8 / 8;
6176 i < ((int_bit_position (field) + (bit_offset % 64))
6177 + tree_to_shwi (DECL_SIZE (field))
6178 + 63) / 8 / 8; i++)
6179 classes[i] =
6180 merge_classes (X86_64_INTEGER_CLASS,
6181 classes[i]);
6182 }
6183 else
6184 {
6185 int pos;
6186
6187 type = TREE_TYPE (field);
6188
6189 /* Flexible array member is ignored. */
6190 if (TYPE_MODE (type) == BLKmode
6191 && TREE_CODE (type) == ARRAY_TYPE
6192 && TYPE_SIZE (type) == NULL_TREE
6193 && TYPE_DOMAIN (type) != NULL_TREE
6194 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6195 == NULL_TREE))
6196 {
6197 static bool warned;
6198
6199 if (!warned && warn_psabi)
6200 {
6201 warned = true;
6202 inform (input_location,
6203 "the ABI of passing struct with"
6204 " a flexible array member has"
6205 " changed in GCC 4.4");
6206 }
6207 continue;
6208 }
6209 num = classify_argument (TYPE_MODE (type), type,
6210 subclasses,
6211 (int_bit_position (field)
6212 + bit_offset) % 256);
6213 if (!num)
6214 return 0;
6215 pos = (int_bit_position (field)
6216 + (bit_offset % 64)) / 8 / 8;
6217 for (i = 0; i < num && (i + pos) < words; i++)
6218 classes[i + pos] =
6219 merge_classes (subclasses[i], classes[i + pos]);
6220 }
6221 }
6222 }
6223 break;
6224
6225 case ARRAY_TYPE:
6226 /* Arrays are handled as small records. */
6227 {
6228 int num;
6229 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6230 TREE_TYPE (type), subclasses, bit_offset);
6231 if (!num)
6232 return 0;
6233
6234 /* The partial classes are now full classes. */
6235 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6236 subclasses[0] = X86_64_SSE_CLASS;
6237 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6238 && !((bit_offset % 64) == 0 && bytes == 4))
6239 subclasses[0] = X86_64_INTEGER_CLASS;
6240
6241 for (i = 0; i < words; i++)
6242 classes[i] = subclasses[i % num];
6243
6244 break;
6245 }
6246 case UNION_TYPE:
6247 case QUAL_UNION_TYPE:
6248 /* Unions are similar to RECORD_TYPE but offset is always 0.
6249 */
6250 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6251 {
6252 if (TREE_CODE (field) == FIELD_DECL)
6253 {
6254 int num;
6255
6256 if (TREE_TYPE (field) == error_mark_node)
6257 continue;
6258
6259 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6260 TREE_TYPE (field), subclasses,
6261 bit_offset);
6262 if (!num)
6263 return 0;
6264 for (i = 0; i < num; i++)
6265 classes[i] = merge_classes (subclasses[i], classes[i]);
6266 }
6267 }
6268 break;
6269
6270 default:
6271 gcc_unreachable ();
6272 }
6273
6274 if (words > 2)
6275 {
6276 /* When size > 16 bytes, if the first one isn't
6277 X86_64_SSE_CLASS or any other ones aren't
6278 X86_64_SSEUP_CLASS, everything should be passed in
6279 memory. */
6280 if (classes[0] != X86_64_SSE_CLASS)
6281 return 0;
6282
6283 for (i = 1; i < words; i++)
6284 if (classes[i] != X86_64_SSEUP_CLASS)
6285 return 0;
6286 }
6287
6288 /* Final merger cleanup. */
6289 for (i = 0; i < words; i++)
6290 {
6291 /* If one class is MEMORY, everything should be passed in
6292 memory. */
6293 if (classes[i] == X86_64_MEMORY_CLASS)
6294 return 0;
6295
6296 /* The X86_64_SSEUP_CLASS should be always preceded by
6297 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6298 if (classes[i] == X86_64_SSEUP_CLASS
6299 && classes[i - 1] != X86_64_SSE_CLASS
6300 && classes[i - 1] != X86_64_SSEUP_CLASS)
6301 {
6302 /* The first one should never be X86_64_SSEUP_CLASS. */
6303 gcc_assert (i != 0);
6304 classes[i] = X86_64_SSE_CLASS;
6305 }
6306
6307 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6308 everything should be passed in memory. */
6309 if (classes[i] == X86_64_X87UP_CLASS
6310 && (classes[i - 1] != X86_64_X87_CLASS))
6311 {
6312 static bool warned;
6313
6314 /* The first one should never be X86_64_X87UP_CLASS. */
6315 gcc_assert (i != 0);
6316 if (!warned && warn_psabi)
6317 {
6318 warned = true;
6319 inform (input_location,
6320 "the ABI of passing union with long double"
6321 " has changed in GCC 4.4");
6322 }
6323 return 0;
6324 }
6325 }
6326 return words;
6327 }
6328
6329 /* Compute alignment needed. We align all types to natural boundaries with
6330 exception of XFmode that is aligned to 64bits. */
6331 if (mode != VOIDmode && mode != BLKmode)
6332 {
6333 int mode_alignment = GET_MODE_BITSIZE (mode);
6334
6335 if (mode == XFmode)
6336 mode_alignment = 128;
6337 else if (mode == XCmode)
6338 mode_alignment = 256;
6339 if (COMPLEX_MODE_P (mode))
6340 mode_alignment /= 2;
6341 /* Misaligned fields are always returned in memory. */
6342 if (bit_offset % mode_alignment)
6343 return 0;
6344 }
6345
6346 /* for V1xx modes, just use the base mode */
6347 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6348 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6349 mode = GET_MODE_INNER (mode);
6350
6351 /* Classification of atomic types. */
6352 switch (mode)
6353 {
6354 case SDmode:
6355 case DDmode:
6356 classes[0] = X86_64_SSE_CLASS;
6357 return 1;
6358 case TDmode:
6359 classes[0] = X86_64_SSE_CLASS;
6360 classes[1] = X86_64_SSEUP_CLASS;
6361 return 2;
6362 case DImode:
6363 case SImode:
6364 case HImode:
6365 case QImode:
6366 case CSImode:
6367 case CHImode:
6368 case CQImode:
6369 {
6370 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6371
6372 if (size <= 32)
6373 {
6374 classes[0] = X86_64_INTEGERSI_CLASS;
6375 return 1;
6376 }
6377 else if (size <= 64)
6378 {
6379 classes[0] = X86_64_INTEGER_CLASS;
6380 return 1;
6381 }
6382 else if (size <= 64+32)
6383 {
6384 classes[0] = X86_64_INTEGER_CLASS;
6385 classes[1] = X86_64_INTEGERSI_CLASS;
6386 return 2;
6387 }
6388 else if (size <= 64+64)
6389 {
6390 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6391 return 2;
6392 }
6393 else
6394 gcc_unreachable ();
6395 }
6396 case CDImode:
6397 case TImode:
6398 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6399 return 2;
6400 case COImode:
6401 case OImode:
6402 /* OImode shouldn't be used directly. */
6403 gcc_unreachable ();
6404 case CTImode:
6405 return 0;
6406 case SFmode:
6407 if (!(bit_offset % 64))
6408 classes[0] = X86_64_SSESF_CLASS;
6409 else
6410 classes[0] = X86_64_SSE_CLASS;
6411 return 1;
6412 case DFmode:
6413 classes[0] = X86_64_SSEDF_CLASS;
6414 return 1;
6415 case XFmode:
6416 classes[0] = X86_64_X87_CLASS;
6417 classes[1] = X86_64_X87UP_CLASS;
6418 return 2;
6419 case TFmode:
6420 classes[0] = X86_64_SSE_CLASS;
6421 classes[1] = X86_64_SSEUP_CLASS;
6422 return 2;
6423 case SCmode:
6424 classes[0] = X86_64_SSE_CLASS;
6425 if (!(bit_offset % 64))
6426 return 1;
6427 else
6428 {
6429 static bool warned;
6430
6431 if (!warned && warn_psabi)
6432 {
6433 warned = true;
6434 inform (input_location,
6435 "the ABI of passing structure with complex float"
6436 " member has changed in GCC 4.4");
6437 }
6438 classes[1] = X86_64_SSESF_CLASS;
6439 return 2;
6440 }
6441 case DCmode:
6442 classes[0] = X86_64_SSEDF_CLASS;
6443 classes[1] = X86_64_SSEDF_CLASS;
6444 return 2;
6445 case XCmode:
6446 classes[0] = X86_64_COMPLEX_X87_CLASS;
6447 return 1;
6448 case TCmode:
6449 /* This modes is larger than 16 bytes. */
6450 return 0;
6451 case V8SFmode:
6452 case V8SImode:
6453 case V32QImode:
6454 case V16HImode:
6455 case V4DFmode:
6456 case V4DImode:
6457 classes[0] = X86_64_SSE_CLASS;
6458 classes[1] = X86_64_SSEUP_CLASS;
6459 classes[2] = X86_64_SSEUP_CLASS;
6460 classes[3] = X86_64_SSEUP_CLASS;
6461 return 4;
6462 case V4SFmode:
6463 case V4SImode:
6464 case V16QImode:
6465 case V8HImode:
6466 case V2DFmode:
6467 case V2DImode:
6468 classes[0] = X86_64_SSE_CLASS;
6469 classes[1] = X86_64_SSEUP_CLASS;
6470 return 2;
6471 case V1TImode:
6472 case V1DImode:
6473 case V2SFmode:
6474 case V2SImode:
6475 case V4HImode:
6476 case V8QImode:
6477 classes[0] = X86_64_SSE_CLASS;
6478 return 1;
6479 case BLKmode:
6480 case VOIDmode:
6481 return 0;
6482 default:
6483 gcc_assert (VECTOR_MODE_P (mode));
6484
6485 if (bytes > 16)
6486 return 0;
6487
6488 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6489
6490 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6491 classes[0] = X86_64_INTEGERSI_CLASS;
6492 else
6493 classes[0] = X86_64_INTEGER_CLASS;
6494 classes[1] = X86_64_INTEGER_CLASS;
6495 return 1 + (bytes > 8);
6496 }
6497 }
6498
6499 /* Examine the argument and return set number of register required in each
6500 class. Return 0 iff parameter should be passed in memory. */
6501 static int
6502 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6503 int *int_nregs, int *sse_nregs)
6504 {
6505 enum x86_64_reg_class regclass[MAX_CLASSES];
6506 int n = classify_argument (mode, type, regclass, 0);
6507
6508 *int_nregs = 0;
6509 *sse_nregs = 0;
6510 if (!n)
6511 return 0;
6512 for (n--; n >= 0; n--)
6513 switch (regclass[n])
6514 {
6515 case X86_64_INTEGER_CLASS:
6516 case X86_64_INTEGERSI_CLASS:
6517 (*int_nregs)++;
6518 break;
6519 case X86_64_SSE_CLASS:
6520 case X86_64_SSESF_CLASS:
6521 case X86_64_SSEDF_CLASS:
6522 (*sse_nregs)++;
6523 break;
6524 case X86_64_NO_CLASS:
6525 case X86_64_SSEUP_CLASS:
6526 break;
6527 case X86_64_X87_CLASS:
6528 case X86_64_X87UP_CLASS:
6529 if (!in_return)
6530 return 0;
6531 break;
6532 case X86_64_COMPLEX_X87_CLASS:
6533 return in_return ? 2 : 0;
6534 case X86_64_MEMORY_CLASS:
6535 gcc_unreachable ();
6536 }
6537 return 1;
6538 }
6539
6540 /* Construct container for the argument used by GCC interface. See
6541 FUNCTION_ARG for the detailed description. */
6542
6543 static rtx
6544 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6545 const_tree type, int in_return, int nintregs, int nsseregs,
6546 const int *intreg, int sse_regno)
6547 {
6548 /* The following variables hold the static issued_error state. */
6549 static bool issued_sse_arg_error;
6550 static bool issued_sse_ret_error;
6551 static bool issued_x87_ret_error;
6552
6553 enum machine_mode tmpmode;
6554 int bytes =
6555 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6556 enum x86_64_reg_class regclass[MAX_CLASSES];
6557 int n;
6558 int i;
6559 int nexps = 0;
6560 int needed_sseregs, needed_intregs;
6561 rtx exp[MAX_CLASSES];
6562 rtx ret;
6563
6564 n = classify_argument (mode, type, regclass, 0);
6565 if (!n)
6566 return NULL;
6567 if (!examine_argument (mode, type, in_return, &needed_intregs,
6568 &needed_sseregs))
6569 return NULL;
6570 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6571 return NULL;
6572
6573 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6574 some less clueful developer tries to use floating-point anyway. */
6575 if (needed_sseregs && !TARGET_SSE)
6576 {
6577 if (in_return)
6578 {
6579 if (!issued_sse_ret_error)
6580 {
6581 error ("SSE register return with SSE disabled");
6582 issued_sse_ret_error = true;
6583 }
6584 }
6585 else if (!issued_sse_arg_error)
6586 {
6587 error ("SSE register argument with SSE disabled");
6588 issued_sse_arg_error = true;
6589 }
6590 return NULL;
6591 }
6592
6593 /* Likewise, error if the ABI requires us to return values in the
6594 x87 registers and the user specified -mno-80387. */
6595 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6596 for (i = 0; i < n; i++)
6597 if (regclass[i] == X86_64_X87_CLASS
6598 || regclass[i] == X86_64_X87UP_CLASS
6599 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6600 {
6601 if (!issued_x87_ret_error)
6602 {
6603 error ("x87 register return with x87 disabled");
6604 issued_x87_ret_error = true;
6605 }
6606 return NULL;
6607 }
6608
6609 /* First construct simple cases. Avoid SCmode, since we want to use
6610 single register to pass this type. */
6611 if (n == 1 && mode != SCmode)
6612 switch (regclass[0])
6613 {
6614 case X86_64_INTEGER_CLASS:
6615 case X86_64_INTEGERSI_CLASS:
6616 return gen_rtx_REG (mode, intreg[0]);
6617 case X86_64_SSE_CLASS:
6618 case X86_64_SSESF_CLASS:
6619 case X86_64_SSEDF_CLASS:
6620 if (mode != BLKmode)
6621 return gen_reg_or_parallel (mode, orig_mode,
6622 SSE_REGNO (sse_regno));
6623 break;
6624 case X86_64_X87_CLASS:
6625 case X86_64_COMPLEX_X87_CLASS:
6626 return gen_rtx_REG (mode, FIRST_STACK_REG);
6627 case X86_64_NO_CLASS:
6628 /* Zero sized array, struct or class. */
6629 return NULL;
6630 default:
6631 gcc_unreachable ();
6632 }
6633 if (n == 2
6634 && regclass[0] == X86_64_SSE_CLASS
6635 && regclass[1] == X86_64_SSEUP_CLASS
6636 && mode != BLKmode)
6637 return gen_reg_or_parallel (mode, orig_mode,
6638 SSE_REGNO (sse_regno));
6639 if (n == 4
6640 && regclass[0] == X86_64_SSE_CLASS
6641 && regclass[1] == X86_64_SSEUP_CLASS
6642 && regclass[2] == X86_64_SSEUP_CLASS
6643 && regclass[3] == X86_64_SSEUP_CLASS
6644 && mode != BLKmode)
6645 return gen_reg_or_parallel (mode, orig_mode,
6646 SSE_REGNO (sse_regno));
6647 if (n == 2
6648 && regclass[0] == X86_64_X87_CLASS
6649 && regclass[1] == X86_64_X87UP_CLASS)
6650 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6651
6652 if (n == 2
6653 && regclass[0] == X86_64_INTEGER_CLASS
6654 && regclass[1] == X86_64_INTEGER_CLASS
6655 && (mode == CDImode || mode == TImode || mode == TFmode)
6656 && intreg[0] + 1 == intreg[1])
6657 return gen_rtx_REG (mode, intreg[0]);
6658
6659 /* Otherwise figure out the entries of the PARALLEL. */
6660 for (i = 0; i < n; i++)
6661 {
6662 int pos;
6663
6664 switch (regclass[i])
6665 {
6666 case X86_64_NO_CLASS:
6667 break;
6668 case X86_64_INTEGER_CLASS:
6669 case X86_64_INTEGERSI_CLASS:
6670 /* Merge TImodes on aligned occasions here too. */
6671 if (i * 8 + 8 > bytes)
6672 tmpmode
6673 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6674 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6675 tmpmode = SImode;
6676 else
6677 tmpmode = DImode;
6678 /* We've requested 24 bytes we
6679 don't have mode for. Use DImode. */
6680 if (tmpmode == BLKmode)
6681 tmpmode = DImode;
6682 exp [nexps++]
6683 = gen_rtx_EXPR_LIST (VOIDmode,
6684 gen_rtx_REG (tmpmode, *intreg),
6685 GEN_INT (i*8));
6686 intreg++;
6687 break;
6688 case X86_64_SSESF_CLASS:
6689 exp [nexps++]
6690 = gen_rtx_EXPR_LIST (VOIDmode,
6691 gen_rtx_REG (SFmode,
6692 SSE_REGNO (sse_regno)),
6693 GEN_INT (i*8));
6694 sse_regno++;
6695 break;
6696 case X86_64_SSEDF_CLASS:
6697 exp [nexps++]
6698 = gen_rtx_EXPR_LIST (VOIDmode,
6699 gen_rtx_REG (DFmode,
6700 SSE_REGNO (sse_regno)),
6701 GEN_INT (i*8));
6702 sse_regno++;
6703 break;
6704 case X86_64_SSE_CLASS:
6705 pos = i;
6706 switch (n)
6707 {
6708 case 1:
6709 tmpmode = DImode;
6710 break;
6711 case 2:
6712 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6713 {
6714 tmpmode = TImode;
6715 i++;
6716 }
6717 else
6718 tmpmode = DImode;
6719 break;
6720 case 4:
6721 gcc_assert (i == 0
6722 && regclass[1] == X86_64_SSEUP_CLASS
6723 && regclass[2] == X86_64_SSEUP_CLASS
6724 && regclass[3] == X86_64_SSEUP_CLASS);
6725 tmpmode = OImode;
6726 i += 3;
6727 break;
6728 default:
6729 gcc_unreachable ();
6730 }
6731 exp [nexps++]
6732 = gen_rtx_EXPR_LIST (VOIDmode,
6733 gen_rtx_REG (tmpmode,
6734 SSE_REGNO (sse_regno)),
6735 GEN_INT (pos*8));
6736 sse_regno++;
6737 break;
6738 default:
6739 gcc_unreachable ();
6740 }
6741 }
6742
6743 /* Empty aligned struct, union or class. */
6744 if (nexps == 0)
6745 return NULL;
6746
6747 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6748 for (i = 0; i < nexps; i++)
6749 XVECEXP (ret, 0, i) = exp [i];
6750 return ret;
6751 }
6752
6753 /* Update the data in CUM to advance over an argument of mode MODE
6754 and data type TYPE. (TYPE is null for libcalls where that information
6755 may not be available.) */
6756
6757 static void
6758 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6759 const_tree type, HOST_WIDE_INT bytes,
6760 HOST_WIDE_INT words)
6761 {
6762 switch (mode)
6763 {
6764 default:
6765 break;
6766
6767 case BLKmode:
6768 if (bytes < 0)
6769 break;
6770 /* FALLTHRU */
6771
6772 case DImode:
6773 case SImode:
6774 case HImode:
6775 case QImode:
6776 cum->words += words;
6777 cum->nregs -= words;
6778 cum->regno += words;
6779
6780 if (cum->nregs <= 0)
6781 {
6782 cum->nregs = 0;
6783 cum->regno = 0;
6784 }
6785 break;
6786
6787 case OImode:
6788 /* OImode shouldn't be used directly. */
6789 gcc_unreachable ();
6790
6791 case DFmode:
6792 if (cum->float_in_sse < 2)
6793 break;
6794 case SFmode:
6795 if (cum->float_in_sse < 1)
6796 break;
6797 /* FALLTHRU */
6798
6799 case V8SFmode:
6800 case V8SImode:
6801 case V32QImode:
6802 case V16HImode:
6803 case V4DFmode:
6804 case V4DImode:
6805 case TImode:
6806 case V16QImode:
6807 case V8HImode:
6808 case V4SImode:
6809 case V2DImode:
6810 case V4SFmode:
6811 case V2DFmode:
6812 if (!type || !AGGREGATE_TYPE_P (type))
6813 {
6814 cum->sse_words += words;
6815 cum->sse_nregs -= 1;
6816 cum->sse_regno += 1;
6817 if (cum->sse_nregs <= 0)
6818 {
6819 cum->sse_nregs = 0;
6820 cum->sse_regno = 0;
6821 }
6822 }
6823 break;
6824
6825 case V8QImode:
6826 case V4HImode:
6827 case V2SImode:
6828 case V2SFmode:
6829 case V1TImode:
6830 case V1DImode:
6831 if (!type || !AGGREGATE_TYPE_P (type))
6832 {
6833 cum->mmx_words += words;
6834 cum->mmx_nregs -= 1;
6835 cum->mmx_regno += 1;
6836 if (cum->mmx_nregs <= 0)
6837 {
6838 cum->mmx_nregs = 0;
6839 cum->mmx_regno = 0;
6840 }
6841 }
6842 break;
6843 }
6844 }
6845
6846 static void
6847 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6848 const_tree type, HOST_WIDE_INT words, bool named)
6849 {
6850 int int_nregs, sse_nregs;
6851
6852 /* Unnamed 256bit vector mode parameters are passed on stack. */
6853 if (!named && VALID_AVX256_REG_MODE (mode))
6854 return;
6855
6856 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6857 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6858 {
6859 cum->nregs -= int_nregs;
6860 cum->sse_nregs -= sse_nregs;
6861 cum->regno += int_nregs;
6862 cum->sse_regno += sse_nregs;
6863 }
6864 else
6865 {
6866 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6867 cum->words = (cum->words + align - 1) & ~(align - 1);
6868 cum->words += words;
6869 }
6870 }
6871
6872 static void
6873 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6874 HOST_WIDE_INT words)
6875 {
6876 /* Otherwise, this should be passed indirect. */
6877 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6878
6879 cum->words += words;
6880 if (cum->nregs > 0)
6881 {
6882 cum->nregs -= 1;
6883 cum->regno += 1;
6884 }
6885 }
6886
6887 /* Update the data in CUM to advance over an argument of mode MODE and
6888 data type TYPE. (TYPE is null for libcalls where that information
6889 may not be available.) */
6890
6891 static void
6892 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6893 const_tree type, bool named)
6894 {
6895 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6896 HOST_WIDE_INT bytes, words;
6897
6898 if (mode == BLKmode)
6899 bytes = int_size_in_bytes (type);
6900 else
6901 bytes = GET_MODE_SIZE (mode);
6902 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6903
6904 if (type)
6905 mode = type_natural_mode (type, NULL);
6906
6907 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6908 function_arg_advance_ms_64 (cum, bytes, words);
6909 else if (TARGET_64BIT)
6910 function_arg_advance_64 (cum, mode, type, words, named);
6911 else
6912 function_arg_advance_32 (cum, mode, type, bytes, words);
6913 }
6914
6915 /* Define where to put the arguments to a function.
6916 Value is zero to push the argument on the stack,
6917 or a hard register in which to store the argument.
6918
6919 MODE is the argument's machine mode.
6920 TYPE is the data type of the argument (as a tree).
6921 This is null for libcalls where that information may
6922 not be available.
6923 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6924 the preceding args and about the function being called.
6925 NAMED is nonzero if this argument is a named parameter
6926 (otherwise it is an extra parameter matching an ellipsis). */
6927
6928 static rtx
6929 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6930 enum machine_mode orig_mode, const_tree type,
6931 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6932 {
6933 static bool warnedsse, warnedmmx;
6934
6935 /* Avoid the AL settings for the Unix64 ABI. */
6936 if (mode == VOIDmode)
6937 return constm1_rtx;
6938
6939 switch (mode)
6940 {
6941 default:
6942 break;
6943
6944 case BLKmode:
6945 if (bytes < 0)
6946 break;
6947 /* FALLTHRU */
6948 case DImode:
6949 case SImode:
6950 case HImode:
6951 case QImode:
6952 if (words <= cum->nregs)
6953 {
6954 int regno = cum->regno;
6955
6956 /* Fastcall allocates the first two DWORD (SImode) or
6957 smaller arguments to ECX and EDX if it isn't an
6958 aggregate type . */
6959 if (cum->fastcall)
6960 {
6961 if (mode == BLKmode
6962 || mode == DImode
6963 || (type && AGGREGATE_TYPE_P (type)))
6964 break;
6965
6966 /* ECX not EAX is the first allocated register. */
6967 if (regno == AX_REG)
6968 regno = CX_REG;
6969 }
6970 return gen_rtx_REG (mode, regno);
6971 }
6972 break;
6973
6974 case DFmode:
6975 if (cum->float_in_sse < 2)
6976 break;
6977 case SFmode:
6978 if (cum->float_in_sse < 1)
6979 break;
6980 /* FALLTHRU */
6981 case TImode:
6982 /* In 32bit, we pass TImode in xmm registers. */
6983 case V16QImode:
6984 case V8HImode:
6985 case V4SImode:
6986 case V2DImode:
6987 case V4SFmode:
6988 case V2DFmode:
6989 if (!type || !AGGREGATE_TYPE_P (type))
6990 {
6991 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6992 {
6993 warnedsse = true;
6994 warning (0, "SSE vector argument without SSE enabled "
6995 "changes the ABI");
6996 }
6997 if (cum->sse_nregs)
6998 return gen_reg_or_parallel (mode, orig_mode,
6999 cum->sse_regno + FIRST_SSE_REG);
7000 }
7001 break;
7002
7003 case OImode:
7004 /* OImode shouldn't be used directly. */
7005 gcc_unreachable ();
7006
7007 case V8SFmode:
7008 case V8SImode:
7009 case V32QImode:
7010 case V16HImode:
7011 case V4DFmode:
7012 case V4DImode:
7013 if (!type || !AGGREGATE_TYPE_P (type))
7014 {
7015 if (cum->sse_nregs)
7016 return gen_reg_or_parallel (mode, orig_mode,
7017 cum->sse_regno + FIRST_SSE_REG);
7018 }
7019 break;
7020
7021 case V8QImode:
7022 case V4HImode:
7023 case V2SImode:
7024 case V2SFmode:
7025 case V1TImode:
7026 case V1DImode:
7027 if (!type || !AGGREGATE_TYPE_P (type))
7028 {
7029 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7030 {
7031 warnedmmx = true;
7032 warning (0, "MMX vector argument without MMX enabled "
7033 "changes the ABI");
7034 }
7035 if (cum->mmx_nregs)
7036 return gen_reg_or_parallel (mode, orig_mode,
7037 cum->mmx_regno + FIRST_MMX_REG);
7038 }
7039 break;
7040 }
7041
7042 return NULL_RTX;
7043 }
7044
7045 static rtx
7046 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7047 enum machine_mode orig_mode, const_tree type, bool named)
7048 {
7049 /* Handle a hidden AL argument containing number of registers
7050 for varargs x86-64 functions. */
7051 if (mode == VOIDmode)
7052 return GEN_INT (cum->maybe_vaarg
7053 ? (cum->sse_nregs < 0
7054 ? X86_64_SSE_REGPARM_MAX
7055 : cum->sse_regno)
7056 : -1);
7057
7058 switch (mode)
7059 {
7060 default:
7061 break;
7062
7063 case V8SFmode:
7064 case V8SImode:
7065 case V32QImode:
7066 case V16HImode:
7067 case V4DFmode:
7068 case V4DImode:
7069 /* Unnamed 256bit vector mode parameters are passed on stack. */
7070 if (!named)
7071 return NULL;
7072 break;
7073 }
7074
7075 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7076 cum->sse_nregs,
7077 &x86_64_int_parameter_registers [cum->regno],
7078 cum->sse_regno);
7079 }
7080
7081 static rtx
7082 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7083 enum machine_mode orig_mode, bool named,
7084 HOST_WIDE_INT bytes)
7085 {
7086 unsigned int regno;
7087
7088 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7089 We use value of -2 to specify that current function call is MSABI. */
7090 if (mode == VOIDmode)
7091 return GEN_INT (-2);
7092
7093 /* If we've run out of registers, it goes on the stack. */
7094 if (cum->nregs == 0)
7095 return NULL_RTX;
7096
7097 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7098
7099 /* Only floating point modes are passed in anything but integer regs. */
7100 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7101 {
7102 if (named)
7103 regno = cum->regno + FIRST_SSE_REG;
7104 else
7105 {
7106 rtx t1, t2;
7107
7108 /* Unnamed floating parameters are passed in both the
7109 SSE and integer registers. */
7110 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7111 t2 = gen_rtx_REG (mode, regno);
7112 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7113 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7114 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7115 }
7116 }
7117 /* Handle aggregated types passed in register. */
7118 if (orig_mode == BLKmode)
7119 {
7120 if (bytes > 0 && bytes <= 8)
7121 mode = (bytes > 4 ? DImode : SImode);
7122 if (mode == BLKmode)
7123 mode = DImode;
7124 }
7125
7126 return gen_reg_or_parallel (mode, orig_mode, regno);
7127 }
7128
7129 /* Return where to put the arguments to a function.
7130 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7131
7132 MODE is the argument's machine mode. TYPE is the data type of the
7133 argument. It is null for libcalls where that information may not be
7134 available. CUM gives information about the preceding args and about
7135 the function being called. NAMED is nonzero if this argument is a
7136 named parameter (otherwise it is an extra parameter matching an
7137 ellipsis). */
7138
7139 static rtx
7140 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7141 const_tree type, bool named)
7142 {
7143 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7144 enum machine_mode mode = omode;
7145 HOST_WIDE_INT bytes, words;
7146 rtx arg;
7147
7148 if (mode == BLKmode)
7149 bytes = int_size_in_bytes (type);
7150 else
7151 bytes = GET_MODE_SIZE (mode);
7152 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7153
7154 /* To simplify the code below, represent vector types with a vector mode
7155 even if MMX/SSE are not active. */
7156 if (type && TREE_CODE (type) == VECTOR_TYPE)
7157 mode = type_natural_mode (type, cum);
7158
7159 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7160 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7161 else if (TARGET_64BIT)
7162 arg = function_arg_64 (cum, mode, omode, type, named);
7163 else
7164 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7165
7166 return arg;
7167 }
7168
7169 /* A C expression that indicates when an argument must be passed by
7170 reference. If nonzero for an argument, a copy of that argument is
7171 made in memory and a pointer to the argument is passed instead of
7172 the argument itself. The pointer is passed in whatever way is
7173 appropriate for passing a pointer to that type. */
7174
7175 static bool
7176 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7177 const_tree type, bool named ATTRIBUTE_UNUSED)
7178 {
7179 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7180
7181 /* See Windows x64 Software Convention. */
7182 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7183 {
7184 int msize = (int) GET_MODE_SIZE (mode);
7185 if (type)
7186 {
7187 /* Arrays are passed by reference. */
7188 if (TREE_CODE (type) == ARRAY_TYPE)
7189 return true;
7190
7191 if (AGGREGATE_TYPE_P (type))
7192 {
7193 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7194 are passed by reference. */
7195 msize = int_size_in_bytes (type);
7196 }
7197 }
7198
7199 /* __m128 is passed by reference. */
7200 switch (msize) {
7201 case 1: case 2: case 4: case 8:
7202 break;
7203 default:
7204 return true;
7205 }
7206 }
7207 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7208 return 1;
7209
7210 return 0;
7211 }
7212
7213 /* Return true when TYPE should be 128bit aligned for 32bit argument
7214 passing ABI. XXX: This function is obsolete and is only used for
7215 checking psABI compatibility with previous versions of GCC. */
7216
7217 static bool
7218 ix86_compat_aligned_value_p (const_tree type)
7219 {
7220 enum machine_mode mode = TYPE_MODE (type);
7221 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7222 || mode == TDmode
7223 || mode == TFmode
7224 || mode == TCmode)
7225 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7226 return true;
7227 if (TYPE_ALIGN (type) < 128)
7228 return false;
7229
7230 if (AGGREGATE_TYPE_P (type))
7231 {
7232 /* Walk the aggregates recursively. */
7233 switch (TREE_CODE (type))
7234 {
7235 case RECORD_TYPE:
7236 case UNION_TYPE:
7237 case QUAL_UNION_TYPE:
7238 {
7239 tree field;
7240
7241 /* Walk all the structure fields. */
7242 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7243 {
7244 if (TREE_CODE (field) == FIELD_DECL
7245 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7246 return true;
7247 }
7248 break;
7249 }
7250
7251 case ARRAY_TYPE:
7252 /* Just for use if some languages passes arrays by value. */
7253 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7254 return true;
7255 break;
7256
7257 default:
7258 gcc_unreachable ();
7259 }
7260 }
7261 return false;
7262 }
7263
7264 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7265 XXX: This function is obsolete and is only used for checking psABI
7266 compatibility with previous versions of GCC. */
7267
7268 static unsigned int
7269 ix86_compat_function_arg_boundary (enum machine_mode mode,
7270 const_tree type, unsigned int align)
7271 {
7272 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7273 natural boundaries. */
7274 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7275 {
7276 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7277 make an exception for SSE modes since these require 128bit
7278 alignment.
7279
7280 The handling here differs from field_alignment. ICC aligns MMX
7281 arguments to 4 byte boundaries, while structure fields are aligned
7282 to 8 byte boundaries. */
7283 if (!type)
7284 {
7285 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7286 align = PARM_BOUNDARY;
7287 }
7288 else
7289 {
7290 if (!ix86_compat_aligned_value_p (type))
7291 align = PARM_BOUNDARY;
7292 }
7293 }
7294 if (align > BIGGEST_ALIGNMENT)
7295 align = BIGGEST_ALIGNMENT;
7296 return align;
7297 }
7298
7299 /* Return true when TYPE should be 128bit aligned for 32bit argument
7300 passing ABI. */
7301
7302 static bool
7303 ix86_contains_aligned_value_p (const_tree type)
7304 {
7305 enum machine_mode mode = TYPE_MODE (type);
7306
7307 if (mode == XFmode || mode == XCmode)
7308 return false;
7309
7310 if (TYPE_ALIGN (type) < 128)
7311 return false;
7312
7313 if (AGGREGATE_TYPE_P (type))
7314 {
7315 /* Walk the aggregates recursively. */
7316 switch (TREE_CODE (type))
7317 {
7318 case RECORD_TYPE:
7319 case UNION_TYPE:
7320 case QUAL_UNION_TYPE:
7321 {
7322 tree field;
7323
7324 /* Walk all the structure fields. */
7325 for (field = TYPE_FIELDS (type);
7326 field;
7327 field = DECL_CHAIN (field))
7328 {
7329 if (TREE_CODE (field) == FIELD_DECL
7330 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7331 return true;
7332 }
7333 break;
7334 }
7335
7336 case ARRAY_TYPE:
7337 /* Just for use if some languages passes arrays by value. */
7338 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7339 return true;
7340 break;
7341
7342 default:
7343 gcc_unreachable ();
7344 }
7345 }
7346 else
7347 return TYPE_ALIGN (type) >= 128;
7348
7349 return false;
7350 }
7351
7352 /* Gives the alignment boundary, in bits, of an argument with the
7353 specified mode and type. */
7354
7355 static unsigned int
7356 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7357 {
7358 unsigned int align;
7359 if (type)
7360 {
7361 /* Since the main variant type is used for call, we convert it to
7362 the main variant type. */
7363 type = TYPE_MAIN_VARIANT (type);
7364 align = TYPE_ALIGN (type);
7365 }
7366 else
7367 align = GET_MODE_ALIGNMENT (mode);
7368 if (align < PARM_BOUNDARY)
7369 align = PARM_BOUNDARY;
7370 else
7371 {
7372 static bool warned;
7373 unsigned int saved_align = align;
7374
7375 if (!TARGET_64BIT)
7376 {
7377 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7378 if (!type)
7379 {
7380 if (mode == XFmode || mode == XCmode)
7381 align = PARM_BOUNDARY;
7382 }
7383 else if (!ix86_contains_aligned_value_p (type))
7384 align = PARM_BOUNDARY;
7385
7386 if (align < 128)
7387 align = PARM_BOUNDARY;
7388 }
7389
7390 if (warn_psabi
7391 && !warned
7392 && align != ix86_compat_function_arg_boundary (mode, type,
7393 saved_align))
7394 {
7395 warned = true;
7396 inform (input_location,
7397 "The ABI for passing parameters with %d-byte"
7398 " alignment has changed in GCC 4.6",
7399 align / BITS_PER_UNIT);
7400 }
7401 }
7402
7403 return align;
7404 }
7405
7406 /* Return true if N is a possible register number of function value. */
7407
7408 static bool
7409 ix86_function_value_regno_p (const unsigned int regno)
7410 {
7411 switch (regno)
7412 {
7413 case AX_REG:
7414 case DX_REG:
7415 return true;
7416 case DI_REG:
7417 case SI_REG:
7418 return TARGET_64BIT && ix86_abi != MS_ABI;
7419
7420 /* Complex values are returned in %st(0)/%st(1) pair. */
7421 case ST0_REG:
7422 case ST1_REG:
7423 /* TODO: The function should depend on current function ABI but
7424 builtins.c would need updating then. Therefore we use the
7425 default ABI. */
7426 if (TARGET_64BIT && ix86_abi == MS_ABI)
7427 return false;
7428 return TARGET_FLOAT_RETURNS_IN_80387;
7429
7430 /* Complex values are returned in %xmm0/%xmm1 pair. */
7431 case XMM0_REG:
7432 case XMM1_REG:
7433 return TARGET_SSE;
7434
7435 case MM0_REG:
7436 if (TARGET_MACHO || TARGET_64BIT)
7437 return false;
7438 return TARGET_MMX;
7439 }
7440
7441 return false;
7442 }
7443
7444 /* Define how to find the value returned by a function.
7445 VALTYPE is the data type of the value (as a tree).
7446 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7447 otherwise, FUNC is 0. */
7448
7449 static rtx
7450 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7451 const_tree fntype, const_tree fn)
7452 {
7453 unsigned int regno;
7454
7455 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7456 we normally prevent this case when mmx is not available. However
7457 some ABIs may require the result to be returned like DImode. */
7458 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7459 regno = FIRST_MMX_REG;
7460
7461 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7462 we prevent this case when sse is not available. However some ABIs
7463 may require the result to be returned like integer TImode. */
7464 else if (mode == TImode
7465 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7466 regno = FIRST_SSE_REG;
7467
7468 /* 32-byte vector modes in %ymm0. */
7469 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7470 regno = FIRST_SSE_REG;
7471
7472 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7473 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7474 regno = FIRST_FLOAT_REG;
7475 else
7476 /* Most things go in %eax. */
7477 regno = AX_REG;
7478
7479 /* Override FP return register with %xmm0 for local functions when
7480 SSE math is enabled or for functions with sseregparm attribute. */
7481 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7482 {
7483 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7484 if ((sse_level >= 1 && mode == SFmode)
7485 || (sse_level == 2 && mode == DFmode))
7486 regno = FIRST_SSE_REG;
7487 }
7488
7489 /* OImode shouldn't be used directly. */
7490 gcc_assert (mode != OImode);
7491
7492 return gen_rtx_REG (orig_mode, regno);
7493 }
7494
7495 static rtx
7496 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7497 const_tree valtype)
7498 {
7499 rtx ret;
7500
7501 /* Handle libcalls, which don't provide a type node. */
7502 if (valtype == NULL)
7503 {
7504 unsigned int regno;
7505
7506 switch (mode)
7507 {
7508 case SFmode:
7509 case SCmode:
7510 case DFmode:
7511 case DCmode:
7512 case TFmode:
7513 case SDmode:
7514 case DDmode:
7515 case TDmode:
7516 regno = FIRST_SSE_REG;
7517 break;
7518 case XFmode:
7519 case XCmode:
7520 regno = FIRST_FLOAT_REG;
7521 break;
7522 case TCmode:
7523 return NULL;
7524 default:
7525 regno = AX_REG;
7526 }
7527
7528 return gen_rtx_REG (mode, regno);
7529 }
7530 else if (POINTER_TYPE_P (valtype))
7531 {
7532 /* Pointers are always returned in word_mode. */
7533 mode = word_mode;
7534 }
7535
7536 ret = construct_container (mode, orig_mode, valtype, 1,
7537 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7538 x86_64_int_return_registers, 0);
7539
7540 /* For zero sized structures, construct_container returns NULL, but we
7541 need to keep rest of compiler happy by returning meaningful value. */
7542 if (!ret)
7543 ret = gen_rtx_REG (orig_mode, AX_REG);
7544
7545 return ret;
7546 }
7547
7548 static rtx
7549 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7550 const_tree valtype)
7551 {
7552 unsigned int regno = AX_REG;
7553
7554 if (TARGET_SSE)
7555 {
7556 switch (GET_MODE_SIZE (mode))
7557 {
7558 case 16:
7559 if (valtype != NULL_TREE
7560 && !VECTOR_INTEGER_TYPE_P (valtype)
7561 && !VECTOR_INTEGER_TYPE_P (valtype)
7562 && !INTEGRAL_TYPE_P (valtype)
7563 && !VECTOR_FLOAT_TYPE_P (valtype))
7564 break;
7565 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7566 && !COMPLEX_MODE_P (mode))
7567 regno = FIRST_SSE_REG;
7568 break;
7569 case 8:
7570 case 4:
7571 if (mode == SFmode || mode == DFmode)
7572 regno = FIRST_SSE_REG;
7573 break;
7574 default:
7575 break;
7576 }
7577 }
7578 return gen_rtx_REG (orig_mode, regno);
7579 }
7580
7581 static rtx
7582 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7583 enum machine_mode orig_mode, enum machine_mode mode)
7584 {
7585 const_tree fn, fntype;
7586
7587 fn = NULL_TREE;
7588 if (fntype_or_decl && DECL_P (fntype_or_decl))
7589 fn = fntype_or_decl;
7590 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7591
7592 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7593 return function_value_ms_64 (orig_mode, mode, valtype);
7594 else if (TARGET_64BIT)
7595 return function_value_64 (orig_mode, mode, valtype);
7596 else
7597 return function_value_32 (orig_mode, mode, fntype, fn);
7598 }
7599
7600 static rtx
7601 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7602 bool outgoing ATTRIBUTE_UNUSED)
7603 {
7604 enum machine_mode mode, orig_mode;
7605
7606 orig_mode = TYPE_MODE (valtype);
7607 mode = type_natural_mode (valtype, NULL);
7608 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7609 }
7610
7611 /* Pointer function arguments and return values are promoted to
7612 word_mode. */
7613
7614 static enum machine_mode
7615 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7616 int *punsignedp, const_tree fntype,
7617 int for_return)
7618 {
7619 if (type != NULL_TREE && POINTER_TYPE_P (type))
7620 {
7621 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7622 return word_mode;
7623 }
7624 return default_promote_function_mode (type, mode, punsignedp, fntype,
7625 for_return);
7626 }
7627
7628 /* Return true if a structure, union or array with MODE containing FIELD
7629 should be accessed using BLKmode. */
7630
7631 static bool
7632 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7633 {
7634 /* Union with XFmode must be in BLKmode. */
7635 return (mode == XFmode
7636 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7637 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7638 }
7639
7640 rtx
7641 ix86_libcall_value (enum machine_mode mode)
7642 {
7643 return ix86_function_value_1 (NULL, NULL, mode, mode);
7644 }
7645
7646 /* Return true iff type is returned in memory. */
7647
7648 static bool ATTRIBUTE_UNUSED
7649 return_in_memory_32 (const_tree type, enum machine_mode mode)
7650 {
7651 HOST_WIDE_INT size;
7652
7653 if (mode == BLKmode)
7654 return true;
7655
7656 size = int_size_in_bytes (type);
7657
7658 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7659 return false;
7660
7661 if (VECTOR_MODE_P (mode) || mode == TImode)
7662 {
7663 /* User-created vectors small enough to fit in EAX. */
7664 if (size < 8)
7665 return false;
7666
7667 /* MMX/3dNow values are returned in MM0,
7668 except when it doesn't exits or the ABI prescribes otherwise. */
7669 if (size == 8)
7670 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7671
7672 /* SSE values are returned in XMM0, except when it doesn't exist. */
7673 if (size == 16)
7674 return !TARGET_SSE;
7675
7676 /* AVX values are returned in YMM0, except when it doesn't exist. */
7677 if (size == 32)
7678 return !TARGET_AVX;
7679 }
7680
7681 if (mode == XFmode)
7682 return false;
7683
7684 if (size > 12)
7685 return true;
7686
7687 /* OImode shouldn't be used directly. */
7688 gcc_assert (mode != OImode);
7689
7690 return false;
7691 }
7692
7693 static bool ATTRIBUTE_UNUSED
7694 return_in_memory_64 (const_tree type, enum machine_mode mode)
7695 {
7696 int needed_intregs, needed_sseregs;
7697 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7698 }
7699
7700 static bool ATTRIBUTE_UNUSED
7701 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7702 {
7703 HOST_WIDE_INT size = int_size_in_bytes (type);
7704
7705 /* __m128 is returned in xmm0. */
7706 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7707 || VECTOR_FLOAT_TYPE_P (type))
7708 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7709 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7710 return false;
7711
7712 /* Otherwise, the size must be exactly in [1248]. */
7713 return size != 1 && size != 2 && size != 4 && size != 8;
7714 }
7715
7716 static bool
7717 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7718 {
7719 #ifdef SUBTARGET_RETURN_IN_MEMORY
7720 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7721 #else
7722 const enum machine_mode mode = type_natural_mode (type, NULL);
7723
7724 if (TARGET_64BIT)
7725 {
7726 if (ix86_function_type_abi (fntype) == MS_ABI)
7727 return return_in_memory_ms_64 (type, mode);
7728 else
7729 return return_in_memory_64 (type, mode);
7730 }
7731 else
7732 return return_in_memory_32 (type, mode);
7733 #endif
7734 }
7735
7736 /* When returning SSE vector types, we have a choice of either
7737 (1) being abi incompatible with a -march switch, or
7738 (2) generating an error.
7739 Given no good solution, I think the safest thing is one warning.
7740 The user won't be able to use -Werror, but....
7741
7742 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7743 called in response to actually generating a caller or callee that
7744 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7745 via aggregate_value_p for general type probing from tree-ssa. */
7746
7747 static rtx
7748 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7749 {
7750 static bool warnedsse, warnedmmx;
7751
7752 if (!TARGET_64BIT && type)
7753 {
7754 /* Look at the return type of the function, not the function type. */
7755 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7756
7757 if (!TARGET_SSE && !warnedsse)
7758 {
7759 if (mode == TImode
7760 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7761 {
7762 warnedsse = true;
7763 warning (0, "SSE vector return without SSE enabled "
7764 "changes the ABI");
7765 }
7766 }
7767
7768 if (!TARGET_MMX && !warnedmmx)
7769 {
7770 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7771 {
7772 warnedmmx = true;
7773 warning (0, "MMX vector return without MMX enabled "
7774 "changes the ABI");
7775 }
7776 }
7777 }
7778
7779 return NULL;
7780 }
7781
7782 \f
7783 /* Create the va_list data type. */
7784
7785 /* Returns the calling convention specific va_list date type.
7786 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7787
7788 static tree
7789 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7790 {
7791 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7792
7793 /* For i386 we use plain pointer to argument area. */
7794 if (!TARGET_64BIT || abi == MS_ABI)
7795 return build_pointer_type (char_type_node);
7796
7797 record = lang_hooks.types.make_type (RECORD_TYPE);
7798 type_decl = build_decl (BUILTINS_LOCATION,
7799 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7800
7801 f_gpr = build_decl (BUILTINS_LOCATION,
7802 FIELD_DECL, get_identifier ("gp_offset"),
7803 unsigned_type_node);
7804 f_fpr = build_decl (BUILTINS_LOCATION,
7805 FIELD_DECL, get_identifier ("fp_offset"),
7806 unsigned_type_node);
7807 f_ovf = build_decl (BUILTINS_LOCATION,
7808 FIELD_DECL, get_identifier ("overflow_arg_area"),
7809 ptr_type_node);
7810 f_sav = build_decl (BUILTINS_LOCATION,
7811 FIELD_DECL, get_identifier ("reg_save_area"),
7812 ptr_type_node);
7813
7814 va_list_gpr_counter_field = f_gpr;
7815 va_list_fpr_counter_field = f_fpr;
7816
7817 DECL_FIELD_CONTEXT (f_gpr) = record;
7818 DECL_FIELD_CONTEXT (f_fpr) = record;
7819 DECL_FIELD_CONTEXT (f_ovf) = record;
7820 DECL_FIELD_CONTEXT (f_sav) = record;
7821
7822 TYPE_STUB_DECL (record) = type_decl;
7823 TYPE_NAME (record) = type_decl;
7824 TYPE_FIELDS (record) = f_gpr;
7825 DECL_CHAIN (f_gpr) = f_fpr;
7826 DECL_CHAIN (f_fpr) = f_ovf;
7827 DECL_CHAIN (f_ovf) = f_sav;
7828
7829 layout_type (record);
7830
7831 /* The correct type is an array type of one element. */
7832 return build_array_type (record, build_index_type (size_zero_node));
7833 }
7834
7835 /* Setup the builtin va_list data type and for 64-bit the additional
7836 calling convention specific va_list data types. */
7837
7838 static tree
7839 ix86_build_builtin_va_list (void)
7840 {
7841 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7842
7843 /* Initialize abi specific va_list builtin types. */
7844 if (TARGET_64BIT)
7845 {
7846 tree t;
7847 if (ix86_abi == MS_ABI)
7848 {
7849 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7850 if (TREE_CODE (t) != RECORD_TYPE)
7851 t = build_variant_type_copy (t);
7852 sysv_va_list_type_node = t;
7853 }
7854 else
7855 {
7856 t = ret;
7857 if (TREE_CODE (t) != RECORD_TYPE)
7858 t = build_variant_type_copy (t);
7859 sysv_va_list_type_node = t;
7860 }
7861 if (ix86_abi != MS_ABI)
7862 {
7863 t = ix86_build_builtin_va_list_abi (MS_ABI);
7864 if (TREE_CODE (t) != RECORD_TYPE)
7865 t = build_variant_type_copy (t);
7866 ms_va_list_type_node = t;
7867 }
7868 else
7869 {
7870 t = ret;
7871 if (TREE_CODE (t) != RECORD_TYPE)
7872 t = build_variant_type_copy (t);
7873 ms_va_list_type_node = t;
7874 }
7875 }
7876
7877 return ret;
7878 }
7879
7880 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7881
7882 static void
7883 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7884 {
7885 rtx save_area, mem;
7886 alias_set_type set;
7887 int i, max;
7888
7889 /* GPR size of varargs save area. */
7890 if (cfun->va_list_gpr_size)
7891 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7892 else
7893 ix86_varargs_gpr_size = 0;
7894
7895 /* FPR size of varargs save area. We don't need it if we don't pass
7896 anything in SSE registers. */
7897 if (TARGET_SSE && cfun->va_list_fpr_size)
7898 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7899 else
7900 ix86_varargs_fpr_size = 0;
7901
7902 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7903 return;
7904
7905 save_area = frame_pointer_rtx;
7906 set = get_varargs_alias_set ();
7907
7908 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7909 if (max > X86_64_REGPARM_MAX)
7910 max = X86_64_REGPARM_MAX;
7911
7912 for (i = cum->regno; i < max; i++)
7913 {
7914 mem = gen_rtx_MEM (word_mode,
7915 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7916 MEM_NOTRAP_P (mem) = 1;
7917 set_mem_alias_set (mem, set);
7918 emit_move_insn (mem,
7919 gen_rtx_REG (word_mode,
7920 x86_64_int_parameter_registers[i]));
7921 }
7922
7923 if (ix86_varargs_fpr_size)
7924 {
7925 enum machine_mode smode;
7926 rtx label, test;
7927
7928 /* Now emit code to save SSE registers. The AX parameter contains number
7929 of SSE parameter registers used to call this function, though all we
7930 actually check here is the zero/non-zero status. */
7931
7932 label = gen_label_rtx ();
7933 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7934 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7935 label));
7936
7937 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7938 we used movdqa (i.e. TImode) instead? Perhaps even better would
7939 be if we could determine the real mode of the data, via a hook
7940 into pass_stdarg. Ignore all that for now. */
7941 smode = V4SFmode;
7942 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7943 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7944
7945 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7946 if (max > X86_64_SSE_REGPARM_MAX)
7947 max = X86_64_SSE_REGPARM_MAX;
7948
7949 for (i = cum->sse_regno; i < max; ++i)
7950 {
7951 mem = plus_constant (Pmode, save_area,
7952 i * 16 + ix86_varargs_gpr_size);
7953 mem = gen_rtx_MEM (smode, mem);
7954 MEM_NOTRAP_P (mem) = 1;
7955 set_mem_alias_set (mem, set);
7956 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7957
7958 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7959 }
7960
7961 emit_label (label);
7962 }
7963 }
7964
7965 static void
7966 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7967 {
7968 alias_set_type set = get_varargs_alias_set ();
7969 int i;
7970
7971 /* Reset to zero, as there might be a sysv vaarg used
7972 before. */
7973 ix86_varargs_gpr_size = 0;
7974 ix86_varargs_fpr_size = 0;
7975
7976 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7977 {
7978 rtx reg, mem;
7979
7980 mem = gen_rtx_MEM (Pmode,
7981 plus_constant (Pmode, virtual_incoming_args_rtx,
7982 i * UNITS_PER_WORD));
7983 MEM_NOTRAP_P (mem) = 1;
7984 set_mem_alias_set (mem, set);
7985
7986 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7987 emit_move_insn (mem, reg);
7988 }
7989 }
7990
7991 static void
7992 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7993 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7994 int no_rtl)
7995 {
7996 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7997 CUMULATIVE_ARGS next_cum;
7998 tree fntype;
7999
8000 /* This argument doesn't appear to be used anymore. Which is good,
8001 because the old code here didn't suppress rtl generation. */
8002 gcc_assert (!no_rtl);
8003
8004 if (!TARGET_64BIT)
8005 return;
8006
8007 fntype = TREE_TYPE (current_function_decl);
8008
8009 /* For varargs, we do not want to skip the dummy va_dcl argument.
8010 For stdargs, we do want to skip the last named argument. */
8011 next_cum = *cum;
8012 if (stdarg_p (fntype))
8013 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8014 true);
8015
8016 if (cum->call_abi == MS_ABI)
8017 setup_incoming_varargs_ms_64 (&next_cum);
8018 else
8019 setup_incoming_varargs_64 (&next_cum);
8020 }
8021
8022 /* Checks if TYPE is of kind va_list char *. */
8023
8024 static bool
8025 is_va_list_char_pointer (tree type)
8026 {
8027 tree canonic;
8028
8029 /* For 32-bit it is always true. */
8030 if (!TARGET_64BIT)
8031 return true;
8032 canonic = ix86_canonical_va_list_type (type);
8033 return (canonic == ms_va_list_type_node
8034 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8035 }
8036
8037 /* Implement va_start. */
8038
8039 static void
8040 ix86_va_start (tree valist, rtx nextarg)
8041 {
8042 HOST_WIDE_INT words, n_gpr, n_fpr;
8043 tree f_gpr, f_fpr, f_ovf, f_sav;
8044 tree gpr, fpr, ovf, sav, t;
8045 tree type;
8046 rtx ovf_rtx;
8047
8048 if (flag_split_stack
8049 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8050 {
8051 unsigned int scratch_regno;
8052
8053 /* When we are splitting the stack, we can't refer to the stack
8054 arguments using internal_arg_pointer, because they may be on
8055 the old stack. The split stack prologue will arrange to
8056 leave a pointer to the old stack arguments in a scratch
8057 register, which we here copy to a pseudo-register. The split
8058 stack prologue can't set the pseudo-register directly because
8059 it (the prologue) runs before any registers have been saved. */
8060
8061 scratch_regno = split_stack_prologue_scratch_regno ();
8062 if (scratch_regno != INVALID_REGNUM)
8063 {
8064 rtx reg, seq;
8065
8066 reg = gen_reg_rtx (Pmode);
8067 cfun->machine->split_stack_varargs_pointer = reg;
8068
8069 start_sequence ();
8070 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8071 seq = get_insns ();
8072 end_sequence ();
8073
8074 push_topmost_sequence ();
8075 emit_insn_after (seq, entry_of_function ());
8076 pop_topmost_sequence ();
8077 }
8078 }
8079
8080 /* Only 64bit target needs something special. */
8081 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8082 {
8083 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8084 std_expand_builtin_va_start (valist, nextarg);
8085 else
8086 {
8087 rtx va_r, next;
8088
8089 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8090 next = expand_binop (ptr_mode, add_optab,
8091 cfun->machine->split_stack_varargs_pointer,
8092 crtl->args.arg_offset_rtx,
8093 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8094 convert_move (va_r, next, 0);
8095 }
8096 return;
8097 }
8098
8099 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8100 f_fpr = DECL_CHAIN (f_gpr);
8101 f_ovf = DECL_CHAIN (f_fpr);
8102 f_sav = DECL_CHAIN (f_ovf);
8103
8104 valist = build_simple_mem_ref (valist);
8105 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8106 /* The following should be folded into the MEM_REF offset. */
8107 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8108 f_gpr, NULL_TREE);
8109 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8110 f_fpr, NULL_TREE);
8111 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8112 f_ovf, NULL_TREE);
8113 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8114 f_sav, NULL_TREE);
8115
8116 /* Count number of gp and fp argument registers used. */
8117 words = crtl->args.info.words;
8118 n_gpr = crtl->args.info.regno;
8119 n_fpr = crtl->args.info.sse_regno;
8120
8121 if (cfun->va_list_gpr_size)
8122 {
8123 type = TREE_TYPE (gpr);
8124 t = build2 (MODIFY_EXPR, type,
8125 gpr, build_int_cst (type, n_gpr * 8));
8126 TREE_SIDE_EFFECTS (t) = 1;
8127 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8128 }
8129
8130 if (TARGET_SSE && cfun->va_list_fpr_size)
8131 {
8132 type = TREE_TYPE (fpr);
8133 t = build2 (MODIFY_EXPR, type, fpr,
8134 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8135 TREE_SIDE_EFFECTS (t) = 1;
8136 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8137 }
8138
8139 /* Find the overflow area. */
8140 type = TREE_TYPE (ovf);
8141 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8142 ovf_rtx = crtl->args.internal_arg_pointer;
8143 else
8144 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8145 t = make_tree (type, ovf_rtx);
8146 if (words != 0)
8147 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8148 t = build2 (MODIFY_EXPR, type, ovf, t);
8149 TREE_SIDE_EFFECTS (t) = 1;
8150 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8151
8152 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8153 {
8154 /* Find the register save area.
8155 Prologue of the function save it right above stack frame. */
8156 type = TREE_TYPE (sav);
8157 t = make_tree (type, frame_pointer_rtx);
8158 if (!ix86_varargs_gpr_size)
8159 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8160 t = build2 (MODIFY_EXPR, type, sav, t);
8161 TREE_SIDE_EFFECTS (t) = 1;
8162 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8163 }
8164 }
8165
8166 /* Implement va_arg. */
8167
8168 static tree
8169 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8170 gimple_seq *post_p)
8171 {
8172 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8173 tree f_gpr, f_fpr, f_ovf, f_sav;
8174 tree gpr, fpr, ovf, sav, t;
8175 int size, rsize;
8176 tree lab_false, lab_over = NULL_TREE;
8177 tree addr, t2;
8178 rtx container;
8179 int indirect_p = 0;
8180 tree ptrtype;
8181 enum machine_mode nat_mode;
8182 unsigned int arg_boundary;
8183
8184 /* Only 64bit target needs something special. */
8185 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8186 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8187
8188 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8189 f_fpr = DECL_CHAIN (f_gpr);
8190 f_ovf = DECL_CHAIN (f_fpr);
8191 f_sav = DECL_CHAIN (f_ovf);
8192
8193 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8194 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8195 valist = build_va_arg_indirect_ref (valist);
8196 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8197 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8198 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8199
8200 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8201 if (indirect_p)
8202 type = build_pointer_type (type);
8203 size = int_size_in_bytes (type);
8204 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8205
8206 nat_mode = type_natural_mode (type, NULL);
8207 switch (nat_mode)
8208 {
8209 case V8SFmode:
8210 case V8SImode:
8211 case V32QImode:
8212 case V16HImode:
8213 case V4DFmode:
8214 case V4DImode:
8215 /* Unnamed 256bit vector mode parameters are passed on stack. */
8216 if (!TARGET_64BIT_MS_ABI)
8217 {
8218 container = NULL;
8219 break;
8220 }
8221
8222 default:
8223 container = construct_container (nat_mode, TYPE_MODE (type),
8224 type, 0, X86_64_REGPARM_MAX,
8225 X86_64_SSE_REGPARM_MAX, intreg,
8226 0);
8227 break;
8228 }
8229
8230 /* Pull the value out of the saved registers. */
8231
8232 addr = create_tmp_var (ptr_type_node, "addr");
8233
8234 if (container)
8235 {
8236 int needed_intregs, needed_sseregs;
8237 bool need_temp;
8238 tree int_addr, sse_addr;
8239
8240 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8241 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8242
8243 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8244
8245 need_temp = (!REG_P (container)
8246 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8247 || TYPE_ALIGN (type) > 128));
8248
8249 /* In case we are passing structure, verify that it is consecutive block
8250 on the register save area. If not we need to do moves. */
8251 if (!need_temp && !REG_P (container))
8252 {
8253 /* Verify that all registers are strictly consecutive */
8254 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8255 {
8256 int i;
8257
8258 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8259 {
8260 rtx slot = XVECEXP (container, 0, i);
8261 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8262 || INTVAL (XEXP (slot, 1)) != i * 16)
8263 need_temp = 1;
8264 }
8265 }
8266 else
8267 {
8268 int i;
8269
8270 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8271 {
8272 rtx slot = XVECEXP (container, 0, i);
8273 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8274 || INTVAL (XEXP (slot, 1)) != i * 8)
8275 need_temp = 1;
8276 }
8277 }
8278 }
8279 if (!need_temp)
8280 {
8281 int_addr = addr;
8282 sse_addr = addr;
8283 }
8284 else
8285 {
8286 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8287 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8288 }
8289
8290 /* First ensure that we fit completely in registers. */
8291 if (needed_intregs)
8292 {
8293 t = build_int_cst (TREE_TYPE (gpr),
8294 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8295 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8296 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8297 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8298 gimplify_and_add (t, pre_p);
8299 }
8300 if (needed_sseregs)
8301 {
8302 t = build_int_cst (TREE_TYPE (fpr),
8303 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8304 + X86_64_REGPARM_MAX * 8);
8305 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8306 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8307 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8308 gimplify_and_add (t, pre_p);
8309 }
8310
8311 /* Compute index to start of area used for integer regs. */
8312 if (needed_intregs)
8313 {
8314 /* int_addr = gpr + sav; */
8315 t = fold_build_pointer_plus (sav, gpr);
8316 gimplify_assign (int_addr, t, pre_p);
8317 }
8318 if (needed_sseregs)
8319 {
8320 /* sse_addr = fpr + sav; */
8321 t = fold_build_pointer_plus (sav, fpr);
8322 gimplify_assign (sse_addr, t, pre_p);
8323 }
8324 if (need_temp)
8325 {
8326 int i, prev_size = 0;
8327 tree temp = create_tmp_var (type, "va_arg_tmp");
8328
8329 /* addr = &temp; */
8330 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8331 gimplify_assign (addr, t, pre_p);
8332
8333 for (i = 0; i < XVECLEN (container, 0); i++)
8334 {
8335 rtx slot = XVECEXP (container, 0, i);
8336 rtx reg = XEXP (slot, 0);
8337 enum machine_mode mode = GET_MODE (reg);
8338 tree piece_type;
8339 tree addr_type;
8340 tree daddr_type;
8341 tree src_addr, src;
8342 int src_offset;
8343 tree dest_addr, dest;
8344 int cur_size = GET_MODE_SIZE (mode);
8345
8346 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8347 prev_size = INTVAL (XEXP (slot, 1));
8348 if (prev_size + cur_size > size)
8349 {
8350 cur_size = size - prev_size;
8351 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8352 if (mode == BLKmode)
8353 mode = QImode;
8354 }
8355 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8356 if (mode == GET_MODE (reg))
8357 addr_type = build_pointer_type (piece_type);
8358 else
8359 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8360 true);
8361 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8362 true);
8363
8364 if (SSE_REGNO_P (REGNO (reg)))
8365 {
8366 src_addr = sse_addr;
8367 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8368 }
8369 else
8370 {
8371 src_addr = int_addr;
8372 src_offset = REGNO (reg) * 8;
8373 }
8374 src_addr = fold_convert (addr_type, src_addr);
8375 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8376
8377 dest_addr = fold_convert (daddr_type, addr);
8378 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8379 if (cur_size == GET_MODE_SIZE (mode))
8380 {
8381 src = build_va_arg_indirect_ref (src_addr);
8382 dest = build_va_arg_indirect_ref (dest_addr);
8383
8384 gimplify_assign (dest, src, pre_p);
8385 }
8386 else
8387 {
8388 tree copy
8389 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8390 3, dest_addr, src_addr,
8391 size_int (cur_size));
8392 gimplify_and_add (copy, pre_p);
8393 }
8394 prev_size += cur_size;
8395 }
8396 }
8397
8398 if (needed_intregs)
8399 {
8400 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8401 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8402 gimplify_assign (gpr, t, pre_p);
8403 }
8404
8405 if (needed_sseregs)
8406 {
8407 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8408 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8409 gimplify_assign (fpr, t, pre_p);
8410 }
8411
8412 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8413
8414 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8415 }
8416
8417 /* ... otherwise out of the overflow area. */
8418
8419 /* When we align parameter on stack for caller, if the parameter
8420 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8421 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8422 here with caller. */
8423 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8424 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8425 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8426
8427 /* Care for on-stack alignment if needed. */
8428 if (arg_boundary <= 64 || size == 0)
8429 t = ovf;
8430 else
8431 {
8432 HOST_WIDE_INT align = arg_boundary / 8;
8433 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8434 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8435 build_int_cst (TREE_TYPE (t), -align));
8436 }
8437
8438 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8439 gimplify_assign (addr, t, pre_p);
8440
8441 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8442 gimplify_assign (unshare_expr (ovf), t, pre_p);
8443
8444 if (container)
8445 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8446
8447 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8448 addr = fold_convert (ptrtype, addr);
8449
8450 if (indirect_p)
8451 addr = build_va_arg_indirect_ref (addr);
8452 return build_va_arg_indirect_ref (addr);
8453 }
8454 \f
8455 /* Return true if OPNUM's MEM should be matched
8456 in movabs* patterns. */
8457
8458 bool
8459 ix86_check_movabs (rtx insn, int opnum)
8460 {
8461 rtx set, mem;
8462
8463 set = PATTERN (insn);
8464 if (GET_CODE (set) == PARALLEL)
8465 set = XVECEXP (set, 0, 0);
8466 gcc_assert (GET_CODE (set) == SET);
8467 mem = XEXP (set, opnum);
8468 while (GET_CODE (mem) == SUBREG)
8469 mem = SUBREG_REG (mem);
8470 gcc_assert (MEM_P (mem));
8471 return volatile_ok || !MEM_VOLATILE_P (mem);
8472 }
8473 \f
8474 /* Initialize the table of extra 80387 mathematical constants. */
8475
8476 static void
8477 init_ext_80387_constants (void)
8478 {
8479 static const char * cst[5] =
8480 {
8481 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8482 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8483 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8484 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8485 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8486 };
8487 int i;
8488
8489 for (i = 0; i < 5; i++)
8490 {
8491 real_from_string (&ext_80387_constants_table[i], cst[i]);
8492 /* Ensure each constant is rounded to XFmode precision. */
8493 real_convert (&ext_80387_constants_table[i],
8494 XFmode, &ext_80387_constants_table[i]);
8495 }
8496
8497 ext_80387_constants_init = 1;
8498 }
8499
8500 /* Return non-zero if the constant is something that
8501 can be loaded with a special instruction. */
8502
8503 int
8504 standard_80387_constant_p (rtx x)
8505 {
8506 enum machine_mode mode = GET_MODE (x);
8507
8508 REAL_VALUE_TYPE r;
8509
8510 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8511 return -1;
8512
8513 if (x == CONST0_RTX (mode))
8514 return 1;
8515 if (x == CONST1_RTX (mode))
8516 return 2;
8517
8518 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8519
8520 /* For XFmode constants, try to find a special 80387 instruction when
8521 optimizing for size or on those CPUs that benefit from them. */
8522 if (mode == XFmode
8523 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8524 {
8525 int i;
8526
8527 if (! ext_80387_constants_init)
8528 init_ext_80387_constants ();
8529
8530 for (i = 0; i < 5; i++)
8531 if (real_identical (&r, &ext_80387_constants_table[i]))
8532 return i + 3;
8533 }
8534
8535 /* Load of the constant -0.0 or -1.0 will be split as
8536 fldz;fchs or fld1;fchs sequence. */
8537 if (real_isnegzero (&r))
8538 return 8;
8539 if (real_identical (&r, &dconstm1))
8540 return 9;
8541
8542 return 0;
8543 }
8544
8545 /* Return the opcode of the special instruction to be used to load
8546 the constant X. */
8547
8548 const char *
8549 standard_80387_constant_opcode (rtx x)
8550 {
8551 switch (standard_80387_constant_p (x))
8552 {
8553 case 1:
8554 return "fldz";
8555 case 2:
8556 return "fld1";
8557 case 3:
8558 return "fldlg2";
8559 case 4:
8560 return "fldln2";
8561 case 5:
8562 return "fldl2e";
8563 case 6:
8564 return "fldl2t";
8565 case 7:
8566 return "fldpi";
8567 case 8:
8568 case 9:
8569 return "#";
8570 default:
8571 gcc_unreachable ();
8572 }
8573 }
8574
8575 /* Return the CONST_DOUBLE representing the 80387 constant that is
8576 loaded by the specified special instruction. The argument IDX
8577 matches the return value from standard_80387_constant_p. */
8578
8579 rtx
8580 standard_80387_constant_rtx (int idx)
8581 {
8582 int i;
8583
8584 if (! ext_80387_constants_init)
8585 init_ext_80387_constants ();
8586
8587 switch (idx)
8588 {
8589 case 3:
8590 case 4:
8591 case 5:
8592 case 6:
8593 case 7:
8594 i = idx - 3;
8595 break;
8596
8597 default:
8598 gcc_unreachable ();
8599 }
8600
8601 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8602 XFmode);
8603 }
8604
8605 /* Return 1 if X is all 0s and 2 if x is all 1s
8606 in supported SSE/AVX vector mode. */
8607
8608 int
8609 standard_sse_constant_p (rtx x)
8610 {
8611 enum machine_mode mode = GET_MODE (x);
8612
8613 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8614 return 1;
8615 if (vector_all_ones_operand (x, mode))
8616 switch (mode)
8617 {
8618 case V16QImode:
8619 case V8HImode:
8620 case V4SImode:
8621 case V2DImode:
8622 if (TARGET_SSE2)
8623 return 2;
8624 case V32QImode:
8625 case V16HImode:
8626 case V8SImode:
8627 case V4DImode:
8628 if (TARGET_AVX2)
8629 return 2;
8630 default:
8631 break;
8632 }
8633
8634 return 0;
8635 }
8636
8637 /* Return the opcode of the special instruction to be used to load
8638 the constant X. */
8639
8640 const char *
8641 standard_sse_constant_opcode (rtx insn, rtx x)
8642 {
8643 switch (standard_sse_constant_p (x))
8644 {
8645 case 1:
8646 switch (get_attr_mode (insn))
8647 {
8648 case MODE_TI:
8649 return "%vpxor\t%0, %d0";
8650 case MODE_V2DF:
8651 return "%vxorpd\t%0, %d0";
8652 case MODE_V4SF:
8653 return "%vxorps\t%0, %d0";
8654
8655 case MODE_OI:
8656 return "vpxor\t%x0, %x0, %x0";
8657 case MODE_V4DF:
8658 return "vxorpd\t%x0, %x0, %x0";
8659 case MODE_V8SF:
8660 return "vxorps\t%x0, %x0, %x0";
8661
8662 default:
8663 break;
8664 }
8665
8666 case 2:
8667 if (get_attr_mode (insn) == MODE_XI
8668 || get_attr_mode (insn) == MODE_V8DF
8669 || get_attr_mode (insn) == MODE_V16SF)
8670 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8671 if (TARGET_AVX)
8672 return "vpcmpeqd\t%0, %0, %0";
8673 else
8674 return "pcmpeqd\t%0, %0";
8675
8676 default:
8677 break;
8678 }
8679 gcc_unreachable ();
8680 }
8681
8682 /* Returns true if OP contains a symbol reference */
8683
8684 bool
8685 symbolic_reference_mentioned_p (rtx op)
8686 {
8687 const char *fmt;
8688 int i;
8689
8690 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8691 return true;
8692
8693 fmt = GET_RTX_FORMAT (GET_CODE (op));
8694 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8695 {
8696 if (fmt[i] == 'E')
8697 {
8698 int j;
8699
8700 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8701 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8702 return true;
8703 }
8704
8705 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8706 return true;
8707 }
8708
8709 return false;
8710 }
8711
8712 /* Return true if it is appropriate to emit `ret' instructions in the
8713 body of a function. Do this only if the epilogue is simple, needing a
8714 couple of insns. Prior to reloading, we can't tell how many registers
8715 must be saved, so return false then. Return false if there is no frame
8716 marker to de-allocate. */
8717
8718 bool
8719 ix86_can_use_return_insn_p (void)
8720 {
8721 struct ix86_frame frame;
8722
8723 if (! reload_completed || frame_pointer_needed)
8724 return 0;
8725
8726 /* Don't allow more than 32k pop, since that's all we can do
8727 with one instruction. */
8728 if (crtl->args.pops_args && crtl->args.size >= 32768)
8729 return 0;
8730
8731 ix86_compute_frame_layout (&frame);
8732 return (frame.stack_pointer_offset == UNITS_PER_WORD
8733 && (frame.nregs + frame.nsseregs) == 0);
8734 }
8735 \f
8736 /* Value should be nonzero if functions must have frame pointers.
8737 Zero means the frame pointer need not be set up (and parms may
8738 be accessed via the stack pointer) in functions that seem suitable. */
8739
8740 static bool
8741 ix86_frame_pointer_required (void)
8742 {
8743 /* If we accessed previous frames, then the generated code expects
8744 to be able to access the saved ebp value in our frame. */
8745 if (cfun->machine->accesses_prev_frame)
8746 return true;
8747
8748 /* Several x86 os'es need a frame pointer for other reasons,
8749 usually pertaining to setjmp. */
8750 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8751 return true;
8752
8753 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8754 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8755 return true;
8756
8757 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8758 allocation is 4GB. */
8759 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8760 return true;
8761
8762 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8763 turns off the frame pointer by default. Turn it back on now if
8764 we've not got a leaf function. */
8765 if (TARGET_OMIT_LEAF_FRAME_POINTER
8766 && (!crtl->is_leaf
8767 || ix86_current_function_calls_tls_descriptor))
8768 return true;
8769
8770 if (crtl->profile && !flag_fentry)
8771 return true;
8772
8773 return false;
8774 }
8775
8776 /* Record that the current function accesses previous call frames. */
8777
8778 void
8779 ix86_setup_frame_addresses (void)
8780 {
8781 cfun->machine->accesses_prev_frame = 1;
8782 }
8783 \f
8784 #ifndef USE_HIDDEN_LINKONCE
8785 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8786 # define USE_HIDDEN_LINKONCE 1
8787 # else
8788 # define USE_HIDDEN_LINKONCE 0
8789 # endif
8790 #endif
8791
8792 static int pic_labels_used;
8793
8794 /* Fills in the label name that should be used for a pc thunk for
8795 the given register. */
8796
8797 static void
8798 get_pc_thunk_name (char name[32], unsigned int regno)
8799 {
8800 gcc_assert (!TARGET_64BIT);
8801
8802 if (USE_HIDDEN_LINKONCE)
8803 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8804 else
8805 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8806 }
8807
8808
8809 /* This function generates code for -fpic that loads %ebx with
8810 the return address of the caller and then returns. */
8811
8812 static void
8813 ix86_code_end (void)
8814 {
8815 rtx xops[2];
8816 int regno;
8817
8818 for (regno = AX_REG; regno <= SP_REG; regno++)
8819 {
8820 char name[32];
8821 tree decl;
8822
8823 if (!(pic_labels_used & (1 << regno)))
8824 continue;
8825
8826 get_pc_thunk_name (name, regno);
8827
8828 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8829 get_identifier (name),
8830 build_function_type_list (void_type_node, NULL_TREE));
8831 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8832 NULL_TREE, void_type_node);
8833 TREE_PUBLIC (decl) = 1;
8834 TREE_STATIC (decl) = 1;
8835 DECL_IGNORED_P (decl) = 1;
8836
8837 #if TARGET_MACHO
8838 if (TARGET_MACHO)
8839 {
8840 switch_to_section (darwin_sections[text_coal_section]);
8841 fputs ("\t.weak_definition\t", asm_out_file);
8842 assemble_name (asm_out_file, name);
8843 fputs ("\n\t.private_extern\t", asm_out_file);
8844 assemble_name (asm_out_file, name);
8845 putc ('\n', asm_out_file);
8846 ASM_OUTPUT_LABEL (asm_out_file, name);
8847 DECL_WEAK (decl) = 1;
8848 }
8849 else
8850 #endif
8851 if (USE_HIDDEN_LINKONCE)
8852 {
8853 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8854
8855 targetm.asm_out.unique_section (decl, 0);
8856 switch_to_section (get_named_section (decl, NULL, 0));
8857
8858 targetm.asm_out.globalize_label (asm_out_file, name);
8859 fputs ("\t.hidden\t", asm_out_file);
8860 assemble_name (asm_out_file, name);
8861 putc ('\n', asm_out_file);
8862 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8863 }
8864 else
8865 {
8866 switch_to_section (text_section);
8867 ASM_OUTPUT_LABEL (asm_out_file, name);
8868 }
8869
8870 DECL_INITIAL (decl) = make_node (BLOCK);
8871 current_function_decl = decl;
8872 init_function_start (decl);
8873 first_function_block_is_cold = false;
8874 /* Make sure unwind info is emitted for the thunk if needed. */
8875 final_start_function (emit_barrier (), asm_out_file, 1);
8876
8877 /* Pad stack IP move with 4 instructions (two NOPs count
8878 as one instruction). */
8879 if (TARGET_PAD_SHORT_FUNCTION)
8880 {
8881 int i = 8;
8882
8883 while (i--)
8884 fputs ("\tnop\n", asm_out_file);
8885 }
8886
8887 xops[0] = gen_rtx_REG (Pmode, regno);
8888 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8889 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8890 output_asm_insn ("%!ret", NULL);
8891 final_end_function ();
8892 init_insn_lengths ();
8893 free_after_compilation (cfun);
8894 set_cfun (NULL);
8895 current_function_decl = NULL;
8896 }
8897
8898 if (flag_split_stack)
8899 file_end_indicate_split_stack ();
8900 }
8901
8902 /* Emit code for the SET_GOT patterns. */
8903
8904 const char *
8905 output_set_got (rtx dest, rtx label)
8906 {
8907 rtx xops[3];
8908
8909 xops[0] = dest;
8910
8911 if (TARGET_VXWORKS_RTP && flag_pic)
8912 {
8913 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8914 xops[2] = gen_rtx_MEM (Pmode,
8915 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8916 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8917
8918 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8919 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8920 an unadorned address. */
8921 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8922 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8923 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8924 return "";
8925 }
8926
8927 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8928
8929 if (!flag_pic)
8930 {
8931 if (TARGET_MACHO)
8932 /* We don't need a pic base, we're not producing pic. */
8933 gcc_unreachable ();
8934
8935 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8936 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8937 targetm.asm_out.internal_label (asm_out_file, "L",
8938 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8939 }
8940 else
8941 {
8942 char name[32];
8943 get_pc_thunk_name (name, REGNO (dest));
8944 pic_labels_used |= 1 << REGNO (dest);
8945
8946 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8947 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8948 output_asm_insn ("%!call\t%X2", xops);
8949
8950 #if TARGET_MACHO
8951 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8952 This is what will be referenced by the Mach-O PIC subsystem. */
8953 if (machopic_should_output_picbase_label () || !label)
8954 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8955
8956 /* When we are restoring the pic base at the site of a nonlocal label,
8957 and we decided to emit the pic base above, we will still output a
8958 local label used for calculating the correction offset (even though
8959 the offset will be 0 in that case). */
8960 if (label)
8961 targetm.asm_out.internal_label (asm_out_file, "L",
8962 CODE_LABEL_NUMBER (label));
8963 #endif
8964 }
8965
8966 if (!TARGET_MACHO)
8967 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8968
8969 return "";
8970 }
8971
8972 /* Generate an "push" pattern for input ARG. */
8973
8974 static rtx
8975 gen_push (rtx arg)
8976 {
8977 struct machine_function *m = cfun->machine;
8978
8979 if (m->fs.cfa_reg == stack_pointer_rtx)
8980 m->fs.cfa_offset += UNITS_PER_WORD;
8981 m->fs.sp_offset += UNITS_PER_WORD;
8982
8983 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8984 arg = gen_rtx_REG (word_mode, REGNO (arg));
8985
8986 return gen_rtx_SET (VOIDmode,
8987 gen_rtx_MEM (word_mode,
8988 gen_rtx_PRE_DEC (Pmode,
8989 stack_pointer_rtx)),
8990 arg);
8991 }
8992
8993 /* Generate an "pop" pattern for input ARG. */
8994
8995 static rtx
8996 gen_pop (rtx arg)
8997 {
8998 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8999 arg = gen_rtx_REG (word_mode, REGNO (arg));
9000
9001 return gen_rtx_SET (VOIDmode,
9002 arg,
9003 gen_rtx_MEM (word_mode,
9004 gen_rtx_POST_INC (Pmode,
9005 stack_pointer_rtx)));
9006 }
9007
9008 /* Return >= 0 if there is an unused call-clobbered register available
9009 for the entire function. */
9010
9011 static unsigned int
9012 ix86_select_alt_pic_regnum (void)
9013 {
9014 if (crtl->is_leaf
9015 && !crtl->profile
9016 && !ix86_current_function_calls_tls_descriptor)
9017 {
9018 int i, drap;
9019 /* Can't use the same register for both PIC and DRAP. */
9020 if (crtl->drap_reg)
9021 drap = REGNO (crtl->drap_reg);
9022 else
9023 drap = -1;
9024 for (i = 2; i >= 0; --i)
9025 if (i != drap && !df_regs_ever_live_p (i))
9026 return i;
9027 }
9028
9029 return INVALID_REGNUM;
9030 }
9031
9032 /* Return TRUE if we need to save REGNO. */
9033
9034 static bool
9035 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9036 {
9037 if (pic_offset_table_rtx
9038 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9039 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9040 || crtl->profile
9041 || crtl->calls_eh_return
9042 || crtl->uses_const_pool
9043 || cfun->has_nonlocal_label))
9044 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9045
9046 if (crtl->calls_eh_return && maybe_eh_return)
9047 {
9048 unsigned i;
9049 for (i = 0; ; i++)
9050 {
9051 unsigned test = EH_RETURN_DATA_REGNO (i);
9052 if (test == INVALID_REGNUM)
9053 break;
9054 if (test == regno)
9055 return true;
9056 }
9057 }
9058
9059 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9060 return true;
9061
9062 return (df_regs_ever_live_p (regno)
9063 && !call_used_regs[regno]
9064 && !fixed_regs[regno]
9065 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9066 }
9067
9068 /* Return number of saved general prupose registers. */
9069
9070 static int
9071 ix86_nsaved_regs (void)
9072 {
9073 int nregs = 0;
9074 int regno;
9075
9076 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9077 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9078 nregs ++;
9079 return nregs;
9080 }
9081
9082 /* Return number of saved SSE registrers. */
9083
9084 static int
9085 ix86_nsaved_sseregs (void)
9086 {
9087 int nregs = 0;
9088 int regno;
9089
9090 if (!TARGET_64BIT_MS_ABI)
9091 return 0;
9092 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9093 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9094 nregs ++;
9095 return nregs;
9096 }
9097
9098 /* Given FROM and TO register numbers, say whether this elimination is
9099 allowed. If stack alignment is needed, we can only replace argument
9100 pointer with hard frame pointer, or replace frame pointer with stack
9101 pointer. Otherwise, frame pointer elimination is automatically
9102 handled and all other eliminations are valid. */
9103
9104 static bool
9105 ix86_can_eliminate (const int from, const int to)
9106 {
9107 if (stack_realign_fp)
9108 return ((from == ARG_POINTER_REGNUM
9109 && to == HARD_FRAME_POINTER_REGNUM)
9110 || (from == FRAME_POINTER_REGNUM
9111 && to == STACK_POINTER_REGNUM));
9112 else
9113 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9114 }
9115
9116 /* Return the offset between two registers, one to be eliminated, and the other
9117 its replacement, at the start of a routine. */
9118
9119 HOST_WIDE_INT
9120 ix86_initial_elimination_offset (int from, int to)
9121 {
9122 struct ix86_frame frame;
9123 ix86_compute_frame_layout (&frame);
9124
9125 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9126 return frame.hard_frame_pointer_offset;
9127 else if (from == FRAME_POINTER_REGNUM
9128 && to == HARD_FRAME_POINTER_REGNUM)
9129 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9130 else
9131 {
9132 gcc_assert (to == STACK_POINTER_REGNUM);
9133
9134 if (from == ARG_POINTER_REGNUM)
9135 return frame.stack_pointer_offset;
9136
9137 gcc_assert (from == FRAME_POINTER_REGNUM);
9138 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9139 }
9140 }
9141
9142 /* In a dynamically-aligned function, we can't know the offset from
9143 stack pointer to frame pointer, so we must ensure that setjmp
9144 eliminates fp against the hard fp (%ebp) rather than trying to
9145 index from %esp up to the top of the frame across a gap that is
9146 of unknown (at compile-time) size. */
9147 static rtx
9148 ix86_builtin_setjmp_frame_value (void)
9149 {
9150 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9151 }
9152
9153 /* When using -fsplit-stack, the allocation routines set a field in
9154 the TCB to the bottom of the stack plus this much space, measured
9155 in bytes. */
9156
9157 #define SPLIT_STACK_AVAILABLE 256
9158
9159 /* Fill structure ix86_frame about frame of currently computed function. */
9160
9161 static void
9162 ix86_compute_frame_layout (struct ix86_frame *frame)
9163 {
9164 unsigned HOST_WIDE_INT stack_alignment_needed;
9165 HOST_WIDE_INT offset;
9166 unsigned HOST_WIDE_INT preferred_alignment;
9167 HOST_WIDE_INT size = get_frame_size ();
9168 HOST_WIDE_INT to_allocate;
9169
9170 frame->nregs = ix86_nsaved_regs ();
9171 frame->nsseregs = ix86_nsaved_sseregs ();
9172
9173 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9174 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9175
9176 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9177 function prologues and leaf. */
9178 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9179 && (!crtl->is_leaf || cfun->calls_alloca != 0
9180 || ix86_current_function_calls_tls_descriptor))
9181 {
9182 preferred_alignment = 16;
9183 stack_alignment_needed = 16;
9184 crtl->preferred_stack_boundary = 128;
9185 crtl->stack_alignment_needed = 128;
9186 }
9187
9188 gcc_assert (!size || stack_alignment_needed);
9189 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9190 gcc_assert (preferred_alignment <= stack_alignment_needed);
9191
9192 /* For SEH we have to limit the amount of code movement into the prologue.
9193 At present we do this via a BLOCKAGE, at which point there's very little
9194 scheduling that can be done, which means that there's very little point
9195 in doing anything except PUSHs. */
9196 if (TARGET_SEH)
9197 cfun->machine->use_fast_prologue_epilogue = false;
9198
9199 /* During reload iteration the amount of registers saved can change.
9200 Recompute the value as needed. Do not recompute when amount of registers
9201 didn't change as reload does multiple calls to the function and does not
9202 expect the decision to change within single iteration. */
9203 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9204 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9205 {
9206 int count = frame->nregs;
9207 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9208
9209 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9210
9211 /* The fast prologue uses move instead of push to save registers. This
9212 is significantly longer, but also executes faster as modern hardware
9213 can execute the moves in parallel, but can't do that for push/pop.
9214
9215 Be careful about choosing what prologue to emit: When function takes
9216 many instructions to execute we may use slow version as well as in
9217 case function is known to be outside hot spot (this is known with
9218 feedback only). Weight the size of function by number of registers
9219 to save as it is cheap to use one or two push instructions but very
9220 slow to use many of them. */
9221 if (count)
9222 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9223 if (node->frequency < NODE_FREQUENCY_NORMAL
9224 || (flag_branch_probabilities
9225 && node->frequency < NODE_FREQUENCY_HOT))
9226 cfun->machine->use_fast_prologue_epilogue = false;
9227 else
9228 cfun->machine->use_fast_prologue_epilogue
9229 = !expensive_function_p (count);
9230 }
9231
9232 frame->save_regs_using_mov
9233 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9234 /* If static stack checking is enabled and done with probes,
9235 the registers need to be saved before allocating the frame. */
9236 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9237
9238 /* Skip return address. */
9239 offset = UNITS_PER_WORD;
9240
9241 /* Skip pushed static chain. */
9242 if (ix86_static_chain_on_stack)
9243 offset += UNITS_PER_WORD;
9244
9245 /* Skip saved base pointer. */
9246 if (frame_pointer_needed)
9247 offset += UNITS_PER_WORD;
9248 frame->hfp_save_offset = offset;
9249
9250 /* The traditional frame pointer location is at the top of the frame. */
9251 frame->hard_frame_pointer_offset = offset;
9252
9253 /* Register save area */
9254 offset += frame->nregs * UNITS_PER_WORD;
9255 frame->reg_save_offset = offset;
9256
9257 /* On SEH target, registers are pushed just before the frame pointer
9258 location. */
9259 if (TARGET_SEH)
9260 frame->hard_frame_pointer_offset = offset;
9261
9262 /* Align and set SSE register save area. */
9263 if (frame->nsseregs)
9264 {
9265 /* The only ABI that has saved SSE registers (Win64) also has a
9266 16-byte aligned default stack, and thus we don't need to be
9267 within the re-aligned local stack frame to save them. */
9268 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9269 offset = (offset + 16 - 1) & -16;
9270 offset += frame->nsseregs * 16;
9271 }
9272 frame->sse_reg_save_offset = offset;
9273
9274 /* The re-aligned stack starts here. Values before this point are not
9275 directly comparable with values below this point. In order to make
9276 sure that no value happens to be the same before and after, force
9277 the alignment computation below to add a non-zero value. */
9278 if (stack_realign_fp)
9279 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9280
9281 /* Va-arg area */
9282 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9283 offset += frame->va_arg_size;
9284
9285 /* Align start of frame for local function. */
9286 if (stack_realign_fp
9287 || offset != frame->sse_reg_save_offset
9288 || size != 0
9289 || !crtl->is_leaf
9290 || cfun->calls_alloca
9291 || ix86_current_function_calls_tls_descriptor)
9292 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9293
9294 /* Frame pointer points here. */
9295 frame->frame_pointer_offset = offset;
9296
9297 offset += size;
9298
9299 /* Add outgoing arguments area. Can be skipped if we eliminated
9300 all the function calls as dead code.
9301 Skipping is however impossible when function calls alloca. Alloca
9302 expander assumes that last crtl->outgoing_args_size
9303 of stack frame are unused. */
9304 if (ACCUMULATE_OUTGOING_ARGS
9305 && (!crtl->is_leaf || cfun->calls_alloca
9306 || ix86_current_function_calls_tls_descriptor))
9307 {
9308 offset += crtl->outgoing_args_size;
9309 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9310 }
9311 else
9312 frame->outgoing_arguments_size = 0;
9313
9314 /* Align stack boundary. Only needed if we're calling another function
9315 or using alloca. */
9316 if (!crtl->is_leaf || cfun->calls_alloca
9317 || ix86_current_function_calls_tls_descriptor)
9318 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9319
9320 /* We've reached end of stack frame. */
9321 frame->stack_pointer_offset = offset;
9322
9323 /* Size prologue needs to allocate. */
9324 to_allocate = offset - frame->sse_reg_save_offset;
9325
9326 if ((!to_allocate && frame->nregs <= 1)
9327 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9328 frame->save_regs_using_mov = false;
9329
9330 if (ix86_using_red_zone ()
9331 && crtl->sp_is_unchanging
9332 && crtl->is_leaf
9333 && !ix86_current_function_calls_tls_descriptor)
9334 {
9335 frame->red_zone_size = to_allocate;
9336 if (frame->save_regs_using_mov)
9337 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9338 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9339 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9340 }
9341 else
9342 frame->red_zone_size = 0;
9343 frame->stack_pointer_offset -= frame->red_zone_size;
9344
9345 /* The SEH frame pointer location is near the bottom of the frame.
9346 This is enforced by the fact that the difference between the
9347 stack pointer and the frame pointer is limited to 240 bytes in
9348 the unwind data structure. */
9349 if (TARGET_SEH)
9350 {
9351 HOST_WIDE_INT diff;
9352
9353 /* If we can leave the frame pointer where it is, do so. Also, returns
9354 the establisher frame for __builtin_frame_address (0). */
9355 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9356 if (diff <= SEH_MAX_FRAME_SIZE
9357 && (diff > 240 || (diff & 15) != 0)
9358 && !crtl->accesses_prior_frames)
9359 {
9360 /* Ideally we'd determine what portion of the local stack frame
9361 (within the constraint of the lowest 240) is most heavily used.
9362 But without that complication, simply bias the frame pointer
9363 by 128 bytes so as to maximize the amount of the local stack
9364 frame that is addressable with 8-bit offsets. */
9365 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9366 }
9367 }
9368 }
9369
9370 /* This is semi-inlined memory_address_length, but simplified
9371 since we know that we're always dealing with reg+offset, and
9372 to avoid having to create and discard all that rtl. */
9373
9374 static inline int
9375 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9376 {
9377 int len = 4;
9378
9379 if (offset == 0)
9380 {
9381 /* EBP and R13 cannot be encoded without an offset. */
9382 len = (regno == BP_REG || regno == R13_REG);
9383 }
9384 else if (IN_RANGE (offset, -128, 127))
9385 len = 1;
9386
9387 /* ESP and R12 must be encoded with a SIB byte. */
9388 if (regno == SP_REG || regno == R12_REG)
9389 len++;
9390
9391 return len;
9392 }
9393
9394 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9395 The valid base registers are taken from CFUN->MACHINE->FS. */
9396
9397 static rtx
9398 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9399 {
9400 const struct machine_function *m = cfun->machine;
9401 rtx base_reg = NULL;
9402 HOST_WIDE_INT base_offset = 0;
9403
9404 if (m->use_fast_prologue_epilogue)
9405 {
9406 /* Choose the base register most likely to allow the most scheduling
9407 opportunities. Generally FP is valid throughout the function,
9408 while DRAP must be reloaded within the epilogue. But choose either
9409 over the SP due to increased encoding size. */
9410
9411 if (m->fs.fp_valid)
9412 {
9413 base_reg = hard_frame_pointer_rtx;
9414 base_offset = m->fs.fp_offset - cfa_offset;
9415 }
9416 else if (m->fs.drap_valid)
9417 {
9418 base_reg = crtl->drap_reg;
9419 base_offset = 0 - cfa_offset;
9420 }
9421 else if (m->fs.sp_valid)
9422 {
9423 base_reg = stack_pointer_rtx;
9424 base_offset = m->fs.sp_offset - cfa_offset;
9425 }
9426 }
9427 else
9428 {
9429 HOST_WIDE_INT toffset;
9430 int len = 16, tlen;
9431
9432 /* Choose the base register with the smallest address encoding.
9433 With a tie, choose FP > DRAP > SP. */
9434 if (m->fs.sp_valid)
9435 {
9436 base_reg = stack_pointer_rtx;
9437 base_offset = m->fs.sp_offset - cfa_offset;
9438 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9439 }
9440 if (m->fs.drap_valid)
9441 {
9442 toffset = 0 - cfa_offset;
9443 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9444 if (tlen <= len)
9445 {
9446 base_reg = crtl->drap_reg;
9447 base_offset = toffset;
9448 len = tlen;
9449 }
9450 }
9451 if (m->fs.fp_valid)
9452 {
9453 toffset = m->fs.fp_offset - cfa_offset;
9454 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9455 if (tlen <= len)
9456 {
9457 base_reg = hard_frame_pointer_rtx;
9458 base_offset = toffset;
9459 len = tlen;
9460 }
9461 }
9462 }
9463 gcc_assert (base_reg != NULL);
9464
9465 return plus_constant (Pmode, base_reg, base_offset);
9466 }
9467
9468 /* Emit code to save registers in the prologue. */
9469
9470 static void
9471 ix86_emit_save_regs (void)
9472 {
9473 unsigned int regno;
9474 rtx insn;
9475
9476 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9477 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9478 {
9479 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9480 RTX_FRAME_RELATED_P (insn) = 1;
9481 }
9482 }
9483
9484 /* Emit a single register save at CFA - CFA_OFFSET. */
9485
9486 static void
9487 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9488 HOST_WIDE_INT cfa_offset)
9489 {
9490 struct machine_function *m = cfun->machine;
9491 rtx reg = gen_rtx_REG (mode, regno);
9492 rtx mem, addr, base, insn;
9493
9494 addr = choose_baseaddr (cfa_offset);
9495 mem = gen_frame_mem (mode, addr);
9496
9497 /* For SSE saves, we need to indicate the 128-bit alignment. */
9498 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9499
9500 insn = emit_move_insn (mem, reg);
9501 RTX_FRAME_RELATED_P (insn) = 1;
9502
9503 base = addr;
9504 if (GET_CODE (base) == PLUS)
9505 base = XEXP (base, 0);
9506 gcc_checking_assert (REG_P (base));
9507
9508 /* When saving registers into a re-aligned local stack frame, avoid
9509 any tricky guessing by dwarf2out. */
9510 if (m->fs.realigned)
9511 {
9512 gcc_checking_assert (stack_realign_drap);
9513
9514 if (regno == REGNO (crtl->drap_reg))
9515 {
9516 /* A bit of a hack. We force the DRAP register to be saved in
9517 the re-aligned stack frame, which provides us with a copy
9518 of the CFA that will last past the prologue. Install it. */
9519 gcc_checking_assert (cfun->machine->fs.fp_valid);
9520 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9521 cfun->machine->fs.fp_offset - cfa_offset);
9522 mem = gen_rtx_MEM (mode, addr);
9523 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9524 }
9525 else
9526 {
9527 /* The frame pointer is a stable reference within the
9528 aligned frame. Use it. */
9529 gcc_checking_assert (cfun->machine->fs.fp_valid);
9530 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9531 cfun->machine->fs.fp_offset - cfa_offset);
9532 mem = gen_rtx_MEM (mode, addr);
9533 add_reg_note (insn, REG_CFA_EXPRESSION,
9534 gen_rtx_SET (VOIDmode, mem, reg));
9535 }
9536 }
9537
9538 /* The memory may not be relative to the current CFA register,
9539 which means that we may need to generate a new pattern for
9540 use by the unwind info. */
9541 else if (base != m->fs.cfa_reg)
9542 {
9543 addr = plus_constant (Pmode, m->fs.cfa_reg,
9544 m->fs.cfa_offset - cfa_offset);
9545 mem = gen_rtx_MEM (mode, addr);
9546 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9547 }
9548 }
9549
9550 /* Emit code to save registers using MOV insns.
9551 First register is stored at CFA - CFA_OFFSET. */
9552 static void
9553 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9554 {
9555 unsigned int regno;
9556
9557 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9558 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9559 {
9560 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9561 cfa_offset -= UNITS_PER_WORD;
9562 }
9563 }
9564
9565 /* Emit code to save SSE registers using MOV insns.
9566 First register is stored at CFA - CFA_OFFSET. */
9567 static void
9568 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9569 {
9570 unsigned int regno;
9571
9572 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9573 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9574 {
9575 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9576 cfa_offset -= 16;
9577 }
9578 }
9579
9580 static GTY(()) rtx queued_cfa_restores;
9581
9582 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9583 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9584 Don't add the note if the previously saved value will be left untouched
9585 within stack red-zone till return, as unwinders can find the same value
9586 in the register and on the stack. */
9587
9588 static void
9589 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9590 {
9591 if (!crtl->shrink_wrapped
9592 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9593 return;
9594
9595 if (insn)
9596 {
9597 add_reg_note (insn, REG_CFA_RESTORE, reg);
9598 RTX_FRAME_RELATED_P (insn) = 1;
9599 }
9600 else
9601 queued_cfa_restores
9602 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9603 }
9604
9605 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9606
9607 static void
9608 ix86_add_queued_cfa_restore_notes (rtx insn)
9609 {
9610 rtx last;
9611 if (!queued_cfa_restores)
9612 return;
9613 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9614 ;
9615 XEXP (last, 1) = REG_NOTES (insn);
9616 REG_NOTES (insn) = queued_cfa_restores;
9617 queued_cfa_restores = NULL_RTX;
9618 RTX_FRAME_RELATED_P (insn) = 1;
9619 }
9620
9621 /* Expand prologue or epilogue stack adjustment.
9622 The pattern exist to put a dependency on all ebp-based memory accesses.
9623 STYLE should be negative if instructions should be marked as frame related,
9624 zero if %r11 register is live and cannot be freely used and positive
9625 otherwise. */
9626
9627 static void
9628 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9629 int style, bool set_cfa)
9630 {
9631 struct machine_function *m = cfun->machine;
9632 rtx insn;
9633 bool add_frame_related_expr = false;
9634
9635 if (Pmode == SImode)
9636 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9637 else if (x86_64_immediate_operand (offset, DImode))
9638 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9639 else
9640 {
9641 rtx tmp;
9642 /* r11 is used by indirect sibcall return as well, set before the
9643 epilogue and used after the epilogue. */
9644 if (style)
9645 tmp = gen_rtx_REG (DImode, R11_REG);
9646 else
9647 {
9648 gcc_assert (src != hard_frame_pointer_rtx
9649 && dest != hard_frame_pointer_rtx);
9650 tmp = hard_frame_pointer_rtx;
9651 }
9652 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9653 if (style < 0)
9654 add_frame_related_expr = true;
9655
9656 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9657 }
9658
9659 insn = emit_insn (insn);
9660 if (style >= 0)
9661 ix86_add_queued_cfa_restore_notes (insn);
9662
9663 if (set_cfa)
9664 {
9665 rtx r;
9666
9667 gcc_assert (m->fs.cfa_reg == src);
9668 m->fs.cfa_offset += INTVAL (offset);
9669 m->fs.cfa_reg = dest;
9670
9671 r = gen_rtx_PLUS (Pmode, src, offset);
9672 r = gen_rtx_SET (VOIDmode, dest, r);
9673 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9674 RTX_FRAME_RELATED_P (insn) = 1;
9675 }
9676 else if (style < 0)
9677 {
9678 RTX_FRAME_RELATED_P (insn) = 1;
9679 if (add_frame_related_expr)
9680 {
9681 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9682 r = gen_rtx_SET (VOIDmode, dest, r);
9683 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9684 }
9685 }
9686
9687 if (dest == stack_pointer_rtx)
9688 {
9689 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9690 bool valid = m->fs.sp_valid;
9691
9692 if (src == hard_frame_pointer_rtx)
9693 {
9694 valid = m->fs.fp_valid;
9695 ooffset = m->fs.fp_offset;
9696 }
9697 else if (src == crtl->drap_reg)
9698 {
9699 valid = m->fs.drap_valid;
9700 ooffset = 0;
9701 }
9702 else
9703 {
9704 /* Else there are two possibilities: SP itself, which we set
9705 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9706 taken care of this by hand along the eh_return path. */
9707 gcc_checking_assert (src == stack_pointer_rtx
9708 || offset == const0_rtx);
9709 }
9710
9711 m->fs.sp_offset = ooffset - INTVAL (offset);
9712 m->fs.sp_valid = valid;
9713 }
9714 }
9715
9716 /* Find an available register to be used as dynamic realign argument
9717 pointer regsiter. Such a register will be written in prologue and
9718 used in begin of body, so it must not be
9719 1. parameter passing register.
9720 2. GOT pointer.
9721 We reuse static-chain register if it is available. Otherwise, we
9722 use DI for i386 and R13 for x86-64. We chose R13 since it has
9723 shorter encoding.
9724
9725 Return: the regno of chosen register. */
9726
9727 static unsigned int
9728 find_drap_reg (void)
9729 {
9730 tree decl = cfun->decl;
9731
9732 if (TARGET_64BIT)
9733 {
9734 /* Use R13 for nested function or function need static chain.
9735 Since function with tail call may use any caller-saved
9736 registers in epilogue, DRAP must not use caller-saved
9737 register in such case. */
9738 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9739 return R13_REG;
9740
9741 return R10_REG;
9742 }
9743 else
9744 {
9745 /* Use DI for nested function or function need static chain.
9746 Since function with tail call may use any caller-saved
9747 registers in epilogue, DRAP must not use caller-saved
9748 register in such case. */
9749 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9750 return DI_REG;
9751
9752 /* Reuse static chain register if it isn't used for parameter
9753 passing. */
9754 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9755 {
9756 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9757 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9758 return CX_REG;
9759 }
9760 return DI_REG;
9761 }
9762 }
9763
9764 /* Return minimum incoming stack alignment. */
9765
9766 static unsigned int
9767 ix86_minimum_incoming_stack_boundary (bool sibcall)
9768 {
9769 unsigned int incoming_stack_boundary;
9770
9771 /* Prefer the one specified at command line. */
9772 if (ix86_user_incoming_stack_boundary)
9773 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9774 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9775 if -mstackrealign is used, it isn't used for sibcall check and
9776 estimated stack alignment is 128bit. */
9777 else if (!sibcall
9778 && !TARGET_64BIT
9779 && ix86_force_align_arg_pointer
9780 && crtl->stack_alignment_estimated == 128)
9781 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9782 else
9783 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9784
9785 /* Incoming stack alignment can be changed on individual functions
9786 via force_align_arg_pointer attribute. We use the smallest
9787 incoming stack boundary. */
9788 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9789 && lookup_attribute (ix86_force_align_arg_pointer_string,
9790 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9791 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9792
9793 /* The incoming stack frame has to be aligned at least at
9794 parm_stack_boundary. */
9795 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9796 incoming_stack_boundary = crtl->parm_stack_boundary;
9797
9798 /* Stack at entrance of main is aligned by runtime. We use the
9799 smallest incoming stack boundary. */
9800 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9801 && DECL_NAME (current_function_decl)
9802 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9803 && DECL_FILE_SCOPE_P (current_function_decl))
9804 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9805
9806 return incoming_stack_boundary;
9807 }
9808
9809 /* Update incoming stack boundary and estimated stack alignment. */
9810
9811 static void
9812 ix86_update_stack_boundary (void)
9813 {
9814 ix86_incoming_stack_boundary
9815 = ix86_minimum_incoming_stack_boundary (false);
9816
9817 /* x86_64 vararg needs 16byte stack alignment for register save
9818 area. */
9819 if (TARGET_64BIT
9820 && cfun->stdarg
9821 && crtl->stack_alignment_estimated < 128)
9822 crtl->stack_alignment_estimated = 128;
9823 }
9824
9825 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9826 needed or an rtx for DRAP otherwise. */
9827
9828 static rtx
9829 ix86_get_drap_rtx (void)
9830 {
9831 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9832 crtl->need_drap = true;
9833
9834 if (stack_realign_drap)
9835 {
9836 /* Assign DRAP to vDRAP and returns vDRAP */
9837 unsigned int regno = find_drap_reg ();
9838 rtx drap_vreg;
9839 rtx arg_ptr;
9840 rtx seq, insn;
9841
9842 arg_ptr = gen_rtx_REG (Pmode, regno);
9843 crtl->drap_reg = arg_ptr;
9844
9845 start_sequence ();
9846 drap_vreg = copy_to_reg (arg_ptr);
9847 seq = get_insns ();
9848 end_sequence ();
9849
9850 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9851 if (!optimize)
9852 {
9853 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9854 RTX_FRAME_RELATED_P (insn) = 1;
9855 }
9856 return drap_vreg;
9857 }
9858 else
9859 return NULL;
9860 }
9861
9862 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9863
9864 static rtx
9865 ix86_internal_arg_pointer (void)
9866 {
9867 return virtual_incoming_args_rtx;
9868 }
9869
9870 struct scratch_reg {
9871 rtx reg;
9872 bool saved;
9873 };
9874
9875 /* Return a short-lived scratch register for use on function entry.
9876 In 32-bit mode, it is valid only after the registers are saved
9877 in the prologue. This register must be released by means of
9878 release_scratch_register_on_entry once it is dead. */
9879
9880 static void
9881 get_scratch_register_on_entry (struct scratch_reg *sr)
9882 {
9883 int regno;
9884
9885 sr->saved = false;
9886
9887 if (TARGET_64BIT)
9888 {
9889 /* We always use R11 in 64-bit mode. */
9890 regno = R11_REG;
9891 }
9892 else
9893 {
9894 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9895 bool fastcall_p
9896 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9897 bool thiscall_p
9898 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9899 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9900 int regparm = ix86_function_regparm (fntype, decl);
9901 int drap_regno
9902 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9903
9904 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9905 for the static chain register. */
9906 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9907 && drap_regno != AX_REG)
9908 regno = AX_REG;
9909 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9910 for the static chain register. */
9911 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9912 regno = AX_REG;
9913 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9914 regno = DX_REG;
9915 /* ecx is the static chain register. */
9916 else if (regparm < 3 && !fastcall_p && !thiscall_p
9917 && !static_chain_p
9918 && drap_regno != CX_REG)
9919 regno = CX_REG;
9920 else if (ix86_save_reg (BX_REG, true))
9921 regno = BX_REG;
9922 /* esi is the static chain register. */
9923 else if (!(regparm == 3 && static_chain_p)
9924 && ix86_save_reg (SI_REG, true))
9925 regno = SI_REG;
9926 else if (ix86_save_reg (DI_REG, true))
9927 regno = DI_REG;
9928 else
9929 {
9930 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9931 sr->saved = true;
9932 }
9933 }
9934
9935 sr->reg = gen_rtx_REG (Pmode, regno);
9936 if (sr->saved)
9937 {
9938 rtx insn = emit_insn (gen_push (sr->reg));
9939 RTX_FRAME_RELATED_P (insn) = 1;
9940 }
9941 }
9942
9943 /* Release a scratch register obtained from the preceding function. */
9944
9945 static void
9946 release_scratch_register_on_entry (struct scratch_reg *sr)
9947 {
9948 if (sr->saved)
9949 {
9950 struct machine_function *m = cfun->machine;
9951 rtx x, insn = emit_insn (gen_pop (sr->reg));
9952
9953 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9954 RTX_FRAME_RELATED_P (insn) = 1;
9955 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9956 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9957 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9958 m->fs.sp_offset -= UNITS_PER_WORD;
9959 }
9960 }
9961
9962 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9963
9964 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9965
9966 static void
9967 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9968 {
9969 /* We skip the probe for the first interval + a small dope of 4 words and
9970 probe that many bytes past the specified size to maintain a protection
9971 area at the botton of the stack. */
9972 const int dope = 4 * UNITS_PER_WORD;
9973 rtx size_rtx = GEN_INT (size), last;
9974
9975 /* See if we have a constant small number of probes to generate. If so,
9976 that's the easy case. The run-time loop is made up of 11 insns in the
9977 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9978 for n # of intervals. */
9979 if (size <= 5 * PROBE_INTERVAL)
9980 {
9981 HOST_WIDE_INT i, adjust;
9982 bool first_probe = true;
9983
9984 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9985 values of N from 1 until it exceeds SIZE. If only one probe is
9986 needed, this will not generate any code. Then adjust and probe
9987 to PROBE_INTERVAL + SIZE. */
9988 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9989 {
9990 if (first_probe)
9991 {
9992 adjust = 2 * PROBE_INTERVAL + dope;
9993 first_probe = false;
9994 }
9995 else
9996 adjust = PROBE_INTERVAL;
9997
9998 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9999 plus_constant (Pmode, stack_pointer_rtx,
10000 -adjust)));
10001 emit_stack_probe (stack_pointer_rtx);
10002 }
10003
10004 if (first_probe)
10005 adjust = size + PROBE_INTERVAL + dope;
10006 else
10007 adjust = size + PROBE_INTERVAL - i;
10008
10009 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10010 plus_constant (Pmode, stack_pointer_rtx,
10011 -adjust)));
10012 emit_stack_probe (stack_pointer_rtx);
10013
10014 /* Adjust back to account for the additional first interval. */
10015 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10016 plus_constant (Pmode, stack_pointer_rtx,
10017 PROBE_INTERVAL + dope)));
10018 }
10019
10020 /* Otherwise, do the same as above, but in a loop. Note that we must be
10021 extra careful with variables wrapping around because we might be at
10022 the very top (or the very bottom) of the address space and we have
10023 to be able to handle this case properly; in particular, we use an
10024 equality test for the loop condition. */
10025 else
10026 {
10027 HOST_WIDE_INT rounded_size;
10028 struct scratch_reg sr;
10029
10030 get_scratch_register_on_entry (&sr);
10031
10032
10033 /* Step 1: round SIZE to the previous multiple of the interval. */
10034
10035 rounded_size = size & -PROBE_INTERVAL;
10036
10037
10038 /* Step 2: compute initial and final value of the loop counter. */
10039
10040 /* SP = SP_0 + PROBE_INTERVAL. */
10041 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10042 plus_constant (Pmode, stack_pointer_rtx,
10043 - (PROBE_INTERVAL + dope))));
10044
10045 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10046 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10047 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10048 gen_rtx_PLUS (Pmode, sr.reg,
10049 stack_pointer_rtx)));
10050
10051
10052 /* Step 3: the loop
10053
10054 while (SP != LAST_ADDR)
10055 {
10056 SP = SP + PROBE_INTERVAL
10057 probe at SP
10058 }
10059
10060 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10061 values of N from 1 until it is equal to ROUNDED_SIZE. */
10062
10063 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10064
10065
10066 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10067 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10068
10069 if (size != rounded_size)
10070 {
10071 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10072 plus_constant (Pmode, stack_pointer_rtx,
10073 rounded_size - size)));
10074 emit_stack_probe (stack_pointer_rtx);
10075 }
10076
10077 /* Adjust back to account for the additional first interval. */
10078 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10079 plus_constant (Pmode, stack_pointer_rtx,
10080 PROBE_INTERVAL + dope)));
10081
10082 release_scratch_register_on_entry (&sr);
10083 }
10084
10085 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10086
10087 /* Even if the stack pointer isn't the CFA register, we need to correctly
10088 describe the adjustments made to it, in particular differentiate the
10089 frame-related ones from the frame-unrelated ones. */
10090 if (size > 0)
10091 {
10092 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10093 XVECEXP (expr, 0, 0)
10094 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10095 plus_constant (Pmode, stack_pointer_rtx, -size));
10096 XVECEXP (expr, 0, 1)
10097 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10098 plus_constant (Pmode, stack_pointer_rtx,
10099 PROBE_INTERVAL + dope + size));
10100 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10101 RTX_FRAME_RELATED_P (last) = 1;
10102
10103 cfun->machine->fs.sp_offset += size;
10104 }
10105
10106 /* Make sure nothing is scheduled before we are done. */
10107 emit_insn (gen_blockage ());
10108 }
10109
10110 /* Adjust the stack pointer up to REG while probing it. */
10111
10112 const char *
10113 output_adjust_stack_and_probe (rtx reg)
10114 {
10115 static int labelno = 0;
10116 char loop_lab[32], end_lab[32];
10117 rtx xops[2];
10118
10119 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10120 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10121
10122 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10123
10124 /* Jump to END_LAB if SP == LAST_ADDR. */
10125 xops[0] = stack_pointer_rtx;
10126 xops[1] = reg;
10127 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10128 fputs ("\tje\t", asm_out_file);
10129 assemble_name_raw (asm_out_file, end_lab);
10130 fputc ('\n', asm_out_file);
10131
10132 /* SP = SP + PROBE_INTERVAL. */
10133 xops[1] = GEN_INT (PROBE_INTERVAL);
10134 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10135
10136 /* Probe at SP. */
10137 xops[1] = const0_rtx;
10138 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10139
10140 fprintf (asm_out_file, "\tjmp\t");
10141 assemble_name_raw (asm_out_file, loop_lab);
10142 fputc ('\n', asm_out_file);
10143
10144 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10145
10146 return "";
10147 }
10148
10149 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10150 inclusive. These are offsets from the current stack pointer. */
10151
10152 static void
10153 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10154 {
10155 /* See if we have a constant small number of probes to generate. If so,
10156 that's the easy case. The run-time loop is made up of 7 insns in the
10157 generic case while the compile-time loop is made up of n insns for n #
10158 of intervals. */
10159 if (size <= 7 * PROBE_INTERVAL)
10160 {
10161 HOST_WIDE_INT i;
10162
10163 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10164 it exceeds SIZE. If only one probe is needed, this will not
10165 generate any code. Then probe at FIRST + SIZE. */
10166 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10167 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10168 -(first + i)));
10169
10170 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10171 -(first + size)));
10172 }
10173
10174 /* Otherwise, do the same as above, but in a loop. Note that we must be
10175 extra careful with variables wrapping around because we might be at
10176 the very top (or the very bottom) of the address space and we have
10177 to be able to handle this case properly; in particular, we use an
10178 equality test for the loop condition. */
10179 else
10180 {
10181 HOST_WIDE_INT rounded_size, last;
10182 struct scratch_reg sr;
10183
10184 get_scratch_register_on_entry (&sr);
10185
10186
10187 /* Step 1: round SIZE to the previous multiple of the interval. */
10188
10189 rounded_size = size & -PROBE_INTERVAL;
10190
10191
10192 /* Step 2: compute initial and final value of the loop counter. */
10193
10194 /* TEST_OFFSET = FIRST. */
10195 emit_move_insn (sr.reg, GEN_INT (-first));
10196
10197 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10198 last = first + rounded_size;
10199
10200
10201 /* Step 3: the loop
10202
10203 while (TEST_ADDR != LAST_ADDR)
10204 {
10205 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10206 probe at TEST_ADDR
10207 }
10208
10209 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10210 until it is equal to ROUNDED_SIZE. */
10211
10212 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10213
10214
10215 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10216 that SIZE is equal to ROUNDED_SIZE. */
10217
10218 if (size != rounded_size)
10219 emit_stack_probe (plus_constant (Pmode,
10220 gen_rtx_PLUS (Pmode,
10221 stack_pointer_rtx,
10222 sr.reg),
10223 rounded_size - size));
10224
10225 release_scratch_register_on_entry (&sr);
10226 }
10227
10228 /* Make sure nothing is scheduled before we are done. */
10229 emit_insn (gen_blockage ());
10230 }
10231
10232 /* Probe a range of stack addresses from REG to END, inclusive. These are
10233 offsets from the current stack pointer. */
10234
10235 const char *
10236 output_probe_stack_range (rtx reg, rtx end)
10237 {
10238 static int labelno = 0;
10239 char loop_lab[32], end_lab[32];
10240 rtx xops[3];
10241
10242 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10243 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10244
10245 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10246
10247 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10248 xops[0] = reg;
10249 xops[1] = end;
10250 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10251 fputs ("\tje\t", asm_out_file);
10252 assemble_name_raw (asm_out_file, end_lab);
10253 fputc ('\n', asm_out_file);
10254
10255 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10256 xops[1] = GEN_INT (PROBE_INTERVAL);
10257 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10258
10259 /* Probe at TEST_ADDR. */
10260 xops[0] = stack_pointer_rtx;
10261 xops[1] = reg;
10262 xops[2] = const0_rtx;
10263 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10264
10265 fprintf (asm_out_file, "\tjmp\t");
10266 assemble_name_raw (asm_out_file, loop_lab);
10267 fputc ('\n', asm_out_file);
10268
10269 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10270
10271 return "";
10272 }
10273
10274 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10275 to be generated in correct form. */
10276 static void
10277 ix86_finalize_stack_realign_flags (void)
10278 {
10279 /* Check if stack realign is really needed after reload, and
10280 stores result in cfun */
10281 unsigned int incoming_stack_boundary
10282 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10283 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10284 unsigned int stack_realign = (incoming_stack_boundary
10285 < (crtl->is_leaf
10286 ? crtl->max_used_stack_slot_alignment
10287 : crtl->stack_alignment_needed));
10288
10289 if (crtl->stack_realign_finalized)
10290 {
10291 /* After stack_realign_needed is finalized, we can't no longer
10292 change it. */
10293 gcc_assert (crtl->stack_realign_needed == stack_realign);
10294 return;
10295 }
10296
10297 /* If the only reason for frame_pointer_needed is that we conservatively
10298 assumed stack realignment might be needed, but in the end nothing that
10299 needed the stack alignment had been spilled, clear frame_pointer_needed
10300 and say we don't need stack realignment. */
10301 if (stack_realign
10302 && !crtl->need_drap
10303 && frame_pointer_needed
10304 && crtl->is_leaf
10305 && flag_omit_frame_pointer
10306 && crtl->sp_is_unchanging
10307 && !ix86_current_function_calls_tls_descriptor
10308 && !crtl->accesses_prior_frames
10309 && !cfun->calls_alloca
10310 && !crtl->calls_eh_return
10311 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10312 && !ix86_frame_pointer_required ()
10313 && get_frame_size () == 0
10314 && ix86_nsaved_sseregs () == 0
10315 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10316 {
10317 HARD_REG_SET set_up_by_prologue, prologue_used;
10318 basic_block bb;
10319
10320 CLEAR_HARD_REG_SET (prologue_used);
10321 CLEAR_HARD_REG_SET (set_up_by_prologue);
10322 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10323 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10324 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10325 HARD_FRAME_POINTER_REGNUM);
10326 FOR_EACH_BB (bb)
10327 {
10328 rtx insn;
10329 FOR_BB_INSNS (bb, insn)
10330 if (NONDEBUG_INSN_P (insn)
10331 && requires_stack_frame_p (insn, prologue_used,
10332 set_up_by_prologue))
10333 {
10334 crtl->stack_realign_needed = stack_realign;
10335 crtl->stack_realign_finalized = true;
10336 return;
10337 }
10338 }
10339
10340 frame_pointer_needed = false;
10341 stack_realign = false;
10342 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10343 crtl->stack_alignment_needed = incoming_stack_boundary;
10344 crtl->stack_alignment_estimated = incoming_stack_boundary;
10345 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10346 crtl->preferred_stack_boundary = incoming_stack_boundary;
10347 df_finish_pass (true);
10348 df_scan_alloc (NULL);
10349 df_scan_blocks ();
10350 df_compute_regs_ever_live (true);
10351 df_analyze ();
10352 }
10353
10354 crtl->stack_realign_needed = stack_realign;
10355 crtl->stack_realign_finalized = true;
10356 }
10357
10358 /* Expand the prologue into a bunch of separate insns. */
10359
10360 void
10361 ix86_expand_prologue (void)
10362 {
10363 struct machine_function *m = cfun->machine;
10364 rtx insn, t;
10365 bool pic_reg_used;
10366 struct ix86_frame frame;
10367 HOST_WIDE_INT allocate;
10368 bool int_registers_saved;
10369 bool sse_registers_saved;
10370
10371 ix86_finalize_stack_realign_flags ();
10372
10373 /* DRAP should not coexist with stack_realign_fp */
10374 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10375
10376 memset (&m->fs, 0, sizeof (m->fs));
10377
10378 /* Initialize CFA state for before the prologue. */
10379 m->fs.cfa_reg = stack_pointer_rtx;
10380 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10381
10382 /* Track SP offset to the CFA. We continue tracking this after we've
10383 swapped the CFA register away from SP. In the case of re-alignment
10384 this is fudged; we're interested to offsets within the local frame. */
10385 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10386 m->fs.sp_valid = true;
10387
10388 ix86_compute_frame_layout (&frame);
10389
10390 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10391 {
10392 /* We should have already generated an error for any use of
10393 ms_hook on a nested function. */
10394 gcc_checking_assert (!ix86_static_chain_on_stack);
10395
10396 /* Check if profiling is active and we shall use profiling before
10397 prologue variant. If so sorry. */
10398 if (crtl->profile && flag_fentry != 0)
10399 sorry ("ms_hook_prologue attribute isn%'t compatible "
10400 "with -mfentry for 32-bit");
10401
10402 /* In ix86_asm_output_function_label we emitted:
10403 8b ff movl.s %edi,%edi
10404 55 push %ebp
10405 8b ec movl.s %esp,%ebp
10406
10407 This matches the hookable function prologue in Win32 API
10408 functions in Microsoft Windows XP Service Pack 2 and newer.
10409 Wine uses this to enable Windows apps to hook the Win32 API
10410 functions provided by Wine.
10411
10412 What that means is that we've already set up the frame pointer. */
10413
10414 if (frame_pointer_needed
10415 && !(crtl->drap_reg && crtl->stack_realign_needed))
10416 {
10417 rtx push, mov;
10418
10419 /* We've decided to use the frame pointer already set up.
10420 Describe this to the unwinder by pretending that both
10421 push and mov insns happen right here.
10422
10423 Putting the unwind info here at the end of the ms_hook
10424 is done so that we can make absolutely certain we get
10425 the required byte sequence at the start of the function,
10426 rather than relying on an assembler that can produce
10427 the exact encoding required.
10428
10429 However it does mean (in the unpatched case) that we have
10430 a 1 insn window where the asynchronous unwind info is
10431 incorrect. However, if we placed the unwind info at
10432 its correct location we would have incorrect unwind info
10433 in the patched case. Which is probably all moot since
10434 I don't expect Wine generates dwarf2 unwind info for the
10435 system libraries that use this feature. */
10436
10437 insn = emit_insn (gen_blockage ());
10438
10439 push = gen_push (hard_frame_pointer_rtx);
10440 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10441 stack_pointer_rtx);
10442 RTX_FRAME_RELATED_P (push) = 1;
10443 RTX_FRAME_RELATED_P (mov) = 1;
10444
10445 RTX_FRAME_RELATED_P (insn) = 1;
10446 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10447 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10448
10449 /* Note that gen_push incremented m->fs.cfa_offset, even
10450 though we didn't emit the push insn here. */
10451 m->fs.cfa_reg = hard_frame_pointer_rtx;
10452 m->fs.fp_offset = m->fs.cfa_offset;
10453 m->fs.fp_valid = true;
10454 }
10455 else
10456 {
10457 /* The frame pointer is not needed so pop %ebp again.
10458 This leaves us with a pristine state. */
10459 emit_insn (gen_pop (hard_frame_pointer_rtx));
10460 }
10461 }
10462
10463 /* The first insn of a function that accepts its static chain on the
10464 stack is to push the register that would be filled in by a direct
10465 call. This insn will be skipped by the trampoline. */
10466 else if (ix86_static_chain_on_stack)
10467 {
10468 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10469 emit_insn (gen_blockage ());
10470
10471 /* We don't want to interpret this push insn as a register save,
10472 only as a stack adjustment. The real copy of the register as
10473 a save will be done later, if needed. */
10474 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10475 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10476 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10477 RTX_FRAME_RELATED_P (insn) = 1;
10478 }
10479
10480 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10481 of DRAP is needed and stack realignment is really needed after reload */
10482 if (stack_realign_drap)
10483 {
10484 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10485
10486 /* Only need to push parameter pointer reg if it is caller saved. */
10487 if (!call_used_regs[REGNO (crtl->drap_reg)])
10488 {
10489 /* Push arg pointer reg */
10490 insn = emit_insn (gen_push (crtl->drap_reg));
10491 RTX_FRAME_RELATED_P (insn) = 1;
10492 }
10493
10494 /* Grab the argument pointer. */
10495 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10496 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10497 RTX_FRAME_RELATED_P (insn) = 1;
10498 m->fs.cfa_reg = crtl->drap_reg;
10499 m->fs.cfa_offset = 0;
10500
10501 /* Align the stack. */
10502 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10503 stack_pointer_rtx,
10504 GEN_INT (-align_bytes)));
10505 RTX_FRAME_RELATED_P (insn) = 1;
10506
10507 /* Replicate the return address on the stack so that return
10508 address can be reached via (argp - 1) slot. This is needed
10509 to implement macro RETURN_ADDR_RTX and intrinsic function
10510 expand_builtin_return_addr etc. */
10511 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10512 t = gen_frame_mem (word_mode, t);
10513 insn = emit_insn (gen_push (t));
10514 RTX_FRAME_RELATED_P (insn) = 1;
10515
10516 /* For the purposes of frame and register save area addressing,
10517 we've started over with a new frame. */
10518 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10519 m->fs.realigned = true;
10520 }
10521
10522 int_registers_saved = (frame.nregs == 0);
10523 sse_registers_saved = (frame.nsseregs == 0);
10524
10525 if (frame_pointer_needed && !m->fs.fp_valid)
10526 {
10527 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10528 slower on all targets. Also sdb doesn't like it. */
10529 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10530 RTX_FRAME_RELATED_P (insn) = 1;
10531
10532 /* Push registers now, before setting the frame pointer
10533 on SEH target. */
10534 if (!int_registers_saved
10535 && TARGET_SEH
10536 && !frame.save_regs_using_mov)
10537 {
10538 ix86_emit_save_regs ();
10539 int_registers_saved = true;
10540 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10541 }
10542
10543 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10544 {
10545 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10546 RTX_FRAME_RELATED_P (insn) = 1;
10547
10548 if (m->fs.cfa_reg == stack_pointer_rtx)
10549 m->fs.cfa_reg = hard_frame_pointer_rtx;
10550 m->fs.fp_offset = m->fs.sp_offset;
10551 m->fs.fp_valid = true;
10552 }
10553 }
10554
10555 if (!int_registers_saved)
10556 {
10557 /* If saving registers via PUSH, do so now. */
10558 if (!frame.save_regs_using_mov)
10559 {
10560 ix86_emit_save_regs ();
10561 int_registers_saved = true;
10562 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10563 }
10564
10565 /* When using red zone we may start register saving before allocating
10566 the stack frame saving one cycle of the prologue. However, avoid
10567 doing this if we have to probe the stack; at least on x86_64 the
10568 stack probe can turn into a call that clobbers a red zone location. */
10569 else if (ix86_using_red_zone ()
10570 && (! TARGET_STACK_PROBE
10571 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10572 {
10573 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10574 int_registers_saved = true;
10575 }
10576 }
10577
10578 if (stack_realign_fp)
10579 {
10580 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10581 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10582
10583 /* The computation of the size of the re-aligned stack frame means
10584 that we must allocate the size of the register save area before
10585 performing the actual alignment. Otherwise we cannot guarantee
10586 that there's enough storage above the realignment point. */
10587 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10588 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10589 GEN_INT (m->fs.sp_offset
10590 - frame.sse_reg_save_offset),
10591 -1, false);
10592
10593 /* Align the stack. */
10594 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10595 stack_pointer_rtx,
10596 GEN_INT (-align_bytes)));
10597
10598 /* For the purposes of register save area addressing, the stack
10599 pointer is no longer valid. As for the value of sp_offset,
10600 see ix86_compute_frame_layout, which we need to match in order
10601 to pass verification of stack_pointer_offset at the end. */
10602 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10603 m->fs.sp_valid = false;
10604 }
10605
10606 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10607
10608 if (flag_stack_usage_info)
10609 {
10610 /* We start to count from ARG_POINTER. */
10611 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10612
10613 /* If it was realigned, take into account the fake frame. */
10614 if (stack_realign_drap)
10615 {
10616 if (ix86_static_chain_on_stack)
10617 stack_size += UNITS_PER_WORD;
10618
10619 if (!call_used_regs[REGNO (crtl->drap_reg)])
10620 stack_size += UNITS_PER_WORD;
10621
10622 /* This over-estimates by 1 minimal-stack-alignment-unit but
10623 mitigates that by counting in the new return address slot. */
10624 current_function_dynamic_stack_size
10625 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10626 }
10627
10628 current_function_static_stack_size = stack_size;
10629 }
10630
10631 /* On SEH target with very large frame size, allocate an area to save
10632 SSE registers (as the very large allocation won't be described). */
10633 if (TARGET_SEH
10634 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10635 && !sse_registers_saved)
10636 {
10637 HOST_WIDE_INT sse_size =
10638 frame.sse_reg_save_offset - frame.reg_save_offset;
10639
10640 gcc_assert (int_registers_saved);
10641
10642 /* No need to do stack checking as the area will be immediately
10643 written. */
10644 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10645 GEN_INT (-sse_size), -1,
10646 m->fs.cfa_reg == stack_pointer_rtx);
10647 allocate -= sse_size;
10648 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10649 sse_registers_saved = true;
10650 }
10651
10652 /* The stack has already been decremented by the instruction calling us
10653 so probe if the size is non-negative to preserve the protection area. */
10654 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10655 {
10656 /* We expect the registers to be saved when probes are used. */
10657 gcc_assert (int_registers_saved);
10658
10659 if (STACK_CHECK_MOVING_SP)
10660 {
10661 if (!(crtl->is_leaf && !cfun->calls_alloca
10662 && allocate <= PROBE_INTERVAL))
10663 {
10664 ix86_adjust_stack_and_probe (allocate);
10665 allocate = 0;
10666 }
10667 }
10668 else
10669 {
10670 HOST_WIDE_INT size = allocate;
10671
10672 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10673 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10674
10675 if (TARGET_STACK_PROBE)
10676 {
10677 if (crtl->is_leaf && !cfun->calls_alloca)
10678 {
10679 if (size > PROBE_INTERVAL)
10680 ix86_emit_probe_stack_range (0, size);
10681 }
10682 else
10683 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10684 }
10685 else
10686 {
10687 if (crtl->is_leaf && !cfun->calls_alloca)
10688 {
10689 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
10690 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
10691 size - STACK_CHECK_PROTECT);
10692 }
10693 else
10694 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10695 }
10696 }
10697 }
10698
10699 if (allocate == 0)
10700 ;
10701 else if (!ix86_target_stack_probe ()
10702 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10703 {
10704 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10705 GEN_INT (-allocate), -1,
10706 m->fs.cfa_reg == stack_pointer_rtx);
10707 }
10708 else
10709 {
10710 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10711 rtx r10 = NULL;
10712 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10713 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10714 bool eax_live = false;
10715 bool r10_live = false;
10716
10717 if (TARGET_64BIT)
10718 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10719 if (!TARGET_64BIT_MS_ABI)
10720 eax_live = ix86_eax_live_at_start_p ();
10721
10722 /* Note that SEH directives need to continue tracking the stack
10723 pointer even after the frame pointer has been set up. */
10724 if (eax_live)
10725 {
10726 insn = emit_insn (gen_push (eax));
10727 allocate -= UNITS_PER_WORD;
10728 if (sp_is_cfa_reg || TARGET_SEH)
10729 {
10730 if (sp_is_cfa_reg)
10731 m->fs.cfa_offset += UNITS_PER_WORD;
10732 RTX_FRAME_RELATED_P (insn) = 1;
10733 }
10734 }
10735
10736 if (r10_live)
10737 {
10738 r10 = gen_rtx_REG (Pmode, R10_REG);
10739 insn = emit_insn (gen_push (r10));
10740 allocate -= UNITS_PER_WORD;
10741 if (sp_is_cfa_reg || TARGET_SEH)
10742 {
10743 if (sp_is_cfa_reg)
10744 m->fs.cfa_offset += UNITS_PER_WORD;
10745 RTX_FRAME_RELATED_P (insn) = 1;
10746 }
10747 }
10748
10749 emit_move_insn (eax, GEN_INT (allocate));
10750 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10751
10752 /* Use the fact that AX still contains ALLOCATE. */
10753 adjust_stack_insn = (Pmode == DImode
10754 ? gen_pro_epilogue_adjust_stack_di_sub
10755 : gen_pro_epilogue_adjust_stack_si_sub);
10756
10757 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10758 stack_pointer_rtx, eax));
10759
10760 if (sp_is_cfa_reg || TARGET_SEH)
10761 {
10762 if (sp_is_cfa_reg)
10763 m->fs.cfa_offset += allocate;
10764 RTX_FRAME_RELATED_P (insn) = 1;
10765 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10766 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10767 plus_constant (Pmode, stack_pointer_rtx,
10768 -allocate)));
10769 }
10770 m->fs.sp_offset += allocate;
10771
10772 if (r10_live && eax_live)
10773 {
10774 t = choose_baseaddr (m->fs.sp_offset - allocate);
10775 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10776 gen_frame_mem (word_mode, t));
10777 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10778 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10779 gen_frame_mem (word_mode, t));
10780 }
10781 else if (eax_live || r10_live)
10782 {
10783 t = choose_baseaddr (m->fs.sp_offset - allocate);
10784 emit_move_insn (gen_rtx_REG (word_mode,
10785 (eax_live ? AX_REG : R10_REG)),
10786 gen_frame_mem (word_mode, t));
10787 }
10788 }
10789 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10790
10791 /* If we havn't already set up the frame pointer, do so now. */
10792 if (frame_pointer_needed && !m->fs.fp_valid)
10793 {
10794 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10795 GEN_INT (frame.stack_pointer_offset
10796 - frame.hard_frame_pointer_offset));
10797 insn = emit_insn (insn);
10798 RTX_FRAME_RELATED_P (insn) = 1;
10799 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10800
10801 if (m->fs.cfa_reg == stack_pointer_rtx)
10802 m->fs.cfa_reg = hard_frame_pointer_rtx;
10803 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10804 m->fs.fp_valid = true;
10805 }
10806
10807 if (!int_registers_saved)
10808 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10809 if (!sse_registers_saved)
10810 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10811
10812 pic_reg_used = false;
10813 /* We don't use pic-register for pe-coff target. */
10814 if (pic_offset_table_rtx
10815 && !TARGET_PECOFF
10816 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10817 || crtl->profile))
10818 {
10819 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10820
10821 if (alt_pic_reg_used != INVALID_REGNUM)
10822 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10823
10824 pic_reg_used = true;
10825 }
10826
10827 if (pic_reg_used)
10828 {
10829 if (TARGET_64BIT)
10830 {
10831 if (ix86_cmodel == CM_LARGE_PIC)
10832 {
10833 rtx label, tmp_reg;
10834
10835 gcc_assert (Pmode == DImode);
10836 label = gen_label_rtx ();
10837 emit_label (label);
10838 LABEL_PRESERVE_P (label) = 1;
10839 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10840 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10841 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10842 label));
10843 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10844 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10845 pic_offset_table_rtx, tmp_reg));
10846 }
10847 else
10848 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10849 }
10850 else
10851 {
10852 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10853 RTX_FRAME_RELATED_P (insn) = 1;
10854 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10855 }
10856 }
10857
10858 /* In the pic_reg_used case, make sure that the got load isn't deleted
10859 when mcount needs it. Blockage to avoid call movement across mcount
10860 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10861 note. */
10862 if (crtl->profile && !flag_fentry && pic_reg_used)
10863 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10864
10865 if (crtl->drap_reg && !crtl->stack_realign_needed)
10866 {
10867 /* vDRAP is setup but after reload it turns out stack realign
10868 isn't necessary, here we will emit prologue to setup DRAP
10869 without stack realign adjustment */
10870 t = choose_baseaddr (0);
10871 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10872 }
10873
10874 /* Prevent instructions from being scheduled into register save push
10875 sequence when access to the redzone area is done through frame pointer.
10876 The offset between the frame pointer and the stack pointer is calculated
10877 relative to the value of the stack pointer at the end of the function
10878 prologue, and moving instructions that access redzone area via frame
10879 pointer inside push sequence violates this assumption. */
10880 if (frame_pointer_needed && frame.red_zone_size)
10881 emit_insn (gen_memory_blockage ());
10882
10883 /* Emit cld instruction if stringops are used in the function. */
10884 if (TARGET_CLD && ix86_current_function_needs_cld)
10885 emit_insn (gen_cld ());
10886
10887 /* SEH requires that the prologue end within 256 bytes of the start of
10888 the function. Prevent instruction schedules that would extend that.
10889 Further, prevent alloca modifications to the stack pointer from being
10890 combined with prologue modifications. */
10891 if (TARGET_SEH)
10892 emit_insn (gen_prologue_use (stack_pointer_rtx));
10893 }
10894
10895 /* Emit code to restore REG using a POP insn. */
10896
10897 static void
10898 ix86_emit_restore_reg_using_pop (rtx reg)
10899 {
10900 struct machine_function *m = cfun->machine;
10901 rtx insn = emit_insn (gen_pop (reg));
10902
10903 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10904 m->fs.sp_offset -= UNITS_PER_WORD;
10905
10906 if (m->fs.cfa_reg == crtl->drap_reg
10907 && REGNO (reg) == REGNO (crtl->drap_reg))
10908 {
10909 /* Previously we'd represented the CFA as an expression
10910 like *(%ebp - 8). We've just popped that value from
10911 the stack, which means we need to reset the CFA to
10912 the drap register. This will remain until we restore
10913 the stack pointer. */
10914 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10915 RTX_FRAME_RELATED_P (insn) = 1;
10916
10917 /* This means that the DRAP register is valid for addressing too. */
10918 m->fs.drap_valid = true;
10919 return;
10920 }
10921
10922 if (m->fs.cfa_reg == stack_pointer_rtx)
10923 {
10924 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10925 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10926 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10927 RTX_FRAME_RELATED_P (insn) = 1;
10928
10929 m->fs.cfa_offset -= UNITS_PER_WORD;
10930 }
10931
10932 /* When the frame pointer is the CFA, and we pop it, we are
10933 swapping back to the stack pointer as the CFA. This happens
10934 for stack frames that don't allocate other data, so we assume
10935 the stack pointer is now pointing at the return address, i.e.
10936 the function entry state, which makes the offset be 1 word. */
10937 if (reg == hard_frame_pointer_rtx)
10938 {
10939 m->fs.fp_valid = false;
10940 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10941 {
10942 m->fs.cfa_reg = stack_pointer_rtx;
10943 m->fs.cfa_offset -= UNITS_PER_WORD;
10944
10945 add_reg_note (insn, REG_CFA_DEF_CFA,
10946 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10947 GEN_INT (m->fs.cfa_offset)));
10948 RTX_FRAME_RELATED_P (insn) = 1;
10949 }
10950 }
10951 }
10952
10953 /* Emit code to restore saved registers using POP insns. */
10954
10955 static void
10956 ix86_emit_restore_regs_using_pop (void)
10957 {
10958 unsigned int regno;
10959
10960 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10961 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10962 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10963 }
10964
10965 /* Emit code and notes for the LEAVE instruction. */
10966
10967 static void
10968 ix86_emit_leave (void)
10969 {
10970 struct machine_function *m = cfun->machine;
10971 rtx insn = emit_insn (ix86_gen_leave ());
10972
10973 ix86_add_queued_cfa_restore_notes (insn);
10974
10975 gcc_assert (m->fs.fp_valid);
10976 m->fs.sp_valid = true;
10977 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10978 m->fs.fp_valid = false;
10979
10980 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10981 {
10982 m->fs.cfa_reg = stack_pointer_rtx;
10983 m->fs.cfa_offset = m->fs.sp_offset;
10984
10985 add_reg_note (insn, REG_CFA_DEF_CFA,
10986 plus_constant (Pmode, stack_pointer_rtx,
10987 m->fs.sp_offset));
10988 RTX_FRAME_RELATED_P (insn) = 1;
10989 }
10990 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10991 m->fs.fp_offset);
10992 }
10993
10994 /* Emit code to restore saved registers using MOV insns.
10995 First register is restored from CFA - CFA_OFFSET. */
10996 static void
10997 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10998 bool maybe_eh_return)
10999 {
11000 struct machine_function *m = cfun->machine;
11001 unsigned int regno;
11002
11003 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11004 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11005 {
11006 rtx reg = gen_rtx_REG (word_mode, regno);
11007 rtx insn, mem;
11008
11009 mem = choose_baseaddr (cfa_offset);
11010 mem = gen_frame_mem (word_mode, mem);
11011 insn = emit_move_insn (reg, mem);
11012
11013 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
11014 {
11015 /* Previously we'd represented the CFA as an expression
11016 like *(%ebp - 8). We've just popped that value from
11017 the stack, which means we need to reset the CFA to
11018 the drap register. This will remain until we restore
11019 the stack pointer. */
11020 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11021 RTX_FRAME_RELATED_P (insn) = 1;
11022
11023 /* This means that the DRAP register is valid for addressing. */
11024 m->fs.drap_valid = true;
11025 }
11026 else
11027 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11028
11029 cfa_offset -= UNITS_PER_WORD;
11030 }
11031 }
11032
11033 /* Emit code to restore saved registers using MOV insns.
11034 First register is restored from CFA - CFA_OFFSET. */
11035 static void
11036 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11037 bool maybe_eh_return)
11038 {
11039 unsigned int regno;
11040
11041 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11042 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11043 {
11044 rtx reg = gen_rtx_REG (V4SFmode, regno);
11045 rtx mem;
11046
11047 mem = choose_baseaddr (cfa_offset);
11048 mem = gen_rtx_MEM (V4SFmode, mem);
11049 set_mem_align (mem, 128);
11050 emit_move_insn (reg, mem);
11051
11052 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11053
11054 cfa_offset -= 16;
11055 }
11056 }
11057
11058 /* Restore function stack, frame, and registers. */
11059
11060 void
11061 ix86_expand_epilogue (int style)
11062 {
11063 struct machine_function *m = cfun->machine;
11064 struct machine_frame_state frame_state_save = m->fs;
11065 struct ix86_frame frame;
11066 bool restore_regs_via_mov;
11067 bool using_drap;
11068
11069 ix86_finalize_stack_realign_flags ();
11070 ix86_compute_frame_layout (&frame);
11071
11072 m->fs.sp_valid = (!frame_pointer_needed
11073 || (crtl->sp_is_unchanging
11074 && !stack_realign_fp));
11075 gcc_assert (!m->fs.sp_valid
11076 || m->fs.sp_offset == frame.stack_pointer_offset);
11077
11078 /* The FP must be valid if the frame pointer is present. */
11079 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11080 gcc_assert (!m->fs.fp_valid
11081 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11082
11083 /* We must have *some* valid pointer to the stack frame. */
11084 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11085
11086 /* The DRAP is never valid at this point. */
11087 gcc_assert (!m->fs.drap_valid);
11088
11089 /* See the comment about red zone and frame
11090 pointer usage in ix86_expand_prologue. */
11091 if (frame_pointer_needed && frame.red_zone_size)
11092 emit_insn (gen_memory_blockage ());
11093
11094 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11095 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11096
11097 /* Determine the CFA offset of the end of the red-zone. */
11098 m->fs.red_zone_offset = 0;
11099 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11100 {
11101 /* The red-zone begins below the return address. */
11102 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11103
11104 /* When the register save area is in the aligned portion of
11105 the stack, determine the maximum runtime displacement that
11106 matches up with the aligned frame. */
11107 if (stack_realign_drap)
11108 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11109 + UNITS_PER_WORD);
11110 }
11111
11112 /* Special care must be taken for the normal return case of a function
11113 using eh_return: the eax and edx registers are marked as saved, but
11114 not restored along this path. Adjust the save location to match. */
11115 if (crtl->calls_eh_return && style != 2)
11116 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11117
11118 /* EH_RETURN requires the use of moves to function properly. */
11119 if (crtl->calls_eh_return)
11120 restore_regs_via_mov = true;
11121 /* SEH requires the use of pops to identify the epilogue. */
11122 else if (TARGET_SEH)
11123 restore_regs_via_mov = false;
11124 /* If we're only restoring one register and sp is not valid then
11125 using a move instruction to restore the register since it's
11126 less work than reloading sp and popping the register. */
11127 else if (!m->fs.sp_valid && frame.nregs <= 1)
11128 restore_regs_via_mov = true;
11129 else if (TARGET_EPILOGUE_USING_MOVE
11130 && cfun->machine->use_fast_prologue_epilogue
11131 && (frame.nregs > 1
11132 || m->fs.sp_offset != frame.reg_save_offset))
11133 restore_regs_via_mov = true;
11134 else if (frame_pointer_needed
11135 && !frame.nregs
11136 && m->fs.sp_offset != frame.reg_save_offset)
11137 restore_regs_via_mov = true;
11138 else if (frame_pointer_needed
11139 && TARGET_USE_LEAVE
11140 && cfun->machine->use_fast_prologue_epilogue
11141 && frame.nregs == 1)
11142 restore_regs_via_mov = true;
11143 else
11144 restore_regs_via_mov = false;
11145
11146 if (restore_regs_via_mov || frame.nsseregs)
11147 {
11148 /* Ensure that the entire register save area is addressable via
11149 the stack pointer, if we will restore via sp. */
11150 if (TARGET_64BIT
11151 && m->fs.sp_offset > 0x7fffffff
11152 && !(m->fs.fp_valid || m->fs.drap_valid)
11153 && (frame.nsseregs + frame.nregs) != 0)
11154 {
11155 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11156 GEN_INT (m->fs.sp_offset
11157 - frame.sse_reg_save_offset),
11158 style,
11159 m->fs.cfa_reg == stack_pointer_rtx);
11160 }
11161 }
11162
11163 /* If there are any SSE registers to restore, then we have to do it
11164 via moves, since there's obviously no pop for SSE regs. */
11165 if (frame.nsseregs)
11166 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11167 style == 2);
11168
11169 if (restore_regs_via_mov)
11170 {
11171 rtx t;
11172
11173 if (frame.nregs)
11174 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11175
11176 /* eh_return epilogues need %ecx added to the stack pointer. */
11177 if (style == 2)
11178 {
11179 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11180
11181 /* Stack align doesn't work with eh_return. */
11182 gcc_assert (!stack_realign_drap);
11183 /* Neither does regparm nested functions. */
11184 gcc_assert (!ix86_static_chain_on_stack);
11185
11186 if (frame_pointer_needed)
11187 {
11188 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11189 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11190 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11191
11192 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11193 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11194
11195 /* Note that we use SA as a temporary CFA, as the return
11196 address is at the proper place relative to it. We
11197 pretend this happens at the FP restore insn because
11198 prior to this insn the FP would be stored at the wrong
11199 offset relative to SA, and after this insn we have no
11200 other reasonable register to use for the CFA. We don't
11201 bother resetting the CFA to the SP for the duration of
11202 the return insn. */
11203 add_reg_note (insn, REG_CFA_DEF_CFA,
11204 plus_constant (Pmode, sa, UNITS_PER_WORD));
11205 ix86_add_queued_cfa_restore_notes (insn);
11206 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11207 RTX_FRAME_RELATED_P (insn) = 1;
11208
11209 m->fs.cfa_reg = sa;
11210 m->fs.cfa_offset = UNITS_PER_WORD;
11211 m->fs.fp_valid = false;
11212
11213 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11214 const0_rtx, style, false);
11215 }
11216 else
11217 {
11218 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11219 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11220 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11221 ix86_add_queued_cfa_restore_notes (insn);
11222
11223 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11224 if (m->fs.cfa_offset != UNITS_PER_WORD)
11225 {
11226 m->fs.cfa_offset = UNITS_PER_WORD;
11227 add_reg_note (insn, REG_CFA_DEF_CFA,
11228 plus_constant (Pmode, stack_pointer_rtx,
11229 UNITS_PER_WORD));
11230 RTX_FRAME_RELATED_P (insn) = 1;
11231 }
11232 }
11233 m->fs.sp_offset = UNITS_PER_WORD;
11234 m->fs.sp_valid = true;
11235 }
11236 }
11237 else
11238 {
11239 /* SEH requires that the function end with (1) a stack adjustment
11240 if necessary, (2) a sequence of pops, and (3) a return or
11241 jump instruction. Prevent insns from the function body from
11242 being scheduled into this sequence. */
11243 if (TARGET_SEH)
11244 {
11245 /* Prevent a catch region from being adjacent to the standard
11246 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11247 several other flags that would be interesting to test are
11248 not yet set up. */
11249 if (flag_non_call_exceptions)
11250 emit_insn (gen_nops (const1_rtx));
11251 else
11252 emit_insn (gen_blockage ());
11253 }
11254
11255 /* First step is to deallocate the stack frame so that we can
11256 pop the registers. Also do it on SEH target for very large
11257 frame as the emitted instructions aren't allowed by the ABI in
11258 epilogues. */
11259 if (!m->fs.sp_valid
11260 || (TARGET_SEH
11261 && (m->fs.sp_offset - frame.reg_save_offset
11262 >= SEH_MAX_FRAME_SIZE)))
11263 {
11264 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11265 GEN_INT (m->fs.fp_offset
11266 - frame.reg_save_offset),
11267 style, false);
11268 }
11269 else if (m->fs.sp_offset != frame.reg_save_offset)
11270 {
11271 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11272 GEN_INT (m->fs.sp_offset
11273 - frame.reg_save_offset),
11274 style,
11275 m->fs.cfa_reg == stack_pointer_rtx);
11276 }
11277
11278 ix86_emit_restore_regs_using_pop ();
11279 }
11280
11281 /* If we used a stack pointer and haven't already got rid of it,
11282 then do so now. */
11283 if (m->fs.fp_valid)
11284 {
11285 /* If the stack pointer is valid and pointing at the frame
11286 pointer store address, then we only need a pop. */
11287 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11288 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11289 /* Leave results in shorter dependency chains on CPUs that are
11290 able to grok it fast. */
11291 else if (TARGET_USE_LEAVE
11292 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11293 || !cfun->machine->use_fast_prologue_epilogue)
11294 ix86_emit_leave ();
11295 else
11296 {
11297 pro_epilogue_adjust_stack (stack_pointer_rtx,
11298 hard_frame_pointer_rtx,
11299 const0_rtx, style, !using_drap);
11300 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11301 }
11302 }
11303
11304 if (using_drap)
11305 {
11306 int param_ptr_offset = UNITS_PER_WORD;
11307 rtx insn;
11308
11309 gcc_assert (stack_realign_drap);
11310
11311 if (ix86_static_chain_on_stack)
11312 param_ptr_offset += UNITS_PER_WORD;
11313 if (!call_used_regs[REGNO (crtl->drap_reg)])
11314 param_ptr_offset += UNITS_PER_WORD;
11315
11316 insn = emit_insn (gen_rtx_SET
11317 (VOIDmode, stack_pointer_rtx,
11318 gen_rtx_PLUS (Pmode,
11319 crtl->drap_reg,
11320 GEN_INT (-param_ptr_offset))));
11321 m->fs.cfa_reg = stack_pointer_rtx;
11322 m->fs.cfa_offset = param_ptr_offset;
11323 m->fs.sp_offset = param_ptr_offset;
11324 m->fs.realigned = false;
11325
11326 add_reg_note (insn, REG_CFA_DEF_CFA,
11327 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11328 GEN_INT (param_ptr_offset)));
11329 RTX_FRAME_RELATED_P (insn) = 1;
11330
11331 if (!call_used_regs[REGNO (crtl->drap_reg)])
11332 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11333 }
11334
11335 /* At this point the stack pointer must be valid, and we must have
11336 restored all of the registers. We may not have deallocated the
11337 entire stack frame. We've delayed this until now because it may
11338 be possible to merge the local stack deallocation with the
11339 deallocation forced by ix86_static_chain_on_stack. */
11340 gcc_assert (m->fs.sp_valid);
11341 gcc_assert (!m->fs.fp_valid);
11342 gcc_assert (!m->fs.realigned);
11343 if (m->fs.sp_offset != UNITS_PER_WORD)
11344 {
11345 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11346 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11347 style, true);
11348 }
11349 else
11350 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11351
11352 /* Sibcall epilogues don't want a return instruction. */
11353 if (style == 0)
11354 {
11355 m->fs = frame_state_save;
11356 return;
11357 }
11358
11359 if (crtl->args.pops_args && crtl->args.size)
11360 {
11361 rtx popc = GEN_INT (crtl->args.pops_args);
11362
11363 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11364 address, do explicit add, and jump indirectly to the caller. */
11365
11366 if (crtl->args.pops_args >= 65536)
11367 {
11368 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11369 rtx insn;
11370
11371 /* There is no "pascal" calling convention in any 64bit ABI. */
11372 gcc_assert (!TARGET_64BIT);
11373
11374 insn = emit_insn (gen_pop (ecx));
11375 m->fs.cfa_offset -= UNITS_PER_WORD;
11376 m->fs.sp_offset -= UNITS_PER_WORD;
11377
11378 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11379 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11380 add_reg_note (insn, REG_CFA_REGISTER,
11381 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11382 RTX_FRAME_RELATED_P (insn) = 1;
11383
11384 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11385 popc, -1, true);
11386 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11387 }
11388 else
11389 emit_jump_insn (gen_simple_return_pop_internal (popc));
11390 }
11391 else
11392 emit_jump_insn (gen_simple_return_internal ());
11393
11394 /* Restore the state back to the state from the prologue,
11395 so that it's correct for the next epilogue. */
11396 m->fs = frame_state_save;
11397 }
11398
11399 /* Reset from the function's potential modifications. */
11400
11401 static void
11402 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11403 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11404 {
11405 if (pic_offset_table_rtx)
11406 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11407 #if TARGET_MACHO
11408 /* Mach-O doesn't support labels at the end of objects, so if
11409 it looks like we might want one, insert a NOP. */
11410 {
11411 rtx insn = get_last_insn ();
11412 rtx deleted_debug_label = NULL_RTX;
11413 while (insn
11414 && NOTE_P (insn)
11415 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11416 {
11417 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11418 notes only, instead set their CODE_LABEL_NUMBER to -1,
11419 otherwise there would be code generation differences
11420 in between -g and -g0. */
11421 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11422 deleted_debug_label = insn;
11423 insn = PREV_INSN (insn);
11424 }
11425 if (insn
11426 && (LABEL_P (insn)
11427 || (NOTE_P (insn)
11428 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11429 fputs ("\tnop\n", file);
11430 else if (deleted_debug_label)
11431 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11432 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11433 CODE_LABEL_NUMBER (insn) = -1;
11434 }
11435 #endif
11436
11437 }
11438
11439 /* Return a scratch register to use in the split stack prologue. The
11440 split stack prologue is used for -fsplit-stack. It is the first
11441 instructions in the function, even before the regular prologue.
11442 The scratch register can be any caller-saved register which is not
11443 used for parameters or for the static chain. */
11444
11445 static unsigned int
11446 split_stack_prologue_scratch_regno (void)
11447 {
11448 if (TARGET_64BIT)
11449 return R11_REG;
11450 else
11451 {
11452 bool is_fastcall, is_thiscall;
11453 int regparm;
11454
11455 is_fastcall = (lookup_attribute ("fastcall",
11456 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11457 != NULL);
11458 is_thiscall = (lookup_attribute ("thiscall",
11459 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11460 != NULL);
11461 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11462
11463 if (is_fastcall)
11464 {
11465 if (DECL_STATIC_CHAIN (cfun->decl))
11466 {
11467 sorry ("-fsplit-stack does not support fastcall with "
11468 "nested function");
11469 return INVALID_REGNUM;
11470 }
11471 return AX_REG;
11472 }
11473 else if (is_thiscall)
11474 {
11475 if (!DECL_STATIC_CHAIN (cfun->decl))
11476 return DX_REG;
11477 return AX_REG;
11478 }
11479 else if (regparm < 3)
11480 {
11481 if (!DECL_STATIC_CHAIN (cfun->decl))
11482 return CX_REG;
11483 else
11484 {
11485 if (regparm >= 2)
11486 {
11487 sorry ("-fsplit-stack does not support 2 register "
11488 " parameters for a nested function");
11489 return INVALID_REGNUM;
11490 }
11491 return DX_REG;
11492 }
11493 }
11494 else
11495 {
11496 /* FIXME: We could make this work by pushing a register
11497 around the addition and comparison. */
11498 sorry ("-fsplit-stack does not support 3 register parameters");
11499 return INVALID_REGNUM;
11500 }
11501 }
11502 }
11503
11504 /* A SYMBOL_REF for the function which allocates new stackspace for
11505 -fsplit-stack. */
11506
11507 static GTY(()) rtx split_stack_fn;
11508
11509 /* A SYMBOL_REF for the more stack function when using the large
11510 model. */
11511
11512 static GTY(()) rtx split_stack_fn_large;
11513
11514 /* Handle -fsplit-stack. These are the first instructions in the
11515 function, even before the regular prologue. */
11516
11517 void
11518 ix86_expand_split_stack_prologue (void)
11519 {
11520 struct ix86_frame frame;
11521 HOST_WIDE_INT allocate;
11522 unsigned HOST_WIDE_INT args_size;
11523 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11524 rtx scratch_reg = NULL_RTX;
11525 rtx varargs_label = NULL_RTX;
11526 rtx fn;
11527
11528 gcc_assert (flag_split_stack && reload_completed);
11529
11530 ix86_finalize_stack_realign_flags ();
11531 ix86_compute_frame_layout (&frame);
11532 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11533
11534 /* This is the label we will branch to if we have enough stack
11535 space. We expect the basic block reordering pass to reverse this
11536 branch if optimizing, so that we branch in the unlikely case. */
11537 label = gen_label_rtx ();
11538
11539 /* We need to compare the stack pointer minus the frame size with
11540 the stack boundary in the TCB. The stack boundary always gives
11541 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11542 can compare directly. Otherwise we need to do an addition. */
11543
11544 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11545 UNSPEC_STACK_CHECK);
11546 limit = gen_rtx_CONST (Pmode, limit);
11547 limit = gen_rtx_MEM (Pmode, limit);
11548 if (allocate < SPLIT_STACK_AVAILABLE)
11549 current = stack_pointer_rtx;
11550 else
11551 {
11552 unsigned int scratch_regno;
11553 rtx offset;
11554
11555 /* We need a scratch register to hold the stack pointer minus
11556 the required frame size. Since this is the very start of the
11557 function, the scratch register can be any caller-saved
11558 register which is not used for parameters. */
11559 offset = GEN_INT (- allocate);
11560 scratch_regno = split_stack_prologue_scratch_regno ();
11561 if (scratch_regno == INVALID_REGNUM)
11562 return;
11563 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11564 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11565 {
11566 /* We don't use ix86_gen_add3 in this case because it will
11567 want to split to lea, but when not optimizing the insn
11568 will not be split after this point. */
11569 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11570 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11571 offset)));
11572 }
11573 else
11574 {
11575 emit_move_insn (scratch_reg, offset);
11576 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11577 stack_pointer_rtx));
11578 }
11579 current = scratch_reg;
11580 }
11581
11582 ix86_expand_branch (GEU, current, limit, label);
11583 jump_insn = get_last_insn ();
11584 JUMP_LABEL (jump_insn) = label;
11585
11586 /* Mark the jump as very likely to be taken. */
11587 add_int_reg_note (jump_insn, REG_BR_PROB,
11588 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
11589
11590 if (split_stack_fn == NULL_RTX)
11591 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11592 fn = split_stack_fn;
11593
11594 /* Get more stack space. We pass in the desired stack space and the
11595 size of the arguments to copy to the new stack. In 32-bit mode
11596 we push the parameters; __morestack will return on a new stack
11597 anyhow. In 64-bit mode we pass the parameters in r10 and
11598 r11. */
11599 allocate_rtx = GEN_INT (allocate);
11600 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11601 call_fusage = NULL_RTX;
11602 if (TARGET_64BIT)
11603 {
11604 rtx reg10, reg11;
11605
11606 reg10 = gen_rtx_REG (Pmode, R10_REG);
11607 reg11 = gen_rtx_REG (Pmode, R11_REG);
11608
11609 /* If this function uses a static chain, it will be in %r10.
11610 Preserve it across the call to __morestack. */
11611 if (DECL_STATIC_CHAIN (cfun->decl))
11612 {
11613 rtx rax;
11614
11615 rax = gen_rtx_REG (word_mode, AX_REG);
11616 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11617 use_reg (&call_fusage, rax);
11618 }
11619
11620 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11621 && !TARGET_PECOFF)
11622 {
11623 HOST_WIDE_INT argval;
11624
11625 gcc_assert (Pmode == DImode);
11626 /* When using the large model we need to load the address
11627 into a register, and we've run out of registers. So we
11628 switch to a different calling convention, and we call a
11629 different function: __morestack_large. We pass the
11630 argument size in the upper 32 bits of r10 and pass the
11631 frame size in the lower 32 bits. */
11632 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11633 gcc_assert ((args_size & 0xffffffff) == args_size);
11634
11635 if (split_stack_fn_large == NULL_RTX)
11636 split_stack_fn_large =
11637 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11638
11639 if (ix86_cmodel == CM_LARGE_PIC)
11640 {
11641 rtx label, x;
11642
11643 label = gen_label_rtx ();
11644 emit_label (label);
11645 LABEL_PRESERVE_P (label) = 1;
11646 emit_insn (gen_set_rip_rex64 (reg10, label));
11647 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11648 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11649 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11650 UNSPEC_GOT);
11651 x = gen_rtx_CONST (Pmode, x);
11652 emit_move_insn (reg11, x);
11653 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11654 x = gen_const_mem (Pmode, x);
11655 emit_move_insn (reg11, x);
11656 }
11657 else
11658 emit_move_insn (reg11, split_stack_fn_large);
11659
11660 fn = reg11;
11661
11662 argval = ((args_size << 16) << 16) + allocate;
11663 emit_move_insn (reg10, GEN_INT (argval));
11664 }
11665 else
11666 {
11667 emit_move_insn (reg10, allocate_rtx);
11668 emit_move_insn (reg11, GEN_INT (args_size));
11669 use_reg (&call_fusage, reg11);
11670 }
11671
11672 use_reg (&call_fusage, reg10);
11673 }
11674 else
11675 {
11676 emit_insn (gen_push (GEN_INT (args_size)));
11677 emit_insn (gen_push (allocate_rtx));
11678 }
11679 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11680 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11681 NULL_RTX, false);
11682 add_function_usage_to (call_insn, call_fusage);
11683
11684 /* In order to make call/return prediction work right, we now need
11685 to execute a return instruction. See
11686 libgcc/config/i386/morestack.S for the details on how this works.
11687
11688 For flow purposes gcc must not see this as a return
11689 instruction--we need control flow to continue at the subsequent
11690 label. Therefore, we use an unspec. */
11691 gcc_assert (crtl->args.pops_args < 65536);
11692 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11693
11694 /* If we are in 64-bit mode and this function uses a static chain,
11695 we saved %r10 in %rax before calling _morestack. */
11696 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11697 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11698 gen_rtx_REG (word_mode, AX_REG));
11699
11700 /* If this function calls va_start, we need to store a pointer to
11701 the arguments on the old stack, because they may not have been
11702 all copied to the new stack. At this point the old stack can be
11703 found at the frame pointer value used by __morestack, because
11704 __morestack has set that up before calling back to us. Here we
11705 store that pointer in a scratch register, and in
11706 ix86_expand_prologue we store the scratch register in a stack
11707 slot. */
11708 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11709 {
11710 unsigned int scratch_regno;
11711 rtx frame_reg;
11712 int words;
11713
11714 scratch_regno = split_stack_prologue_scratch_regno ();
11715 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11716 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11717
11718 /* 64-bit:
11719 fp -> old fp value
11720 return address within this function
11721 return address of caller of this function
11722 stack arguments
11723 So we add three words to get to the stack arguments.
11724
11725 32-bit:
11726 fp -> old fp value
11727 return address within this function
11728 first argument to __morestack
11729 second argument to __morestack
11730 return address of caller of this function
11731 stack arguments
11732 So we add five words to get to the stack arguments.
11733 */
11734 words = TARGET_64BIT ? 3 : 5;
11735 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11736 gen_rtx_PLUS (Pmode, frame_reg,
11737 GEN_INT (words * UNITS_PER_WORD))));
11738
11739 varargs_label = gen_label_rtx ();
11740 emit_jump_insn (gen_jump (varargs_label));
11741 JUMP_LABEL (get_last_insn ()) = varargs_label;
11742
11743 emit_barrier ();
11744 }
11745
11746 emit_label (label);
11747 LABEL_NUSES (label) = 1;
11748
11749 /* If this function calls va_start, we now have to set the scratch
11750 register for the case where we do not call __morestack. In this
11751 case we need to set it based on the stack pointer. */
11752 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11753 {
11754 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11755 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11756 GEN_INT (UNITS_PER_WORD))));
11757
11758 emit_label (varargs_label);
11759 LABEL_NUSES (varargs_label) = 1;
11760 }
11761 }
11762
11763 /* We may have to tell the dataflow pass that the split stack prologue
11764 is initializing a scratch register. */
11765
11766 static void
11767 ix86_live_on_entry (bitmap regs)
11768 {
11769 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11770 {
11771 gcc_assert (flag_split_stack);
11772 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11773 }
11774 }
11775 \f
11776 /* Determine if op is suitable SUBREG RTX for address. */
11777
11778 static bool
11779 ix86_address_subreg_operand (rtx op)
11780 {
11781 enum machine_mode mode;
11782
11783 if (!REG_P (op))
11784 return false;
11785
11786 mode = GET_MODE (op);
11787
11788 if (GET_MODE_CLASS (mode) != MODE_INT)
11789 return false;
11790
11791 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11792 failures when the register is one word out of a two word structure. */
11793 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11794 return false;
11795
11796 /* Allow only SUBREGs of non-eliminable hard registers. */
11797 return register_no_elim_operand (op, mode);
11798 }
11799
11800 /* Extract the parts of an RTL expression that is a valid memory address
11801 for an instruction. Return 0 if the structure of the address is
11802 grossly off. Return -1 if the address contains ASHIFT, so it is not
11803 strictly valid, but still used for computing length of lea instruction. */
11804
11805 int
11806 ix86_decompose_address (rtx addr, struct ix86_address *out)
11807 {
11808 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11809 rtx base_reg, index_reg;
11810 HOST_WIDE_INT scale = 1;
11811 rtx scale_rtx = NULL_RTX;
11812 rtx tmp;
11813 int retval = 1;
11814 enum ix86_address_seg seg = SEG_DEFAULT;
11815
11816 /* Allow zero-extended SImode addresses,
11817 they will be emitted with addr32 prefix. */
11818 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11819 {
11820 if (GET_CODE (addr) == ZERO_EXTEND
11821 && GET_MODE (XEXP (addr, 0)) == SImode)
11822 {
11823 addr = XEXP (addr, 0);
11824 if (CONST_INT_P (addr))
11825 return 0;
11826 }
11827 else if (GET_CODE (addr) == AND
11828 && const_32bit_mask (XEXP (addr, 1), DImode))
11829 {
11830 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11831 if (addr == NULL_RTX)
11832 return 0;
11833
11834 if (CONST_INT_P (addr))
11835 return 0;
11836 }
11837 }
11838
11839 /* Allow SImode subregs of DImode addresses,
11840 they will be emitted with addr32 prefix. */
11841 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11842 {
11843 if (GET_CODE (addr) == SUBREG
11844 && GET_MODE (SUBREG_REG (addr)) == DImode)
11845 {
11846 addr = SUBREG_REG (addr);
11847 if (CONST_INT_P (addr))
11848 return 0;
11849 }
11850 }
11851
11852 if (REG_P (addr))
11853 base = addr;
11854 else if (GET_CODE (addr) == SUBREG)
11855 {
11856 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11857 base = addr;
11858 else
11859 return 0;
11860 }
11861 else if (GET_CODE (addr) == PLUS)
11862 {
11863 rtx addends[4], op;
11864 int n = 0, i;
11865
11866 op = addr;
11867 do
11868 {
11869 if (n >= 4)
11870 return 0;
11871 addends[n++] = XEXP (op, 1);
11872 op = XEXP (op, 0);
11873 }
11874 while (GET_CODE (op) == PLUS);
11875 if (n >= 4)
11876 return 0;
11877 addends[n] = op;
11878
11879 for (i = n; i >= 0; --i)
11880 {
11881 op = addends[i];
11882 switch (GET_CODE (op))
11883 {
11884 case MULT:
11885 if (index)
11886 return 0;
11887 index = XEXP (op, 0);
11888 scale_rtx = XEXP (op, 1);
11889 break;
11890
11891 case ASHIFT:
11892 if (index)
11893 return 0;
11894 index = XEXP (op, 0);
11895 tmp = XEXP (op, 1);
11896 if (!CONST_INT_P (tmp))
11897 return 0;
11898 scale = INTVAL (tmp);
11899 if ((unsigned HOST_WIDE_INT) scale > 3)
11900 return 0;
11901 scale = 1 << scale;
11902 break;
11903
11904 case ZERO_EXTEND:
11905 op = XEXP (op, 0);
11906 if (GET_CODE (op) != UNSPEC)
11907 return 0;
11908 /* FALLTHRU */
11909
11910 case UNSPEC:
11911 if (XINT (op, 1) == UNSPEC_TP
11912 && TARGET_TLS_DIRECT_SEG_REFS
11913 && seg == SEG_DEFAULT)
11914 seg = DEFAULT_TLS_SEG_REG;
11915 else
11916 return 0;
11917 break;
11918
11919 case SUBREG:
11920 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11921 return 0;
11922 /* FALLTHRU */
11923
11924 case REG:
11925 if (!base)
11926 base = op;
11927 else if (!index)
11928 index = op;
11929 else
11930 return 0;
11931 break;
11932
11933 case CONST:
11934 case CONST_INT:
11935 case SYMBOL_REF:
11936 case LABEL_REF:
11937 if (disp)
11938 return 0;
11939 disp = op;
11940 break;
11941
11942 default:
11943 return 0;
11944 }
11945 }
11946 }
11947 else if (GET_CODE (addr) == MULT)
11948 {
11949 index = XEXP (addr, 0); /* index*scale */
11950 scale_rtx = XEXP (addr, 1);
11951 }
11952 else if (GET_CODE (addr) == ASHIFT)
11953 {
11954 /* We're called for lea too, which implements ashift on occasion. */
11955 index = XEXP (addr, 0);
11956 tmp = XEXP (addr, 1);
11957 if (!CONST_INT_P (tmp))
11958 return 0;
11959 scale = INTVAL (tmp);
11960 if ((unsigned HOST_WIDE_INT) scale > 3)
11961 return 0;
11962 scale = 1 << scale;
11963 retval = -1;
11964 }
11965 else if (CONST_INT_P (addr))
11966 {
11967 if (!x86_64_immediate_operand (addr, VOIDmode))
11968 return 0;
11969
11970 /* Constant addresses are sign extended to 64bit, we have to
11971 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11972 if (TARGET_X32
11973 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11974 return 0;
11975
11976 disp = addr;
11977 }
11978 else
11979 disp = addr; /* displacement */
11980
11981 if (index)
11982 {
11983 if (REG_P (index))
11984 ;
11985 else if (GET_CODE (index) == SUBREG
11986 && ix86_address_subreg_operand (SUBREG_REG (index)))
11987 ;
11988 else
11989 return 0;
11990 }
11991
11992 /* Address override works only on the (%reg) part of %fs:(%reg). */
11993 if (seg != SEG_DEFAULT
11994 && ((base && GET_MODE (base) != word_mode)
11995 || (index && GET_MODE (index) != word_mode)))
11996 return 0;
11997
11998 /* Extract the integral value of scale. */
11999 if (scale_rtx)
12000 {
12001 if (!CONST_INT_P (scale_rtx))
12002 return 0;
12003 scale = INTVAL (scale_rtx);
12004 }
12005
12006 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
12007 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
12008
12009 /* Avoid useless 0 displacement. */
12010 if (disp == const0_rtx && (base || index))
12011 disp = NULL_RTX;
12012
12013 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12014 if (base_reg && index_reg && scale == 1
12015 && (index_reg == arg_pointer_rtx
12016 || index_reg == frame_pointer_rtx
12017 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12018 {
12019 rtx tmp;
12020 tmp = base, base = index, index = tmp;
12021 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12022 }
12023
12024 /* Special case: %ebp cannot be encoded as a base without a displacement.
12025 Similarly %r13. */
12026 if (!disp
12027 && base_reg
12028 && (base_reg == hard_frame_pointer_rtx
12029 || base_reg == frame_pointer_rtx
12030 || base_reg == arg_pointer_rtx
12031 || (REG_P (base_reg)
12032 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12033 || REGNO (base_reg) == R13_REG))))
12034 disp = const0_rtx;
12035
12036 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12037 Avoid this by transforming to [%esi+0].
12038 Reload calls address legitimization without cfun defined, so we need
12039 to test cfun for being non-NULL. */
12040 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12041 && base_reg && !index_reg && !disp
12042 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12043 disp = const0_rtx;
12044
12045 /* Special case: encode reg+reg instead of reg*2. */
12046 if (!base && index && scale == 2)
12047 base = index, base_reg = index_reg, scale = 1;
12048
12049 /* Special case: scaling cannot be encoded without base or displacement. */
12050 if (!base && !disp && index && scale != 1)
12051 disp = const0_rtx;
12052
12053 out->base = base;
12054 out->index = index;
12055 out->disp = disp;
12056 out->scale = scale;
12057 out->seg = seg;
12058
12059 return retval;
12060 }
12061 \f
12062 /* Return cost of the memory address x.
12063 For i386, it is better to use a complex address than let gcc copy
12064 the address into a reg and make a new pseudo. But not if the address
12065 requires to two regs - that would mean more pseudos with longer
12066 lifetimes. */
12067 static int
12068 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12069 addr_space_t as ATTRIBUTE_UNUSED,
12070 bool speed ATTRIBUTE_UNUSED)
12071 {
12072 struct ix86_address parts;
12073 int cost = 1;
12074 int ok = ix86_decompose_address (x, &parts);
12075
12076 gcc_assert (ok);
12077
12078 if (parts.base && GET_CODE (parts.base) == SUBREG)
12079 parts.base = SUBREG_REG (parts.base);
12080 if (parts.index && GET_CODE (parts.index) == SUBREG)
12081 parts.index = SUBREG_REG (parts.index);
12082
12083 /* Attempt to minimize number of registers in the address. */
12084 if ((parts.base
12085 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12086 || (parts.index
12087 && (!REG_P (parts.index)
12088 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12089 cost++;
12090
12091 if (parts.base
12092 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12093 && parts.index
12094 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12095 && parts.base != parts.index)
12096 cost++;
12097
12098 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12099 since it's predecode logic can't detect the length of instructions
12100 and it degenerates to vector decoded. Increase cost of such
12101 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12102 to split such addresses or even refuse such addresses at all.
12103
12104 Following addressing modes are affected:
12105 [base+scale*index]
12106 [scale*index+disp]
12107 [base+index]
12108
12109 The first and last case may be avoidable by explicitly coding the zero in
12110 memory address, but I don't have AMD-K6 machine handy to check this
12111 theory. */
12112
12113 if (TARGET_K6
12114 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12115 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12116 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12117 cost += 10;
12118
12119 return cost;
12120 }
12121 \f
12122 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12123 this is used for to form addresses to local data when -fPIC is in
12124 use. */
12125
12126 static bool
12127 darwin_local_data_pic (rtx disp)
12128 {
12129 return (GET_CODE (disp) == UNSPEC
12130 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12131 }
12132
12133 /* Determine if a given RTX is a valid constant. We already know this
12134 satisfies CONSTANT_P. */
12135
12136 static bool
12137 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12138 {
12139 switch (GET_CODE (x))
12140 {
12141 case CONST:
12142 x = XEXP (x, 0);
12143
12144 if (GET_CODE (x) == PLUS)
12145 {
12146 if (!CONST_INT_P (XEXP (x, 1)))
12147 return false;
12148 x = XEXP (x, 0);
12149 }
12150
12151 if (TARGET_MACHO && darwin_local_data_pic (x))
12152 return true;
12153
12154 /* Only some unspecs are valid as "constants". */
12155 if (GET_CODE (x) == UNSPEC)
12156 switch (XINT (x, 1))
12157 {
12158 case UNSPEC_GOT:
12159 case UNSPEC_GOTOFF:
12160 case UNSPEC_PLTOFF:
12161 return TARGET_64BIT;
12162 case UNSPEC_TPOFF:
12163 case UNSPEC_NTPOFF:
12164 x = XVECEXP (x, 0, 0);
12165 return (GET_CODE (x) == SYMBOL_REF
12166 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12167 case UNSPEC_DTPOFF:
12168 x = XVECEXP (x, 0, 0);
12169 return (GET_CODE (x) == SYMBOL_REF
12170 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12171 default:
12172 return false;
12173 }
12174
12175 /* We must have drilled down to a symbol. */
12176 if (GET_CODE (x) == LABEL_REF)
12177 return true;
12178 if (GET_CODE (x) != SYMBOL_REF)
12179 return false;
12180 /* FALLTHRU */
12181
12182 case SYMBOL_REF:
12183 /* TLS symbols are never valid. */
12184 if (SYMBOL_REF_TLS_MODEL (x))
12185 return false;
12186
12187 /* DLLIMPORT symbols are never valid. */
12188 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12189 && SYMBOL_REF_DLLIMPORT_P (x))
12190 return false;
12191
12192 #if TARGET_MACHO
12193 /* mdynamic-no-pic */
12194 if (MACHO_DYNAMIC_NO_PIC_P)
12195 return machopic_symbol_defined_p (x);
12196 #endif
12197 break;
12198
12199 case CONST_DOUBLE:
12200 if (GET_MODE (x) == TImode
12201 && x != CONST0_RTX (TImode)
12202 && !TARGET_64BIT)
12203 return false;
12204 break;
12205
12206 case CONST_VECTOR:
12207 if (!standard_sse_constant_p (x))
12208 return false;
12209
12210 default:
12211 break;
12212 }
12213
12214 /* Otherwise we handle everything else in the move patterns. */
12215 return true;
12216 }
12217
12218 /* Determine if it's legal to put X into the constant pool. This
12219 is not possible for the address of thread-local symbols, which
12220 is checked above. */
12221
12222 static bool
12223 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12224 {
12225 /* We can always put integral constants and vectors in memory. */
12226 switch (GET_CODE (x))
12227 {
12228 case CONST_INT:
12229 case CONST_DOUBLE:
12230 case CONST_VECTOR:
12231 return false;
12232
12233 default:
12234 break;
12235 }
12236 return !ix86_legitimate_constant_p (mode, x);
12237 }
12238
12239 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12240 otherwise zero. */
12241
12242 static bool
12243 is_imported_p (rtx x)
12244 {
12245 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12246 || GET_CODE (x) != SYMBOL_REF)
12247 return false;
12248
12249 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12250 }
12251
12252
12253 /* Nonzero if the constant value X is a legitimate general operand
12254 when generating PIC code. It is given that flag_pic is on and
12255 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12256
12257 bool
12258 legitimate_pic_operand_p (rtx x)
12259 {
12260 rtx inner;
12261
12262 switch (GET_CODE (x))
12263 {
12264 case CONST:
12265 inner = XEXP (x, 0);
12266 if (GET_CODE (inner) == PLUS
12267 && CONST_INT_P (XEXP (inner, 1)))
12268 inner = XEXP (inner, 0);
12269
12270 /* Only some unspecs are valid as "constants". */
12271 if (GET_CODE (inner) == UNSPEC)
12272 switch (XINT (inner, 1))
12273 {
12274 case UNSPEC_GOT:
12275 case UNSPEC_GOTOFF:
12276 case UNSPEC_PLTOFF:
12277 return TARGET_64BIT;
12278 case UNSPEC_TPOFF:
12279 x = XVECEXP (inner, 0, 0);
12280 return (GET_CODE (x) == SYMBOL_REF
12281 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12282 case UNSPEC_MACHOPIC_OFFSET:
12283 return legitimate_pic_address_disp_p (x);
12284 default:
12285 return false;
12286 }
12287 /* FALLTHRU */
12288
12289 case SYMBOL_REF:
12290 case LABEL_REF:
12291 return legitimate_pic_address_disp_p (x);
12292
12293 default:
12294 return true;
12295 }
12296 }
12297
12298 /* Determine if a given CONST RTX is a valid memory displacement
12299 in PIC mode. */
12300
12301 bool
12302 legitimate_pic_address_disp_p (rtx disp)
12303 {
12304 bool saw_plus;
12305
12306 /* In 64bit mode we can allow direct addresses of symbols and labels
12307 when they are not dynamic symbols. */
12308 if (TARGET_64BIT)
12309 {
12310 rtx op0 = disp, op1;
12311
12312 switch (GET_CODE (disp))
12313 {
12314 case LABEL_REF:
12315 return true;
12316
12317 case CONST:
12318 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12319 break;
12320 op0 = XEXP (XEXP (disp, 0), 0);
12321 op1 = XEXP (XEXP (disp, 0), 1);
12322 if (!CONST_INT_P (op1)
12323 || INTVAL (op1) >= 16*1024*1024
12324 || INTVAL (op1) < -16*1024*1024)
12325 break;
12326 if (GET_CODE (op0) == LABEL_REF)
12327 return true;
12328 if (GET_CODE (op0) == CONST
12329 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12330 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12331 return true;
12332 if (GET_CODE (op0) == UNSPEC
12333 && XINT (op0, 1) == UNSPEC_PCREL)
12334 return true;
12335 if (GET_CODE (op0) != SYMBOL_REF)
12336 break;
12337 /* FALLTHRU */
12338
12339 case SYMBOL_REF:
12340 /* TLS references should always be enclosed in UNSPEC.
12341 The dllimported symbol needs always to be resolved. */
12342 if (SYMBOL_REF_TLS_MODEL (op0)
12343 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12344 return false;
12345
12346 if (TARGET_PECOFF)
12347 {
12348 if (is_imported_p (op0))
12349 return true;
12350
12351 if (SYMBOL_REF_FAR_ADDR_P (op0)
12352 || !SYMBOL_REF_LOCAL_P (op0))
12353 break;
12354
12355 /* Function-symbols need to be resolved only for
12356 large-model.
12357 For the small-model we don't need to resolve anything
12358 here. */
12359 if ((ix86_cmodel != CM_LARGE_PIC
12360 && SYMBOL_REF_FUNCTION_P (op0))
12361 || ix86_cmodel == CM_SMALL_PIC)
12362 return true;
12363 /* Non-external symbols don't need to be resolved for
12364 large, and medium-model. */
12365 if ((ix86_cmodel == CM_LARGE_PIC
12366 || ix86_cmodel == CM_MEDIUM_PIC)
12367 && !SYMBOL_REF_EXTERNAL_P (op0))
12368 return true;
12369 }
12370 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12371 && SYMBOL_REF_LOCAL_P (op0)
12372 && ix86_cmodel != CM_LARGE_PIC)
12373 return true;
12374 break;
12375
12376 default:
12377 break;
12378 }
12379 }
12380 if (GET_CODE (disp) != CONST)
12381 return false;
12382 disp = XEXP (disp, 0);
12383
12384 if (TARGET_64BIT)
12385 {
12386 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12387 of GOT tables. We should not need these anyway. */
12388 if (GET_CODE (disp) != UNSPEC
12389 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12390 && XINT (disp, 1) != UNSPEC_GOTOFF
12391 && XINT (disp, 1) != UNSPEC_PCREL
12392 && XINT (disp, 1) != UNSPEC_PLTOFF))
12393 return false;
12394
12395 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12396 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12397 return false;
12398 return true;
12399 }
12400
12401 saw_plus = false;
12402 if (GET_CODE (disp) == PLUS)
12403 {
12404 if (!CONST_INT_P (XEXP (disp, 1)))
12405 return false;
12406 disp = XEXP (disp, 0);
12407 saw_plus = true;
12408 }
12409
12410 if (TARGET_MACHO && darwin_local_data_pic (disp))
12411 return true;
12412
12413 if (GET_CODE (disp) != UNSPEC)
12414 return false;
12415
12416 switch (XINT (disp, 1))
12417 {
12418 case UNSPEC_GOT:
12419 if (saw_plus)
12420 return false;
12421 /* We need to check for both symbols and labels because VxWorks loads
12422 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12423 details. */
12424 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12425 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12426 case UNSPEC_GOTOFF:
12427 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12428 While ABI specify also 32bit relocation but we don't produce it in
12429 small PIC model at all. */
12430 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12431 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12432 && !TARGET_64BIT)
12433 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12434 return false;
12435 case UNSPEC_GOTTPOFF:
12436 case UNSPEC_GOTNTPOFF:
12437 case UNSPEC_INDNTPOFF:
12438 if (saw_plus)
12439 return false;
12440 disp = XVECEXP (disp, 0, 0);
12441 return (GET_CODE (disp) == SYMBOL_REF
12442 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12443 case UNSPEC_NTPOFF:
12444 disp = XVECEXP (disp, 0, 0);
12445 return (GET_CODE (disp) == SYMBOL_REF
12446 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12447 case UNSPEC_DTPOFF:
12448 disp = XVECEXP (disp, 0, 0);
12449 return (GET_CODE (disp) == SYMBOL_REF
12450 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12451 }
12452
12453 return false;
12454 }
12455
12456 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12457 replace the input X, or the original X if no replacement is called for.
12458 The output parameter *WIN is 1 if the calling macro should goto WIN,
12459 0 if it should not. */
12460
12461 bool
12462 ix86_legitimize_reload_address (rtx x,
12463 enum machine_mode mode ATTRIBUTE_UNUSED,
12464 int opnum, int type,
12465 int ind_levels ATTRIBUTE_UNUSED)
12466 {
12467 /* Reload can generate:
12468
12469 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12470 (reg:DI 97))
12471 (reg:DI 2 cx))
12472
12473 This RTX is rejected from ix86_legitimate_address_p due to
12474 non-strictness of base register 97. Following this rejection,
12475 reload pushes all three components into separate registers,
12476 creating invalid memory address RTX.
12477
12478 Following code reloads only the invalid part of the
12479 memory address RTX. */
12480
12481 if (GET_CODE (x) == PLUS
12482 && REG_P (XEXP (x, 1))
12483 && GET_CODE (XEXP (x, 0)) == PLUS
12484 && REG_P (XEXP (XEXP (x, 0), 1)))
12485 {
12486 rtx base, index;
12487 bool something_reloaded = false;
12488
12489 base = XEXP (XEXP (x, 0), 1);
12490 if (!REG_OK_FOR_BASE_STRICT_P (base))
12491 {
12492 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12493 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12494 opnum, (enum reload_type) type);
12495 something_reloaded = true;
12496 }
12497
12498 index = XEXP (x, 1);
12499 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12500 {
12501 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12502 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12503 opnum, (enum reload_type) type);
12504 something_reloaded = true;
12505 }
12506
12507 gcc_assert (something_reloaded);
12508 return true;
12509 }
12510
12511 return false;
12512 }
12513
12514 /* Recognizes RTL expressions that are valid memory addresses for an
12515 instruction. The MODE argument is the machine mode for the MEM
12516 expression that wants to use this address.
12517
12518 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12519 convert common non-canonical forms to canonical form so that they will
12520 be recognized. */
12521
12522 static bool
12523 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12524 rtx addr, bool strict)
12525 {
12526 struct ix86_address parts;
12527 rtx base, index, disp;
12528 HOST_WIDE_INT scale;
12529
12530 if (ix86_decompose_address (addr, &parts) <= 0)
12531 /* Decomposition failed. */
12532 return false;
12533
12534 base = parts.base;
12535 index = parts.index;
12536 disp = parts.disp;
12537 scale = parts.scale;
12538
12539 /* Validate base register. */
12540 if (base)
12541 {
12542 rtx reg;
12543
12544 if (REG_P (base))
12545 reg = base;
12546 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12547 reg = SUBREG_REG (base);
12548 else
12549 /* Base is not a register. */
12550 return false;
12551
12552 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12553 return false;
12554
12555 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12556 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12557 /* Base is not valid. */
12558 return false;
12559 }
12560
12561 /* Validate index register. */
12562 if (index)
12563 {
12564 rtx reg;
12565
12566 if (REG_P (index))
12567 reg = index;
12568 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12569 reg = SUBREG_REG (index);
12570 else
12571 /* Index is not a register. */
12572 return false;
12573
12574 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12575 return false;
12576
12577 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12578 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12579 /* Index is not valid. */
12580 return false;
12581 }
12582
12583 /* Index and base should have the same mode. */
12584 if (base && index
12585 && GET_MODE (base) != GET_MODE (index))
12586 return false;
12587
12588 /* Validate scale factor. */
12589 if (scale != 1)
12590 {
12591 if (!index)
12592 /* Scale without index. */
12593 return false;
12594
12595 if (scale != 2 && scale != 4 && scale != 8)
12596 /* Scale is not a valid multiplier. */
12597 return false;
12598 }
12599
12600 /* Validate displacement. */
12601 if (disp)
12602 {
12603 if (GET_CODE (disp) == CONST
12604 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12605 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12606 switch (XINT (XEXP (disp, 0), 1))
12607 {
12608 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12609 used. While ABI specify also 32bit relocations, we don't produce
12610 them at all and use IP relative instead. */
12611 case UNSPEC_GOT:
12612 case UNSPEC_GOTOFF:
12613 gcc_assert (flag_pic);
12614 if (!TARGET_64BIT)
12615 goto is_legitimate_pic;
12616
12617 /* 64bit address unspec. */
12618 return false;
12619
12620 case UNSPEC_GOTPCREL:
12621 case UNSPEC_PCREL:
12622 gcc_assert (flag_pic);
12623 goto is_legitimate_pic;
12624
12625 case UNSPEC_GOTTPOFF:
12626 case UNSPEC_GOTNTPOFF:
12627 case UNSPEC_INDNTPOFF:
12628 case UNSPEC_NTPOFF:
12629 case UNSPEC_DTPOFF:
12630 break;
12631
12632 case UNSPEC_STACK_CHECK:
12633 gcc_assert (flag_split_stack);
12634 break;
12635
12636 default:
12637 /* Invalid address unspec. */
12638 return false;
12639 }
12640
12641 else if (SYMBOLIC_CONST (disp)
12642 && (flag_pic
12643 || (TARGET_MACHO
12644 #if TARGET_MACHO
12645 && MACHOPIC_INDIRECT
12646 && !machopic_operand_p (disp)
12647 #endif
12648 )))
12649 {
12650
12651 is_legitimate_pic:
12652 if (TARGET_64BIT && (index || base))
12653 {
12654 /* foo@dtpoff(%rX) is ok. */
12655 if (GET_CODE (disp) != CONST
12656 || GET_CODE (XEXP (disp, 0)) != PLUS
12657 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12658 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12659 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12660 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12661 /* Non-constant pic memory reference. */
12662 return false;
12663 }
12664 else if ((!TARGET_MACHO || flag_pic)
12665 && ! legitimate_pic_address_disp_p (disp))
12666 /* Displacement is an invalid pic construct. */
12667 return false;
12668 #if TARGET_MACHO
12669 else if (MACHO_DYNAMIC_NO_PIC_P
12670 && !ix86_legitimate_constant_p (Pmode, disp))
12671 /* displacment must be referenced via non_lazy_pointer */
12672 return false;
12673 #endif
12674
12675 /* This code used to verify that a symbolic pic displacement
12676 includes the pic_offset_table_rtx register.
12677
12678 While this is good idea, unfortunately these constructs may
12679 be created by "adds using lea" optimization for incorrect
12680 code like:
12681
12682 int a;
12683 int foo(int i)
12684 {
12685 return *(&a+i);
12686 }
12687
12688 This code is nonsensical, but results in addressing
12689 GOT table with pic_offset_table_rtx base. We can't
12690 just refuse it easily, since it gets matched by
12691 "addsi3" pattern, that later gets split to lea in the
12692 case output register differs from input. While this
12693 can be handled by separate addsi pattern for this case
12694 that never results in lea, this seems to be easier and
12695 correct fix for crash to disable this test. */
12696 }
12697 else if (GET_CODE (disp) != LABEL_REF
12698 && !CONST_INT_P (disp)
12699 && (GET_CODE (disp) != CONST
12700 || !ix86_legitimate_constant_p (Pmode, disp))
12701 && (GET_CODE (disp) != SYMBOL_REF
12702 || !ix86_legitimate_constant_p (Pmode, disp)))
12703 /* Displacement is not constant. */
12704 return false;
12705 else if (TARGET_64BIT
12706 && !x86_64_immediate_operand (disp, VOIDmode))
12707 /* Displacement is out of range. */
12708 return false;
12709 }
12710
12711 /* Everything looks valid. */
12712 return true;
12713 }
12714
12715 /* Determine if a given RTX is a valid constant address. */
12716
12717 bool
12718 constant_address_p (rtx x)
12719 {
12720 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12721 }
12722 \f
12723 /* Return a unique alias set for the GOT. */
12724
12725 static alias_set_type
12726 ix86_GOT_alias_set (void)
12727 {
12728 static alias_set_type set = -1;
12729 if (set == -1)
12730 set = new_alias_set ();
12731 return set;
12732 }
12733
12734 /* Return a legitimate reference for ORIG (an address) using the
12735 register REG. If REG is 0, a new pseudo is generated.
12736
12737 There are two types of references that must be handled:
12738
12739 1. Global data references must load the address from the GOT, via
12740 the PIC reg. An insn is emitted to do this load, and the reg is
12741 returned.
12742
12743 2. Static data references, constant pool addresses, and code labels
12744 compute the address as an offset from the GOT, whose base is in
12745 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12746 differentiate them from global data objects. The returned
12747 address is the PIC reg + an unspec constant.
12748
12749 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12750 reg also appears in the address. */
12751
12752 static rtx
12753 legitimize_pic_address (rtx orig, rtx reg)
12754 {
12755 rtx addr = orig;
12756 rtx new_rtx = orig;
12757
12758 #if TARGET_MACHO
12759 if (TARGET_MACHO && !TARGET_64BIT)
12760 {
12761 if (reg == 0)
12762 reg = gen_reg_rtx (Pmode);
12763 /* Use the generic Mach-O PIC machinery. */
12764 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12765 }
12766 #endif
12767
12768 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12769 {
12770 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12771 if (tmp)
12772 return tmp;
12773 }
12774
12775 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12776 new_rtx = addr;
12777 else if (TARGET_64BIT && !TARGET_PECOFF
12778 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12779 {
12780 rtx tmpreg;
12781 /* This symbol may be referenced via a displacement from the PIC
12782 base address (@GOTOFF). */
12783
12784 if (reload_in_progress)
12785 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12786 if (GET_CODE (addr) == CONST)
12787 addr = XEXP (addr, 0);
12788 if (GET_CODE (addr) == PLUS)
12789 {
12790 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12791 UNSPEC_GOTOFF);
12792 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12793 }
12794 else
12795 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12796 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12797 if (!reg)
12798 tmpreg = gen_reg_rtx (Pmode);
12799 else
12800 tmpreg = reg;
12801 emit_move_insn (tmpreg, new_rtx);
12802
12803 if (reg != 0)
12804 {
12805 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12806 tmpreg, 1, OPTAB_DIRECT);
12807 new_rtx = reg;
12808 }
12809 else
12810 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12811 }
12812 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12813 {
12814 /* This symbol may be referenced via a displacement from the PIC
12815 base address (@GOTOFF). */
12816
12817 if (reload_in_progress)
12818 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12819 if (GET_CODE (addr) == CONST)
12820 addr = XEXP (addr, 0);
12821 if (GET_CODE (addr) == PLUS)
12822 {
12823 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12824 UNSPEC_GOTOFF);
12825 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12826 }
12827 else
12828 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12829 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12830 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12831
12832 if (reg != 0)
12833 {
12834 emit_move_insn (reg, new_rtx);
12835 new_rtx = reg;
12836 }
12837 }
12838 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12839 /* We can't use @GOTOFF for text labels on VxWorks;
12840 see gotoff_operand. */
12841 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12842 {
12843 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12844 if (tmp)
12845 return tmp;
12846
12847 /* For x64 PE-COFF there is no GOT table. So we use address
12848 directly. */
12849 if (TARGET_64BIT && TARGET_PECOFF)
12850 {
12851 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12852 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12853
12854 if (reg == 0)
12855 reg = gen_reg_rtx (Pmode);
12856 emit_move_insn (reg, new_rtx);
12857 new_rtx = reg;
12858 }
12859 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12860 {
12861 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12862 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12863 new_rtx = gen_const_mem (Pmode, new_rtx);
12864 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12865
12866 if (reg == 0)
12867 reg = gen_reg_rtx (Pmode);
12868 /* Use directly gen_movsi, otherwise the address is loaded
12869 into register for CSE. We don't want to CSE this addresses,
12870 instead we CSE addresses from the GOT table, so skip this. */
12871 emit_insn (gen_movsi (reg, new_rtx));
12872 new_rtx = reg;
12873 }
12874 else
12875 {
12876 /* This symbol must be referenced via a load from the
12877 Global Offset Table (@GOT). */
12878
12879 if (reload_in_progress)
12880 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12881 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12882 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12883 if (TARGET_64BIT)
12884 new_rtx = force_reg (Pmode, new_rtx);
12885 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12886 new_rtx = gen_const_mem (Pmode, new_rtx);
12887 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12888
12889 if (reg == 0)
12890 reg = gen_reg_rtx (Pmode);
12891 emit_move_insn (reg, new_rtx);
12892 new_rtx = reg;
12893 }
12894 }
12895 else
12896 {
12897 if (CONST_INT_P (addr)
12898 && !x86_64_immediate_operand (addr, VOIDmode))
12899 {
12900 if (reg)
12901 {
12902 emit_move_insn (reg, addr);
12903 new_rtx = reg;
12904 }
12905 else
12906 new_rtx = force_reg (Pmode, addr);
12907 }
12908 else if (GET_CODE (addr) == CONST)
12909 {
12910 addr = XEXP (addr, 0);
12911
12912 /* We must match stuff we generate before. Assume the only
12913 unspecs that can get here are ours. Not that we could do
12914 anything with them anyway.... */
12915 if (GET_CODE (addr) == UNSPEC
12916 || (GET_CODE (addr) == PLUS
12917 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12918 return orig;
12919 gcc_assert (GET_CODE (addr) == PLUS);
12920 }
12921 if (GET_CODE (addr) == PLUS)
12922 {
12923 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12924
12925 /* Check first to see if this is a constant offset from a @GOTOFF
12926 symbol reference. */
12927 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
12928 && CONST_INT_P (op1))
12929 {
12930 if (!TARGET_64BIT)
12931 {
12932 if (reload_in_progress)
12933 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12934 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12935 UNSPEC_GOTOFF);
12936 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12937 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12938 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12939
12940 if (reg != 0)
12941 {
12942 emit_move_insn (reg, new_rtx);
12943 new_rtx = reg;
12944 }
12945 }
12946 else
12947 {
12948 if (INTVAL (op1) < -16*1024*1024
12949 || INTVAL (op1) >= 16*1024*1024)
12950 {
12951 if (!x86_64_immediate_operand (op1, Pmode))
12952 op1 = force_reg (Pmode, op1);
12953 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12954 }
12955 }
12956 }
12957 else
12958 {
12959 rtx base = legitimize_pic_address (op0, reg);
12960 enum machine_mode mode = GET_MODE (base);
12961 new_rtx
12962 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12963
12964 if (CONST_INT_P (new_rtx))
12965 {
12966 if (INTVAL (new_rtx) < -16*1024*1024
12967 || INTVAL (new_rtx) >= 16*1024*1024)
12968 {
12969 if (!x86_64_immediate_operand (new_rtx, mode))
12970 new_rtx = force_reg (mode, new_rtx);
12971 new_rtx
12972 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12973 }
12974 else
12975 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12976 }
12977 else
12978 {
12979 if (GET_CODE (new_rtx) == PLUS
12980 && CONSTANT_P (XEXP (new_rtx, 1)))
12981 {
12982 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12983 new_rtx = XEXP (new_rtx, 1);
12984 }
12985 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12986 }
12987 }
12988 }
12989 }
12990 return new_rtx;
12991 }
12992 \f
12993 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12994
12995 static rtx
12996 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12997 {
12998 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12999
13000 if (GET_MODE (tp) != tp_mode)
13001 {
13002 gcc_assert (GET_MODE (tp) == SImode);
13003 gcc_assert (tp_mode == DImode);
13004
13005 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
13006 }
13007
13008 if (to_reg)
13009 tp = copy_to_mode_reg (tp_mode, tp);
13010
13011 return tp;
13012 }
13013
13014 /* Construct the SYMBOL_REF for the tls_get_addr function. */
13015
13016 static GTY(()) rtx ix86_tls_symbol;
13017
13018 static rtx
13019 ix86_tls_get_addr (void)
13020 {
13021 if (!ix86_tls_symbol)
13022 {
13023 const char *sym
13024 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
13025 ? "___tls_get_addr" : "__tls_get_addr");
13026
13027 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
13028 }
13029
13030 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
13031 {
13032 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
13033 UNSPEC_PLTOFF);
13034 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
13035 gen_rtx_CONST (Pmode, unspec));
13036 }
13037
13038 return ix86_tls_symbol;
13039 }
13040
13041 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
13042
13043 static GTY(()) rtx ix86_tls_module_base_symbol;
13044
13045 rtx
13046 ix86_tls_module_base (void)
13047 {
13048 if (!ix86_tls_module_base_symbol)
13049 {
13050 ix86_tls_module_base_symbol
13051 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13052
13053 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13054 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13055 }
13056
13057 return ix86_tls_module_base_symbol;
13058 }
13059
13060 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13061 false if we expect this to be used for a memory address and true if
13062 we expect to load the address into a register. */
13063
13064 static rtx
13065 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13066 {
13067 rtx dest, base, off;
13068 rtx pic = NULL_RTX, tp = NULL_RTX;
13069 enum machine_mode tp_mode = Pmode;
13070 int type;
13071
13072 switch (model)
13073 {
13074 case TLS_MODEL_GLOBAL_DYNAMIC:
13075 dest = gen_reg_rtx (Pmode);
13076
13077 if (!TARGET_64BIT)
13078 {
13079 if (flag_pic && !TARGET_PECOFF)
13080 pic = pic_offset_table_rtx;
13081 else
13082 {
13083 pic = gen_reg_rtx (Pmode);
13084 emit_insn (gen_set_got (pic));
13085 }
13086 }
13087
13088 if (TARGET_GNU2_TLS)
13089 {
13090 if (TARGET_64BIT)
13091 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13092 else
13093 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13094
13095 tp = get_thread_pointer (Pmode, true);
13096 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13097
13098 if (GET_MODE (x) != Pmode)
13099 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13100
13101 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13102 }
13103 else
13104 {
13105 rtx caddr = ix86_tls_get_addr ();
13106
13107 if (TARGET_64BIT)
13108 {
13109 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13110 rtx insns;
13111
13112 start_sequence ();
13113 emit_call_insn
13114 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13115 insns = get_insns ();
13116 end_sequence ();
13117
13118 if (GET_MODE (x) != Pmode)
13119 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13120
13121 RTL_CONST_CALL_P (insns) = 1;
13122 emit_libcall_block (insns, dest, rax, x);
13123 }
13124 else
13125 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13126 }
13127 break;
13128
13129 case TLS_MODEL_LOCAL_DYNAMIC:
13130 base = gen_reg_rtx (Pmode);
13131
13132 if (!TARGET_64BIT)
13133 {
13134 if (flag_pic)
13135 pic = pic_offset_table_rtx;
13136 else
13137 {
13138 pic = gen_reg_rtx (Pmode);
13139 emit_insn (gen_set_got (pic));
13140 }
13141 }
13142
13143 if (TARGET_GNU2_TLS)
13144 {
13145 rtx tmp = ix86_tls_module_base ();
13146
13147 if (TARGET_64BIT)
13148 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13149 else
13150 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13151
13152 tp = get_thread_pointer (Pmode, true);
13153 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13154 gen_rtx_MINUS (Pmode, tmp, tp));
13155 }
13156 else
13157 {
13158 rtx caddr = ix86_tls_get_addr ();
13159
13160 if (TARGET_64BIT)
13161 {
13162 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13163 rtx insns, eqv;
13164
13165 start_sequence ();
13166 emit_call_insn
13167 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13168 insns = get_insns ();
13169 end_sequence ();
13170
13171 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13172 share the LD_BASE result with other LD model accesses. */
13173 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13174 UNSPEC_TLS_LD_BASE);
13175
13176 RTL_CONST_CALL_P (insns) = 1;
13177 emit_libcall_block (insns, base, rax, eqv);
13178 }
13179 else
13180 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13181 }
13182
13183 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13184 off = gen_rtx_CONST (Pmode, off);
13185
13186 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13187
13188 if (TARGET_GNU2_TLS)
13189 {
13190 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13191
13192 if (GET_MODE (x) != Pmode)
13193 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13194
13195 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13196 }
13197 break;
13198
13199 case TLS_MODEL_INITIAL_EXEC:
13200 if (TARGET_64BIT)
13201 {
13202 if (TARGET_SUN_TLS && !TARGET_X32)
13203 {
13204 /* The Sun linker took the AMD64 TLS spec literally
13205 and can only handle %rax as destination of the
13206 initial executable code sequence. */
13207
13208 dest = gen_reg_rtx (DImode);
13209 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13210 return dest;
13211 }
13212
13213 /* Generate DImode references to avoid %fs:(%reg32)
13214 problems and linker IE->LE relaxation bug. */
13215 tp_mode = DImode;
13216 pic = NULL;
13217 type = UNSPEC_GOTNTPOFF;
13218 }
13219 else if (flag_pic)
13220 {
13221 if (reload_in_progress)
13222 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13223 pic = pic_offset_table_rtx;
13224 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13225 }
13226 else if (!TARGET_ANY_GNU_TLS)
13227 {
13228 pic = gen_reg_rtx (Pmode);
13229 emit_insn (gen_set_got (pic));
13230 type = UNSPEC_GOTTPOFF;
13231 }
13232 else
13233 {
13234 pic = NULL;
13235 type = UNSPEC_INDNTPOFF;
13236 }
13237
13238 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13239 off = gen_rtx_CONST (tp_mode, off);
13240 if (pic)
13241 off = gen_rtx_PLUS (tp_mode, pic, off);
13242 off = gen_const_mem (tp_mode, off);
13243 set_mem_alias_set (off, ix86_GOT_alias_set ());
13244
13245 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13246 {
13247 base = get_thread_pointer (tp_mode,
13248 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13249 off = force_reg (tp_mode, off);
13250 return gen_rtx_PLUS (tp_mode, base, off);
13251 }
13252 else
13253 {
13254 base = get_thread_pointer (Pmode, true);
13255 dest = gen_reg_rtx (Pmode);
13256 emit_insn (ix86_gen_sub3 (dest, base, off));
13257 }
13258 break;
13259
13260 case TLS_MODEL_LOCAL_EXEC:
13261 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13262 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13263 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13264 off = gen_rtx_CONST (Pmode, off);
13265
13266 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13267 {
13268 base = get_thread_pointer (Pmode,
13269 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13270 return gen_rtx_PLUS (Pmode, base, off);
13271 }
13272 else
13273 {
13274 base = get_thread_pointer (Pmode, true);
13275 dest = gen_reg_rtx (Pmode);
13276 emit_insn (ix86_gen_sub3 (dest, base, off));
13277 }
13278 break;
13279
13280 default:
13281 gcc_unreachable ();
13282 }
13283
13284 return dest;
13285 }
13286
13287 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13288 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13289 unique refptr-DECL symbol corresponding to symbol DECL. */
13290
13291 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13292 htab_t dllimport_map;
13293
13294 static tree
13295 get_dllimport_decl (tree decl, bool beimport)
13296 {
13297 struct tree_map *h, in;
13298 void **loc;
13299 const char *name;
13300 const char *prefix;
13301 size_t namelen, prefixlen;
13302 char *imp_name;
13303 tree to;
13304 rtx rtl;
13305
13306 if (!dllimport_map)
13307 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13308
13309 in.hash = htab_hash_pointer (decl);
13310 in.base.from = decl;
13311 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13312 h = (struct tree_map *) *loc;
13313 if (h)
13314 return h->to;
13315
13316 *loc = h = ggc_alloc_tree_map ();
13317 h->hash = in.hash;
13318 h->base.from = decl;
13319 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13320 VAR_DECL, NULL, ptr_type_node);
13321 DECL_ARTIFICIAL (to) = 1;
13322 DECL_IGNORED_P (to) = 1;
13323 DECL_EXTERNAL (to) = 1;
13324 TREE_READONLY (to) = 1;
13325
13326 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13327 name = targetm.strip_name_encoding (name);
13328 if (beimport)
13329 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13330 ? "*__imp_" : "*__imp__";
13331 else
13332 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13333 namelen = strlen (name);
13334 prefixlen = strlen (prefix);
13335 imp_name = (char *) alloca (namelen + prefixlen + 1);
13336 memcpy (imp_name, prefix, prefixlen);
13337 memcpy (imp_name + prefixlen, name, namelen + 1);
13338
13339 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13340 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13341 SET_SYMBOL_REF_DECL (rtl, to);
13342 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13343 if (!beimport)
13344 {
13345 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13346 #ifdef SUB_TARGET_RECORD_STUB
13347 SUB_TARGET_RECORD_STUB (name);
13348 #endif
13349 }
13350
13351 rtl = gen_const_mem (Pmode, rtl);
13352 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13353
13354 SET_DECL_RTL (to, rtl);
13355 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13356
13357 return to;
13358 }
13359
13360 /* Expand SYMBOL into its corresponding far-addresse symbol.
13361 WANT_REG is true if we require the result be a register. */
13362
13363 static rtx
13364 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13365 {
13366 tree imp_decl;
13367 rtx x;
13368
13369 gcc_assert (SYMBOL_REF_DECL (symbol));
13370 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13371
13372 x = DECL_RTL (imp_decl);
13373 if (want_reg)
13374 x = force_reg (Pmode, x);
13375 return x;
13376 }
13377
13378 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13379 true if we require the result be a register. */
13380
13381 static rtx
13382 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13383 {
13384 tree imp_decl;
13385 rtx x;
13386
13387 gcc_assert (SYMBOL_REF_DECL (symbol));
13388 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13389
13390 x = DECL_RTL (imp_decl);
13391 if (want_reg)
13392 x = force_reg (Pmode, x);
13393 return x;
13394 }
13395
13396 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13397 is true if we require the result be a register. */
13398
13399 static rtx
13400 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13401 {
13402 if (!TARGET_PECOFF)
13403 return NULL_RTX;
13404
13405 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13406 {
13407 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13408 return legitimize_dllimport_symbol (addr, inreg);
13409 if (GET_CODE (addr) == CONST
13410 && GET_CODE (XEXP (addr, 0)) == PLUS
13411 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13412 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13413 {
13414 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13415 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13416 }
13417 }
13418
13419 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13420 return NULL_RTX;
13421 if (GET_CODE (addr) == SYMBOL_REF
13422 && !is_imported_p (addr)
13423 && SYMBOL_REF_EXTERNAL_P (addr)
13424 && SYMBOL_REF_DECL (addr))
13425 return legitimize_pe_coff_extern_decl (addr, inreg);
13426
13427 if (GET_CODE (addr) == CONST
13428 && GET_CODE (XEXP (addr, 0)) == PLUS
13429 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13430 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13431 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13432 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13433 {
13434 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13435 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13436 }
13437 return NULL_RTX;
13438 }
13439
13440 /* Try machine-dependent ways of modifying an illegitimate address
13441 to be legitimate. If we find one, return the new, valid address.
13442 This macro is used in only one place: `memory_address' in explow.c.
13443
13444 OLDX is the address as it was before break_out_memory_refs was called.
13445 In some cases it is useful to look at this to decide what needs to be done.
13446
13447 It is always safe for this macro to do nothing. It exists to recognize
13448 opportunities to optimize the output.
13449
13450 For the 80386, we handle X+REG by loading X into a register R and
13451 using R+REG. R will go in a general reg and indexing will be used.
13452 However, if REG is a broken-out memory address or multiplication,
13453 nothing needs to be done because REG can certainly go in a general reg.
13454
13455 When -fpic is used, special handling is needed for symbolic references.
13456 See comments by legitimize_pic_address in i386.c for details. */
13457
13458 static rtx
13459 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13460 enum machine_mode mode)
13461 {
13462 int changed = 0;
13463 unsigned log;
13464
13465 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13466 if (log)
13467 return legitimize_tls_address (x, (enum tls_model) log, false);
13468 if (GET_CODE (x) == CONST
13469 && GET_CODE (XEXP (x, 0)) == PLUS
13470 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13471 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13472 {
13473 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13474 (enum tls_model) log, false);
13475 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13476 }
13477
13478 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13479 {
13480 rtx tmp = legitimize_pe_coff_symbol (x, true);
13481 if (tmp)
13482 return tmp;
13483 }
13484
13485 if (flag_pic && SYMBOLIC_CONST (x))
13486 return legitimize_pic_address (x, 0);
13487
13488 #if TARGET_MACHO
13489 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13490 return machopic_indirect_data_reference (x, 0);
13491 #endif
13492
13493 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13494 if (GET_CODE (x) == ASHIFT
13495 && CONST_INT_P (XEXP (x, 1))
13496 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13497 {
13498 changed = 1;
13499 log = INTVAL (XEXP (x, 1));
13500 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13501 GEN_INT (1 << log));
13502 }
13503
13504 if (GET_CODE (x) == PLUS)
13505 {
13506 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13507
13508 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13509 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13510 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13511 {
13512 changed = 1;
13513 log = INTVAL (XEXP (XEXP (x, 0), 1));
13514 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13515 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13516 GEN_INT (1 << log));
13517 }
13518
13519 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13520 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13521 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13522 {
13523 changed = 1;
13524 log = INTVAL (XEXP (XEXP (x, 1), 1));
13525 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13526 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13527 GEN_INT (1 << log));
13528 }
13529
13530 /* Put multiply first if it isn't already. */
13531 if (GET_CODE (XEXP (x, 1)) == MULT)
13532 {
13533 rtx tmp = XEXP (x, 0);
13534 XEXP (x, 0) = XEXP (x, 1);
13535 XEXP (x, 1) = tmp;
13536 changed = 1;
13537 }
13538
13539 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13540 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13541 created by virtual register instantiation, register elimination, and
13542 similar optimizations. */
13543 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13544 {
13545 changed = 1;
13546 x = gen_rtx_PLUS (Pmode,
13547 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13548 XEXP (XEXP (x, 1), 0)),
13549 XEXP (XEXP (x, 1), 1));
13550 }
13551
13552 /* Canonicalize
13553 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13554 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13555 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13556 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13557 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13558 && CONSTANT_P (XEXP (x, 1)))
13559 {
13560 rtx constant;
13561 rtx other = NULL_RTX;
13562
13563 if (CONST_INT_P (XEXP (x, 1)))
13564 {
13565 constant = XEXP (x, 1);
13566 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13567 }
13568 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13569 {
13570 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13571 other = XEXP (x, 1);
13572 }
13573 else
13574 constant = 0;
13575
13576 if (constant)
13577 {
13578 changed = 1;
13579 x = gen_rtx_PLUS (Pmode,
13580 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13581 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13582 plus_constant (Pmode, other,
13583 INTVAL (constant)));
13584 }
13585 }
13586
13587 if (changed && ix86_legitimate_address_p (mode, x, false))
13588 return x;
13589
13590 if (GET_CODE (XEXP (x, 0)) == MULT)
13591 {
13592 changed = 1;
13593 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13594 }
13595
13596 if (GET_CODE (XEXP (x, 1)) == MULT)
13597 {
13598 changed = 1;
13599 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13600 }
13601
13602 if (changed
13603 && REG_P (XEXP (x, 1))
13604 && REG_P (XEXP (x, 0)))
13605 return x;
13606
13607 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13608 {
13609 changed = 1;
13610 x = legitimize_pic_address (x, 0);
13611 }
13612
13613 if (changed && ix86_legitimate_address_p (mode, x, false))
13614 return x;
13615
13616 if (REG_P (XEXP (x, 0)))
13617 {
13618 rtx temp = gen_reg_rtx (Pmode);
13619 rtx val = force_operand (XEXP (x, 1), temp);
13620 if (val != temp)
13621 {
13622 val = convert_to_mode (Pmode, val, 1);
13623 emit_move_insn (temp, val);
13624 }
13625
13626 XEXP (x, 1) = temp;
13627 return x;
13628 }
13629
13630 else if (REG_P (XEXP (x, 1)))
13631 {
13632 rtx temp = gen_reg_rtx (Pmode);
13633 rtx val = force_operand (XEXP (x, 0), temp);
13634 if (val != temp)
13635 {
13636 val = convert_to_mode (Pmode, val, 1);
13637 emit_move_insn (temp, val);
13638 }
13639
13640 XEXP (x, 0) = temp;
13641 return x;
13642 }
13643 }
13644
13645 return x;
13646 }
13647 \f
13648 /* Print an integer constant expression in assembler syntax. Addition
13649 and subtraction are the only arithmetic that may appear in these
13650 expressions. FILE is the stdio stream to write to, X is the rtx, and
13651 CODE is the operand print code from the output string. */
13652
13653 static void
13654 output_pic_addr_const (FILE *file, rtx x, int code)
13655 {
13656 char buf[256];
13657
13658 switch (GET_CODE (x))
13659 {
13660 case PC:
13661 gcc_assert (flag_pic);
13662 putc ('.', file);
13663 break;
13664
13665 case SYMBOL_REF:
13666 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13667 output_addr_const (file, x);
13668 else
13669 {
13670 const char *name = XSTR (x, 0);
13671
13672 /* Mark the decl as referenced so that cgraph will
13673 output the function. */
13674 if (SYMBOL_REF_DECL (x))
13675 mark_decl_referenced (SYMBOL_REF_DECL (x));
13676
13677 #if TARGET_MACHO
13678 if (MACHOPIC_INDIRECT
13679 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13680 name = machopic_indirection_name (x, /*stub_p=*/true);
13681 #endif
13682 assemble_name (file, name);
13683 }
13684 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13685 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13686 fputs ("@PLT", file);
13687 break;
13688
13689 case LABEL_REF:
13690 x = XEXP (x, 0);
13691 /* FALLTHRU */
13692 case CODE_LABEL:
13693 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13694 assemble_name (asm_out_file, buf);
13695 break;
13696
13697 case CONST_INT:
13698 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13699 break;
13700
13701 case CONST:
13702 /* This used to output parentheses around the expression,
13703 but that does not work on the 386 (either ATT or BSD assembler). */
13704 output_pic_addr_const (file, XEXP (x, 0), code);
13705 break;
13706
13707 case CONST_DOUBLE:
13708 if (GET_MODE (x) == VOIDmode)
13709 {
13710 /* We can use %d if the number is <32 bits and positive. */
13711 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13712 fprintf (file, "0x%lx%08lx",
13713 (unsigned long) CONST_DOUBLE_HIGH (x),
13714 (unsigned long) CONST_DOUBLE_LOW (x));
13715 else
13716 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13717 }
13718 else
13719 /* We can't handle floating point constants;
13720 TARGET_PRINT_OPERAND must handle them. */
13721 output_operand_lossage ("floating constant misused");
13722 break;
13723
13724 case PLUS:
13725 /* Some assemblers need integer constants to appear first. */
13726 if (CONST_INT_P (XEXP (x, 0)))
13727 {
13728 output_pic_addr_const (file, XEXP (x, 0), code);
13729 putc ('+', file);
13730 output_pic_addr_const (file, XEXP (x, 1), code);
13731 }
13732 else
13733 {
13734 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13735 output_pic_addr_const (file, XEXP (x, 1), code);
13736 putc ('+', file);
13737 output_pic_addr_const (file, XEXP (x, 0), code);
13738 }
13739 break;
13740
13741 case MINUS:
13742 if (!TARGET_MACHO)
13743 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13744 output_pic_addr_const (file, XEXP (x, 0), code);
13745 putc ('-', file);
13746 output_pic_addr_const (file, XEXP (x, 1), code);
13747 if (!TARGET_MACHO)
13748 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13749 break;
13750
13751 case UNSPEC:
13752 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13753 {
13754 bool f = i386_asm_output_addr_const_extra (file, x);
13755 gcc_assert (f);
13756 break;
13757 }
13758
13759 gcc_assert (XVECLEN (x, 0) == 1);
13760 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13761 switch (XINT (x, 1))
13762 {
13763 case UNSPEC_GOT:
13764 fputs ("@GOT", file);
13765 break;
13766 case UNSPEC_GOTOFF:
13767 fputs ("@GOTOFF", file);
13768 break;
13769 case UNSPEC_PLTOFF:
13770 fputs ("@PLTOFF", file);
13771 break;
13772 case UNSPEC_PCREL:
13773 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13774 "(%rip)" : "[rip]", file);
13775 break;
13776 case UNSPEC_GOTPCREL:
13777 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13778 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13779 break;
13780 case UNSPEC_GOTTPOFF:
13781 /* FIXME: This might be @TPOFF in Sun ld too. */
13782 fputs ("@gottpoff", file);
13783 break;
13784 case UNSPEC_TPOFF:
13785 fputs ("@tpoff", file);
13786 break;
13787 case UNSPEC_NTPOFF:
13788 if (TARGET_64BIT)
13789 fputs ("@tpoff", file);
13790 else
13791 fputs ("@ntpoff", file);
13792 break;
13793 case UNSPEC_DTPOFF:
13794 fputs ("@dtpoff", file);
13795 break;
13796 case UNSPEC_GOTNTPOFF:
13797 if (TARGET_64BIT)
13798 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13799 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13800 else
13801 fputs ("@gotntpoff", file);
13802 break;
13803 case UNSPEC_INDNTPOFF:
13804 fputs ("@indntpoff", file);
13805 break;
13806 #if TARGET_MACHO
13807 case UNSPEC_MACHOPIC_OFFSET:
13808 putc ('-', file);
13809 machopic_output_function_base_name (file);
13810 break;
13811 #endif
13812 default:
13813 output_operand_lossage ("invalid UNSPEC as operand");
13814 break;
13815 }
13816 break;
13817
13818 default:
13819 output_operand_lossage ("invalid expression as operand");
13820 }
13821 }
13822
13823 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13824 We need to emit DTP-relative relocations. */
13825
13826 static void ATTRIBUTE_UNUSED
13827 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13828 {
13829 fputs (ASM_LONG, file);
13830 output_addr_const (file, x);
13831 fputs ("@dtpoff", file);
13832 switch (size)
13833 {
13834 case 4:
13835 break;
13836 case 8:
13837 fputs (", 0", file);
13838 break;
13839 default:
13840 gcc_unreachable ();
13841 }
13842 }
13843
13844 /* Return true if X is a representation of the PIC register. This copes
13845 with calls from ix86_find_base_term, where the register might have
13846 been replaced by a cselib value. */
13847
13848 static bool
13849 ix86_pic_register_p (rtx x)
13850 {
13851 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13852 return (pic_offset_table_rtx
13853 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13854 else
13855 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13856 }
13857
13858 /* Helper function for ix86_delegitimize_address.
13859 Attempt to delegitimize TLS local-exec accesses. */
13860
13861 static rtx
13862 ix86_delegitimize_tls_address (rtx orig_x)
13863 {
13864 rtx x = orig_x, unspec;
13865 struct ix86_address addr;
13866
13867 if (!TARGET_TLS_DIRECT_SEG_REFS)
13868 return orig_x;
13869 if (MEM_P (x))
13870 x = XEXP (x, 0);
13871 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13872 return orig_x;
13873 if (ix86_decompose_address (x, &addr) == 0
13874 || addr.seg != DEFAULT_TLS_SEG_REG
13875 || addr.disp == NULL_RTX
13876 || GET_CODE (addr.disp) != CONST)
13877 return orig_x;
13878 unspec = XEXP (addr.disp, 0);
13879 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13880 unspec = XEXP (unspec, 0);
13881 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13882 return orig_x;
13883 x = XVECEXP (unspec, 0, 0);
13884 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13885 if (unspec != XEXP (addr.disp, 0))
13886 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13887 if (addr.index)
13888 {
13889 rtx idx = addr.index;
13890 if (addr.scale != 1)
13891 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13892 x = gen_rtx_PLUS (Pmode, idx, x);
13893 }
13894 if (addr.base)
13895 x = gen_rtx_PLUS (Pmode, addr.base, x);
13896 if (MEM_P (orig_x))
13897 x = replace_equiv_address_nv (orig_x, x);
13898 return x;
13899 }
13900
13901 /* In the name of slightly smaller debug output, and to cater to
13902 general assembler lossage, recognize PIC+GOTOFF and turn it back
13903 into a direct symbol reference.
13904
13905 On Darwin, this is necessary to avoid a crash, because Darwin
13906 has a different PIC label for each routine but the DWARF debugging
13907 information is not associated with any particular routine, so it's
13908 necessary to remove references to the PIC label from RTL stored by
13909 the DWARF output code. */
13910
13911 static rtx
13912 ix86_delegitimize_address (rtx x)
13913 {
13914 rtx orig_x = delegitimize_mem_from_attrs (x);
13915 /* addend is NULL or some rtx if x is something+GOTOFF where
13916 something doesn't include the PIC register. */
13917 rtx addend = NULL_RTX;
13918 /* reg_addend is NULL or a multiple of some register. */
13919 rtx reg_addend = NULL_RTX;
13920 /* const_addend is NULL or a const_int. */
13921 rtx const_addend = NULL_RTX;
13922 /* This is the result, or NULL. */
13923 rtx result = NULL_RTX;
13924
13925 x = orig_x;
13926
13927 if (MEM_P (x))
13928 x = XEXP (x, 0);
13929
13930 if (TARGET_64BIT)
13931 {
13932 if (GET_CODE (x) == CONST
13933 && GET_CODE (XEXP (x, 0)) == PLUS
13934 && GET_MODE (XEXP (x, 0)) == Pmode
13935 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13936 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13937 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13938 {
13939 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13940 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13941 if (MEM_P (orig_x))
13942 x = replace_equiv_address_nv (orig_x, x);
13943 return x;
13944 }
13945
13946 if (GET_CODE (x) == CONST
13947 && GET_CODE (XEXP (x, 0)) == UNSPEC
13948 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
13949 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
13950 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
13951 {
13952 x = XVECEXP (XEXP (x, 0), 0, 0);
13953 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13954 {
13955 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13956 GET_MODE (x), 0);
13957 if (x == NULL_RTX)
13958 return orig_x;
13959 }
13960 return x;
13961 }
13962
13963 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
13964 return ix86_delegitimize_tls_address (orig_x);
13965
13966 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
13967 and -mcmodel=medium -fpic. */
13968 }
13969
13970 if (GET_CODE (x) != PLUS
13971 || GET_CODE (XEXP (x, 1)) != CONST)
13972 return ix86_delegitimize_tls_address (orig_x);
13973
13974 if (ix86_pic_register_p (XEXP (x, 0)))
13975 /* %ebx + GOT/GOTOFF */
13976 ;
13977 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13978 {
13979 /* %ebx + %reg * scale + GOT/GOTOFF */
13980 reg_addend = XEXP (x, 0);
13981 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13982 reg_addend = XEXP (reg_addend, 1);
13983 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13984 reg_addend = XEXP (reg_addend, 0);
13985 else
13986 {
13987 reg_addend = NULL_RTX;
13988 addend = XEXP (x, 0);
13989 }
13990 }
13991 else
13992 addend = XEXP (x, 0);
13993
13994 x = XEXP (XEXP (x, 1), 0);
13995 if (GET_CODE (x) == PLUS
13996 && CONST_INT_P (XEXP (x, 1)))
13997 {
13998 const_addend = XEXP (x, 1);
13999 x = XEXP (x, 0);
14000 }
14001
14002 if (GET_CODE (x) == UNSPEC
14003 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
14004 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
14005 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
14006 && !MEM_P (orig_x) && !addend)))
14007 result = XVECEXP (x, 0, 0);
14008
14009 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
14010 && !MEM_P (orig_x))
14011 result = XVECEXP (x, 0, 0);
14012
14013 if (! result)
14014 return ix86_delegitimize_tls_address (orig_x);
14015
14016 if (const_addend)
14017 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
14018 if (reg_addend)
14019 result = gen_rtx_PLUS (Pmode, reg_addend, result);
14020 if (addend)
14021 {
14022 /* If the rest of original X doesn't involve the PIC register, add
14023 addend and subtract pic_offset_table_rtx. This can happen e.g.
14024 for code like:
14025 leal (%ebx, %ecx, 4), %ecx
14026 ...
14027 movl foo@GOTOFF(%ecx), %edx
14028 in which case we return (%ecx - %ebx) + foo. */
14029 if (pic_offset_table_rtx)
14030 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
14031 pic_offset_table_rtx),
14032 result);
14033 else
14034 return orig_x;
14035 }
14036 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
14037 {
14038 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
14039 if (result == NULL_RTX)
14040 return orig_x;
14041 }
14042 return result;
14043 }
14044
14045 /* If X is a machine specific address (i.e. a symbol or label being
14046 referenced as a displacement from the GOT implemented using an
14047 UNSPEC), then return the base term. Otherwise return X. */
14048
14049 rtx
14050 ix86_find_base_term (rtx x)
14051 {
14052 rtx term;
14053
14054 if (TARGET_64BIT)
14055 {
14056 if (GET_CODE (x) != CONST)
14057 return x;
14058 term = XEXP (x, 0);
14059 if (GET_CODE (term) == PLUS
14060 && (CONST_INT_P (XEXP (term, 1))
14061 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14062 term = XEXP (term, 0);
14063 if (GET_CODE (term) != UNSPEC
14064 || (XINT (term, 1) != UNSPEC_GOTPCREL
14065 && XINT (term, 1) != UNSPEC_PCREL))
14066 return x;
14067
14068 return XVECEXP (term, 0, 0);
14069 }
14070
14071 return ix86_delegitimize_address (x);
14072 }
14073 \f
14074 static void
14075 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14076 bool fp, FILE *file)
14077 {
14078 const char *suffix;
14079
14080 if (mode == CCFPmode || mode == CCFPUmode)
14081 {
14082 code = ix86_fp_compare_code_to_integer (code);
14083 mode = CCmode;
14084 }
14085 if (reverse)
14086 code = reverse_condition (code);
14087
14088 switch (code)
14089 {
14090 case EQ:
14091 switch (mode)
14092 {
14093 case CCAmode:
14094 suffix = "a";
14095 break;
14096
14097 case CCCmode:
14098 suffix = "c";
14099 break;
14100
14101 case CCOmode:
14102 suffix = "o";
14103 break;
14104
14105 case CCSmode:
14106 suffix = "s";
14107 break;
14108
14109 default:
14110 suffix = "e";
14111 }
14112 break;
14113 case NE:
14114 switch (mode)
14115 {
14116 case CCAmode:
14117 suffix = "na";
14118 break;
14119
14120 case CCCmode:
14121 suffix = "nc";
14122 break;
14123
14124 case CCOmode:
14125 suffix = "no";
14126 break;
14127
14128 case CCSmode:
14129 suffix = "ns";
14130 break;
14131
14132 default:
14133 suffix = "ne";
14134 }
14135 break;
14136 case GT:
14137 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14138 suffix = "g";
14139 break;
14140 case GTU:
14141 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14142 Those same assemblers have the same but opposite lossage on cmov. */
14143 if (mode == CCmode)
14144 suffix = fp ? "nbe" : "a";
14145 else
14146 gcc_unreachable ();
14147 break;
14148 case LT:
14149 switch (mode)
14150 {
14151 case CCNOmode:
14152 case CCGOCmode:
14153 suffix = "s";
14154 break;
14155
14156 case CCmode:
14157 case CCGCmode:
14158 suffix = "l";
14159 break;
14160
14161 default:
14162 gcc_unreachable ();
14163 }
14164 break;
14165 case LTU:
14166 if (mode == CCmode)
14167 suffix = "b";
14168 else if (mode == CCCmode)
14169 suffix = "c";
14170 else
14171 gcc_unreachable ();
14172 break;
14173 case GE:
14174 switch (mode)
14175 {
14176 case CCNOmode:
14177 case CCGOCmode:
14178 suffix = "ns";
14179 break;
14180
14181 case CCmode:
14182 case CCGCmode:
14183 suffix = "ge";
14184 break;
14185
14186 default:
14187 gcc_unreachable ();
14188 }
14189 break;
14190 case GEU:
14191 if (mode == CCmode)
14192 suffix = fp ? "nb" : "ae";
14193 else if (mode == CCCmode)
14194 suffix = "nc";
14195 else
14196 gcc_unreachable ();
14197 break;
14198 case LE:
14199 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14200 suffix = "le";
14201 break;
14202 case LEU:
14203 if (mode == CCmode)
14204 suffix = "be";
14205 else
14206 gcc_unreachable ();
14207 break;
14208 case UNORDERED:
14209 suffix = fp ? "u" : "p";
14210 break;
14211 case ORDERED:
14212 suffix = fp ? "nu" : "np";
14213 break;
14214 default:
14215 gcc_unreachable ();
14216 }
14217 fputs (suffix, file);
14218 }
14219
14220 /* Print the name of register X to FILE based on its machine mode and number.
14221 If CODE is 'w', pretend the mode is HImode.
14222 If CODE is 'b', pretend the mode is QImode.
14223 If CODE is 'k', pretend the mode is SImode.
14224 If CODE is 'q', pretend the mode is DImode.
14225 If CODE is 'x', pretend the mode is V4SFmode.
14226 If CODE is 't', pretend the mode is V8SFmode.
14227 If CODE is 'g', pretend the mode is V16SFmode.
14228 If CODE is 'h', pretend the reg is the 'high' byte register.
14229 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14230 If CODE is 'd', duplicate the operand for AVX instruction.
14231 */
14232
14233 void
14234 print_reg (rtx x, int code, FILE *file)
14235 {
14236 const char *reg;
14237 unsigned int regno;
14238 bool duplicated = code == 'd' && TARGET_AVX;
14239
14240 if (ASSEMBLER_DIALECT == ASM_ATT)
14241 putc ('%', file);
14242
14243 if (x == pc_rtx)
14244 {
14245 gcc_assert (TARGET_64BIT);
14246 fputs ("rip", file);
14247 return;
14248 }
14249
14250 regno = true_regnum (x);
14251 gcc_assert (regno != ARG_POINTER_REGNUM
14252 && regno != FRAME_POINTER_REGNUM
14253 && regno != FLAGS_REG
14254 && regno != FPSR_REG
14255 && regno != FPCR_REG);
14256
14257 if (code == 'w' || MMX_REG_P (x))
14258 code = 2;
14259 else if (code == 'b')
14260 code = 1;
14261 else if (code == 'k')
14262 code = 4;
14263 else if (code == 'q')
14264 code = 8;
14265 else if (code == 'y')
14266 code = 3;
14267 else if (code == 'h')
14268 code = 0;
14269 else if (code == 'x')
14270 code = 16;
14271 else if (code == 't')
14272 code = 32;
14273 else if (code == 'g')
14274 code = 64;
14275 else
14276 code = GET_MODE_SIZE (GET_MODE (x));
14277
14278 /* Irritatingly, AMD extended registers use different naming convention
14279 from the normal registers: "r%d[bwd]" */
14280 if (REX_INT_REGNO_P (regno))
14281 {
14282 gcc_assert (TARGET_64BIT);
14283 putc ('r', file);
14284 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14285 switch (code)
14286 {
14287 case 0:
14288 error ("extended registers have no high halves");
14289 break;
14290 case 1:
14291 putc ('b', file);
14292 break;
14293 case 2:
14294 putc ('w', file);
14295 break;
14296 case 4:
14297 putc ('d', file);
14298 break;
14299 case 8:
14300 /* no suffix */
14301 break;
14302 default:
14303 error ("unsupported operand size for extended register");
14304 break;
14305 }
14306 return;
14307 }
14308
14309 reg = NULL;
14310 switch (code)
14311 {
14312 case 3:
14313 if (STACK_TOP_P (x))
14314 {
14315 reg = "st(0)";
14316 break;
14317 }
14318 /* FALLTHRU */
14319 case 8:
14320 case 4:
14321 case 12:
14322 if (! ANY_FP_REG_P (x) && ! ANY_BND_REG_P (x))
14323 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14324 /* FALLTHRU */
14325 case 16:
14326 case 2:
14327 normal:
14328 reg = hi_reg_name[regno];
14329 break;
14330 case 1:
14331 if (regno >= ARRAY_SIZE (qi_reg_name))
14332 goto normal;
14333 reg = qi_reg_name[regno];
14334 break;
14335 case 0:
14336 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14337 goto normal;
14338 reg = qi_high_reg_name[regno];
14339 break;
14340 case 32:
14341 if (SSE_REG_P (x))
14342 {
14343 gcc_assert (!duplicated);
14344 putc ('y', file);
14345 fputs (hi_reg_name[regno] + 1, file);
14346 return;
14347 }
14348 case 64:
14349 if (SSE_REG_P (x))
14350 {
14351 gcc_assert (!duplicated);
14352 putc ('z', file);
14353 fputs (hi_reg_name[REGNO (x)] + 1, file);
14354 return;
14355 }
14356 break;
14357 default:
14358 gcc_unreachable ();
14359 }
14360
14361 fputs (reg, file);
14362 if (duplicated)
14363 {
14364 if (ASSEMBLER_DIALECT == ASM_ATT)
14365 fprintf (file, ", %%%s", reg);
14366 else
14367 fprintf (file, ", %s", reg);
14368 }
14369 }
14370
14371 /* Locate some local-dynamic symbol still in use by this function
14372 so that we can print its name in some tls_local_dynamic_base
14373 pattern. */
14374
14375 static int
14376 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14377 {
14378 rtx x = *px;
14379
14380 if (GET_CODE (x) == SYMBOL_REF
14381 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14382 {
14383 cfun->machine->some_ld_name = XSTR (x, 0);
14384 return 1;
14385 }
14386
14387 return 0;
14388 }
14389
14390 static const char *
14391 get_some_local_dynamic_name (void)
14392 {
14393 rtx insn;
14394
14395 if (cfun->machine->some_ld_name)
14396 return cfun->machine->some_ld_name;
14397
14398 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14399 if (NONDEBUG_INSN_P (insn)
14400 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14401 return cfun->machine->some_ld_name;
14402
14403 return NULL;
14404 }
14405
14406 /* Meaning of CODE:
14407 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14408 C -- print opcode suffix for set/cmov insn.
14409 c -- like C, but print reversed condition
14410 F,f -- likewise, but for floating-point.
14411 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14412 otherwise nothing
14413 R -- print the prefix for register names.
14414 z -- print the opcode suffix for the size of the current operand.
14415 Z -- likewise, with special suffixes for x87 instructions.
14416 * -- print a star (in certain assembler syntax)
14417 A -- print an absolute memory reference.
14418 E -- print address with DImode register names if TARGET_64BIT.
14419 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14420 s -- print a shift double count, followed by the assemblers argument
14421 delimiter.
14422 b -- print the QImode name of the register for the indicated operand.
14423 %b0 would print %al if operands[0] is reg 0.
14424 w -- likewise, print the HImode name of the register.
14425 k -- likewise, print the SImode name of the register.
14426 q -- likewise, print the DImode name of the register.
14427 x -- likewise, print the V4SFmode name of the register.
14428 t -- likewise, print the V8SFmode name of the register.
14429 g -- likewise, print the V16SFmode name of the register.
14430 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14431 y -- print "st(0)" instead of "st" as a register.
14432 d -- print duplicated register operand for AVX instruction.
14433 D -- print condition for SSE cmp instruction.
14434 P -- if PIC, print an @PLT suffix.
14435 p -- print raw symbol name.
14436 X -- don't print any sort of PIC '@' suffix for a symbol.
14437 & -- print some in-use local-dynamic symbol name.
14438 H -- print a memory address offset by 8; used for sse high-parts
14439 Y -- print condition for XOP pcom* instruction.
14440 + -- print a branch hint as 'cs' or 'ds' prefix
14441 ; -- print a semicolon (after prefixes due to bug in older gas).
14442 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14443 @ -- print a segment register of thread base pointer load
14444 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14445 ! -- print MPX prefix for jxx/call/ret instructions if required.
14446 */
14447
14448 void
14449 ix86_print_operand (FILE *file, rtx x, int code)
14450 {
14451 if (code)
14452 {
14453 switch (code)
14454 {
14455 case 'A':
14456 switch (ASSEMBLER_DIALECT)
14457 {
14458 case ASM_ATT:
14459 putc ('*', file);
14460 break;
14461
14462 case ASM_INTEL:
14463 /* Intel syntax. For absolute addresses, registers should not
14464 be surrounded by braces. */
14465 if (!REG_P (x))
14466 {
14467 putc ('[', file);
14468 ix86_print_operand (file, x, 0);
14469 putc (']', file);
14470 return;
14471 }
14472 break;
14473
14474 default:
14475 gcc_unreachable ();
14476 }
14477
14478 ix86_print_operand (file, x, 0);
14479 return;
14480
14481 case 'E':
14482 /* Wrap address in an UNSPEC to declare special handling. */
14483 if (TARGET_64BIT)
14484 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14485
14486 output_address (x);
14487 return;
14488
14489 case 'L':
14490 if (ASSEMBLER_DIALECT == ASM_ATT)
14491 putc ('l', file);
14492 return;
14493
14494 case 'W':
14495 if (ASSEMBLER_DIALECT == ASM_ATT)
14496 putc ('w', file);
14497 return;
14498
14499 case 'B':
14500 if (ASSEMBLER_DIALECT == ASM_ATT)
14501 putc ('b', file);
14502 return;
14503
14504 case 'Q':
14505 if (ASSEMBLER_DIALECT == ASM_ATT)
14506 putc ('l', file);
14507 return;
14508
14509 case 'S':
14510 if (ASSEMBLER_DIALECT == ASM_ATT)
14511 putc ('s', file);
14512 return;
14513
14514 case 'T':
14515 if (ASSEMBLER_DIALECT == ASM_ATT)
14516 putc ('t', file);
14517 return;
14518
14519 case 'O':
14520 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14521 if (ASSEMBLER_DIALECT != ASM_ATT)
14522 return;
14523
14524 switch (GET_MODE_SIZE (GET_MODE (x)))
14525 {
14526 case 2:
14527 putc ('w', file);
14528 break;
14529
14530 case 4:
14531 putc ('l', file);
14532 break;
14533
14534 case 8:
14535 putc ('q', file);
14536 break;
14537
14538 default:
14539 output_operand_lossage
14540 ("invalid operand size for operand code 'O'");
14541 return;
14542 }
14543
14544 putc ('.', file);
14545 #endif
14546 return;
14547
14548 case 'z':
14549 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14550 {
14551 /* Opcodes don't get size suffixes if using Intel opcodes. */
14552 if (ASSEMBLER_DIALECT == ASM_INTEL)
14553 return;
14554
14555 switch (GET_MODE_SIZE (GET_MODE (x)))
14556 {
14557 case 1:
14558 putc ('b', file);
14559 return;
14560
14561 case 2:
14562 putc ('w', file);
14563 return;
14564
14565 case 4:
14566 putc ('l', file);
14567 return;
14568
14569 case 8:
14570 putc ('q', file);
14571 return;
14572
14573 default:
14574 output_operand_lossage
14575 ("invalid operand size for operand code 'z'");
14576 return;
14577 }
14578 }
14579
14580 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14581 warning
14582 (0, "non-integer operand used with operand code 'z'");
14583 /* FALLTHRU */
14584
14585 case 'Z':
14586 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14587 if (ASSEMBLER_DIALECT == ASM_INTEL)
14588 return;
14589
14590 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14591 {
14592 switch (GET_MODE_SIZE (GET_MODE (x)))
14593 {
14594 case 2:
14595 #ifdef HAVE_AS_IX86_FILDS
14596 putc ('s', file);
14597 #endif
14598 return;
14599
14600 case 4:
14601 putc ('l', file);
14602 return;
14603
14604 case 8:
14605 #ifdef HAVE_AS_IX86_FILDQ
14606 putc ('q', file);
14607 #else
14608 fputs ("ll", file);
14609 #endif
14610 return;
14611
14612 default:
14613 break;
14614 }
14615 }
14616 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14617 {
14618 /* 387 opcodes don't get size suffixes
14619 if the operands are registers. */
14620 if (STACK_REG_P (x))
14621 return;
14622
14623 switch (GET_MODE_SIZE (GET_MODE (x)))
14624 {
14625 case 4:
14626 putc ('s', file);
14627 return;
14628
14629 case 8:
14630 putc ('l', file);
14631 return;
14632
14633 case 12:
14634 case 16:
14635 putc ('t', file);
14636 return;
14637
14638 default:
14639 break;
14640 }
14641 }
14642 else
14643 {
14644 output_operand_lossage
14645 ("invalid operand type used with operand code 'Z'");
14646 return;
14647 }
14648
14649 output_operand_lossage
14650 ("invalid operand size for operand code 'Z'");
14651 return;
14652
14653 case 'd':
14654 case 'b':
14655 case 'w':
14656 case 'k':
14657 case 'q':
14658 case 'h':
14659 case 't':
14660 case 'g':
14661 case 'y':
14662 case 'x':
14663 case 'X':
14664 case 'P':
14665 case 'p':
14666 break;
14667
14668 case 's':
14669 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14670 {
14671 ix86_print_operand (file, x, 0);
14672 fputs (", ", file);
14673 }
14674 return;
14675
14676 case 'Y':
14677 switch (GET_CODE (x))
14678 {
14679 case NE:
14680 fputs ("neq", file);
14681 break;
14682 case EQ:
14683 fputs ("eq", file);
14684 break;
14685 case GE:
14686 case GEU:
14687 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14688 break;
14689 case GT:
14690 case GTU:
14691 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14692 break;
14693 case LE:
14694 case LEU:
14695 fputs ("le", file);
14696 break;
14697 case LT:
14698 case LTU:
14699 fputs ("lt", file);
14700 break;
14701 case UNORDERED:
14702 fputs ("unord", file);
14703 break;
14704 case ORDERED:
14705 fputs ("ord", file);
14706 break;
14707 case UNEQ:
14708 fputs ("ueq", file);
14709 break;
14710 case UNGE:
14711 fputs ("nlt", file);
14712 break;
14713 case UNGT:
14714 fputs ("nle", file);
14715 break;
14716 case UNLE:
14717 fputs ("ule", file);
14718 break;
14719 case UNLT:
14720 fputs ("ult", file);
14721 break;
14722 case LTGT:
14723 fputs ("une", file);
14724 break;
14725 default:
14726 output_operand_lossage ("operand is not a condition code, "
14727 "invalid operand code 'Y'");
14728 return;
14729 }
14730 return;
14731
14732 case 'D':
14733 /* Little bit of braindamage here. The SSE compare instructions
14734 does use completely different names for the comparisons that the
14735 fp conditional moves. */
14736 switch (GET_CODE (x))
14737 {
14738 case UNEQ:
14739 if (TARGET_AVX)
14740 {
14741 fputs ("eq_us", file);
14742 break;
14743 }
14744 case EQ:
14745 fputs ("eq", file);
14746 break;
14747 case UNLT:
14748 if (TARGET_AVX)
14749 {
14750 fputs ("nge", file);
14751 break;
14752 }
14753 case LT:
14754 fputs ("lt", file);
14755 break;
14756 case UNLE:
14757 if (TARGET_AVX)
14758 {
14759 fputs ("ngt", file);
14760 break;
14761 }
14762 case LE:
14763 fputs ("le", file);
14764 break;
14765 case UNORDERED:
14766 fputs ("unord", file);
14767 break;
14768 case LTGT:
14769 if (TARGET_AVX)
14770 {
14771 fputs ("neq_oq", file);
14772 break;
14773 }
14774 case NE:
14775 fputs ("neq", file);
14776 break;
14777 case GE:
14778 if (TARGET_AVX)
14779 {
14780 fputs ("ge", file);
14781 break;
14782 }
14783 case UNGE:
14784 fputs ("nlt", file);
14785 break;
14786 case GT:
14787 if (TARGET_AVX)
14788 {
14789 fputs ("gt", file);
14790 break;
14791 }
14792 case UNGT:
14793 fputs ("nle", file);
14794 break;
14795 case ORDERED:
14796 fputs ("ord", file);
14797 break;
14798 default:
14799 output_operand_lossage ("operand is not a condition code, "
14800 "invalid operand code 'D'");
14801 return;
14802 }
14803 return;
14804
14805 case 'F':
14806 case 'f':
14807 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14808 if (ASSEMBLER_DIALECT == ASM_ATT)
14809 putc ('.', file);
14810 #endif
14811
14812 case 'C':
14813 case 'c':
14814 if (!COMPARISON_P (x))
14815 {
14816 output_operand_lossage ("operand is not a condition code, "
14817 "invalid operand code '%c'", code);
14818 return;
14819 }
14820 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14821 code == 'c' || code == 'f',
14822 code == 'F' || code == 'f',
14823 file);
14824 return;
14825
14826 case 'H':
14827 if (!offsettable_memref_p (x))
14828 {
14829 output_operand_lossage ("operand is not an offsettable memory "
14830 "reference, invalid operand code 'H'");
14831 return;
14832 }
14833 /* It doesn't actually matter what mode we use here, as we're
14834 only going to use this for printing. */
14835 x = adjust_address_nv (x, DImode, 8);
14836 /* Output 'qword ptr' for intel assembler dialect. */
14837 if (ASSEMBLER_DIALECT == ASM_INTEL)
14838 code = 'q';
14839 break;
14840
14841 case 'K':
14842 gcc_assert (CONST_INT_P (x));
14843
14844 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14845 #ifdef HAVE_AS_IX86_HLE
14846 fputs ("xacquire ", file);
14847 #else
14848 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14849 #endif
14850 else if (INTVAL (x) & IX86_HLE_RELEASE)
14851 #ifdef HAVE_AS_IX86_HLE
14852 fputs ("xrelease ", file);
14853 #else
14854 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14855 #endif
14856 /* We do not want to print value of the operand. */
14857 return;
14858
14859 case '*':
14860 if (ASSEMBLER_DIALECT == ASM_ATT)
14861 putc ('*', file);
14862 return;
14863
14864 case '&':
14865 {
14866 const char *name = get_some_local_dynamic_name ();
14867 if (name == NULL)
14868 output_operand_lossage ("'%%&' used without any "
14869 "local dynamic TLS references");
14870 else
14871 assemble_name (file, name);
14872 return;
14873 }
14874
14875 case '+':
14876 {
14877 rtx x;
14878
14879 if (!optimize
14880 || optimize_function_for_size_p (cfun)
14881 || !TARGET_BRANCH_PREDICTION_HINTS)
14882 return;
14883
14884 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14885 if (x)
14886 {
14887 int pred_val = XINT (x, 0);
14888
14889 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14890 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14891 {
14892 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14893 bool cputaken
14894 = final_forward_branch_p (current_output_insn) == 0;
14895
14896 /* Emit hints only in the case default branch prediction
14897 heuristics would fail. */
14898 if (taken != cputaken)
14899 {
14900 /* We use 3e (DS) prefix for taken branches and
14901 2e (CS) prefix for not taken branches. */
14902 if (taken)
14903 fputs ("ds ; ", file);
14904 else
14905 fputs ("cs ; ", file);
14906 }
14907 }
14908 }
14909 return;
14910 }
14911
14912 case ';':
14913 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14914 putc (';', file);
14915 #endif
14916 return;
14917
14918 case '@':
14919 if (ASSEMBLER_DIALECT == ASM_ATT)
14920 putc ('%', file);
14921
14922 /* The kernel uses a different segment register for performance
14923 reasons; a system call would not have to trash the userspace
14924 segment register, which would be expensive. */
14925 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14926 fputs ("fs", file);
14927 else
14928 fputs ("gs", file);
14929 return;
14930
14931 case '~':
14932 putc (TARGET_AVX2 ? 'i' : 'f', file);
14933 return;
14934
14935 case '^':
14936 if (TARGET_64BIT && Pmode != word_mode)
14937 fputs ("addr32 ", file);
14938 return;
14939
14940 case '!':
14941 if (ix86_bnd_prefixed_insn_p (NULL_RTX))
14942 fputs ("bnd ", file);
14943 return;
14944
14945 default:
14946 output_operand_lossage ("invalid operand code '%c'", code);
14947 }
14948 }
14949
14950 if (REG_P (x))
14951 print_reg (x, code, file);
14952
14953 else if (MEM_P (x))
14954 {
14955 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14956 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14957 && GET_MODE (x) != BLKmode)
14958 {
14959 const char * size;
14960 switch (GET_MODE_SIZE (GET_MODE (x)))
14961 {
14962 case 1: size = "BYTE"; break;
14963 case 2: size = "WORD"; break;
14964 case 4: size = "DWORD"; break;
14965 case 8: size = "QWORD"; break;
14966 case 12: size = "TBYTE"; break;
14967 case 16:
14968 if (GET_MODE (x) == XFmode)
14969 size = "TBYTE";
14970 else
14971 size = "XMMWORD";
14972 break;
14973 case 32: size = "YMMWORD"; break;
14974 case 64: size = "ZMMWORD"; break;
14975 default:
14976 gcc_unreachable ();
14977 }
14978
14979 /* Check for explicit size override (codes 'b', 'w', 'k',
14980 'q' and 'x') */
14981 if (code == 'b')
14982 size = "BYTE";
14983 else if (code == 'w')
14984 size = "WORD";
14985 else if (code == 'k')
14986 size = "DWORD";
14987 else if (code == 'q')
14988 size = "QWORD";
14989 else if (code == 'x')
14990 size = "XMMWORD";
14991
14992 fputs (size, file);
14993 fputs (" PTR ", file);
14994 }
14995
14996 x = XEXP (x, 0);
14997 /* Avoid (%rip) for call operands. */
14998 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14999 && !CONST_INT_P (x))
15000 output_addr_const (file, x);
15001 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
15002 output_operand_lossage ("invalid constraints for operand");
15003 else
15004 output_address (x);
15005 }
15006
15007 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
15008 {
15009 REAL_VALUE_TYPE r;
15010 long l;
15011
15012 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15013 REAL_VALUE_TO_TARGET_SINGLE (r, l);
15014
15015 if (ASSEMBLER_DIALECT == ASM_ATT)
15016 putc ('$', file);
15017 /* Sign extend 32bit SFmode immediate to 8 bytes. */
15018 if (code == 'q')
15019 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
15020 (unsigned long long) (int) l);
15021 else
15022 fprintf (file, "0x%08x", (unsigned int) l);
15023 }
15024
15025 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
15026 {
15027 REAL_VALUE_TYPE r;
15028 long l[2];
15029
15030 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15031 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
15032
15033 if (ASSEMBLER_DIALECT == ASM_ATT)
15034 putc ('$', file);
15035 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
15036 }
15037
15038 /* These float cases don't actually occur as immediate operands. */
15039 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
15040 {
15041 char dstr[30];
15042
15043 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
15044 fputs (dstr, file);
15045 }
15046
15047 else
15048 {
15049 /* We have patterns that allow zero sets of memory, for instance.
15050 In 64-bit mode, we should probably support all 8-byte vectors,
15051 since we can in fact encode that into an immediate. */
15052 if (GET_CODE (x) == CONST_VECTOR)
15053 {
15054 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15055 x = const0_rtx;
15056 }
15057
15058 if (code != 'P' && code != 'p')
15059 {
15060 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15061 {
15062 if (ASSEMBLER_DIALECT == ASM_ATT)
15063 putc ('$', file);
15064 }
15065 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15066 || GET_CODE (x) == LABEL_REF)
15067 {
15068 if (ASSEMBLER_DIALECT == ASM_ATT)
15069 putc ('$', file);
15070 else
15071 fputs ("OFFSET FLAT:", file);
15072 }
15073 }
15074 if (CONST_INT_P (x))
15075 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15076 else if (flag_pic || MACHOPIC_INDIRECT)
15077 output_pic_addr_const (file, x, code);
15078 else
15079 output_addr_const (file, x);
15080 }
15081 }
15082
15083 static bool
15084 ix86_print_operand_punct_valid_p (unsigned char code)
15085 {
15086 return (code == '@' || code == '*' || code == '+' || code == '&'
15087 || code == ';' || code == '~' || code == '^' || code == '!');
15088 }
15089 \f
15090 /* Print a memory operand whose address is ADDR. */
15091
15092 static void
15093 ix86_print_operand_address (FILE *file, rtx addr)
15094 {
15095 struct ix86_address parts;
15096 rtx base, index, disp;
15097 int scale;
15098 int ok;
15099 bool vsib = false;
15100 int code = 0;
15101
15102 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15103 {
15104 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15105 gcc_assert (parts.index == NULL_RTX);
15106 parts.index = XVECEXP (addr, 0, 1);
15107 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15108 addr = XVECEXP (addr, 0, 0);
15109 vsib = true;
15110 }
15111 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15112 {
15113 gcc_assert (TARGET_64BIT);
15114 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15115 code = 'q';
15116 }
15117 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
15118 {
15119 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
15120 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
15121 if (parts.base != NULL_RTX)
15122 {
15123 parts.index = parts.base;
15124 parts.scale = 1;
15125 }
15126 parts.base = XVECEXP (addr, 0, 0);
15127 addr = XVECEXP (addr, 0, 0);
15128 }
15129 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
15130 {
15131 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15132 gcc_assert (parts.index == NULL_RTX);
15133 parts.index = XVECEXP (addr, 0, 1);
15134 addr = XVECEXP (addr, 0, 0);
15135 }
15136 else
15137 ok = ix86_decompose_address (addr, &parts);
15138
15139 gcc_assert (ok);
15140
15141 base = parts.base;
15142 index = parts.index;
15143 disp = parts.disp;
15144 scale = parts.scale;
15145
15146 switch (parts.seg)
15147 {
15148 case SEG_DEFAULT:
15149 break;
15150 case SEG_FS:
15151 case SEG_GS:
15152 if (ASSEMBLER_DIALECT == ASM_ATT)
15153 putc ('%', file);
15154 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15155 break;
15156 default:
15157 gcc_unreachable ();
15158 }
15159
15160 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15161 if (TARGET_64BIT && !base && !index)
15162 {
15163 rtx symbol = disp;
15164
15165 if (GET_CODE (disp) == CONST
15166 && GET_CODE (XEXP (disp, 0)) == PLUS
15167 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15168 symbol = XEXP (XEXP (disp, 0), 0);
15169
15170 if (GET_CODE (symbol) == LABEL_REF
15171 || (GET_CODE (symbol) == SYMBOL_REF
15172 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15173 base = pc_rtx;
15174 }
15175 if (!base && !index)
15176 {
15177 /* Displacement only requires special attention. */
15178
15179 if (CONST_INT_P (disp))
15180 {
15181 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15182 fputs ("ds:", file);
15183 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15184 }
15185 else if (flag_pic)
15186 output_pic_addr_const (file, disp, 0);
15187 else
15188 output_addr_const (file, disp);
15189 }
15190 else
15191 {
15192 /* Print SImode register names to force addr32 prefix. */
15193 if (SImode_address_operand (addr, VOIDmode))
15194 {
15195 #ifdef ENABLE_CHECKING
15196 gcc_assert (TARGET_64BIT);
15197 switch (GET_CODE (addr))
15198 {
15199 case SUBREG:
15200 gcc_assert (GET_MODE (addr) == SImode);
15201 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15202 break;
15203 case ZERO_EXTEND:
15204 case AND:
15205 gcc_assert (GET_MODE (addr) == DImode);
15206 break;
15207 default:
15208 gcc_unreachable ();
15209 }
15210 #endif
15211 gcc_assert (!code);
15212 code = 'k';
15213 }
15214 else if (code == 0
15215 && TARGET_X32
15216 && disp
15217 && CONST_INT_P (disp)
15218 && INTVAL (disp) < -16*1024*1024)
15219 {
15220 /* X32 runs in 64-bit mode, where displacement, DISP, in
15221 address DISP(%r64), is encoded as 32-bit immediate sign-
15222 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15223 address is %r64 + 0xffffffffbffffd00. When %r64 <
15224 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15225 which is invalid for x32. The correct address is %r64
15226 - 0x40000300 == 0xf7ffdd64. To properly encode
15227 -0x40000300(%r64) for x32, we zero-extend negative
15228 displacement by forcing addr32 prefix which truncates
15229 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15230 zero-extend all negative displacements, including -1(%rsp).
15231 However, for small negative displacements, sign-extension
15232 won't cause overflow. We only zero-extend negative
15233 displacements if they < -16*1024*1024, which is also used
15234 to check legitimate address displacements for PIC. */
15235 code = 'k';
15236 }
15237
15238 if (ASSEMBLER_DIALECT == ASM_ATT)
15239 {
15240 if (disp)
15241 {
15242 if (flag_pic)
15243 output_pic_addr_const (file, disp, 0);
15244 else if (GET_CODE (disp) == LABEL_REF)
15245 output_asm_label (disp);
15246 else
15247 output_addr_const (file, disp);
15248 }
15249
15250 putc ('(', file);
15251 if (base)
15252 print_reg (base, code, file);
15253 if (index)
15254 {
15255 putc (',', file);
15256 print_reg (index, vsib ? 0 : code, file);
15257 if (scale != 1 || vsib)
15258 fprintf (file, ",%d", scale);
15259 }
15260 putc (')', file);
15261 }
15262 else
15263 {
15264 rtx offset = NULL_RTX;
15265
15266 if (disp)
15267 {
15268 /* Pull out the offset of a symbol; print any symbol itself. */
15269 if (GET_CODE (disp) == CONST
15270 && GET_CODE (XEXP (disp, 0)) == PLUS
15271 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15272 {
15273 offset = XEXP (XEXP (disp, 0), 1);
15274 disp = gen_rtx_CONST (VOIDmode,
15275 XEXP (XEXP (disp, 0), 0));
15276 }
15277
15278 if (flag_pic)
15279 output_pic_addr_const (file, disp, 0);
15280 else if (GET_CODE (disp) == LABEL_REF)
15281 output_asm_label (disp);
15282 else if (CONST_INT_P (disp))
15283 offset = disp;
15284 else
15285 output_addr_const (file, disp);
15286 }
15287
15288 putc ('[', file);
15289 if (base)
15290 {
15291 print_reg (base, code, file);
15292 if (offset)
15293 {
15294 if (INTVAL (offset) >= 0)
15295 putc ('+', file);
15296 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15297 }
15298 }
15299 else if (offset)
15300 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15301 else
15302 putc ('0', file);
15303
15304 if (index)
15305 {
15306 putc ('+', file);
15307 print_reg (index, vsib ? 0 : code, file);
15308 if (scale != 1 || vsib)
15309 fprintf (file, "*%d", scale);
15310 }
15311 putc (']', file);
15312 }
15313 }
15314 }
15315
15316 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15317
15318 static bool
15319 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15320 {
15321 rtx op;
15322
15323 if (GET_CODE (x) != UNSPEC)
15324 return false;
15325
15326 op = XVECEXP (x, 0, 0);
15327 switch (XINT (x, 1))
15328 {
15329 case UNSPEC_GOTTPOFF:
15330 output_addr_const (file, op);
15331 /* FIXME: This might be @TPOFF in Sun ld. */
15332 fputs ("@gottpoff", file);
15333 break;
15334 case UNSPEC_TPOFF:
15335 output_addr_const (file, op);
15336 fputs ("@tpoff", file);
15337 break;
15338 case UNSPEC_NTPOFF:
15339 output_addr_const (file, op);
15340 if (TARGET_64BIT)
15341 fputs ("@tpoff", file);
15342 else
15343 fputs ("@ntpoff", file);
15344 break;
15345 case UNSPEC_DTPOFF:
15346 output_addr_const (file, op);
15347 fputs ("@dtpoff", file);
15348 break;
15349 case UNSPEC_GOTNTPOFF:
15350 output_addr_const (file, op);
15351 if (TARGET_64BIT)
15352 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15353 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15354 else
15355 fputs ("@gotntpoff", file);
15356 break;
15357 case UNSPEC_INDNTPOFF:
15358 output_addr_const (file, op);
15359 fputs ("@indntpoff", file);
15360 break;
15361 #if TARGET_MACHO
15362 case UNSPEC_MACHOPIC_OFFSET:
15363 output_addr_const (file, op);
15364 putc ('-', file);
15365 machopic_output_function_base_name (file);
15366 break;
15367 #endif
15368
15369 case UNSPEC_STACK_CHECK:
15370 {
15371 int offset;
15372
15373 gcc_assert (flag_split_stack);
15374
15375 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15376 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15377 #else
15378 gcc_unreachable ();
15379 #endif
15380
15381 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15382 }
15383 break;
15384
15385 default:
15386 return false;
15387 }
15388
15389 return true;
15390 }
15391 \f
15392 /* Split one or more double-mode RTL references into pairs of half-mode
15393 references. The RTL can be REG, offsettable MEM, integer constant, or
15394 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15395 split and "num" is its length. lo_half and hi_half are output arrays
15396 that parallel "operands". */
15397
15398 void
15399 split_double_mode (enum machine_mode mode, rtx operands[],
15400 int num, rtx lo_half[], rtx hi_half[])
15401 {
15402 enum machine_mode half_mode;
15403 unsigned int byte;
15404
15405 switch (mode)
15406 {
15407 case TImode:
15408 half_mode = DImode;
15409 break;
15410 case DImode:
15411 half_mode = SImode;
15412 break;
15413 default:
15414 gcc_unreachable ();
15415 }
15416
15417 byte = GET_MODE_SIZE (half_mode);
15418
15419 while (num--)
15420 {
15421 rtx op = operands[num];
15422
15423 /* simplify_subreg refuse to split volatile memory addresses,
15424 but we still have to handle it. */
15425 if (MEM_P (op))
15426 {
15427 lo_half[num] = adjust_address (op, half_mode, 0);
15428 hi_half[num] = adjust_address (op, half_mode, byte);
15429 }
15430 else
15431 {
15432 lo_half[num] = simplify_gen_subreg (half_mode, op,
15433 GET_MODE (op) == VOIDmode
15434 ? mode : GET_MODE (op), 0);
15435 hi_half[num] = simplify_gen_subreg (half_mode, op,
15436 GET_MODE (op) == VOIDmode
15437 ? mode : GET_MODE (op), byte);
15438 }
15439 }
15440 }
15441 \f
15442 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15443 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15444 is the expression of the binary operation. The output may either be
15445 emitted here, or returned to the caller, like all output_* functions.
15446
15447 There is no guarantee that the operands are the same mode, as they
15448 might be within FLOAT or FLOAT_EXTEND expressions. */
15449
15450 #ifndef SYSV386_COMPAT
15451 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15452 wants to fix the assemblers because that causes incompatibility
15453 with gcc. No-one wants to fix gcc because that causes
15454 incompatibility with assemblers... You can use the option of
15455 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15456 #define SYSV386_COMPAT 1
15457 #endif
15458
15459 const char *
15460 output_387_binary_op (rtx insn, rtx *operands)
15461 {
15462 static char buf[40];
15463 const char *p;
15464 const char *ssep;
15465 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15466
15467 #ifdef ENABLE_CHECKING
15468 /* Even if we do not want to check the inputs, this documents input
15469 constraints. Which helps in understanding the following code. */
15470 if (STACK_REG_P (operands[0])
15471 && ((REG_P (operands[1])
15472 && REGNO (operands[0]) == REGNO (operands[1])
15473 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15474 || (REG_P (operands[2])
15475 && REGNO (operands[0]) == REGNO (operands[2])
15476 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15477 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15478 ; /* ok */
15479 else
15480 gcc_assert (is_sse);
15481 #endif
15482
15483 switch (GET_CODE (operands[3]))
15484 {
15485 case PLUS:
15486 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15487 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15488 p = "fiadd";
15489 else
15490 p = "fadd";
15491 ssep = "vadd";
15492 break;
15493
15494 case MINUS:
15495 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15496 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15497 p = "fisub";
15498 else
15499 p = "fsub";
15500 ssep = "vsub";
15501 break;
15502
15503 case MULT:
15504 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15505 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15506 p = "fimul";
15507 else
15508 p = "fmul";
15509 ssep = "vmul";
15510 break;
15511
15512 case DIV:
15513 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15514 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15515 p = "fidiv";
15516 else
15517 p = "fdiv";
15518 ssep = "vdiv";
15519 break;
15520
15521 default:
15522 gcc_unreachable ();
15523 }
15524
15525 if (is_sse)
15526 {
15527 if (TARGET_AVX)
15528 {
15529 strcpy (buf, ssep);
15530 if (GET_MODE (operands[0]) == SFmode)
15531 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15532 else
15533 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15534 }
15535 else
15536 {
15537 strcpy (buf, ssep + 1);
15538 if (GET_MODE (operands[0]) == SFmode)
15539 strcat (buf, "ss\t{%2, %0|%0, %2}");
15540 else
15541 strcat (buf, "sd\t{%2, %0|%0, %2}");
15542 }
15543 return buf;
15544 }
15545 strcpy (buf, p);
15546
15547 switch (GET_CODE (operands[3]))
15548 {
15549 case MULT:
15550 case PLUS:
15551 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15552 {
15553 rtx temp = operands[2];
15554 operands[2] = operands[1];
15555 operands[1] = temp;
15556 }
15557
15558 /* know operands[0] == operands[1]. */
15559
15560 if (MEM_P (operands[2]))
15561 {
15562 p = "%Z2\t%2";
15563 break;
15564 }
15565
15566 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15567 {
15568 if (STACK_TOP_P (operands[0]))
15569 /* How is it that we are storing to a dead operand[2]?
15570 Well, presumably operands[1] is dead too. We can't
15571 store the result to st(0) as st(0) gets popped on this
15572 instruction. Instead store to operands[2] (which I
15573 think has to be st(1)). st(1) will be popped later.
15574 gcc <= 2.8.1 didn't have this check and generated
15575 assembly code that the Unixware assembler rejected. */
15576 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15577 else
15578 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15579 break;
15580 }
15581
15582 if (STACK_TOP_P (operands[0]))
15583 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15584 else
15585 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15586 break;
15587
15588 case MINUS:
15589 case DIV:
15590 if (MEM_P (operands[1]))
15591 {
15592 p = "r%Z1\t%1";
15593 break;
15594 }
15595
15596 if (MEM_P (operands[2]))
15597 {
15598 p = "%Z2\t%2";
15599 break;
15600 }
15601
15602 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15603 {
15604 #if SYSV386_COMPAT
15605 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15606 derived assemblers, confusingly reverse the direction of
15607 the operation for fsub{r} and fdiv{r} when the
15608 destination register is not st(0). The Intel assembler
15609 doesn't have this brain damage. Read !SYSV386_COMPAT to
15610 figure out what the hardware really does. */
15611 if (STACK_TOP_P (operands[0]))
15612 p = "{p\t%0, %2|rp\t%2, %0}";
15613 else
15614 p = "{rp\t%2, %0|p\t%0, %2}";
15615 #else
15616 if (STACK_TOP_P (operands[0]))
15617 /* As above for fmul/fadd, we can't store to st(0). */
15618 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15619 else
15620 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15621 #endif
15622 break;
15623 }
15624
15625 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15626 {
15627 #if SYSV386_COMPAT
15628 if (STACK_TOP_P (operands[0]))
15629 p = "{rp\t%0, %1|p\t%1, %0}";
15630 else
15631 p = "{p\t%1, %0|rp\t%0, %1}";
15632 #else
15633 if (STACK_TOP_P (operands[0]))
15634 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15635 else
15636 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15637 #endif
15638 break;
15639 }
15640
15641 if (STACK_TOP_P (operands[0]))
15642 {
15643 if (STACK_TOP_P (operands[1]))
15644 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15645 else
15646 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15647 break;
15648 }
15649 else if (STACK_TOP_P (operands[1]))
15650 {
15651 #if SYSV386_COMPAT
15652 p = "{\t%1, %0|r\t%0, %1}";
15653 #else
15654 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15655 #endif
15656 }
15657 else
15658 {
15659 #if SYSV386_COMPAT
15660 p = "{r\t%2, %0|\t%0, %2}";
15661 #else
15662 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15663 #endif
15664 }
15665 break;
15666
15667 default:
15668 gcc_unreachable ();
15669 }
15670
15671 strcat (buf, p);
15672 return buf;
15673 }
15674
15675 /* Check if a 256bit AVX register is referenced inside of EXP. */
15676
15677 static int
15678 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15679 {
15680 rtx exp = *pexp;
15681
15682 if (GET_CODE (exp) == SUBREG)
15683 exp = SUBREG_REG (exp);
15684
15685 if (REG_P (exp)
15686 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15687 return 1;
15688
15689 return 0;
15690 }
15691
15692 /* Return needed mode for entity in optimize_mode_switching pass. */
15693
15694 static int
15695 ix86_avx_u128_mode_needed (rtx insn)
15696 {
15697 if (CALL_P (insn))
15698 {
15699 rtx link;
15700
15701 /* Needed mode is set to AVX_U128_CLEAN if there are
15702 no 256bit modes used in function arguments. */
15703 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15704 link;
15705 link = XEXP (link, 1))
15706 {
15707 if (GET_CODE (XEXP (link, 0)) == USE)
15708 {
15709 rtx arg = XEXP (XEXP (link, 0), 0);
15710
15711 if (ix86_check_avx256_register (&arg, NULL))
15712 return AVX_U128_DIRTY;
15713 }
15714 }
15715
15716 return AVX_U128_CLEAN;
15717 }
15718
15719 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15720 changes state only when a 256bit register is written to, but we need
15721 to prevent the compiler from moving optimal insertion point above
15722 eventual read from 256bit register. */
15723 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15724 return AVX_U128_DIRTY;
15725
15726 return AVX_U128_ANY;
15727 }
15728
15729 /* Return mode that i387 must be switched into
15730 prior to the execution of insn. */
15731
15732 static int
15733 ix86_i387_mode_needed (int entity, rtx insn)
15734 {
15735 enum attr_i387_cw mode;
15736
15737 /* The mode UNINITIALIZED is used to store control word after a
15738 function call or ASM pattern. The mode ANY specify that function
15739 has no requirements on the control word and make no changes in the
15740 bits we are interested in. */
15741
15742 if (CALL_P (insn)
15743 || (NONJUMP_INSN_P (insn)
15744 && (asm_noperands (PATTERN (insn)) >= 0
15745 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15746 return I387_CW_UNINITIALIZED;
15747
15748 if (recog_memoized (insn) < 0)
15749 return I387_CW_ANY;
15750
15751 mode = get_attr_i387_cw (insn);
15752
15753 switch (entity)
15754 {
15755 case I387_TRUNC:
15756 if (mode == I387_CW_TRUNC)
15757 return mode;
15758 break;
15759
15760 case I387_FLOOR:
15761 if (mode == I387_CW_FLOOR)
15762 return mode;
15763 break;
15764
15765 case I387_CEIL:
15766 if (mode == I387_CW_CEIL)
15767 return mode;
15768 break;
15769
15770 case I387_MASK_PM:
15771 if (mode == I387_CW_MASK_PM)
15772 return mode;
15773 break;
15774
15775 default:
15776 gcc_unreachable ();
15777 }
15778
15779 return I387_CW_ANY;
15780 }
15781
15782 /* Return mode that entity must be switched into
15783 prior to the execution of insn. */
15784
15785 int
15786 ix86_mode_needed (int entity, rtx insn)
15787 {
15788 switch (entity)
15789 {
15790 case AVX_U128:
15791 return ix86_avx_u128_mode_needed (insn);
15792 case I387_TRUNC:
15793 case I387_FLOOR:
15794 case I387_CEIL:
15795 case I387_MASK_PM:
15796 return ix86_i387_mode_needed (entity, insn);
15797 default:
15798 gcc_unreachable ();
15799 }
15800 return 0;
15801 }
15802
15803 /* Check if a 256bit AVX register is referenced in stores. */
15804
15805 static void
15806 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15807 {
15808 if (ix86_check_avx256_register (&dest, NULL))
15809 {
15810 bool *used = (bool *) data;
15811 *used = true;
15812 }
15813 }
15814
15815 /* Calculate mode of upper 128bit AVX registers after the insn. */
15816
15817 static int
15818 ix86_avx_u128_mode_after (int mode, rtx insn)
15819 {
15820 rtx pat = PATTERN (insn);
15821
15822 if (vzeroupper_operation (pat, VOIDmode)
15823 || vzeroall_operation (pat, VOIDmode))
15824 return AVX_U128_CLEAN;
15825
15826 /* We know that state is clean after CALL insn if there are no
15827 256bit registers used in the function return register. */
15828 if (CALL_P (insn))
15829 {
15830 bool avx_reg256_found = false;
15831 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15832
15833 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
15834 }
15835
15836 /* Otherwise, return current mode. Remember that if insn
15837 references AVX 256bit registers, the mode was already changed
15838 to DIRTY from MODE_NEEDED. */
15839 return mode;
15840 }
15841
15842 /* Return the mode that an insn results in. */
15843
15844 int
15845 ix86_mode_after (int entity, int mode, rtx insn)
15846 {
15847 switch (entity)
15848 {
15849 case AVX_U128:
15850 return ix86_avx_u128_mode_after (mode, insn);
15851 case I387_TRUNC:
15852 case I387_FLOOR:
15853 case I387_CEIL:
15854 case I387_MASK_PM:
15855 return mode;
15856 default:
15857 gcc_unreachable ();
15858 }
15859 }
15860
15861 static int
15862 ix86_avx_u128_mode_entry (void)
15863 {
15864 tree arg;
15865
15866 /* Entry mode is set to AVX_U128_DIRTY if there are
15867 256bit modes used in function arguments. */
15868 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15869 arg = TREE_CHAIN (arg))
15870 {
15871 rtx incoming = DECL_INCOMING_RTL (arg);
15872
15873 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15874 return AVX_U128_DIRTY;
15875 }
15876
15877 return AVX_U128_CLEAN;
15878 }
15879
15880 /* Return a mode that ENTITY is assumed to be
15881 switched to at function entry. */
15882
15883 int
15884 ix86_mode_entry (int entity)
15885 {
15886 switch (entity)
15887 {
15888 case AVX_U128:
15889 return ix86_avx_u128_mode_entry ();
15890 case I387_TRUNC:
15891 case I387_FLOOR:
15892 case I387_CEIL:
15893 case I387_MASK_PM:
15894 return I387_CW_ANY;
15895 default:
15896 gcc_unreachable ();
15897 }
15898 }
15899
15900 static int
15901 ix86_avx_u128_mode_exit (void)
15902 {
15903 rtx reg = crtl->return_rtx;
15904
15905 /* Exit mode is set to AVX_U128_DIRTY if there are
15906 256bit modes used in the function return register. */
15907 if (reg && ix86_check_avx256_register (&reg, NULL))
15908 return AVX_U128_DIRTY;
15909
15910 return AVX_U128_CLEAN;
15911 }
15912
15913 /* Return a mode that ENTITY is assumed to be
15914 switched to at function exit. */
15915
15916 int
15917 ix86_mode_exit (int entity)
15918 {
15919 switch (entity)
15920 {
15921 case AVX_U128:
15922 return ix86_avx_u128_mode_exit ();
15923 case I387_TRUNC:
15924 case I387_FLOOR:
15925 case I387_CEIL:
15926 case I387_MASK_PM:
15927 return I387_CW_ANY;
15928 default:
15929 gcc_unreachable ();
15930 }
15931 }
15932
15933 /* Output code to initialize control word copies used by trunc?f?i and
15934 rounding patterns. CURRENT_MODE is set to current control word,
15935 while NEW_MODE is set to new control word. */
15936
15937 static void
15938 emit_i387_cw_initialization (int mode)
15939 {
15940 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15941 rtx new_mode;
15942
15943 enum ix86_stack_slot slot;
15944
15945 rtx reg = gen_reg_rtx (HImode);
15946
15947 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15948 emit_move_insn (reg, copy_rtx (stored_mode));
15949
15950 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15951 || optimize_insn_for_size_p ())
15952 {
15953 switch (mode)
15954 {
15955 case I387_CW_TRUNC:
15956 /* round toward zero (truncate) */
15957 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15958 slot = SLOT_CW_TRUNC;
15959 break;
15960
15961 case I387_CW_FLOOR:
15962 /* round down toward -oo */
15963 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15964 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15965 slot = SLOT_CW_FLOOR;
15966 break;
15967
15968 case I387_CW_CEIL:
15969 /* round up toward +oo */
15970 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15971 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15972 slot = SLOT_CW_CEIL;
15973 break;
15974
15975 case I387_CW_MASK_PM:
15976 /* mask precision exception for nearbyint() */
15977 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15978 slot = SLOT_CW_MASK_PM;
15979 break;
15980
15981 default:
15982 gcc_unreachable ();
15983 }
15984 }
15985 else
15986 {
15987 switch (mode)
15988 {
15989 case I387_CW_TRUNC:
15990 /* round toward zero (truncate) */
15991 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15992 slot = SLOT_CW_TRUNC;
15993 break;
15994
15995 case I387_CW_FLOOR:
15996 /* round down toward -oo */
15997 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15998 slot = SLOT_CW_FLOOR;
15999 break;
16000
16001 case I387_CW_CEIL:
16002 /* round up toward +oo */
16003 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
16004 slot = SLOT_CW_CEIL;
16005 break;
16006
16007 case I387_CW_MASK_PM:
16008 /* mask precision exception for nearbyint() */
16009 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16010 slot = SLOT_CW_MASK_PM;
16011 break;
16012
16013 default:
16014 gcc_unreachable ();
16015 }
16016 }
16017
16018 gcc_assert (slot < MAX_386_STACK_LOCALS);
16019
16020 new_mode = assign_386_stack_local (HImode, slot);
16021 emit_move_insn (new_mode, reg);
16022 }
16023
16024 /* Emit vzeroupper. */
16025
16026 void
16027 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
16028 {
16029 int i;
16030
16031 /* Cancel automatic vzeroupper insertion if there are
16032 live call-saved SSE registers at the insertion point. */
16033
16034 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
16035 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16036 return;
16037
16038 if (TARGET_64BIT)
16039 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
16040 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16041 return;
16042
16043 emit_insn (gen_avx_vzeroupper ());
16044 }
16045
16046 /* Generate one or more insns to set ENTITY to MODE. */
16047
16048 void
16049 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
16050 {
16051 switch (entity)
16052 {
16053 case AVX_U128:
16054 if (mode == AVX_U128_CLEAN)
16055 ix86_avx_emit_vzeroupper (regs_live);
16056 break;
16057 case I387_TRUNC:
16058 case I387_FLOOR:
16059 case I387_CEIL:
16060 case I387_MASK_PM:
16061 if (mode != I387_CW_ANY
16062 && mode != I387_CW_UNINITIALIZED)
16063 emit_i387_cw_initialization (mode);
16064 break;
16065 default:
16066 gcc_unreachable ();
16067 }
16068 }
16069
16070 /* Output code for INSN to convert a float to a signed int. OPERANDS
16071 are the insn operands. The output may be [HSD]Imode and the input
16072 operand may be [SDX]Fmode. */
16073
16074 const char *
16075 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16076 {
16077 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16078 int dimode_p = GET_MODE (operands[0]) == DImode;
16079 int round_mode = get_attr_i387_cw (insn);
16080
16081 /* Jump through a hoop or two for DImode, since the hardware has no
16082 non-popping instruction. We used to do this a different way, but
16083 that was somewhat fragile and broke with post-reload splitters. */
16084 if ((dimode_p || fisttp) && !stack_top_dies)
16085 output_asm_insn ("fld\t%y1", operands);
16086
16087 gcc_assert (STACK_TOP_P (operands[1]));
16088 gcc_assert (MEM_P (operands[0]));
16089 gcc_assert (GET_MODE (operands[1]) != TFmode);
16090
16091 if (fisttp)
16092 output_asm_insn ("fisttp%Z0\t%0", operands);
16093 else
16094 {
16095 if (round_mode != I387_CW_ANY)
16096 output_asm_insn ("fldcw\t%3", operands);
16097 if (stack_top_dies || dimode_p)
16098 output_asm_insn ("fistp%Z0\t%0", operands);
16099 else
16100 output_asm_insn ("fist%Z0\t%0", operands);
16101 if (round_mode != I387_CW_ANY)
16102 output_asm_insn ("fldcw\t%2", operands);
16103 }
16104
16105 return "";
16106 }
16107
16108 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16109 have the values zero or one, indicates the ffreep insn's operand
16110 from the OPERANDS array. */
16111
16112 static const char *
16113 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16114 {
16115 if (TARGET_USE_FFREEP)
16116 #ifdef HAVE_AS_IX86_FFREEP
16117 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16118 #else
16119 {
16120 static char retval[32];
16121 int regno = REGNO (operands[opno]);
16122
16123 gcc_assert (STACK_REGNO_P (regno));
16124
16125 regno -= FIRST_STACK_REG;
16126
16127 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16128 return retval;
16129 }
16130 #endif
16131
16132 return opno ? "fstp\t%y1" : "fstp\t%y0";
16133 }
16134
16135
16136 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16137 should be used. UNORDERED_P is true when fucom should be used. */
16138
16139 const char *
16140 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16141 {
16142 int stack_top_dies;
16143 rtx cmp_op0, cmp_op1;
16144 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16145
16146 if (eflags_p)
16147 {
16148 cmp_op0 = operands[0];
16149 cmp_op1 = operands[1];
16150 }
16151 else
16152 {
16153 cmp_op0 = operands[1];
16154 cmp_op1 = operands[2];
16155 }
16156
16157 if (is_sse)
16158 {
16159 if (GET_MODE (operands[0]) == SFmode)
16160 if (unordered_p)
16161 return "%vucomiss\t{%1, %0|%0, %1}";
16162 else
16163 return "%vcomiss\t{%1, %0|%0, %1}";
16164 else
16165 if (unordered_p)
16166 return "%vucomisd\t{%1, %0|%0, %1}";
16167 else
16168 return "%vcomisd\t{%1, %0|%0, %1}";
16169 }
16170
16171 gcc_assert (STACK_TOP_P (cmp_op0));
16172
16173 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16174
16175 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16176 {
16177 if (stack_top_dies)
16178 {
16179 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16180 return output_387_ffreep (operands, 1);
16181 }
16182 else
16183 return "ftst\n\tfnstsw\t%0";
16184 }
16185
16186 if (STACK_REG_P (cmp_op1)
16187 && stack_top_dies
16188 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16189 && REGNO (cmp_op1) != FIRST_STACK_REG)
16190 {
16191 /* If both the top of the 387 stack dies, and the other operand
16192 is also a stack register that dies, then this must be a
16193 `fcompp' float compare */
16194
16195 if (eflags_p)
16196 {
16197 /* There is no double popping fcomi variant. Fortunately,
16198 eflags is immune from the fstp's cc clobbering. */
16199 if (unordered_p)
16200 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16201 else
16202 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16203 return output_387_ffreep (operands, 0);
16204 }
16205 else
16206 {
16207 if (unordered_p)
16208 return "fucompp\n\tfnstsw\t%0";
16209 else
16210 return "fcompp\n\tfnstsw\t%0";
16211 }
16212 }
16213 else
16214 {
16215 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16216
16217 static const char * const alt[16] =
16218 {
16219 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16220 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16221 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16222 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16223
16224 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16225 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16226 NULL,
16227 NULL,
16228
16229 "fcomi\t{%y1, %0|%0, %y1}",
16230 "fcomip\t{%y1, %0|%0, %y1}",
16231 "fucomi\t{%y1, %0|%0, %y1}",
16232 "fucomip\t{%y1, %0|%0, %y1}",
16233
16234 NULL,
16235 NULL,
16236 NULL,
16237 NULL
16238 };
16239
16240 int mask;
16241 const char *ret;
16242
16243 mask = eflags_p << 3;
16244 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16245 mask |= unordered_p << 1;
16246 mask |= stack_top_dies;
16247
16248 gcc_assert (mask < 16);
16249 ret = alt[mask];
16250 gcc_assert (ret);
16251
16252 return ret;
16253 }
16254 }
16255
16256 void
16257 ix86_output_addr_vec_elt (FILE *file, int value)
16258 {
16259 const char *directive = ASM_LONG;
16260
16261 #ifdef ASM_QUAD
16262 if (TARGET_LP64)
16263 directive = ASM_QUAD;
16264 #else
16265 gcc_assert (!TARGET_64BIT);
16266 #endif
16267
16268 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16269 }
16270
16271 void
16272 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16273 {
16274 const char *directive = ASM_LONG;
16275
16276 #ifdef ASM_QUAD
16277 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16278 directive = ASM_QUAD;
16279 #else
16280 gcc_assert (!TARGET_64BIT);
16281 #endif
16282 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16283 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16284 fprintf (file, "%s%s%d-%s%d\n",
16285 directive, LPREFIX, value, LPREFIX, rel);
16286 else if (HAVE_AS_GOTOFF_IN_DATA)
16287 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16288 #if TARGET_MACHO
16289 else if (TARGET_MACHO)
16290 {
16291 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16292 machopic_output_function_base_name (file);
16293 putc ('\n', file);
16294 }
16295 #endif
16296 else
16297 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16298 GOT_SYMBOL_NAME, LPREFIX, value);
16299 }
16300 \f
16301 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16302 for the target. */
16303
16304 void
16305 ix86_expand_clear (rtx dest)
16306 {
16307 rtx tmp;
16308
16309 /* We play register width games, which are only valid after reload. */
16310 gcc_assert (reload_completed);
16311
16312 /* Avoid HImode and its attendant prefix byte. */
16313 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16314 dest = gen_rtx_REG (SImode, REGNO (dest));
16315 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16316
16317 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16318 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16319 {
16320 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16321 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16322 }
16323
16324 emit_insn (tmp);
16325 }
16326
16327 /* X is an unchanging MEM. If it is a constant pool reference, return
16328 the constant pool rtx, else NULL. */
16329
16330 rtx
16331 maybe_get_pool_constant (rtx x)
16332 {
16333 x = ix86_delegitimize_address (XEXP (x, 0));
16334
16335 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16336 return get_pool_constant (x);
16337
16338 return NULL_RTX;
16339 }
16340
16341 void
16342 ix86_expand_move (enum machine_mode mode, rtx operands[])
16343 {
16344 rtx op0, op1;
16345 enum tls_model model;
16346
16347 op0 = operands[0];
16348 op1 = operands[1];
16349
16350 if (GET_CODE (op1) == SYMBOL_REF)
16351 {
16352 rtx tmp;
16353
16354 model = SYMBOL_REF_TLS_MODEL (op1);
16355 if (model)
16356 {
16357 op1 = legitimize_tls_address (op1, model, true);
16358 op1 = force_operand (op1, op0);
16359 if (op1 == op0)
16360 return;
16361 op1 = convert_to_mode (mode, op1, 1);
16362 }
16363 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16364 op1 = tmp;
16365 }
16366 else if (GET_CODE (op1) == CONST
16367 && GET_CODE (XEXP (op1, 0)) == PLUS
16368 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16369 {
16370 rtx addend = XEXP (XEXP (op1, 0), 1);
16371 rtx symbol = XEXP (XEXP (op1, 0), 0);
16372 rtx tmp;
16373
16374 model = SYMBOL_REF_TLS_MODEL (symbol);
16375 if (model)
16376 tmp = legitimize_tls_address (symbol, model, true);
16377 else
16378 tmp = legitimize_pe_coff_symbol (symbol, true);
16379
16380 if (tmp)
16381 {
16382 tmp = force_operand (tmp, NULL);
16383 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16384 op0, 1, OPTAB_DIRECT);
16385 if (tmp == op0)
16386 return;
16387 op1 = convert_to_mode (mode, tmp, 1);
16388 }
16389 }
16390
16391 if ((flag_pic || MACHOPIC_INDIRECT)
16392 && symbolic_operand (op1, mode))
16393 {
16394 if (TARGET_MACHO && !TARGET_64BIT)
16395 {
16396 #if TARGET_MACHO
16397 /* dynamic-no-pic */
16398 if (MACHOPIC_INDIRECT)
16399 {
16400 rtx temp = ((reload_in_progress
16401 || ((op0 && REG_P (op0))
16402 && mode == Pmode))
16403 ? op0 : gen_reg_rtx (Pmode));
16404 op1 = machopic_indirect_data_reference (op1, temp);
16405 if (MACHOPIC_PURE)
16406 op1 = machopic_legitimize_pic_address (op1, mode,
16407 temp == op1 ? 0 : temp);
16408 }
16409 if (op0 != op1 && GET_CODE (op0) != MEM)
16410 {
16411 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16412 emit_insn (insn);
16413 return;
16414 }
16415 if (GET_CODE (op0) == MEM)
16416 op1 = force_reg (Pmode, op1);
16417 else
16418 {
16419 rtx temp = op0;
16420 if (GET_CODE (temp) != REG)
16421 temp = gen_reg_rtx (Pmode);
16422 temp = legitimize_pic_address (op1, temp);
16423 if (temp == op0)
16424 return;
16425 op1 = temp;
16426 }
16427 /* dynamic-no-pic */
16428 #endif
16429 }
16430 else
16431 {
16432 if (MEM_P (op0))
16433 op1 = force_reg (mode, op1);
16434 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16435 {
16436 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16437 op1 = legitimize_pic_address (op1, reg);
16438 if (op0 == op1)
16439 return;
16440 op1 = convert_to_mode (mode, op1, 1);
16441 }
16442 }
16443 }
16444 else
16445 {
16446 if (MEM_P (op0)
16447 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16448 || !push_operand (op0, mode))
16449 && MEM_P (op1))
16450 op1 = force_reg (mode, op1);
16451
16452 if (push_operand (op0, mode)
16453 && ! general_no_elim_operand (op1, mode))
16454 op1 = copy_to_mode_reg (mode, op1);
16455
16456 /* Force large constants in 64bit compilation into register
16457 to get them CSEed. */
16458 if (can_create_pseudo_p ()
16459 && (mode == DImode) && TARGET_64BIT
16460 && immediate_operand (op1, mode)
16461 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16462 && !register_operand (op0, mode)
16463 && optimize)
16464 op1 = copy_to_mode_reg (mode, op1);
16465
16466 if (can_create_pseudo_p ()
16467 && FLOAT_MODE_P (mode)
16468 && GET_CODE (op1) == CONST_DOUBLE)
16469 {
16470 /* If we are loading a floating point constant to a register,
16471 force the value to memory now, since we'll get better code
16472 out the back end. */
16473
16474 op1 = validize_mem (force_const_mem (mode, op1));
16475 if (!register_operand (op0, mode))
16476 {
16477 rtx temp = gen_reg_rtx (mode);
16478 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16479 emit_move_insn (op0, temp);
16480 return;
16481 }
16482 }
16483 }
16484
16485 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16486 }
16487
16488 void
16489 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16490 {
16491 rtx op0 = operands[0], op1 = operands[1];
16492 unsigned int align = GET_MODE_ALIGNMENT (mode);
16493
16494 /* Force constants other than zero into memory. We do not know how
16495 the instructions used to build constants modify the upper 64 bits
16496 of the register, once we have that information we may be able
16497 to handle some of them more efficiently. */
16498 if (can_create_pseudo_p ()
16499 && register_operand (op0, mode)
16500 && (CONSTANT_P (op1)
16501 || (GET_CODE (op1) == SUBREG
16502 && CONSTANT_P (SUBREG_REG (op1))))
16503 && !standard_sse_constant_p (op1))
16504 op1 = validize_mem (force_const_mem (mode, op1));
16505
16506 /* We need to check memory alignment for SSE mode since attribute
16507 can make operands unaligned. */
16508 if (can_create_pseudo_p ()
16509 && SSE_REG_MODE_P (mode)
16510 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16511 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16512 {
16513 rtx tmp[2];
16514
16515 /* ix86_expand_vector_move_misalign() does not like constants ... */
16516 if (CONSTANT_P (op1)
16517 || (GET_CODE (op1) == SUBREG
16518 && CONSTANT_P (SUBREG_REG (op1))))
16519 op1 = validize_mem (force_const_mem (mode, op1));
16520
16521 /* ... nor both arguments in memory. */
16522 if (!register_operand (op0, mode)
16523 && !register_operand (op1, mode))
16524 op1 = force_reg (mode, op1);
16525
16526 tmp[0] = op0; tmp[1] = op1;
16527 ix86_expand_vector_move_misalign (mode, tmp);
16528 return;
16529 }
16530
16531 /* Make operand1 a register if it isn't already. */
16532 if (can_create_pseudo_p ()
16533 && !register_operand (op0, mode)
16534 && !register_operand (op1, mode))
16535 {
16536 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16537 return;
16538 }
16539
16540 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16541 }
16542
16543 /* Split 32-byte AVX unaligned load and store if needed. */
16544
16545 static void
16546 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16547 {
16548 rtx m;
16549 rtx (*extract) (rtx, rtx, rtx);
16550 rtx (*load_unaligned) (rtx, rtx);
16551 rtx (*store_unaligned) (rtx, rtx);
16552 enum machine_mode mode;
16553
16554 switch (GET_MODE (op0))
16555 {
16556 default:
16557 gcc_unreachable ();
16558 case V32QImode:
16559 extract = gen_avx_vextractf128v32qi;
16560 load_unaligned = gen_avx_loaddquv32qi;
16561 store_unaligned = gen_avx_storedquv32qi;
16562 mode = V16QImode;
16563 break;
16564 case V8SFmode:
16565 extract = gen_avx_vextractf128v8sf;
16566 load_unaligned = gen_avx_loadups256;
16567 store_unaligned = gen_avx_storeups256;
16568 mode = V4SFmode;
16569 break;
16570 case V4DFmode:
16571 extract = gen_avx_vextractf128v4df;
16572 load_unaligned = gen_avx_loadupd256;
16573 store_unaligned = gen_avx_storeupd256;
16574 mode = V2DFmode;
16575 break;
16576 }
16577
16578 if (MEM_P (op1))
16579 {
16580 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16581 {
16582 rtx r = gen_reg_rtx (mode);
16583 m = adjust_address (op1, mode, 0);
16584 emit_move_insn (r, m);
16585 m = adjust_address (op1, mode, 16);
16586 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16587 emit_move_insn (op0, r);
16588 }
16589 /* Normal *mov<mode>_internal pattern will handle
16590 unaligned loads just fine if misaligned_operand
16591 is true, and without the UNSPEC it can be combined
16592 with arithmetic instructions. */
16593 else if (misaligned_operand (op1, GET_MODE (op1)))
16594 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16595 else
16596 emit_insn (load_unaligned (op0, op1));
16597 }
16598 else if (MEM_P (op0))
16599 {
16600 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16601 {
16602 m = adjust_address (op0, mode, 0);
16603 emit_insn (extract (m, op1, const0_rtx));
16604 m = adjust_address (op0, mode, 16);
16605 emit_insn (extract (m, op1, const1_rtx));
16606 }
16607 else
16608 emit_insn (store_unaligned (op0, op1));
16609 }
16610 else
16611 gcc_unreachable ();
16612 }
16613
16614 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16615 straight to ix86_expand_vector_move. */
16616 /* Code generation for scalar reg-reg moves of single and double precision data:
16617 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16618 movaps reg, reg
16619 else
16620 movss reg, reg
16621 if (x86_sse_partial_reg_dependency == true)
16622 movapd reg, reg
16623 else
16624 movsd reg, reg
16625
16626 Code generation for scalar loads of double precision data:
16627 if (x86_sse_split_regs == true)
16628 movlpd mem, reg (gas syntax)
16629 else
16630 movsd mem, reg
16631
16632 Code generation for unaligned packed loads of single precision data
16633 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16634 if (x86_sse_unaligned_move_optimal)
16635 movups mem, reg
16636
16637 if (x86_sse_partial_reg_dependency == true)
16638 {
16639 xorps reg, reg
16640 movlps mem, reg
16641 movhps mem+8, reg
16642 }
16643 else
16644 {
16645 movlps mem, reg
16646 movhps mem+8, reg
16647 }
16648
16649 Code generation for unaligned packed loads of double precision data
16650 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16651 if (x86_sse_unaligned_move_optimal)
16652 movupd mem, reg
16653
16654 if (x86_sse_split_regs == true)
16655 {
16656 movlpd mem, reg
16657 movhpd mem+8, reg
16658 }
16659 else
16660 {
16661 movsd mem, reg
16662 movhpd mem+8, reg
16663 }
16664 */
16665
16666 void
16667 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16668 {
16669 rtx op0, op1, orig_op0 = NULL_RTX, m;
16670 rtx (*load_unaligned) (rtx, rtx);
16671 rtx (*store_unaligned) (rtx, rtx);
16672
16673 op0 = operands[0];
16674 op1 = operands[1];
16675
16676 if (GET_MODE_SIZE (mode) == 64)
16677 {
16678 switch (GET_MODE_CLASS (mode))
16679 {
16680 case MODE_VECTOR_INT:
16681 case MODE_INT:
16682 if (GET_MODE (op0) != V16SImode)
16683 {
16684 if (!MEM_P (op0))
16685 {
16686 orig_op0 = op0;
16687 op0 = gen_reg_rtx (V16SImode);
16688 }
16689 else
16690 op0 = gen_lowpart (V16SImode, op0);
16691 }
16692 op1 = gen_lowpart (V16SImode, op1);
16693 /* FALLTHRU */
16694
16695 case MODE_VECTOR_FLOAT:
16696 switch (GET_MODE (op0))
16697 {
16698 default:
16699 gcc_unreachable ();
16700 case V16SImode:
16701 load_unaligned = gen_avx512f_loaddquv16si;
16702 store_unaligned = gen_avx512f_storedquv16si;
16703 break;
16704 case V16SFmode:
16705 load_unaligned = gen_avx512f_loadups512;
16706 store_unaligned = gen_avx512f_storeups512;
16707 break;
16708 case V8DFmode:
16709 load_unaligned = gen_avx512f_loadupd512;
16710 store_unaligned = gen_avx512f_storeupd512;
16711 break;
16712 }
16713
16714 if (MEM_P (op1))
16715 emit_insn (load_unaligned (op0, op1));
16716 else if (MEM_P (op0))
16717 emit_insn (store_unaligned (op0, op1));
16718 else
16719 gcc_unreachable ();
16720 if (orig_op0)
16721 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16722 break;
16723
16724 default:
16725 gcc_unreachable ();
16726 }
16727
16728 return;
16729 }
16730
16731 if (TARGET_AVX
16732 && GET_MODE_SIZE (mode) == 32)
16733 {
16734 switch (GET_MODE_CLASS (mode))
16735 {
16736 case MODE_VECTOR_INT:
16737 case MODE_INT:
16738 if (GET_MODE (op0) != V32QImode)
16739 {
16740 if (!MEM_P (op0))
16741 {
16742 orig_op0 = op0;
16743 op0 = gen_reg_rtx (V32QImode);
16744 }
16745 else
16746 op0 = gen_lowpart (V32QImode, op0);
16747 }
16748 op1 = gen_lowpart (V32QImode, op1);
16749 /* FALLTHRU */
16750
16751 case MODE_VECTOR_FLOAT:
16752 ix86_avx256_split_vector_move_misalign (op0, op1);
16753 if (orig_op0)
16754 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16755 break;
16756
16757 default:
16758 gcc_unreachable ();
16759 }
16760
16761 return;
16762 }
16763
16764 if (MEM_P (op1))
16765 {
16766 /* Normal *mov<mode>_internal pattern will handle
16767 unaligned loads just fine if misaligned_operand
16768 is true, and without the UNSPEC it can be combined
16769 with arithmetic instructions. */
16770 if (TARGET_AVX
16771 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16772 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16773 && misaligned_operand (op1, GET_MODE (op1)))
16774 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16775 /* ??? If we have typed data, then it would appear that using
16776 movdqu is the only way to get unaligned data loaded with
16777 integer type. */
16778 else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16779 {
16780 if (GET_MODE (op0) != V16QImode)
16781 {
16782 orig_op0 = op0;
16783 op0 = gen_reg_rtx (V16QImode);
16784 }
16785 op1 = gen_lowpart (V16QImode, op1);
16786 /* We will eventually emit movups based on insn attributes. */
16787 emit_insn (gen_sse2_loaddquv16qi (op0, op1));
16788 if (orig_op0)
16789 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16790 }
16791 else if (TARGET_SSE2 && mode == V2DFmode)
16792 {
16793 rtx zero;
16794
16795 if (TARGET_AVX
16796 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16797 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16798 || optimize_insn_for_size_p ())
16799 {
16800 /* We will eventually emit movups based on insn attributes. */
16801 emit_insn (gen_sse2_loadupd (op0, op1));
16802 return;
16803 }
16804
16805 /* When SSE registers are split into halves, we can avoid
16806 writing to the top half twice. */
16807 if (TARGET_SSE_SPLIT_REGS)
16808 {
16809 emit_clobber (op0);
16810 zero = op0;
16811 }
16812 else
16813 {
16814 /* ??? Not sure about the best option for the Intel chips.
16815 The following would seem to satisfy; the register is
16816 entirely cleared, breaking the dependency chain. We
16817 then store to the upper half, with a dependency depth
16818 of one. A rumor has it that Intel recommends two movsd
16819 followed by an unpacklpd, but this is unconfirmed. And
16820 given that the dependency depth of the unpacklpd would
16821 still be one, I'm not sure why this would be better. */
16822 zero = CONST0_RTX (V2DFmode);
16823 }
16824
16825 m = adjust_address (op1, DFmode, 0);
16826 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16827 m = adjust_address (op1, DFmode, 8);
16828 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16829 }
16830 else
16831 {
16832 rtx t;
16833
16834 if (TARGET_AVX
16835 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16836 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16837 || optimize_insn_for_size_p ())
16838 {
16839 if (GET_MODE (op0) != V4SFmode)
16840 {
16841 orig_op0 = op0;
16842 op0 = gen_reg_rtx (V4SFmode);
16843 }
16844 op1 = gen_lowpart (V4SFmode, op1);
16845 emit_insn (gen_sse_loadups (op0, op1));
16846 if (orig_op0)
16847 emit_move_insn (orig_op0,
16848 gen_lowpart (GET_MODE (orig_op0), op0));
16849 return;
16850 }
16851
16852 if (mode != V4SFmode)
16853 t = gen_reg_rtx (V4SFmode);
16854 else
16855 t = op0;
16856
16857 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16858 emit_move_insn (t, CONST0_RTX (V4SFmode));
16859 else
16860 emit_clobber (t);
16861
16862 m = adjust_address (op1, V2SFmode, 0);
16863 emit_insn (gen_sse_loadlps (t, t, m));
16864 m = adjust_address (op1, V2SFmode, 8);
16865 emit_insn (gen_sse_loadhps (t, t, m));
16866 if (mode != V4SFmode)
16867 emit_move_insn (op0, gen_lowpart (mode, t));
16868 }
16869 }
16870 else if (MEM_P (op0))
16871 {
16872 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16873 {
16874 op0 = gen_lowpart (V16QImode, op0);
16875 op1 = gen_lowpart (V16QImode, op1);
16876 /* We will eventually emit movups based on insn attributes. */
16877 emit_insn (gen_sse2_storedquv16qi (op0, op1));
16878 }
16879 else if (TARGET_SSE2 && mode == V2DFmode)
16880 {
16881 if (TARGET_AVX
16882 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16883 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16884 || optimize_insn_for_size_p ())
16885 /* We will eventually emit movups based on insn attributes. */
16886 emit_insn (gen_sse2_storeupd (op0, op1));
16887 else
16888 {
16889 m = adjust_address (op0, DFmode, 0);
16890 emit_insn (gen_sse2_storelpd (m, op1));
16891 m = adjust_address (op0, DFmode, 8);
16892 emit_insn (gen_sse2_storehpd (m, op1));
16893 }
16894 }
16895 else
16896 {
16897 if (mode != V4SFmode)
16898 op1 = gen_lowpart (V4SFmode, op1);
16899
16900 if (TARGET_AVX
16901 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16902 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16903 || optimize_insn_for_size_p ())
16904 {
16905 op0 = gen_lowpart (V4SFmode, op0);
16906 emit_insn (gen_sse_storeups (op0, op1));
16907 }
16908 else
16909 {
16910 m = adjust_address (op0, V2SFmode, 0);
16911 emit_insn (gen_sse_storelps (m, op1));
16912 m = adjust_address (op0, V2SFmode, 8);
16913 emit_insn (gen_sse_storehps (m, op1));
16914 }
16915 }
16916 }
16917 else
16918 gcc_unreachable ();
16919 }
16920
16921 /* Expand a push in MODE. This is some mode for which we do not support
16922 proper push instructions, at least from the registers that we expect
16923 the value to live in. */
16924
16925 void
16926 ix86_expand_push (enum machine_mode mode, rtx x)
16927 {
16928 rtx tmp;
16929
16930 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16931 GEN_INT (-GET_MODE_SIZE (mode)),
16932 stack_pointer_rtx, 1, OPTAB_DIRECT);
16933 if (tmp != stack_pointer_rtx)
16934 emit_move_insn (stack_pointer_rtx, tmp);
16935
16936 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16937
16938 /* When we push an operand onto stack, it has to be aligned at least
16939 at the function argument boundary. However since we don't have
16940 the argument type, we can't determine the actual argument
16941 boundary. */
16942 emit_move_insn (tmp, x);
16943 }
16944
16945 /* Helper function of ix86_fixup_binary_operands to canonicalize
16946 operand order. Returns true if the operands should be swapped. */
16947
16948 static bool
16949 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16950 rtx operands[])
16951 {
16952 rtx dst = operands[0];
16953 rtx src1 = operands[1];
16954 rtx src2 = operands[2];
16955
16956 /* If the operation is not commutative, we can't do anything. */
16957 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16958 return false;
16959
16960 /* Highest priority is that src1 should match dst. */
16961 if (rtx_equal_p (dst, src1))
16962 return false;
16963 if (rtx_equal_p (dst, src2))
16964 return true;
16965
16966 /* Next highest priority is that immediate constants come second. */
16967 if (immediate_operand (src2, mode))
16968 return false;
16969 if (immediate_operand (src1, mode))
16970 return true;
16971
16972 /* Lowest priority is that memory references should come second. */
16973 if (MEM_P (src2))
16974 return false;
16975 if (MEM_P (src1))
16976 return true;
16977
16978 return false;
16979 }
16980
16981
16982 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16983 destination to use for the operation. If different from the true
16984 destination in operands[0], a copy operation will be required. */
16985
16986 rtx
16987 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16988 rtx operands[])
16989 {
16990 rtx dst = operands[0];
16991 rtx src1 = operands[1];
16992 rtx src2 = operands[2];
16993
16994 /* Canonicalize operand order. */
16995 if (ix86_swap_binary_operands_p (code, mode, operands))
16996 {
16997 rtx temp;
16998
16999 /* It is invalid to swap operands of different modes. */
17000 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
17001
17002 temp = src1;
17003 src1 = src2;
17004 src2 = temp;
17005 }
17006
17007 /* Both source operands cannot be in memory. */
17008 if (MEM_P (src1) && MEM_P (src2))
17009 {
17010 /* Optimization: Only read from memory once. */
17011 if (rtx_equal_p (src1, src2))
17012 {
17013 src2 = force_reg (mode, src2);
17014 src1 = src2;
17015 }
17016 else if (rtx_equal_p (dst, src1))
17017 src2 = force_reg (mode, src2);
17018 else
17019 src1 = force_reg (mode, src1);
17020 }
17021
17022 /* If the destination is memory, and we do not have matching source
17023 operands, do things in registers. */
17024 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17025 dst = gen_reg_rtx (mode);
17026
17027 /* Source 1 cannot be a constant. */
17028 if (CONSTANT_P (src1))
17029 src1 = force_reg (mode, src1);
17030
17031 /* Source 1 cannot be a non-matching memory. */
17032 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17033 src1 = force_reg (mode, src1);
17034
17035 /* Improve address combine. */
17036 if (code == PLUS
17037 && GET_MODE_CLASS (mode) == MODE_INT
17038 && MEM_P (src2))
17039 src2 = force_reg (mode, src2);
17040
17041 operands[1] = src1;
17042 operands[2] = src2;
17043 return dst;
17044 }
17045
17046 /* Similarly, but assume that the destination has already been
17047 set up properly. */
17048
17049 void
17050 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
17051 enum machine_mode mode, rtx operands[])
17052 {
17053 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
17054 gcc_assert (dst == operands[0]);
17055 }
17056
17057 /* Attempt to expand a binary operator. Make the expansion closer to the
17058 actual machine, then just general_operand, which will allow 3 separate
17059 memory references (one output, two input) in a single insn. */
17060
17061 void
17062 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
17063 rtx operands[])
17064 {
17065 rtx src1, src2, dst, op, clob;
17066
17067 dst = ix86_fixup_binary_operands (code, mode, operands);
17068 src1 = operands[1];
17069 src2 = operands[2];
17070
17071 /* Emit the instruction. */
17072
17073 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
17074 if (reload_in_progress)
17075 {
17076 /* Reload doesn't know about the flags register, and doesn't know that
17077 it doesn't want to clobber it. We can only do this with PLUS. */
17078 gcc_assert (code == PLUS);
17079 emit_insn (op);
17080 }
17081 else if (reload_completed
17082 && code == PLUS
17083 && !rtx_equal_p (dst, src1))
17084 {
17085 /* This is going to be an LEA; avoid splitting it later. */
17086 emit_insn (op);
17087 }
17088 else
17089 {
17090 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17091 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17092 }
17093
17094 /* Fix up the destination if needed. */
17095 if (dst != operands[0])
17096 emit_move_insn (operands[0], dst);
17097 }
17098
17099 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
17100 the given OPERANDS. */
17101
17102 void
17103 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
17104 rtx operands[])
17105 {
17106 rtx op1 = NULL_RTX, op2 = NULL_RTX;
17107 if (GET_CODE (operands[1]) == SUBREG)
17108 {
17109 op1 = operands[1];
17110 op2 = operands[2];
17111 }
17112 else if (GET_CODE (operands[2]) == SUBREG)
17113 {
17114 op1 = operands[2];
17115 op2 = operands[1];
17116 }
17117 /* Optimize (__m128i) d | (__m128i) e and similar code
17118 when d and e are float vectors into float vector logical
17119 insn. In C/C++ without using intrinsics there is no other way
17120 to express vector logical operation on float vectors than
17121 to cast them temporarily to integer vectors. */
17122 if (op1
17123 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17124 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
17125 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
17126 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
17127 && SUBREG_BYTE (op1) == 0
17128 && (GET_CODE (op2) == CONST_VECTOR
17129 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17130 && SUBREG_BYTE (op2) == 0))
17131 && can_create_pseudo_p ())
17132 {
17133 rtx dst;
17134 switch (GET_MODE (SUBREG_REG (op1)))
17135 {
17136 case V4SFmode:
17137 case V8SFmode:
17138 case V2DFmode:
17139 case V4DFmode:
17140 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17141 if (GET_CODE (op2) == CONST_VECTOR)
17142 {
17143 op2 = gen_lowpart (GET_MODE (dst), op2);
17144 op2 = force_reg (GET_MODE (dst), op2);
17145 }
17146 else
17147 {
17148 op1 = operands[1];
17149 op2 = SUBREG_REG (operands[2]);
17150 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17151 op2 = force_reg (GET_MODE (dst), op2);
17152 }
17153 op1 = SUBREG_REG (op1);
17154 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17155 op1 = force_reg (GET_MODE (dst), op1);
17156 emit_insn (gen_rtx_SET (VOIDmode, dst,
17157 gen_rtx_fmt_ee (code, GET_MODE (dst),
17158 op1, op2)));
17159 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17160 return;
17161 default:
17162 break;
17163 }
17164 }
17165 if (!nonimmediate_operand (operands[1], mode))
17166 operands[1] = force_reg (mode, operands[1]);
17167 if (!nonimmediate_operand (operands[2], mode))
17168 operands[2] = force_reg (mode, operands[2]);
17169 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17170 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17171 gen_rtx_fmt_ee (code, mode, operands[1],
17172 operands[2])));
17173 }
17174
17175 /* Return TRUE or FALSE depending on whether the binary operator meets the
17176 appropriate constraints. */
17177
17178 bool
17179 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17180 rtx operands[3])
17181 {
17182 rtx dst = operands[0];
17183 rtx src1 = operands[1];
17184 rtx src2 = operands[2];
17185
17186 /* Both source operands cannot be in memory. */
17187 if (MEM_P (src1) && MEM_P (src2))
17188 return false;
17189
17190 /* Canonicalize operand order for commutative operators. */
17191 if (ix86_swap_binary_operands_p (code, mode, operands))
17192 {
17193 rtx temp = src1;
17194 src1 = src2;
17195 src2 = temp;
17196 }
17197
17198 /* If the destination is memory, we must have a matching source operand. */
17199 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17200 return false;
17201
17202 /* Source 1 cannot be a constant. */
17203 if (CONSTANT_P (src1))
17204 return false;
17205
17206 /* Source 1 cannot be a non-matching memory. */
17207 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17208 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17209 return (code == AND
17210 && (mode == HImode
17211 || mode == SImode
17212 || (TARGET_64BIT && mode == DImode))
17213 && satisfies_constraint_L (src2));
17214
17215 return true;
17216 }
17217
17218 /* Attempt to expand a unary operator. Make the expansion closer to the
17219 actual machine, then just general_operand, which will allow 2 separate
17220 memory references (one output, one input) in a single insn. */
17221
17222 void
17223 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17224 rtx operands[])
17225 {
17226 int matching_memory;
17227 rtx src, dst, op, clob;
17228
17229 dst = operands[0];
17230 src = operands[1];
17231
17232 /* If the destination is memory, and we do not have matching source
17233 operands, do things in registers. */
17234 matching_memory = 0;
17235 if (MEM_P (dst))
17236 {
17237 if (rtx_equal_p (dst, src))
17238 matching_memory = 1;
17239 else
17240 dst = gen_reg_rtx (mode);
17241 }
17242
17243 /* When source operand is memory, destination must match. */
17244 if (MEM_P (src) && !matching_memory)
17245 src = force_reg (mode, src);
17246
17247 /* Emit the instruction. */
17248
17249 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17250 if (reload_in_progress || code == NOT)
17251 {
17252 /* Reload doesn't know about the flags register, and doesn't know that
17253 it doesn't want to clobber it. */
17254 gcc_assert (code == NOT);
17255 emit_insn (op);
17256 }
17257 else
17258 {
17259 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17260 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17261 }
17262
17263 /* Fix up the destination if needed. */
17264 if (dst != operands[0])
17265 emit_move_insn (operands[0], dst);
17266 }
17267
17268 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17269 divisor are within the range [0-255]. */
17270
17271 void
17272 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17273 bool signed_p)
17274 {
17275 rtx end_label, qimode_label;
17276 rtx insn, div, mod;
17277 rtx scratch, tmp0, tmp1, tmp2;
17278 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17279 rtx (*gen_zero_extend) (rtx, rtx);
17280 rtx (*gen_test_ccno_1) (rtx, rtx);
17281
17282 switch (mode)
17283 {
17284 case SImode:
17285 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17286 gen_test_ccno_1 = gen_testsi_ccno_1;
17287 gen_zero_extend = gen_zero_extendqisi2;
17288 break;
17289 case DImode:
17290 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17291 gen_test_ccno_1 = gen_testdi_ccno_1;
17292 gen_zero_extend = gen_zero_extendqidi2;
17293 break;
17294 default:
17295 gcc_unreachable ();
17296 }
17297
17298 end_label = gen_label_rtx ();
17299 qimode_label = gen_label_rtx ();
17300
17301 scratch = gen_reg_rtx (mode);
17302
17303 /* Use 8bit unsigned divimod if dividend and divisor are within
17304 the range [0-255]. */
17305 emit_move_insn (scratch, operands[2]);
17306 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17307 scratch, 1, OPTAB_DIRECT);
17308 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17309 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17310 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17311 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17312 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17313 pc_rtx);
17314 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17315 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17316 JUMP_LABEL (insn) = qimode_label;
17317
17318 /* Generate original signed/unsigned divimod. */
17319 div = gen_divmod4_1 (operands[0], operands[1],
17320 operands[2], operands[3]);
17321 emit_insn (div);
17322
17323 /* Branch to the end. */
17324 emit_jump_insn (gen_jump (end_label));
17325 emit_barrier ();
17326
17327 /* Generate 8bit unsigned divide. */
17328 emit_label (qimode_label);
17329 /* Don't use operands[0] for result of 8bit divide since not all
17330 registers support QImode ZERO_EXTRACT. */
17331 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17332 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17333 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17334 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17335
17336 if (signed_p)
17337 {
17338 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17339 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17340 }
17341 else
17342 {
17343 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17344 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17345 }
17346
17347 /* Extract remainder from AH. */
17348 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17349 if (REG_P (operands[1]))
17350 insn = emit_move_insn (operands[1], tmp1);
17351 else
17352 {
17353 /* Need a new scratch register since the old one has result
17354 of 8bit divide. */
17355 scratch = gen_reg_rtx (mode);
17356 emit_move_insn (scratch, tmp1);
17357 insn = emit_move_insn (operands[1], scratch);
17358 }
17359 set_unique_reg_note (insn, REG_EQUAL, mod);
17360
17361 /* Zero extend quotient from AL. */
17362 tmp1 = gen_lowpart (QImode, tmp0);
17363 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17364 set_unique_reg_note (insn, REG_EQUAL, div);
17365
17366 emit_label (end_label);
17367 }
17368
17369 /* Whether it is OK to emit CFI directives when emitting asm code. */
17370
17371 bool
17372 ix86_emit_cfi ()
17373 {
17374 return dwarf2out_do_cfi_asm ();
17375 }
17376
17377 #define LEA_MAX_STALL (3)
17378 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17379
17380 /* Increase given DISTANCE in half-cycles according to
17381 dependencies between PREV and NEXT instructions.
17382 Add 1 half-cycle if there is no dependency and
17383 go to next cycle if there is some dependecy. */
17384
17385 static unsigned int
17386 increase_distance (rtx prev, rtx next, unsigned int distance)
17387 {
17388 df_ref *use_rec;
17389 df_ref *def_rec;
17390
17391 if (!prev || !next)
17392 return distance + (distance & 1) + 2;
17393
17394 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17395 return distance + 1;
17396
17397 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17398 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17399 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17400 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17401 return distance + (distance & 1) + 2;
17402
17403 return distance + 1;
17404 }
17405
17406 /* Function checks if instruction INSN defines register number
17407 REGNO1 or REGNO2. */
17408
17409 static bool
17410 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17411 rtx insn)
17412 {
17413 df_ref *def_rec;
17414
17415 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17416 if (DF_REF_REG_DEF_P (*def_rec)
17417 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17418 && (regno1 == DF_REF_REGNO (*def_rec)
17419 || regno2 == DF_REF_REGNO (*def_rec)))
17420 {
17421 return true;
17422 }
17423
17424 return false;
17425 }
17426
17427 /* Function checks if instruction INSN uses register number
17428 REGNO as a part of address expression. */
17429
17430 static bool
17431 insn_uses_reg_mem (unsigned int regno, rtx insn)
17432 {
17433 df_ref *use_rec;
17434
17435 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17436 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17437 return true;
17438
17439 return false;
17440 }
17441
17442 /* Search backward for non-agu definition of register number REGNO1
17443 or register number REGNO2 in basic block starting from instruction
17444 START up to head of basic block or instruction INSN.
17445
17446 Function puts true value into *FOUND var if definition was found
17447 and false otherwise.
17448
17449 Distance in half-cycles between START and found instruction or head
17450 of BB is added to DISTANCE and returned. */
17451
17452 static int
17453 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17454 rtx insn, int distance,
17455 rtx start, bool *found)
17456 {
17457 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17458 rtx prev = start;
17459 rtx next = NULL;
17460
17461 *found = false;
17462
17463 while (prev
17464 && prev != insn
17465 && distance < LEA_SEARCH_THRESHOLD)
17466 {
17467 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17468 {
17469 distance = increase_distance (prev, next, distance);
17470 if (insn_defines_reg (regno1, regno2, prev))
17471 {
17472 if (recog_memoized (prev) < 0
17473 || get_attr_type (prev) != TYPE_LEA)
17474 {
17475 *found = true;
17476 return distance;
17477 }
17478 }
17479
17480 next = prev;
17481 }
17482 if (prev == BB_HEAD (bb))
17483 break;
17484
17485 prev = PREV_INSN (prev);
17486 }
17487
17488 return distance;
17489 }
17490
17491 /* Search backward for non-agu definition of register number REGNO1
17492 or register number REGNO2 in INSN's basic block until
17493 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17494 2. Reach neighbour BBs boundary, or
17495 3. Reach agu definition.
17496 Returns the distance between the non-agu definition point and INSN.
17497 If no definition point, returns -1. */
17498
17499 static int
17500 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17501 rtx insn)
17502 {
17503 basic_block bb = BLOCK_FOR_INSN (insn);
17504 int distance = 0;
17505 bool found = false;
17506
17507 if (insn != BB_HEAD (bb))
17508 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17509 distance, PREV_INSN (insn),
17510 &found);
17511
17512 if (!found && distance < LEA_SEARCH_THRESHOLD)
17513 {
17514 edge e;
17515 edge_iterator ei;
17516 bool simple_loop = false;
17517
17518 FOR_EACH_EDGE (e, ei, bb->preds)
17519 if (e->src == bb)
17520 {
17521 simple_loop = true;
17522 break;
17523 }
17524
17525 if (simple_loop)
17526 distance = distance_non_agu_define_in_bb (regno1, regno2,
17527 insn, distance,
17528 BB_END (bb), &found);
17529 else
17530 {
17531 int shortest_dist = -1;
17532 bool found_in_bb = false;
17533
17534 FOR_EACH_EDGE (e, ei, bb->preds)
17535 {
17536 int bb_dist
17537 = distance_non_agu_define_in_bb (regno1, regno2,
17538 insn, distance,
17539 BB_END (e->src),
17540 &found_in_bb);
17541 if (found_in_bb)
17542 {
17543 if (shortest_dist < 0)
17544 shortest_dist = bb_dist;
17545 else if (bb_dist > 0)
17546 shortest_dist = MIN (bb_dist, shortest_dist);
17547
17548 found = true;
17549 }
17550 }
17551
17552 distance = shortest_dist;
17553 }
17554 }
17555
17556 /* get_attr_type may modify recog data. We want to make sure
17557 that recog data is valid for instruction INSN, on which
17558 distance_non_agu_define is called. INSN is unchanged here. */
17559 extract_insn_cached (insn);
17560
17561 if (!found)
17562 return -1;
17563
17564 return distance >> 1;
17565 }
17566
17567 /* Return the distance in half-cycles between INSN and the next
17568 insn that uses register number REGNO in memory address added
17569 to DISTANCE. Return -1 if REGNO0 is set.
17570
17571 Put true value into *FOUND if register usage was found and
17572 false otherwise.
17573 Put true value into *REDEFINED if register redefinition was
17574 found and false otherwise. */
17575
17576 static int
17577 distance_agu_use_in_bb (unsigned int regno,
17578 rtx insn, int distance, rtx start,
17579 bool *found, bool *redefined)
17580 {
17581 basic_block bb = NULL;
17582 rtx next = start;
17583 rtx prev = NULL;
17584
17585 *found = false;
17586 *redefined = false;
17587
17588 if (start != NULL_RTX)
17589 {
17590 bb = BLOCK_FOR_INSN (start);
17591 if (start != BB_HEAD (bb))
17592 /* If insn and start belong to the same bb, set prev to insn,
17593 so the call to increase_distance will increase the distance
17594 between insns by 1. */
17595 prev = insn;
17596 }
17597
17598 while (next
17599 && next != insn
17600 && distance < LEA_SEARCH_THRESHOLD)
17601 {
17602 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17603 {
17604 distance = increase_distance(prev, next, distance);
17605 if (insn_uses_reg_mem (regno, next))
17606 {
17607 /* Return DISTANCE if OP0 is used in memory
17608 address in NEXT. */
17609 *found = true;
17610 return distance;
17611 }
17612
17613 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17614 {
17615 /* Return -1 if OP0 is set in NEXT. */
17616 *redefined = true;
17617 return -1;
17618 }
17619
17620 prev = next;
17621 }
17622
17623 if (next == BB_END (bb))
17624 break;
17625
17626 next = NEXT_INSN (next);
17627 }
17628
17629 return distance;
17630 }
17631
17632 /* Return the distance between INSN and the next insn that uses
17633 register number REGNO0 in memory address. Return -1 if no such
17634 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17635
17636 static int
17637 distance_agu_use (unsigned int regno0, rtx insn)
17638 {
17639 basic_block bb = BLOCK_FOR_INSN (insn);
17640 int distance = 0;
17641 bool found = false;
17642 bool redefined = false;
17643
17644 if (insn != BB_END (bb))
17645 distance = distance_agu_use_in_bb (regno0, insn, distance,
17646 NEXT_INSN (insn),
17647 &found, &redefined);
17648
17649 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17650 {
17651 edge e;
17652 edge_iterator ei;
17653 bool simple_loop = false;
17654
17655 FOR_EACH_EDGE (e, ei, bb->succs)
17656 if (e->dest == bb)
17657 {
17658 simple_loop = true;
17659 break;
17660 }
17661
17662 if (simple_loop)
17663 distance = distance_agu_use_in_bb (regno0, insn,
17664 distance, BB_HEAD (bb),
17665 &found, &redefined);
17666 else
17667 {
17668 int shortest_dist = -1;
17669 bool found_in_bb = false;
17670 bool redefined_in_bb = false;
17671
17672 FOR_EACH_EDGE (e, ei, bb->succs)
17673 {
17674 int bb_dist
17675 = distance_agu_use_in_bb (regno0, insn,
17676 distance, BB_HEAD (e->dest),
17677 &found_in_bb, &redefined_in_bb);
17678 if (found_in_bb)
17679 {
17680 if (shortest_dist < 0)
17681 shortest_dist = bb_dist;
17682 else if (bb_dist > 0)
17683 shortest_dist = MIN (bb_dist, shortest_dist);
17684
17685 found = true;
17686 }
17687 }
17688
17689 distance = shortest_dist;
17690 }
17691 }
17692
17693 if (!found || redefined)
17694 return -1;
17695
17696 return distance >> 1;
17697 }
17698
17699 /* Define this macro to tune LEA priority vs ADD, it take effect when
17700 there is a dilemma of choicing LEA or ADD
17701 Negative value: ADD is more preferred than LEA
17702 Zero: Netrual
17703 Positive value: LEA is more preferred than ADD*/
17704 #define IX86_LEA_PRIORITY 0
17705
17706 /* Return true if usage of lea INSN has performance advantage
17707 over a sequence of instructions. Instructions sequence has
17708 SPLIT_COST cycles higher latency than lea latency. */
17709
17710 static bool
17711 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17712 unsigned int regno2, int split_cost, bool has_scale)
17713 {
17714 int dist_define, dist_use;
17715
17716 /* For Silvermont if using a 2-source or 3-source LEA for
17717 non-destructive destination purposes, or due to wanting
17718 ability to use SCALE, the use of LEA is justified. */
17719 if (ix86_tune == PROCESSOR_SLM)
17720 {
17721 if (has_scale)
17722 return true;
17723 if (split_cost < 1)
17724 return false;
17725 if (regno0 == regno1 || regno0 == regno2)
17726 return false;
17727 return true;
17728 }
17729
17730 dist_define = distance_non_agu_define (regno1, regno2, insn);
17731 dist_use = distance_agu_use (regno0, insn);
17732
17733 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17734 {
17735 /* If there is no non AGU operand definition, no AGU
17736 operand usage and split cost is 0 then both lea
17737 and non lea variants have same priority. Currently
17738 we prefer lea for 64 bit code and non lea on 32 bit
17739 code. */
17740 if (dist_use < 0 && split_cost == 0)
17741 return TARGET_64BIT || IX86_LEA_PRIORITY;
17742 else
17743 return true;
17744 }
17745
17746 /* With longer definitions distance lea is more preferable.
17747 Here we change it to take into account splitting cost and
17748 lea priority. */
17749 dist_define += split_cost + IX86_LEA_PRIORITY;
17750
17751 /* If there is no use in memory addess then we just check
17752 that split cost exceeds AGU stall. */
17753 if (dist_use < 0)
17754 return dist_define > LEA_MAX_STALL;
17755
17756 /* If this insn has both backward non-agu dependence and forward
17757 agu dependence, the one with short distance takes effect. */
17758 return dist_define >= dist_use;
17759 }
17760
17761 /* Return true if it is legal to clobber flags by INSN and
17762 false otherwise. */
17763
17764 static bool
17765 ix86_ok_to_clobber_flags (rtx insn)
17766 {
17767 basic_block bb = BLOCK_FOR_INSN (insn);
17768 df_ref *use;
17769 bitmap live;
17770
17771 while (insn)
17772 {
17773 if (NONDEBUG_INSN_P (insn))
17774 {
17775 for (use = DF_INSN_USES (insn); *use; use++)
17776 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17777 return false;
17778
17779 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17780 return true;
17781 }
17782
17783 if (insn == BB_END (bb))
17784 break;
17785
17786 insn = NEXT_INSN (insn);
17787 }
17788
17789 live = df_get_live_out(bb);
17790 return !REGNO_REG_SET_P (live, FLAGS_REG);
17791 }
17792
17793 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17794 move and add to avoid AGU stalls. */
17795
17796 bool
17797 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17798 {
17799 unsigned int regno0, regno1, regno2;
17800
17801 /* Check if we need to optimize. */
17802 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17803 return false;
17804
17805 /* Check it is correct to split here. */
17806 if (!ix86_ok_to_clobber_flags(insn))
17807 return false;
17808
17809 regno0 = true_regnum (operands[0]);
17810 regno1 = true_regnum (operands[1]);
17811 regno2 = true_regnum (operands[2]);
17812
17813 /* We need to split only adds with non destructive
17814 destination operand. */
17815 if (regno0 == regno1 || regno0 == regno2)
17816 return false;
17817 else
17818 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17819 }
17820
17821 /* Return true if we should emit lea instruction instead of mov
17822 instruction. */
17823
17824 bool
17825 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17826 {
17827 unsigned int regno0, regno1;
17828
17829 /* Check if we need to optimize. */
17830 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17831 return false;
17832
17833 /* Use lea for reg to reg moves only. */
17834 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17835 return false;
17836
17837 regno0 = true_regnum (operands[0]);
17838 regno1 = true_regnum (operands[1]);
17839
17840 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17841 }
17842
17843 /* Return true if we need to split lea into a sequence of
17844 instructions to avoid AGU stalls. */
17845
17846 bool
17847 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17848 {
17849 unsigned int regno0, regno1, regno2;
17850 int split_cost;
17851 struct ix86_address parts;
17852 int ok;
17853
17854 /* Check we need to optimize. */
17855 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17856 return false;
17857
17858 /* Check it is correct to split here. */
17859 if (!ix86_ok_to_clobber_flags(insn))
17860 return false;
17861
17862 ok = ix86_decompose_address (operands[1], &parts);
17863 gcc_assert (ok);
17864
17865 /* There should be at least two components in the address. */
17866 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17867 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17868 return false;
17869
17870 /* We should not split into add if non legitimate pic
17871 operand is used as displacement. */
17872 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17873 return false;
17874
17875 regno0 = true_regnum (operands[0]) ;
17876 regno1 = INVALID_REGNUM;
17877 regno2 = INVALID_REGNUM;
17878
17879 if (parts.base)
17880 regno1 = true_regnum (parts.base);
17881 if (parts.index)
17882 regno2 = true_regnum (parts.index);
17883
17884 split_cost = 0;
17885
17886 /* Compute how many cycles we will add to execution time
17887 if split lea into a sequence of instructions. */
17888 if (parts.base || parts.index)
17889 {
17890 /* Have to use mov instruction if non desctructive
17891 destination form is used. */
17892 if (regno1 != regno0 && regno2 != regno0)
17893 split_cost += 1;
17894
17895 /* Have to add index to base if both exist. */
17896 if (parts.base && parts.index)
17897 split_cost += 1;
17898
17899 /* Have to use shift and adds if scale is 2 or greater. */
17900 if (parts.scale > 1)
17901 {
17902 if (regno0 != regno1)
17903 split_cost += 1;
17904 else if (regno2 == regno0)
17905 split_cost += 4;
17906 else
17907 split_cost += parts.scale;
17908 }
17909
17910 /* Have to use add instruction with immediate if
17911 disp is non zero. */
17912 if (parts.disp && parts.disp != const0_rtx)
17913 split_cost += 1;
17914
17915 /* Subtract the price of lea. */
17916 split_cost -= 1;
17917 }
17918
17919 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
17920 parts.scale > 1);
17921 }
17922
17923 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17924 matches destination. RTX includes clobber of FLAGS_REG. */
17925
17926 static void
17927 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17928 rtx dst, rtx src)
17929 {
17930 rtx op, clob;
17931
17932 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17933 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17934
17935 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17936 }
17937
17938 /* Return true if regno1 def is nearest to the insn. */
17939
17940 static bool
17941 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17942 {
17943 rtx prev = insn;
17944 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17945
17946 if (insn == start)
17947 return false;
17948 while (prev && prev != start)
17949 {
17950 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17951 {
17952 prev = PREV_INSN (prev);
17953 continue;
17954 }
17955 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17956 return true;
17957 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17958 return false;
17959 prev = PREV_INSN (prev);
17960 }
17961
17962 /* None of the regs is defined in the bb. */
17963 return false;
17964 }
17965
17966 /* Split lea instructions into a sequence of instructions
17967 which are executed on ALU to avoid AGU stalls.
17968 It is assumed that it is allowed to clobber flags register
17969 at lea position. */
17970
17971 void
17972 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17973 {
17974 unsigned int regno0, regno1, regno2;
17975 struct ix86_address parts;
17976 rtx target, tmp;
17977 int ok, adds;
17978
17979 ok = ix86_decompose_address (operands[1], &parts);
17980 gcc_assert (ok);
17981
17982 target = gen_lowpart (mode, operands[0]);
17983
17984 regno0 = true_regnum (target);
17985 regno1 = INVALID_REGNUM;
17986 regno2 = INVALID_REGNUM;
17987
17988 if (parts.base)
17989 {
17990 parts.base = gen_lowpart (mode, parts.base);
17991 regno1 = true_regnum (parts.base);
17992 }
17993
17994 if (parts.index)
17995 {
17996 parts.index = gen_lowpart (mode, parts.index);
17997 regno2 = true_regnum (parts.index);
17998 }
17999
18000 if (parts.disp)
18001 parts.disp = gen_lowpart (mode, parts.disp);
18002
18003 if (parts.scale > 1)
18004 {
18005 /* Case r1 = r1 + ... */
18006 if (regno1 == regno0)
18007 {
18008 /* If we have a case r1 = r1 + C * r1 then we
18009 should use multiplication which is very
18010 expensive. Assume cost model is wrong if we
18011 have such case here. */
18012 gcc_assert (regno2 != regno0);
18013
18014 for (adds = parts.scale; adds > 0; adds--)
18015 ix86_emit_binop (PLUS, mode, target, parts.index);
18016 }
18017 else
18018 {
18019 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
18020 if (regno0 != regno2)
18021 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18022
18023 /* Use shift for scaling. */
18024 ix86_emit_binop (ASHIFT, mode, target,
18025 GEN_INT (exact_log2 (parts.scale)));
18026
18027 if (parts.base)
18028 ix86_emit_binop (PLUS, mode, target, parts.base);
18029
18030 if (parts.disp && parts.disp != const0_rtx)
18031 ix86_emit_binop (PLUS, mode, target, parts.disp);
18032 }
18033 }
18034 else if (!parts.base && !parts.index)
18035 {
18036 gcc_assert(parts.disp);
18037 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
18038 }
18039 else
18040 {
18041 if (!parts.base)
18042 {
18043 if (regno0 != regno2)
18044 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18045 }
18046 else if (!parts.index)
18047 {
18048 if (regno0 != regno1)
18049 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
18050 }
18051 else
18052 {
18053 if (regno0 == regno1)
18054 tmp = parts.index;
18055 else if (regno0 == regno2)
18056 tmp = parts.base;
18057 else
18058 {
18059 rtx tmp1;
18060
18061 /* Find better operand for SET instruction, depending
18062 on which definition is farther from the insn. */
18063 if (find_nearest_reg_def (insn, regno1, regno2))
18064 tmp = parts.index, tmp1 = parts.base;
18065 else
18066 tmp = parts.base, tmp1 = parts.index;
18067
18068 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
18069
18070 if (parts.disp && parts.disp != const0_rtx)
18071 ix86_emit_binop (PLUS, mode, target, parts.disp);
18072
18073 ix86_emit_binop (PLUS, mode, target, tmp1);
18074 return;
18075 }
18076
18077 ix86_emit_binop (PLUS, mode, target, tmp);
18078 }
18079
18080 if (parts.disp && parts.disp != const0_rtx)
18081 ix86_emit_binop (PLUS, mode, target, parts.disp);
18082 }
18083 }
18084
18085 /* Return true if it is ok to optimize an ADD operation to LEA
18086 operation to avoid flag register consumation. For most processors,
18087 ADD is faster than LEA. For the processors like ATOM, if the
18088 destination register of LEA holds an actual address which will be
18089 used soon, LEA is better and otherwise ADD is better. */
18090
18091 bool
18092 ix86_lea_for_add_ok (rtx insn, rtx operands[])
18093 {
18094 unsigned int regno0 = true_regnum (operands[0]);
18095 unsigned int regno1 = true_regnum (operands[1]);
18096 unsigned int regno2 = true_regnum (operands[2]);
18097
18098 /* If a = b + c, (a!=b && a!=c), must use lea form. */
18099 if (regno0 != regno1 && regno0 != regno2)
18100 return true;
18101
18102 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18103 return false;
18104
18105 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
18106 }
18107
18108 /* Return true if destination reg of SET_BODY is shift count of
18109 USE_BODY. */
18110
18111 static bool
18112 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
18113 {
18114 rtx set_dest;
18115 rtx shift_rtx;
18116 int i;
18117
18118 /* Retrieve destination of SET_BODY. */
18119 switch (GET_CODE (set_body))
18120 {
18121 case SET:
18122 set_dest = SET_DEST (set_body);
18123 if (!set_dest || !REG_P (set_dest))
18124 return false;
18125 break;
18126 case PARALLEL:
18127 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
18128 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
18129 use_body))
18130 return true;
18131 default:
18132 return false;
18133 break;
18134 }
18135
18136 /* Retrieve shift count of USE_BODY. */
18137 switch (GET_CODE (use_body))
18138 {
18139 case SET:
18140 shift_rtx = XEXP (use_body, 1);
18141 break;
18142 case PARALLEL:
18143 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18144 if (ix86_dep_by_shift_count_body (set_body,
18145 XVECEXP (use_body, 0, i)))
18146 return true;
18147 default:
18148 return false;
18149 break;
18150 }
18151
18152 if (shift_rtx
18153 && (GET_CODE (shift_rtx) == ASHIFT
18154 || GET_CODE (shift_rtx) == LSHIFTRT
18155 || GET_CODE (shift_rtx) == ASHIFTRT
18156 || GET_CODE (shift_rtx) == ROTATE
18157 || GET_CODE (shift_rtx) == ROTATERT))
18158 {
18159 rtx shift_count = XEXP (shift_rtx, 1);
18160
18161 /* Return true if shift count is dest of SET_BODY. */
18162 if (REG_P (shift_count))
18163 {
18164 /* Add check since it can be invoked before register
18165 allocation in pre-reload schedule. */
18166 if (reload_completed
18167 && true_regnum (set_dest) == true_regnum (shift_count))
18168 return true;
18169 else if (REGNO(set_dest) == REGNO(shift_count))
18170 return true;
18171 }
18172 }
18173
18174 return false;
18175 }
18176
18177 /* Return true if destination reg of SET_INSN is shift count of
18178 USE_INSN. */
18179
18180 bool
18181 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18182 {
18183 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18184 PATTERN (use_insn));
18185 }
18186
18187 /* Return TRUE or FALSE depending on whether the unary operator meets the
18188 appropriate constraints. */
18189
18190 bool
18191 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18192 enum machine_mode mode ATTRIBUTE_UNUSED,
18193 rtx operands[2])
18194 {
18195 /* If one of operands is memory, source and destination must match. */
18196 if ((MEM_P (operands[0])
18197 || MEM_P (operands[1]))
18198 && ! rtx_equal_p (operands[0], operands[1]))
18199 return false;
18200 return true;
18201 }
18202
18203 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18204 are ok, keeping in mind the possible movddup alternative. */
18205
18206 bool
18207 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18208 {
18209 if (MEM_P (operands[0]))
18210 return rtx_equal_p (operands[0], operands[1 + high]);
18211 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18212 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18213 return true;
18214 }
18215
18216 /* Post-reload splitter for converting an SF or DFmode value in an
18217 SSE register into an unsigned SImode. */
18218
18219 void
18220 ix86_split_convert_uns_si_sse (rtx operands[])
18221 {
18222 enum machine_mode vecmode;
18223 rtx value, large, zero_or_two31, input, two31, x;
18224
18225 large = operands[1];
18226 zero_or_two31 = operands[2];
18227 input = operands[3];
18228 two31 = operands[4];
18229 vecmode = GET_MODE (large);
18230 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18231
18232 /* Load up the value into the low element. We must ensure that the other
18233 elements are valid floats -- zero is the easiest such value. */
18234 if (MEM_P (input))
18235 {
18236 if (vecmode == V4SFmode)
18237 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18238 else
18239 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18240 }
18241 else
18242 {
18243 input = gen_rtx_REG (vecmode, REGNO (input));
18244 emit_move_insn (value, CONST0_RTX (vecmode));
18245 if (vecmode == V4SFmode)
18246 emit_insn (gen_sse_movss (value, value, input));
18247 else
18248 emit_insn (gen_sse2_movsd (value, value, input));
18249 }
18250
18251 emit_move_insn (large, two31);
18252 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18253
18254 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18255 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18256
18257 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18258 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18259
18260 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18261 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18262
18263 large = gen_rtx_REG (V4SImode, REGNO (large));
18264 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18265
18266 x = gen_rtx_REG (V4SImode, REGNO (value));
18267 if (vecmode == V4SFmode)
18268 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18269 else
18270 emit_insn (gen_sse2_cvttpd2dq (x, value));
18271 value = x;
18272
18273 emit_insn (gen_xorv4si3 (value, value, large));
18274 }
18275
18276 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18277 Expects the 64-bit DImode to be supplied in a pair of integral
18278 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18279 -mfpmath=sse, !optimize_size only. */
18280
18281 void
18282 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18283 {
18284 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18285 rtx int_xmm, fp_xmm;
18286 rtx biases, exponents;
18287 rtx x;
18288
18289 int_xmm = gen_reg_rtx (V4SImode);
18290 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18291 emit_insn (gen_movdi_to_sse (int_xmm, input));
18292 else if (TARGET_SSE_SPLIT_REGS)
18293 {
18294 emit_clobber (int_xmm);
18295 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18296 }
18297 else
18298 {
18299 x = gen_reg_rtx (V2DImode);
18300 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18301 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18302 }
18303
18304 x = gen_rtx_CONST_VECTOR (V4SImode,
18305 gen_rtvec (4, GEN_INT (0x43300000UL),
18306 GEN_INT (0x45300000UL),
18307 const0_rtx, const0_rtx));
18308 exponents = validize_mem (force_const_mem (V4SImode, x));
18309
18310 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18311 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18312
18313 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18314 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18315 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18316 (0x1.0p84 + double(fp_value_hi_xmm)).
18317 Note these exponents differ by 32. */
18318
18319 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18320
18321 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18322 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18323 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18324 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18325 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18326 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18327 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18328 biases = validize_mem (force_const_mem (V2DFmode, biases));
18329 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18330
18331 /* Add the upper and lower DFmode values together. */
18332 if (TARGET_SSE3)
18333 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18334 else
18335 {
18336 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18337 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18338 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18339 }
18340
18341 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18342 }
18343
18344 /* Not used, but eases macroization of patterns. */
18345 void
18346 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18347 rtx input ATTRIBUTE_UNUSED)
18348 {
18349 gcc_unreachable ();
18350 }
18351
18352 /* Convert an unsigned SImode value into a DFmode. Only currently used
18353 for SSE, but applicable anywhere. */
18354
18355 void
18356 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18357 {
18358 REAL_VALUE_TYPE TWO31r;
18359 rtx x, fp;
18360
18361 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18362 NULL, 1, OPTAB_DIRECT);
18363
18364 fp = gen_reg_rtx (DFmode);
18365 emit_insn (gen_floatsidf2 (fp, x));
18366
18367 real_ldexp (&TWO31r, &dconst1, 31);
18368 x = const_double_from_real_value (TWO31r, DFmode);
18369
18370 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18371 if (x != target)
18372 emit_move_insn (target, x);
18373 }
18374
18375 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18376 32-bit mode; otherwise we have a direct convert instruction. */
18377
18378 void
18379 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18380 {
18381 REAL_VALUE_TYPE TWO32r;
18382 rtx fp_lo, fp_hi, x;
18383
18384 fp_lo = gen_reg_rtx (DFmode);
18385 fp_hi = gen_reg_rtx (DFmode);
18386
18387 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18388
18389 real_ldexp (&TWO32r, &dconst1, 32);
18390 x = const_double_from_real_value (TWO32r, DFmode);
18391 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18392
18393 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18394
18395 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18396 0, OPTAB_DIRECT);
18397 if (x != target)
18398 emit_move_insn (target, x);
18399 }
18400
18401 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18402 For x86_32, -mfpmath=sse, !optimize_size only. */
18403 void
18404 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18405 {
18406 REAL_VALUE_TYPE ONE16r;
18407 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18408
18409 real_ldexp (&ONE16r, &dconst1, 16);
18410 x = const_double_from_real_value (ONE16r, SFmode);
18411 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18412 NULL, 0, OPTAB_DIRECT);
18413 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18414 NULL, 0, OPTAB_DIRECT);
18415 fp_hi = gen_reg_rtx (SFmode);
18416 fp_lo = gen_reg_rtx (SFmode);
18417 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18418 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18419 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18420 0, OPTAB_DIRECT);
18421 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18422 0, OPTAB_DIRECT);
18423 if (!rtx_equal_p (target, fp_hi))
18424 emit_move_insn (target, fp_hi);
18425 }
18426
18427 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18428 a vector of unsigned ints VAL to vector of floats TARGET. */
18429
18430 void
18431 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18432 {
18433 rtx tmp[8];
18434 REAL_VALUE_TYPE TWO16r;
18435 enum machine_mode intmode = GET_MODE (val);
18436 enum machine_mode fltmode = GET_MODE (target);
18437 rtx (*cvt) (rtx, rtx);
18438
18439 if (intmode == V4SImode)
18440 cvt = gen_floatv4siv4sf2;
18441 else
18442 cvt = gen_floatv8siv8sf2;
18443 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18444 tmp[0] = force_reg (intmode, tmp[0]);
18445 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18446 OPTAB_DIRECT);
18447 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18448 NULL_RTX, 1, OPTAB_DIRECT);
18449 tmp[3] = gen_reg_rtx (fltmode);
18450 emit_insn (cvt (tmp[3], tmp[1]));
18451 tmp[4] = gen_reg_rtx (fltmode);
18452 emit_insn (cvt (tmp[4], tmp[2]));
18453 real_ldexp (&TWO16r, &dconst1, 16);
18454 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18455 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18456 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18457 OPTAB_DIRECT);
18458 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18459 OPTAB_DIRECT);
18460 if (tmp[7] != target)
18461 emit_move_insn (target, tmp[7]);
18462 }
18463
18464 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18465 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18466 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18467 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18468
18469 rtx
18470 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18471 {
18472 REAL_VALUE_TYPE TWO31r;
18473 rtx two31r, tmp[4];
18474 enum machine_mode mode = GET_MODE (val);
18475 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18476 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18477 rtx (*cmp) (rtx, rtx, rtx, rtx);
18478 int i;
18479
18480 for (i = 0; i < 3; i++)
18481 tmp[i] = gen_reg_rtx (mode);
18482 real_ldexp (&TWO31r, &dconst1, 31);
18483 two31r = const_double_from_real_value (TWO31r, scalarmode);
18484 two31r = ix86_build_const_vector (mode, 1, two31r);
18485 two31r = force_reg (mode, two31r);
18486 switch (mode)
18487 {
18488 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18489 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18490 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18491 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18492 default: gcc_unreachable ();
18493 }
18494 tmp[3] = gen_rtx_LE (mode, two31r, val);
18495 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18496 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18497 0, OPTAB_DIRECT);
18498 if (intmode == V4SImode || TARGET_AVX2)
18499 *xorp = expand_simple_binop (intmode, ASHIFT,
18500 gen_lowpart (intmode, tmp[0]),
18501 GEN_INT (31), NULL_RTX, 0,
18502 OPTAB_DIRECT);
18503 else
18504 {
18505 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18506 two31 = ix86_build_const_vector (intmode, 1, two31);
18507 *xorp = expand_simple_binop (intmode, AND,
18508 gen_lowpart (intmode, tmp[0]),
18509 two31, NULL_RTX, 0,
18510 OPTAB_DIRECT);
18511 }
18512 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18513 0, OPTAB_DIRECT);
18514 }
18515
18516 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18517 then replicate the value for all elements of the vector
18518 register. */
18519
18520 rtx
18521 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18522 {
18523 int i, n_elt;
18524 rtvec v;
18525 enum machine_mode scalar_mode;
18526
18527 switch (mode)
18528 {
18529 case V32QImode:
18530 case V16QImode:
18531 case V16HImode:
18532 case V8HImode:
18533 case V8SImode:
18534 case V4SImode:
18535 case V4DImode:
18536 case V2DImode:
18537 gcc_assert (vect);
18538 case V8SFmode:
18539 case V4SFmode:
18540 case V4DFmode:
18541 case V2DFmode:
18542 n_elt = GET_MODE_NUNITS (mode);
18543 v = rtvec_alloc (n_elt);
18544 scalar_mode = GET_MODE_INNER (mode);
18545
18546 RTVEC_ELT (v, 0) = value;
18547
18548 for (i = 1; i < n_elt; ++i)
18549 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18550
18551 return gen_rtx_CONST_VECTOR (mode, v);
18552
18553 default:
18554 gcc_unreachable ();
18555 }
18556 }
18557
18558 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18559 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18560 for an SSE register. If VECT is true, then replicate the mask for
18561 all elements of the vector register. If INVERT is true, then create
18562 a mask excluding the sign bit. */
18563
18564 rtx
18565 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18566 {
18567 enum machine_mode vec_mode, imode;
18568 HOST_WIDE_INT hi, lo;
18569 int shift = 63;
18570 rtx v;
18571 rtx mask;
18572
18573 /* Find the sign bit, sign extended to 2*HWI. */
18574 switch (mode)
18575 {
18576 case V8SImode:
18577 case V4SImode:
18578 case V8SFmode:
18579 case V4SFmode:
18580 vec_mode = mode;
18581 mode = GET_MODE_INNER (mode);
18582 imode = SImode;
18583 lo = 0x80000000, hi = lo < 0;
18584 break;
18585
18586 case V4DImode:
18587 case V2DImode:
18588 case V4DFmode:
18589 case V2DFmode:
18590 vec_mode = mode;
18591 mode = GET_MODE_INNER (mode);
18592 imode = DImode;
18593 if (HOST_BITS_PER_WIDE_INT >= 64)
18594 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18595 else
18596 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18597 break;
18598
18599 case TImode:
18600 case TFmode:
18601 vec_mode = VOIDmode;
18602 if (HOST_BITS_PER_WIDE_INT >= 64)
18603 {
18604 imode = TImode;
18605 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18606 }
18607 else
18608 {
18609 rtvec vec;
18610
18611 imode = DImode;
18612 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18613
18614 if (invert)
18615 {
18616 lo = ~lo, hi = ~hi;
18617 v = constm1_rtx;
18618 }
18619 else
18620 v = const0_rtx;
18621
18622 mask = immed_double_const (lo, hi, imode);
18623
18624 vec = gen_rtvec (2, v, mask);
18625 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18626 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18627
18628 return v;
18629 }
18630 break;
18631
18632 default:
18633 gcc_unreachable ();
18634 }
18635
18636 if (invert)
18637 lo = ~lo, hi = ~hi;
18638
18639 /* Force this value into the low part of a fp vector constant. */
18640 mask = immed_double_const (lo, hi, imode);
18641 mask = gen_lowpart (mode, mask);
18642
18643 if (vec_mode == VOIDmode)
18644 return force_reg (mode, mask);
18645
18646 v = ix86_build_const_vector (vec_mode, vect, mask);
18647 return force_reg (vec_mode, v);
18648 }
18649
18650 /* Generate code for floating point ABS or NEG. */
18651
18652 void
18653 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18654 rtx operands[])
18655 {
18656 rtx mask, set, dst, src;
18657 bool use_sse = false;
18658 bool vector_mode = VECTOR_MODE_P (mode);
18659 enum machine_mode vmode = mode;
18660
18661 if (vector_mode)
18662 use_sse = true;
18663 else if (mode == TFmode)
18664 use_sse = true;
18665 else if (TARGET_SSE_MATH)
18666 {
18667 use_sse = SSE_FLOAT_MODE_P (mode);
18668 if (mode == SFmode)
18669 vmode = V4SFmode;
18670 else if (mode == DFmode)
18671 vmode = V2DFmode;
18672 }
18673
18674 /* NEG and ABS performed with SSE use bitwise mask operations.
18675 Create the appropriate mask now. */
18676 if (use_sse)
18677 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18678 else
18679 mask = NULL_RTX;
18680
18681 dst = operands[0];
18682 src = operands[1];
18683
18684 set = gen_rtx_fmt_e (code, mode, src);
18685 set = gen_rtx_SET (VOIDmode, dst, set);
18686
18687 if (mask)
18688 {
18689 rtx use, clob;
18690 rtvec par;
18691
18692 use = gen_rtx_USE (VOIDmode, mask);
18693 if (vector_mode)
18694 par = gen_rtvec (2, set, use);
18695 else
18696 {
18697 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18698 par = gen_rtvec (3, set, use, clob);
18699 }
18700 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18701 }
18702 else
18703 emit_insn (set);
18704 }
18705
18706 /* Expand a copysign operation. Special case operand 0 being a constant. */
18707
18708 void
18709 ix86_expand_copysign (rtx operands[])
18710 {
18711 enum machine_mode mode, vmode;
18712 rtx dest, op0, op1, mask, nmask;
18713
18714 dest = operands[0];
18715 op0 = operands[1];
18716 op1 = operands[2];
18717
18718 mode = GET_MODE (dest);
18719
18720 if (mode == SFmode)
18721 vmode = V4SFmode;
18722 else if (mode == DFmode)
18723 vmode = V2DFmode;
18724 else
18725 vmode = mode;
18726
18727 if (GET_CODE (op0) == CONST_DOUBLE)
18728 {
18729 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18730
18731 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18732 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18733
18734 if (mode == SFmode || mode == DFmode)
18735 {
18736 if (op0 == CONST0_RTX (mode))
18737 op0 = CONST0_RTX (vmode);
18738 else
18739 {
18740 rtx v = ix86_build_const_vector (vmode, false, op0);
18741
18742 op0 = force_reg (vmode, v);
18743 }
18744 }
18745 else if (op0 != CONST0_RTX (mode))
18746 op0 = force_reg (mode, op0);
18747
18748 mask = ix86_build_signbit_mask (vmode, 0, 0);
18749
18750 if (mode == SFmode)
18751 copysign_insn = gen_copysignsf3_const;
18752 else if (mode == DFmode)
18753 copysign_insn = gen_copysigndf3_const;
18754 else
18755 copysign_insn = gen_copysigntf3_const;
18756
18757 emit_insn (copysign_insn (dest, op0, op1, mask));
18758 }
18759 else
18760 {
18761 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18762
18763 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18764 mask = ix86_build_signbit_mask (vmode, 0, 0);
18765
18766 if (mode == SFmode)
18767 copysign_insn = gen_copysignsf3_var;
18768 else if (mode == DFmode)
18769 copysign_insn = gen_copysigndf3_var;
18770 else
18771 copysign_insn = gen_copysigntf3_var;
18772
18773 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18774 }
18775 }
18776
18777 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18778 be a constant, and so has already been expanded into a vector constant. */
18779
18780 void
18781 ix86_split_copysign_const (rtx operands[])
18782 {
18783 enum machine_mode mode, vmode;
18784 rtx dest, op0, mask, x;
18785
18786 dest = operands[0];
18787 op0 = operands[1];
18788 mask = operands[3];
18789
18790 mode = GET_MODE (dest);
18791 vmode = GET_MODE (mask);
18792
18793 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18794 x = gen_rtx_AND (vmode, dest, mask);
18795 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18796
18797 if (op0 != CONST0_RTX (vmode))
18798 {
18799 x = gen_rtx_IOR (vmode, dest, op0);
18800 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18801 }
18802 }
18803
18804 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18805 so we have to do two masks. */
18806
18807 void
18808 ix86_split_copysign_var (rtx operands[])
18809 {
18810 enum machine_mode mode, vmode;
18811 rtx dest, scratch, op0, op1, mask, nmask, x;
18812
18813 dest = operands[0];
18814 scratch = operands[1];
18815 op0 = operands[2];
18816 op1 = operands[3];
18817 nmask = operands[4];
18818 mask = operands[5];
18819
18820 mode = GET_MODE (dest);
18821 vmode = GET_MODE (mask);
18822
18823 if (rtx_equal_p (op0, op1))
18824 {
18825 /* Shouldn't happen often (it's useless, obviously), but when it does
18826 we'd generate incorrect code if we continue below. */
18827 emit_move_insn (dest, op0);
18828 return;
18829 }
18830
18831 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18832 {
18833 gcc_assert (REGNO (op1) == REGNO (scratch));
18834
18835 x = gen_rtx_AND (vmode, scratch, mask);
18836 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18837
18838 dest = mask;
18839 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18840 x = gen_rtx_NOT (vmode, dest);
18841 x = gen_rtx_AND (vmode, x, op0);
18842 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18843 }
18844 else
18845 {
18846 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18847 {
18848 x = gen_rtx_AND (vmode, scratch, mask);
18849 }
18850 else /* alternative 2,4 */
18851 {
18852 gcc_assert (REGNO (mask) == REGNO (scratch));
18853 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18854 x = gen_rtx_AND (vmode, scratch, op1);
18855 }
18856 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18857
18858 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18859 {
18860 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18861 x = gen_rtx_AND (vmode, dest, nmask);
18862 }
18863 else /* alternative 3,4 */
18864 {
18865 gcc_assert (REGNO (nmask) == REGNO (dest));
18866 dest = nmask;
18867 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18868 x = gen_rtx_AND (vmode, dest, op0);
18869 }
18870 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18871 }
18872
18873 x = gen_rtx_IOR (vmode, dest, scratch);
18874 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18875 }
18876
18877 /* Return TRUE or FALSE depending on whether the first SET in INSN
18878 has source and destination with matching CC modes, and that the
18879 CC mode is at least as constrained as REQ_MODE. */
18880
18881 bool
18882 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18883 {
18884 rtx set;
18885 enum machine_mode set_mode;
18886
18887 set = PATTERN (insn);
18888 if (GET_CODE (set) == PARALLEL)
18889 set = XVECEXP (set, 0, 0);
18890 gcc_assert (GET_CODE (set) == SET);
18891 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18892
18893 set_mode = GET_MODE (SET_DEST (set));
18894 switch (set_mode)
18895 {
18896 case CCNOmode:
18897 if (req_mode != CCNOmode
18898 && (req_mode != CCmode
18899 || XEXP (SET_SRC (set), 1) != const0_rtx))
18900 return false;
18901 break;
18902 case CCmode:
18903 if (req_mode == CCGCmode)
18904 return false;
18905 /* FALLTHRU */
18906 case CCGCmode:
18907 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18908 return false;
18909 /* FALLTHRU */
18910 case CCGOCmode:
18911 if (req_mode == CCZmode)
18912 return false;
18913 /* FALLTHRU */
18914 case CCZmode:
18915 break;
18916
18917 case CCAmode:
18918 case CCCmode:
18919 case CCOmode:
18920 case CCSmode:
18921 if (set_mode != req_mode)
18922 return false;
18923 break;
18924
18925 default:
18926 gcc_unreachable ();
18927 }
18928
18929 return GET_MODE (SET_SRC (set)) == set_mode;
18930 }
18931
18932 /* Generate insn patterns to do an integer compare of OPERANDS. */
18933
18934 static rtx
18935 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18936 {
18937 enum machine_mode cmpmode;
18938 rtx tmp, flags;
18939
18940 cmpmode = SELECT_CC_MODE (code, op0, op1);
18941 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18942
18943 /* This is very simple, but making the interface the same as in the
18944 FP case makes the rest of the code easier. */
18945 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18946 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18947
18948 /* Return the test that should be put into the flags user, i.e.
18949 the bcc, scc, or cmov instruction. */
18950 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18951 }
18952
18953 /* Figure out whether to use ordered or unordered fp comparisons.
18954 Return the appropriate mode to use. */
18955
18956 enum machine_mode
18957 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18958 {
18959 /* ??? In order to make all comparisons reversible, we do all comparisons
18960 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18961 all forms trapping and nontrapping comparisons, we can make inequality
18962 comparisons trapping again, since it results in better code when using
18963 FCOM based compares. */
18964 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18965 }
18966
18967 enum machine_mode
18968 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18969 {
18970 enum machine_mode mode = GET_MODE (op0);
18971
18972 if (SCALAR_FLOAT_MODE_P (mode))
18973 {
18974 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18975 return ix86_fp_compare_mode (code);
18976 }
18977
18978 switch (code)
18979 {
18980 /* Only zero flag is needed. */
18981 case EQ: /* ZF=0 */
18982 case NE: /* ZF!=0 */
18983 return CCZmode;
18984 /* Codes needing carry flag. */
18985 case GEU: /* CF=0 */
18986 case LTU: /* CF=1 */
18987 /* Detect overflow checks. They need just the carry flag. */
18988 if (GET_CODE (op0) == PLUS
18989 && rtx_equal_p (op1, XEXP (op0, 0)))
18990 return CCCmode;
18991 else
18992 return CCmode;
18993 case GTU: /* CF=0 & ZF=0 */
18994 case LEU: /* CF=1 | ZF=1 */
18995 return CCmode;
18996 /* Codes possibly doable only with sign flag when
18997 comparing against zero. */
18998 case GE: /* SF=OF or SF=0 */
18999 case LT: /* SF<>OF or SF=1 */
19000 if (op1 == const0_rtx)
19001 return CCGOCmode;
19002 else
19003 /* For other cases Carry flag is not required. */
19004 return CCGCmode;
19005 /* Codes doable only with sign flag when comparing
19006 against zero, but we miss jump instruction for it
19007 so we need to use relational tests against overflow
19008 that thus needs to be zero. */
19009 case GT: /* ZF=0 & SF=OF */
19010 case LE: /* ZF=1 | SF<>OF */
19011 if (op1 == const0_rtx)
19012 return CCNOmode;
19013 else
19014 return CCGCmode;
19015 /* strcmp pattern do (use flags) and combine may ask us for proper
19016 mode. */
19017 case USE:
19018 return CCmode;
19019 default:
19020 gcc_unreachable ();
19021 }
19022 }
19023
19024 /* Return the fixed registers used for condition codes. */
19025
19026 static bool
19027 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
19028 {
19029 *p1 = FLAGS_REG;
19030 *p2 = FPSR_REG;
19031 return true;
19032 }
19033
19034 /* If two condition code modes are compatible, return a condition code
19035 mode which is compatible with both. Otherwise, return
19036 VOIDmode. */
19037
19038 static enum machine_mode
19039 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
19040 {
19041 if (m1 == m2)
19042 return m1;
19043
19044 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
19045 return VOIDmode;
19046
19047 if ((m1 == CCGCmode && m2 == CCGOCmode)
19048 || (m1 == CCGOCmode && m2 == CCGCmode))
19049 return CCGCmode;
19050
19051 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
19052 return m2;
19053 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
19054 return m1;
19055
19056 switch (m1)
19057 {
19058 default:
19059 gcc_unreachable ();
19060
19061 case CCmode:
19062 case CCGCmode:
19063 case CCGOCmode:
19064 case CCNOmode:
19065 case CCAmode:
19066 case CCCmode:
19067 case CCOmode:
19068 case CCSmode:
19069 case CCZmode:
19070 switch (m2)
19071 {
19072 default:
19073 return VOIDmode;
19074
19075 case CCmode:
19076 case CCGCmode:
19077 case CCGOCmode:
19078 case CCNOmode:
19079 case CCAmode:
19080 case CCCmode:
19081 case CCOmode:
19082 case CCSmode:
19083 case CCZmode:
19084 return CCmode;
19085 }
19086
19087 case CCFPmode:
19088 case CCFPUmode:
19089 /* These are only compatible with themselves, which we already
19090 checked above. */
19091 return VOIDmode;
19092 }
19093 }
19094
19095
19096 /* Return a comparison we can do and that it is equivalent to
19097 swap_condition (code) apart possibly from orderedness.
19098 But, never change orderedness if TARGET_IEEE_FP, returning
19099 UNKNOWN in that case if necessary. */
19100
19101 static enum rtx_code
19102 ix86_fp_swap_condition (enum rtx_code code)
19103 {
19104 switch (code)
19105 {
19106 case GT: /* GTU - CF=0 & ZF=0 */
19107 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
19108 case GE: /* GEU - CF=0 */
19109 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
19110 case UNLT: /* LTU - CF=1 */
19111 return TARGET_IEEE_FP ? UNKNOWN : GT;
19112 case UNLE: /* LEU - CF=1 | ZF=1 */
19113 return TARGET_IEEE_FP ? UNKNOWN : GE;
19114 default:
19115 return swap_condition (code);
19116 }
19117 }
19118
19119 /* Return cost of comparison CODE using the best strategy for performance.
19120 All following functions do use number of instructions as a cost metrics.
19121 In future this should be tweaked to compute bytes for optimize_size and
19122 take into account performance of various instructions on various CPUs. */
19123
19124 static int
19125 ix86_fp_comparison_cost (enum rtx_code code)
19126 {
19127 int arith_cost;
19128
19129 /* The cost of code using bit-twiddling on %ah. */
19130 switch (code)
19131 {
19132 case UNLE:
19133 case UNLT:
19134 case LTGT:
19135 case GT:
19136 case GE:
19137 case UNORDERED:
19138 case ORDERED:
19139 case UNEQ:
19140 arith_cost = 4;
19141 break;
19142 case LT:
19143 case NE:
19144 case EQ:
19145 case UNGE:
19146 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19147 break;
19148 case LE:
19149 case UNGT:
19150 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19151 break;
19152 default:
19153 gcc_unreachable ();
19154 }
19155
19156 switch (ix86_fp_comparison_strategy (code))
19157 {
19158 case IX86_FPCMP_COMI:
19159 return arith_cost > 4 ? 3 : 2;
19160 case IX86_FPCMP_SAHF:
19161 return arith_cost > 4 ? 4 : 3;
19162 default:
19163 return arith_cost;
19164 }
19165 }
19166
19167 /* Return strategy to use for floating-point. We assume that fcomi is always
19168 preferrable where available, since that is also true when looking at size
19169 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19170
19171 enum ix86_fpcmp_strategy
19172 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19173 {
19174 /* Do fcomi/sahf based test when profitable. */
19175
19176 if (TARGET_CMOVE)
19177 return IX86_FPCMP_COMI;
19178
19179 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19180 return IX86_FPCMP_SAHF;
19181
19182 return IX86_FPCMP_ARITH;
19183 }
19184
19185 /* Swap, force into registers, or otherwise massage the two operands
19186 to a fp comparison. The operands are updated in place; the new
19187 comparison code is returned. */
19188
19189 static enum rtx_code
19190 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19191 {
19192 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19193 rtx op0 = *pop0, op1 = *pop1;
19194 enum machine_mode op_mode = GET_MODE (op0);
19195 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19196
19197 /* All of the unordered compare instructions only work on registers.
19198 The same is true of the fcomi compare instructions. The XFmode
19199 compare instructions require registers except when comparing
19200 against zero or when converting operand 1 from fixed point to
19201 floating point. */
19202
19203 if (!is_sse
19204 && (fpcmp_mode == CCFPUmode
19205 || (op_mode == XFmode
19206 && ! (standard_80387_constant_p (op0) == 1
19207 || standard_80387_constant_p (op1) == 1)
19208 && GET_CODE (op1) != FLOAT)
19209 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19210 {
19211 op0 = force_reg (op_mode, op0);
19212 op1 = force_reg (op_mode, op1);
19213 }
19214 else
19215 {
19216 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19217 things around if they appear profitable, otherwise force op0
19218 into a register. */
19219
19220 if (standard_80387_constant_p (op0) == 0
19221 || (MEM_P (op0)
19222 && ! (standard_80387_constant_p (op1) == 0
19223 || MEM_P (op1))))
19224 {
19225 enum rtx_code new_code = ix86_fp_swap_condition (code);
19226 if (new_code != UNKNOWN)
19227 {
19228 rtx tmp;
19229 tmp = op0, op0 = op1, op1 = tmp;
19230 code = new_code;
19231 }
19232 }
19233
19234 if (!REG_P (op0))
19235 op0 = force_reg (op_mode, op0);
19236
19237 if (CONSTANT_P (op1))
19238 {
19239 int tmp = standard_80387_constant_p (op1);
19240 if (tmp == 0)
19241 op1 = validize_mem (force_const_mem (op_mode, op1));
19242 else if (tmp == 1)
19243 {
19244 if (TARGET_CMOVE)
19245 op1 = force_reg (op_mode, op1);
19246 }
19247 else
19248 op1 = force_reg (op_mode, op1);
19249 }
19250 }
19251
19252 /* Try to rearrange the comparison to make it cheaper. */
19253 if (ix86_fp_comparison_cost (code)
19254 > ix86_fp_comparison_cost (swap_condition (code))
19255 && (REG_P (op1) || can_create_pseudo_p ()))
19256 {
19257 rtx tmp;
19258 tmp = op0, op0 = op1, op1 = tmp;
19259 code = swap_condition (code);
19260 if (!REG_P (op0))
19261 op0 = force_reg (op_mode, op0);
19262 }
19263
19264 *pop0 = op0;
19265 *pop1 = op1;
19266 return code;
19267 }
19268
19269 /* Convert comparison codes we use to represent FP comparison to integer
19270 code that will result in proper branch. Return UNKNOWN if no such code
19271 is available. */
19272
19273 enum rtx_code
19274 ix86_fp_compare_code_to_integer (enum rtx_code code)
19275 {
19276 switch (code)
19277 {
19278 case GT:
19279 return GTU;
19280 case GE:
19281 return GEU;
19282 case ORDERED:
19283 case UNORDERED:
19284 return code;
19285 break;
19286 case UNEQ:
19287 return EQ;
19288 break;
19289 case UNLT:
19290 return LTU;
19291 break;
19292 case UNLE:
19293 return LEU;
19294 break;
19295 case LTGT:
19296 return NE;
19297 break;
19298 default:
19299 return UNKNOWN;
19300 }
19301 }
19302
19303 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19304
19305 static rtx
19306 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19307 {
19308 enum machine_mode fpcmp_mode, intcmp_mode;
19309 rtx tmp, tmp2;
19310
19311 fpcmp_mode = ix86_fp_compare_mode (code);
19312 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19313
19314 /* Do fcomi/sahf based test when profitable. */
19315 switch (ix86_fp_comparison_strategy (code))
19316 {
19317 case IX86_FPCMP_COMI:
19318 intcmp_mode = fpcmp_mode;
19319 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19320 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19321 tmp);
19322 emit_insn (tmp);
19323 break;
19324
19325 case IX86_FPCMP_SAHF:
19326 intcmp_mode = fpcmp_mode;
19327 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19328 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19329 tmp);
19330
19331 if (!scratch)
19332 scratch = gen_reg_rtx (HImode);
19333 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19334 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19335 break;
19336
19337 case IX86_FPCMP_ARITH:
19338 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19339 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19340 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19341 if (!scratch)
19342 scratch = gen_reg_rtx (HImode);
19343 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19344
19345 /* In the unordered case, we have to check C2 for NaN's, which
19346 doesn't happen to work out to anything nice combination-wise.
19347 So do some bit twiddling on the value we've got in AH to come
19348 up with an appropriate set of condition codes. */
19349
19350 intcmp_mode = CCNOmode;
19351 switch (code)
19352 {
19353 case GT:
19354 case UNGT:
19355 if (code == GT || !TARGET_IEEE_FP)
19356 {
19357 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19358 code = EQ;
19359 }
19360 else
19361 {
19362 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19363 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19364 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19365 intcmp_mode = CCmode;
19366 code = GEU;
19367 }
19368 break;
19369 case LT:
19370 case UNLT:
19371 if (code == LT && TARGET_IEEE_FP)
19372 {
19373 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19374 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19375 intcmp_mode = CCmode;
19376 code = EQ;
19377 }
19378 else
19379 {
19380 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19381 code = NE;
19382 }
19383 break;
19384 case GE:
19385 case UNGE:
19386 if (code == GE || !TARGET_IEEE_FP)
19387 {
19388 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19389 code = EQ;
19390 }
19391 else
19392 {
19393 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19394 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19395 code = NE;
19396 }
19397 break;
19398 case LE:
19399 case UNLE:
19400 if (code == LE && TARGET_IEEE_FP)
19401 {
19402 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19403 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19404 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19405 intcmp_mode = CCmode;
19406 code = LTU;
19407 }
19408 else
19409 {
19410 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19411 code = NE;
19412 }
19413 break;
19414 case EQ:
19415 case UNEQ:
19416 if (code == EQ && TARGET_IEEE_FP)
19417 {
19418 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19419 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19420 intcmp_mode = CCmode;
19421 code = EQ;
19422 }
19423 else
19424 {
19425 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19426 code = NE;
19427 }
19428 break;
19429 case NE:
19430 case LTGT:
19431 if (code == NE && TARGET_IEEE_FP)
19432 {
19433 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19434 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19435 GEN_INT (0x40)));
19436 code = NE;
19437 }
19438 else
19439 {
19440 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19441 code = EQ;
19442 }
19443 break;
19444
19445 case UNORDERED:
19446 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19447 code = NE;
19448 break;
19449 case ORDERED:
19450 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19451 code = EQ;
19452 break;
19453
19454 default:
19455 gcc_unreachable ();
19456 }
19457 break;
19458
19459 default:
19460 gcc_unreachable();
19461 }
19462
19463 /* Return the test that should be put into the flags user, i.e.
19464 the bcc, scc, or cmov instruction. */
19465 return gen_rtx_fmt_ee (code, VOIDmode,
19466 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19467 const0_rtx);
19468 }
19469
19470 static rtx
19471 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19472 {
19473 rtx ret;
19474
19475 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19476 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19477
19478 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19479 {
19480 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19481 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19482 }
19483 else
19484 ret = ix86_expand_int_compare (code, op0, op1);
19485
19486 return ret;
19487 }
19488
19489 void
19490 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19491 {
19492 enum machine_mode mode = GET_MODE (op0);
19493 rtx tmp;
19494
19495 switch (mode)
19496 {
19497 case SFmode:
19498 case DFmode:
19499 case XFmode:
19500 case QImode:
19501 case HImode:
19502 case SImode:
19503 simple:
19504 tmp = ix86_expand_compare (code, op0, op1);
19505 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19506 gen_rtx_LABEL_REF (VOIDmode, label),
19507 pc_rtx);
19508 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19509 return;
19510
19511 case DImode:
19512 if (TARGET_64BIT)
19513 goto simple;
19514 case TImode:
19515 /* Expand DImode branch into multiple compare+branch. */
19516 {
19517 rtx lo[2], hi[2], label2;
19518 enum rtx_code code1, code2, code3;
19519 enum machine_mode submode;
19520
19521 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19522 {
19523 tmp = op0, op0 = op1, op1 = tmp;
19524 code = swap_condition (code);
19525 }
19526
19527 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19528 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19529
19530 submode = mode == DImode ? SImode : DImode;
19531
19532 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19533 avoid two branches. This costs one extra insn, so disable when
19534 optimizing for size. */
19535
19536 if ((code == EQ || code == NE)
19537 && (!optimize_insn_for_size_p ()
19538 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19539 {
19540 rtx xor0, xor1;
19541
19542 xor1 = hi[0];
19543 if (hi[1] != const0_rtx)
19544 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19545 NULL_RTX, 0, OPTAB_WIDEN);
19546
19547 xor0 = lo[0];
19548 if (lo[1] != const0_rtx)
19549 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19550 NULL_RTX, 0, OPTAB_WIDEN);
19551
19552 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19553 NULL_RTX, 0, OPTAB_WIDEN);
19554
19555 ix86_expand_branch (code, tmp, const0_rtx, label);
19556 return;
19557 }
19558
19559 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19560 op1 is a constant and the low word is zero, then we can just
19561 examine the high word. Similarly for low word -1 and
19562 less-or-equal-than or greater-than. */
19563
19564 if (CONST_INT_P (hi[1]))
19565 switch (code)
19566 {
19567 case LT: case LTU: case GE: case GEU:
19568 if (lo[1] == const0_rtx)
19569 {
19570 ix86_expand_branch (code, hi[0], hi[1], label);
19571 return;
19572 }
19573 break;
19574 case LE: case LEU: case GT: case GTU:
19575 if (lo[1] == constm1_rtx)
19576 {
19577 ix86_expand_branch (code, hi[0], hi[1], label);
19578 return;
19579 }
19580 break;
19581 default:
19582 break;
19583 }
19584
19585 /* Otherwise, we need two or three jumps. */
19586
19587 label2 = gen_label_rtx ();
19588
19589 code1 = code;
19590 code2 = swap_condition (code);
19591 code3 = unsigned_condition (code);
19592
19593 switch (code)
19594 {
19595 case LT: case GT: case LTU: case GTU:
19596 break;
19597
19598 case LE: code1 = LT; code2 = GT; break;
19599 case GE: code1 = GT; code2 = LT; break;
19600 case LEU: code1 = LTU; code2 = GTU; break;
19601 case GEU: code1 = GTU; code2 = LTU; break;
19602
19603 case EQ: code1 = UNKNOWN; code2 = NE; break;
19604 case NE: code2 = UNKNOWN; break;
19605
19606 default:
19607 gcc_unreachable ();
19608 }
19609
19610 /*
19611 * a < b =>
19612 * if (hi(a) < hi(b)) goto true;
19613 * if (hi(a) > hi(b)) goto false;
19614 * if (lo(a) < lo(b)) goto true;
19615 * false:
19616 */
19617
19618 if (code1 != UNKNOWN)
19619 ix86_expand_branch (code1, hi[0], hi[1], label);
19620 if (code2 != UNKNOWN)
19621 ix86_expand_branch (code2, hi[0], hi[1], label2);
19622
19623 ix86_expand_branch (code3, lo[0], lo[1], label);
19624
19625 if (code2 != UNKNOWN)
19626 emit_label (label2);
19627 return;
19628 }
19629
19630 default:
19631 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19632 goto simple;
19633 }
19634 }
19635
19636 /* Split branch based on floating point condition. */
19637 void
19638 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19639 rtx target1, rtx target2, rtx tmp, rtx pushed)
19640 {
19641 rtx condition;
19642 rtx i;
19643
19644 if (target2 != pc_rtx)
19645 {
19646 rtx tmp = target2;
19647 code = reverse_condition_maybe_unordered (code);
19648 target2 = target1;
19649 target1 = tmp;
19650 }
19651
19652 condition = ix86_expand_fp_compare (code, op1, op2,
19653 tmp);
19654
19655 /* Remove pushed operand from stack. */
19656 if (pushed)
19657 ix86_free_from_memory (GET_MODE (pushed));
19658
19659 i = emit_jump_insn (gen_rtx_SET
19660 (VOIDmode, pc_rtx,
19661 gen_rtx_IF_THEN_ELSE (VOIDmode,
19662 condition, target1, target2)));
19663 if (split_branch_probability >= 0)
19664 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
19665 }
19666
19667 void
19668 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19669 {
19670 rtx ret;
19671
19672 gcc_assert (GET_MODE (dest) == QImode);
19673
19674 ret = ix86_expand_compare (code, op0, op1);
19675 PUT_MODE (ret, QImode);
19676 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19677 }
19678
19679 /* Expand comparison setting or clearing carry flag. Return true when
19680 successful and set pop for the operation. */
19681 static bool
19682 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19683 {
19684 enum machine_mode mode =
19685 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19686
19687 /* Do not handle double-mode compares that go through special path. */
19688 if (mode == (TARGET_64BIT ? TImode : DImode))
19689 return false;
19690
19691 if (SCALAR_FLOAT_MODE_P (mode))
19692 {
19693 rtx compare_op, compare_seq;
19694
19695 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19696
19697 /* Shortcut: following common codes never translate
19698 into carry flag compares. */
19699 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19700 || code == ORDERED || code == UNORDERED)
19701 return false;
19702
19703 /* These comparisons require zero flag; swap operands so they won't. */
19704 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19705 && !TARGET_IEEE_FP)
19706 {
19707 rtx tmp = op0;
19708 op0 = op1;
19709 op1 = tmp;
19710 code = swap_condition (code);
19711 }
19712
19713 /* Try to expand the comparison and verify that we end up with
19714 carry flag based comparison. This fails to be true only when
19715 we decide to expand comparison using arithmetic that is not
19716 too common scenario. */
19717 start_sequence ();
19718 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19719 compare_seq = get_insns ();
19720 end_sequence ();
19721
19722 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19723 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19724 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19725 else
19726 code = GET_CODE (compare_op);
19727
19728 if (code != LTU && code != GEU)
19729 return false;
19730
19731 emit_insn (compare_seq);
19732 *pop = compare_op;
19733 return true;
19734 }
19735
19736 if (!INTEGRAL_MODE_P (mode))
19737 return false;
19738
19739 switch (code)
19740 {
19741 case LTU:
19742 case GEU:
19743 break;
19744
19745 /* Convert a==0 into (unsigned)a<1. */
19746 case EQ:
19747 case NE:
19748 if (op1 != const0_rtx)
19749 return false;
19750 op1 = const1_rtx;
19751 code = (code == EQ ? LTU : GEU);
19752 break;
19753
19754 /* Convert a>b into b<a or a>=b-1. */
19755 case GTU:
19756 case LEU:
19757 if (CONST_INT_P (op1))
19758 {
19759 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19760 /* Bail out on overflow. We still can swap operands but that
19761 would force loading of the constant into register. */
19762 if (op1 == const0_rtx
19763 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19764 return false;
19765 code = (code == GTU ? GEU : LTU);
19766 }
19767 else
19768 {
19769 rtx tmp = op1;
19770 op1 = op0;
19771 op0 = tmp;
19772 code = (code == GTU ? LTU : GEU);
19773 }
19774 break;
19775
19776 /* Convert a>=0 into (unsigned)a<0x80000000. */
19777 case LT:
19778 case GE:
19779 if (mode == DImode || op1 != const0_rtx)
19780 return false;
19781 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19782 code = (code == LT ? GEU : LTU);
19783 break;
19784 case LE:
19785 case GT:
19786 if (mode == DImode || op1 != constm1_rtx)
19787 return false;
19788 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19789 code = (code == LE ? GEU : LTU);
19790 break;
19791
19792 default:
19793 return false;
19794 }
19795 /* Swapping operands may cause constant to appear as first operand. */
19796 if (!nonimmediate_operand (op0, VOIDmode))
19797 {
19798 if (!can_create_pseudo_p ())
19799 return false;
19800 op0 = force_reg (mode, op0);
19801 }
19802 *pop = ix86_expand_compare (code, op0, op1);
19803 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19804 return true;
19805 }
19806
19807 bool
19808 ix86_expand_int_movcc (rtx operands[])
19809 {
19810 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19811 rtx compare_seq, compare_op;
19812 enum machine_mode mode = GET_MODE (operands[0]);
19813 bool sign_bit_compare_p = false;
19814 rtx op0 = XEXP (operands[1], 0);
19815 rtx op1 = XEXP (operands[1], 1);
19816
19817 if (GET_MODE (op0) == TImode
19818 || (GET_MODE (op0) == DImode
19819 && !TARGET_64BIT))
19820 return false;
19821
19822 start_sequence ();
19823 compare_op = ix86_expand_compare (code, op0, op1);
19824 compare_seq = get_insns ();
19825 end_sequence ();
19826
19827 compare_code = GET_CODE (compare_op);
19828
19829 if ((op1 == const0_rtx && (code == GE || code == LT))
19830 || (op1 == constm1_rtx && (code == GT || code == LE)))
19831 sign_bit_compare_p = true;
19832
19833 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19834 HImode insns, we'd be swallowed in word prefix ops. */
19835
19836 if ((mode != HImode || TARGET_FAST_PREFIX)
19837 && (mode != (TARGET_64BIT ? TImode : DImode))
19838 && CONST_INT_P (operands[2])
19839 && CONST_INT_P (operands[3]))
19840 {
19841 rtx out = operands[0];
19842 HOST_WIDE_INT ct = INTVAL (operands[2]);
19843 HOST_WIDE_INT cf = INTVAL (operands[3]);
19844 HOST_WIDE_INT diff;
19845
19846 diff = ct - cf;
19847 /* Sign bit compares are better done using shifts than we do by using
19848 sbb. */
19849 if (sign_bit_compare_p
19850 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19851 {
19852 /* Detect overlap between destination and compare sources. */
19853 rtx tmp = out;
19854
19855 if (!sign_bit_compare_p)
19856 {
19857 rtx flags;
19858 bool fpcmp = false;
19859
19860 compare_code = GET_CODE (compare_op);
19861
19862 flags = XEXP (compare_op, 0);
19863
19864 if (GET_MODE (flags) == CCFPmode
19865 || GET_MODE (flags) == CCFPUmode)
19866 {
19867 fpcmp = true;
19868 compare_code
19869 = ix86_fp_compare_code_to_integer (compare_code);
19870 }
19871
19872 /* To simplify rest of code, restrict to the GEU case. */
19873 if (compare_code == LTU)
19874 {
19875 HOST_WIDE_INT tmp = ct;
19876 ct = cf;
19877 cf = tmp;
19878 compare_code = reverse_condition (compare_code);
19879 code = reverse_condition (code);
19880 }
19881 else
19882 {
19883 if (fpcmp)
19884 PUT_CODE (compare_op,
19885 reverse_condition_maybe_unordered
19886 (GET_CODE (compare_op)));
19887 else
19888 PUT_CODE (compare_op,
19889 reverse_condition (GET_CODE (compare_op)));
19890 }
19891 diff = ct - cf;
19892
19893 if (reg_overlap_mentioned_p (out, op0)
19894 || reg_overlap_mentioned_p (out, op1))
19895 tmp = gen_reg_rtx (mode);
19896
19897 if (mode == DImode)
19898 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19899 else
19900 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19901 flags, compare_op));
19902 }
19903 else
19904 {
19905 if (code == GT || code == GE)
19906 code = reverse_condition (code);
19907 else
19908 {
19909 HOST_WIDE_INT tmp = ct;
19910 ct = cf;
19911 cf = tmp;
19912 diff = ct - cf;
19913 }
19914 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19915 }
19916
19917 if (diff == 1)
19918 {
19919 /*
19920 * cmpl op0,op1
19921 * sbbl dest,dest
19922 * [addl dest, ct]
19923 *
19924 * Size 5 - 8.
19925 */
19926 if (ct)
19927 tmp = expand_simple_binop (mode, PLUS,
19928 tmp, GEN_INT (ct),
19929 copy_rtx (tmp), 1, OPTAB_DIRECT);
19930 }
19931 else if (cf == -1)
19932 {
19933 /*
19934 * cmpl op0,op1
19935 * sbbl dest,dest
19936 * orl $ct, dest
19937 *
19938 * Size 8.
19939 */
19940 tmp = expand_simple_binop (mode, IOR,
19941 tmp, GEN_INT (ct),
19942 copy_rtx (tmp), 1, OPTAB_DIRECT);
19943 }
19944 else if (diff == -1 && ct)
19945 {
19946 /*
19947 * cmpl op0,op1
19948 * sbbl dest,dest
19949 * notl dest
19950 * [addl dest, cf]
19951 *
19952 * Size 8 - 11.
19953 */
19954 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19955 if (cf)
19956 tmp = expand_simple_binop (mode, PLUS,
19957 copy_rtx (tmp), GEN_INT (cf),
19958 copy_rtx (tmp), 1, OPTAB_DIRECT);
19959 }
19960 else
19961 {
19962 /*
19963 * cmpl op0,op1
19964 * sbbl dest,dest
19965 * [notl dest]
19966 * andl cf - ct, dest
19967 * [addl dest, ct]
19968 *
19969 * Size 8 - 11.
19970 */
19971
19972 if (cf == 0)
19973 {
19974 cf = ct;
19975 ct = 0;
19976 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19977 }
19978
19979 tmp = expand_simple_binop (mode, AND,
19980 copy_rtx (tmp),
19981 gen_int_mode (cf - ct, mode),
19982 copy_rtx (tmp), 1, OPTAB_DIRECT);
19983 if (ct)
19984 tmp = expand_simple_binop (mode, PLUS,
19985 copy_rtx (tmp), GEN_INT (ct),
19986 copy_rtx (tmp), 1, OPTAB_DIRECT);
19987 }
19988
19989 if (!rtx_equal_p (tmp, out))
19990 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19991
19992 return true;
19993 }
19994
19995 if (diff < 0)
19996 {
19997 enum machine_mode cmp_mode = GET_MODE (op0);
19998
19999 HOST_WIDE_INT tmp;
20000 tmp = ct, ct = cf, cf = tmp;
20001 diff = -diff;
20002
20003 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20004 {
20005 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20006
20007 /* We may be reversing unordered compare to normal compare, that
20008 is not valid in general (we may convert non-trapping condition
20009 to trapping one), however on i386 we currently emit all
20010 comparisons unordered. */
20011 compare_code = reverse_condition_maybe_unordered (compare_code);
20012 code = reverse_condition_maybe_unordered (code);
20013 }
20014 else
20015 {
20016 compare_code = reverse_condition (compare_code);
20017 code = reverse_condition (code);
20018 }
20019 }
20020
20021 compare_code = UNKNOWN;
20022 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
20023 && CONST_INT_P (op1))
20024 {
20025 if (op1 == const0_rtx
20026 && (code == LT || code == GE))
20027 compare_code = code;
20028 else if (op1 == constm1_rtx)
20029 {
20030 if (code == LE)
20031 compare_code = LT;
20032 else if (code == GT)
20033 compare_code = GE;
20034 }
20035 }
20036
20037 /* Optimize dest = (op0 < 0) ? -1 : cf. */
20038 if (compare_code != UNKNOWN
20039 && GET_MODE (op0) == GET_MODE (out)
20040 && (cf == -1 || ct == -1))
20041 {
20042 /* If lea code below could be used, only optimize
20043 if it results in a 2 insn sequence. */
20044
20045 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
20046 || diff == 3 || diff == 5 || diff == 9)
20047 || (compare_code == LT && ct == -1)
20048 || (compare_code == GE && cf == -1))
20049 {
20050 /*
20051 * notl op1 (if necessary)
20052 * sarl $31, op1
20053 * orl cf, op1
20054 */
20055 if (ct != -1)
20056 {
20057 cf = ct;
20058 ct = -1;
20059 code = reverse_condition (code);
20060 }
20061
20062 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20063
20064 out = expand_simple_binop (mode, IOR,
20065 out, GEN_INT (cf),
20066 out, 1, OPTAB_DIRECT);
20067 if (out != operands[0])
20068 emit_move_insn (operands[0], out);
20069
20070 return true;
20071 }
20072 }
20073
20074
20075 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
20076 || diff == 3 || diff == 5 || diff == 9)
20077 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
20078 && (mode != DImode
20079 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
20080 {
20081 /*
20082 * xorl dest,dest
20083 * cmpl op1,op2
20084 * setcc dest
20085 * lea cf(dest*(ct-cf)),dest
20086 *
20087 * Size 14.
20088 *
20089 * This also catches the degenerate setcc-only case.
20090 */
20091
20092 rtx tmp;
20093 int nops;
20094
20095 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20096
20097 nops = 0;
20098 /* On x86_64 the lea instruction operates on Pmode, so we need
20099 to get arithmetics done in proper mode to match. */
20100 if (diff == 1)
20101 tmp = copy_rtx (out);
20102 else
20103 {
20104 rtx out1;
20105 out1 = copy_rtx (out);
20106 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
20107 nops++;
20108 if (diff & 1)
20109 {
20110 tmp = gen_rtx_PLUS (mode, tmp, out1);
20111 nops++;
20112 }
20113 }
20114 if (cf != 0)
20115 {
20116 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
20117 nops++;
20118 }
20119 if (!rtx_equal_p (tmp, out))
20120 {
20121 if (nops == 1)
20122 out = force_operand (tmp, copy_rtx (out));
20123 else
20124 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
20125 }
20126 if (!rtx_equal_p (out, operands[0]))
20127 emit_move_insn (operands[0], copy_rtx (out));
20128
20129 return true;
20130 }
20131
20132 /*
20133 * General case: Jumpful:
20134 * xorl dest,dest cmpl op1, op2
20135 * cmpl op1, op2 movl ct, dest
20136 * setcc dest jcc 1f
20137 * decl dest movl cf, dest
20138 * andl (cf-ct),dest 1:
20139 * addl ct,dest
20140 *
20141 * Size 20. Size 14.
20142 *
20143 * This is reasonably steep, but branch mispredict costs are
20144 * high on modern cpus, so consider failing only if optimizing
20145 * for space.
20146 */
20147
20148 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20149 && BRANCH_COST (optimize_insn_for_speed_p (),
20150 false) >= 2)
20151 {
20152 if (cf == 0)
20153 {
20154 enum machine_mode cmp_mode = GET_MODE (op0);
20155
20156 cf = ct;
20157 ct = 0;
20158
20159 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20160 {
20161 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20162
20163 /* We may be reversing unordered compare to normal compare,
20164 that is not valid in general (we may convert non-trapping
20165 condition to trapping one), however on i386 we currently
20166 emit all comparisons unordered. */
20167 code = reverse_condition_maybe_unordered (code);
20168 }
20169 else
20170 {
20171 code = reverse_condition (code);
20172 if (compare_code != UNKNOWN)
20173 compare_code = reverse_condition (compare_code);
20174 }
20175 }
20176
20177 if (compare_code != UNKNOWN)
20178 {
20179 /* notl op1 (if needed)
20180 sarl $31, op1
20181 andl (cf-ct), op1
20182 addl ct, op1
20183
20184 For x < 0 (resp. x <= -1) there will be no notl,
20185 so if possible swap the constants to get rid of the
20186 complement.
20187 True/false will be -1/0 while code below (store flag
20188 followed by decrement) is 0/-1, so the constants need
20189 to be exchanged once more. */
20190
20191 if (compare_code == GE || !cf)
20192 {
20193 code = reverse_condition (code);
20194 compare_code = LT;
20195 }
20196 else
20197 {
20198 HOST_WIDE_INT tmp = cf;
20199 cf = ct;
20200 ct = tmp;
20201 }
20202
20203 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20204 }
20205 else
20206 {
20207 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20208
20209 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20210 constm1_rtx,
20211 copy_rtx (out), 1, OPTAB_DIRECT);
20212 }
20213
20214 out = expand_simple_binop (mode, AND, copy_rtx (out),
20215 gen_int_mode (cf - ct, mode),
20216 copy_rtx (out), 1, OPTAB_DIRECT);
20217 if (ct)
20218 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20219 copy_rtx (out), 1, OPTAB_DIRECT);
20220 if (!rtx_equal_p (out, operands[0]))
20221 emit_move_insn (operands[0], copy_rtx (out));
20222
20223 return true;
20224 }
20225 }
20226
20227 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20228 {
20229 /* Try a few things more with specific constants and a variable. */
20230
20231 optab op;
20232 rtx var, orig_out, out, tmp;
20233
20234 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20235 return false;
20236
20237 /* If one of the two operands is an interesting constant, load a
20238 constant with the above and mask it in with a logical operation. */
20239
20240 if (CONST_INT_P (operands[2]))
20241 {
20242 var = operands[3];
20243 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20244 operands[3] = constm1_rtx, op = and_optab;
20245 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20246 operands[3] = const0_rtx, op = ior_optab;
20247 else
20248 return false;
20249 }
20250 else if (CONST_INT_P (operands[3]))
20251 {
20252 var = operands[2];
20253 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20254 operands[2] = constm1_rtx, op = and_optab;
20255 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20256 operands[2] = const0_rtx, op = ior_optab;
20257 else
20258 return false;
20259 }
20260 else
20261 return false;
20262
20263 orig_out = operands[0];
20264 tmp = gen_reg_rtx (mode);
20265 operands[0] = tmp;
20266
20267 /* Recurse to get the constant loaded. */
20268 if (ix86_expand_int_movcc (operands) == 0)
20269 return false;
20270
20271 /* Mask in the interesting variable. */
20272 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20273 OPTAB_WIDEN);
20274 if (!rtx_equal_p (out, orig_out))
20275 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20276
20277 return true;
20278 }
20279
20280 /*
20281 * For comparison with above,
20282 *
20283 * movl cf,dest
20284 * movl ct,tmp
20285 * cmpl op1,op2
20286 * cmovcc tmp,dest
20287 *
20288 * Size 15.
20289 */
20290
20291 if (! nonimmediate_operand (operands[2], mode))
20292 operands[2] = force_reg (mode, operands[2]);
20293 if (! nonimmediate_operand (operands[3], mode))
20294 operands[3] = force_reg (mode, operands[3]);
20295
20296 if (! register_operand (operands[2], VOIDmode)
20297 && (mode == QImode
20298 || ! register_operand (operands[3], VOIDmode)))
20299 operands[2] = force_reg (mode, operands[2]);
20300
20301 if (mode == QImode
20302 && ! register_operand (operands[3], VOIDmode))
20303 operands[3] = force_reg (mode, operands[3]);
20304
20305 emit_insn (compare_seq);
20306 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20307 gen_rtx_IF_THEN_ELSE (mode,
20308 compare_op, operands[2],
20309 operands[3])));
20310 return true;
20311 }
20312
20313 /* Swap, force into registers, or otherwise massage the two operands
20314 to an sse comparison with a mask result. Thus we differ a bit from
20315 ix86_prepare_fp_compare_args which expects to produce a flags result.
20316
20317 The DEST operand exists to help determine whether to commute commutative
20318 operators. The POP0/POP1 operands are updated in place. The new
20319 comparison code is returned, or UNKNOWN if not implementable. */
20320
20321 static enum rtx_code
20322 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20323 rtx *pop0, rtx *pop1)
20324 {
20325 rtx tmp;
20326
20327 switch (code)
20328 {
20329 case LTGT:
20330 case UNEQ:
20331 /* AVX supports all the needed comparisons. */
20332 if (TARGET_AVX)
20333 break;
20334 /* We have no LTGT as an operator. We could implement it with
20335 NE & ORDERED, but this requires an extra temporary. It's
20336 not clear that it's worth it. */
20337 return UNKNOWN;
20338
20339 case LT:
20340 case LE:
20341 case UNGT:
20342 case UNGE:
20343 /* These are supported directly. */
20344 break;
20345
20346 case EQ:
20347 case NE:
20348 case UNORDERED:
20349 case ORDERED:
20350 /* AVX has 3 operand comparisons, no need to swap anything. */
20351 if (TARGET_AVX)
20352 break;
20353 /* For commutative operators, try to canonicalize the destination
20354 operand to be first in the comparison - this helps reload to
20355 avoid extra moves. */
20356 if (!dest || !rtx_equal_p (dest, *pop1))
20357 break;
20358 /* FALLTHRU */
20359
20360 case GE:
20361 case GT:
20362 case UNLE:
20363 case UNLT:
20364 /* These are not supported directly before AVX, and furthermore
20365 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20366 comparison operands to transform into something that is
20367 supported. */
20368 tmp = *pop0;
20369 *pop0 = *pop1;
20370 *pop1 = tmp;
20371 code = swap_condition (code);
20372 break;
20373
20374 default:
20375 gcc_unreachable ();
20376 }
20377
20378 return code;
20379 }
20380
20381 /* Detect conditional moves that exactly match min/max operational
20382 semantics. Note that this is IEEE safe, as long as we don't
20383 interchange the operands.
20384
20385 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20386 and TRUE if the operation is successful and instructions are emitted. */
20387
20388 static bool
20389 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20390 rtx cmp_op1, rtx if_true, rtx if_false)
20391 {
20392 enum machine_mode mode;
20393 bool is_min;
20394 rtx tmp;
20395
20396 if (code == LT)
20397 ;
20398 else if (code == UNGE)
20399 {
20400 tmp = if_true;
20401 if_true = if_false;
20402 if_false = tmp;
20403 }
20404 else
20405 return false;
20406
20407 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20408 is_min = true;
20409 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20410 is_min = false;
20411 else
20412 return false;
20413
20414 mode = GET_MODE (dest);
20415
20416 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20417 but MODE may be a vector mode and thus not appropriate. */
20418 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20419 {
20420 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20421 rtvec v;
20422
20423 if_true = force_reg (mode, if_true);
20424 v = gen_rtvec (2, if_true, if_false);
20425 tmp = gen_rtx_UNSPEC (mode, v, u);
20426 }
20427 else
20428 {
20429 code = is_min ? SMIN : SMAX;
20430 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20431 }
20432
20433 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20434 return true;
20435 }
20436
20437 /* Expand an sse vector comparison. Return the register with the result. */
20438
20439 static rtx
20440 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20441 rtx op_true, rtx op_false)
20442 {
20443 enum machine_mode mode = GET_MODE (dest);
20444 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20445 rtx x;
20446
20447 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20448 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20449 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20450
20451 if (optimize
20452 || reg_overlap_mentioned_p (dest, op_true)
20453 || reg_overlap_mentioned_p (dest, op_false))
20454 dest = gen_reg_rtx (mode);
20455
20456 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20457 if (cmp_mode != mode)
20458 {
20459 x = force_reg (cmp_mode, x);
20460 convert_move (dest, x, false);
20461 }
20462 else
20463 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20464
20465 return dest;
20466 }
20467
20468 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20469 operations. This is used for both scalar and vector conditional moves. */
20470
20471 static void
20472 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20473 {
20474 enum machine_mode mode = GET_MODE (dest);
20475 rtx t2, t3, x;
20476
20477 if (vector_all_ones_operand (op_true, mode)
20478 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20479 {
20480 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20481 }
20482 else if (op_false == CONST0_RTX (mode))
20483 {
20484 op_true = force_reg (mode, op_true);
20485 x = gen_rtx_AND (mode, cmp, op_true);
20486 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20487 }
20488 else if (op_true == CONST0_RTX (mode))
20489 {
20490 op_false = force_reg (mode, op_false);
20491 x = gen_rtx_NOT (mode, cmp);
20492 x = gen_rtx_AND (mode, x, op_false);
20493 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20494 }
20495 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20496 {
20497 op_false = force_reg (mode, op_false);
20498 x = gen_rtx_IOR (mode, cmp, op_false);
20499 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20500 }
20501 else if (TARGET_XOP)
20502 {
20503 op_true = force_reg (mode, op_true);
20504
20505 if (!nonimmediate_operand (op_false, mode))
20506 op_false = force_reg (mode, op_false);
20507
20508 emit_insn (gen_rtx_SET (mode, dest,
20509 gen_rtx_IF_THEN_ELSE (mode, cmp,
20510 op_true,
20511 op_false)));
20512 }
20513 else
20514 {
20515 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20516 rtx d = dest;
20517
20518 if (!nonimmediate_operand (op_true, mode))
20519 op_true = force_reg (mode, op_true);
20520
20521 op_false = force_reg (mode, op_false);
20522
20523 switch (mode)
20524 {
20525 case V4SFmode:
20526 if (TARGET_SSE4_1)
20527 gen = gen_sse4_1_blendvps;
20528 break;
20529 case V2DFmode:
20530 if (TARGET_SSE4_1)
20531 gen = gen_sse4_1_blendvpd;
20532 break;
20533 case V16QImode:
20534 case V8HImode:
20535 case V4SImode:
20536 case V2DImode:
20537 if (TARGET_SSE4_1)
20538 {
20539 gen = gen_sse4_1_pblendvb;
20540 if (mode != V16QImode)
20541 d = gen_reg_rtx (V16QImode);
20542 op_false = gen_lowpart (V16QImode, op_false);
20543 op_true = gen_lowpart (V16QImode, op_true);
20544 cmp = gen_lowpart (V16QImode, cmp);
20545 }
20546 break;
20547 case V8SFmode:
20548 if (TARGET_AVX)
20549 gen = gen_avx_blendvps256;
20550 break;
20551 case V4DFmode:
20552 if (TARGET_AVX)
20553 gen = gen_avx_blendvpd256;
20554 break;
20555 case V32QImode:
20556 case V16HImode:
20557 case V8SImode:
20558 case V4DImode:
20559 if (TARGET_AVX2)
20560 {
20561 gen = gen_avx2_pblendvb;
20562 if (mode != V32QImode)
20563 d = gen_reg_rtx (V32QImode);
20564 op_false = gen_lowpart (V32QImode, op_false);
20565 op_true = gen_lowpart (V32QImode, op_true);
20566 cmp = gen_lowpart (V32QImode, cmp);
20567 }
20568 break;
20569 default:
20570 break;
20571 }
20572
20573 if (gen != NULL)
20574 {
20575 emit_insn (gen (d, op_false, op_true, cmp));
20576 if (d != dest)
20577 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
20578 }
20579 else
20580 {
20581 op_true = force_reg (mode, op_true);
20582
20583 t2 = gen_reg_rtx (mode);
20584 if (optimize)
20585 t3 = gen_reg_rtx (mode);
20586 else
20587 t3 = dest;
20588
20589 x = gen_rtx_AND (mode, op_true, cmp);
20590 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20591
20592 x = gen_rtx_NOT (mode, cmp);
20593 x = gen_rtx_AND (mode, x, op_false);
20594 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20595
20596 x = gen_rtx_IOR (mode, t3, t2);
20597 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20598 }
20599 }
20600 }
20601
20602 /* Expand a floating-point conditional move. Return true if successful. */
20603
20604 bool
20605 ix86_expand_fp_movcc (rtx operands[])
20606 {
20607 enum machine_mode mode = GET_MODE (operands[0]);
20608 enum rtx_code code = GET_CODE (operands[1]);
20609 rtx tmp, compare_op;
20610 rtx op0 = XEXP (operands[1], 0);
20611 rtx op1 = XEXP (operands[1], 1);
20612
20613 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20614 {
20615 enum machine_mode cmode;
20616
20617 /* Since we've no cmove for sse registers, don't force bad register
20618 allocation just to gain access to it. Deny movcc when the
20619 comparison mode doesn't match the move mode. */
20620 cmode = GET_MODE (op0);
20621 if (cmode == VOIDmode)
20622 cmode = GET_MODE (op1);
20623 if (cmode != mode)
20624 return false;
20625
20626 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20627 if (code == UNKNOWN)
20628 return false;
20629
20630 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20631 operands[2], operands[3]))
20632 return true;
20633
20634 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20635 operands[2], operands[3]);
20636 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20637 return true;
20638 }
20639
20640 if (GET_MODE (op0) == TImode
20641 || (GET_MODE (op0) == DImode
20642 && !TARGET_64BIT))
20643 return false;
20644
20645 /* The floating point conditional move instructions don't directly
20646 support conditions resulting from a signed integer comparison. */
20647
20648 compare_op = ix86_expand_compare (code, op0, op1);
20649 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20650 {
20651 tmp = gen_reg_rtx (QImode);
20652 ix86_expand_setcc (tmp, code, op0, op1);
20653
20654 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20655 }
20656
20657 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20658 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20659 operands[2], operands[3])));
20660
20661 return true;
20662 }
20663
20664 /* Expand a floating-point vector conditional move; a vcond operation
20665 rather than a movcc operation. */
20666
20667 bool
20668 ix86_expand_fp_vcond (rtx operands[])
20669 {
20670 enum rtx_code code = GET_CODE (operands[3]);
20671 rtx cmp;
20672
20673 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20674 &operands[4], &operands[5]);
20675 if (code == UNKNOWN)
20676 {
20677 rtx temp;
20678 switch (GET_CODE (operands[3]))
20679 {
20680 case LTGT:
20681 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20682 operands[5], operands[0], operands[0]);
20683 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20684 operands[5], operands[1], operands[2]);
20685 code = AND;
20686 break;
20687 case UNEQ:
20688 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20689 operands[5], operands[0], operands[0]);
20690 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20691 operands[5], operands[1], operands[2]);
20692 code = IOR;
20693 break;
20694 default:
20695 gcc_unreachable ();
20696 }
20697 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20698 OPTAB_DIRECT);
20699 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20700 return true;
20701 }
20702
20703 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20704 operands[5], operands[1], operands[2]))
20705 return true;
20706
20707 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20708 operands[1], operands[2]);
20709 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20710 return true;
20711 }
20712
20713 /* Expand a signed/unsigned integral vector conditional move. */
20714
20715 bool
20716 ix86_expand_int_vcond (rtx operands[])
20717 {
20718 enum machine_mode data_mode = GET_MODE (operands[0]);
20719 enum machine_mode mode = GET_MODE (operands[4]);
20720 enum rtx_code code = GET_CODE (operands[3]);
20721 bool negate = false;
20722 rtx x, cop0, cop1;
20723
20724 cop0 = operands[4];
20725 cop1 = operands[5];
20726
20727 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20728 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20729 if ((code == LT || code == GE)
20730 && data_mode == mode
20731 && cop1 == CONST0_RTX (mode)
20732 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20733 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20734 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20735 && (GET_MODE_SIZE (data_mode) == 16
20736 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20737 {
20738 rtx negop = operands[2 - (code == LT)];
20739 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20740 if (negop == CONST1_RTX (data_mode))
20741 {
20742 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20743 operands[0], 1, OPTAB_DIRECT);
20744 if (res != operands[0])
20745 emit_move_insn (operands[0], res);
20746 return true;
20747 }
20748 else if (GET_MODE_INNER (data_mode) != DImode
20749 && vector_all_ones_operand (negop, data_mode))
20750 {
20751 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20752 operands[0], 0, OPTAB_DIRECT);
20753 if (res != operands[0])
20754 emit_move_insn (operands[0], res);
20755 return true;
20756 }
20757 }
20758
20759 if (!nonimmediate_operand (cop1, mode))
20760 cop1 = force_reg (mode, cop1);
20761 if (!general_operand (operands[1], data_mode))
20762 operands[1] = force_reg (data_mode, operands[1]);
20763 if (!general_operand (operands[2], data_mode))
20764 operands[2] = force_reg (data_mode, operands[2]);
20765
20766 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20767 if (TARGET_XOP
20768 && (mode == V16QImode || mode == V8HImode
20769 || mode == V4SImode || mode == V2DImode))
20770 ;
20771 else
20772 {
20773 /* Canonicalize the comparison to EQ, GT, GTU. */
20774 switch (code)
20775 {
20776 case EQ:
20777 case GT:
20778 case GTU:
20779 break;
20780
20781 case NE:
20782 case LE:
20783 case LEU:
20784 code = reverse_condition (code);
20785 negate = true;
20786 break;
20787
20788 case GE:
20789 case GEU:
20790 code = reverse_condition (code);
20791 negate = true;
20792 /* FALLTHRU */
20793
20794 case LT:
20795 case LTU:
20796 code = swap_condition (code);
20797 x = cop0, cop0 = cop1, cop1 = x;
20798 break;
20799
20800 default:
20801 gcc_unreachable ();
20802 }
20803
20804 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20805 if (mode == V2DImode)
20806 {
20807 switch (code)
20808 {
20809 case EQ:
20810 /* SSE4.1 supports EQ. */
20811 if (!TARGET_SSE4_1)
20812 return false;
20813 break;
20814
20815 case GT:
20816 case GTU:
20817 /* SSE4.2 supports GT/GTU. */
20818 if (!TARGET_SSE4_2)
20819 return false;
20820 break;
20821
20822 default:
20823 gcc_unreachable ();
20824 }
20825 }
20826
20827 /* Unsigned parallel compare is not supported by the hardware.
20828 Play some tricks to turn this into a signed comparison
20829 against 0. */
20830 if (code == GTU)
20831 {
20832 cop0 = force_reg (mode, cop0);
20833
20834 switch (mode)
20835 {
20836 case V8SImode:
20837 case V4DImode:
20838 case V4SImode:
20839 case V2DImode:
20840 {
20841 rtx t1, t2, mask;
20842 rtx (*gen_sub3) (rtx, rtx, rtx);
20843
20844 switch (mode)
20845 {
20846 case V8SImode: gen_sub3 = gen_subv8si3; break;
20847 case V4DImode: gen_sub3 = gen_subv4di3; break;
20848 case V4SImode: gen_sub3 = gen_subv4si3; break;
20849 case V2DImode: gen_sub3 = gen_subv2di3; break;
20850 default:
20851 gcc_unreachable ();
20852 }
20853 /* Subtract (-(INT MAX) - 1) from both operands to make
20854 them signed. */
20855 mask = ix86_build_signbit_mask (mode, true, false);
20856 t1 = gen_reg_rtx (mode);
20857 emit_insn (gen_sub3 (t1, cop0, mask));
20858
20859 t2 = gen_reg_rtx (mode);
20860 emit_insn (gen_sub3 (t2, cop1, mask));
20861
20862 cop0 = t1;
20863 cop1 = t2;
20864 code = GT;
20865 }
20866 break;
20867
20868 case V32QImode:
20869 case V16HImode:
20870 case V16QImode:
20871 case V8HImode:
20872 /* Perform a parallel unsigned saturating subtraction. */
20873 x = gen_reg_rtx (mode);
20874 emit_insn (gen_rtx_SET (VOIDmode, x,
20875 gen_rtx_US_MINUS (mode, cop0, cop1)));
20876
20877 cop0 = x;
20878 cop1 = CONST0_RTX (mode);
20879 code = EQ;
20880 negate = !negate;
20881 break;
20882
20883 default:
20884 gcc_unreachable ();
20885 }
20886 }
20887 }
20888
20889 /* Allow the comparison to be done in one mode, but the movcc to
20890 happen in another mode. */
20891 if (data_mode == mode)
20892 {
20893 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20894 operands[1+negate], operands[2-negate]);
20895 }
20896 else
20897 {
20898 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20899 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
20900 operands[1+negate], operands[2-negate]);
20901 x = gen_lowpart (data_mode, x);
20902 }
20903
20904 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20905 operands[2-negate]);
20906 return true;
20907 }
20908
20909 /* Expand a variable vector permutation. */
20910
20911 void
20912 ix86_expand_vec_perm (rtx operands[])
20913 {
20914 rtx target = operands[0];
20915 rtx op0 = operands[1];
20916 rtx op1 = operands[2];
20917 rtx mask = operands[3];
20918 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
20919 enum machine_mode mode = GET_MODE (op0);
20920 enum machine_mode maskmode = GET_MODE (mask);
20921 int w, e, i;
20922 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20923
20924 /* Number of elements in the vector. */
20925 w = GET_MODE_NUNITS (mode);
20926 e = GET_MODE_UNIT_SIZE (mode);
20927 gcc_assert (w <= 32);
20928
20929 if (TARGET_AVX2)
20930 {
20931 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20932 {
20933 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20934 an constant shuffle operand. With a tiny bit of effort we can
20935 use VPERMD instead. A re-interpretation stall for V4DFmode is
20936 unfortunate but there's no avoiding it.
20937 Similarly for V16HImode we don't have instructions for variable
20938 shuffling, while for V32QImode we can use after preparing suitable
20939 masks vpshufb; vpshufb; vpermq; vpor. */
20940
20941 if (mode == V16HImode)
20942 {
20943 maskmode = mode = V32QImode;
20944 w = 32;
20945 e = 1;
20946 }
20947 else
20948 {
20949 maskmode = mode = V8SImode;
20950 w = 8;
20951 e = 4;
20952 }
20953 t1 = gen_reg_rtx (maskmode);
20954
20955 /* Replicate the low bits of the V4DImode mask into V8SImode:
20956 mask = { A B C D }
20957 t1 = { A A B B C C D D }. */
20958 for (i = 0; i < w / 2; ++i)
20959 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20960 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20961 vt = force_reg (maskmode, vt);
20962 mask = gen_lowpart (maskmode, mask);
20963 if (maskmode == V8SImode)
20964 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20965 else
20966 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20967
20968 /* Multiply the shuffle indicies by two. */
20969 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20970 OPTAB_DIRECT);
20971
20972 /* Add one to the odd shuffle indicies:
20973 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20974 for (i = 0; i < w / 2; ++i)
20975 {
20976 vec[i * 2] = const0_rtx;
20977 vec[i * 2 + 1] = const1_rtx;
20978 }
20979 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20980 vt = validize_mem (force_const_mem (maskmode, vt));
20981 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20982 OPTAB_DIRECT);
20983
20984 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20985 operands[3] = mask = t1;
20986 target = gen_reg_rtx (mode);
20987 op0 = gen_lowpart (mode, op0);
20988 op1 = gen_lowpart (mode, op1);
20989 }
20990
20991 switch (mode)
20992 {
20993 case V8SImode:
20994 /* The VPERMD and VPERMPS instructions already properly ignore
20995 the high bits of the shuffle elements. No need for us to
20996 perform an AND ourselves. */
20997 if (one_operand_shuffle)
20998 {
20999 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
21000 if (target != operands[0])
21001 emit_move_insn (operands[0],
21002 gen_lowpart (GET_MODE (operands[0]), target));
21003 }
21004 else
21005 {
21006 t1 = gen_reg_rtx (V8SImode);
21007 t2 = gen_reg_rtx (V8SImode);
21008 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
21009 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
21010 goto merge_two;
21011 }
21012 return;
21013
21014 case V8SFmode:
21015 mask = gen_lowpart (V8SFmode, mask);
21016 if (one_operand_shuffle)
21017 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
21018 else
21019 {
21020 t1 = gen_reg_rtx (V8SFmode);
21021 t2 = gen_reg_rtx (V8SFmode);
21022 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
21023 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
21024 goto merge_two;
21025 }
21026 return;
21027
21028 case V4SImode:
21029 /* By combining the two 128-bit input vectors into one 256-bit
21030 input vector, we can use VPERMD and VPERMPS for the full
21031 two-operand shuffle. */
21032 t1 = gen_reg_rtx (V8SImode);
21033 t2 = gen_reg_rtx (V8SImode);
21034 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
21035 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21036 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
21037 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
21038 return;
21039
21040 case V4SFmode:
21041 t1 = gen_reg_rtx (V8SFmode);
21042 t2 = gen_reg_rtx (V8SImode);
21043 mask = gen_lowpart (V4SImode, mask);
21044 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
21045 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21046 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
21047 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
21048 return;
21049
21050 case V32QImode:
21051 t1 = gen_reg_rtx (V32QImode);
21052 t2 = gen_reg_rtx (V32QImode);
21053 t3 = gen_reg_rtx (V32QImode);
21054 vt2 = GEN_INT (128);
21055 for (i = 0; i < 32; i++)
21056 vec[i] = vt2;
21057 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21058 vt = force_reg (V32QImode, vt);
21059 for (i = 0; i < 32; i++)
21060 vec[i] = i < 16 ? vt2 : const0_rtx;
21061 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21062 vt2 = force_reg (V32QImode, vt2);
21063 /* From mask create two adjusted masks, which contain the same
21064 bits as mask in the low 7 bits of each vector element.
21065 The first mask will have the most significant bit clear
21066 if it requests element from the same 128-bit lane
21067 and MSB set if it requests element from the other 128-bit lane.
21068 The second mask will have the opposite values of the MSB,
21069 and additionally will have its 128-bit lanes swapped.
21070 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
21071 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
21072 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
21073 stands for other 12 bytes. */
21074 /* The bit whether element is from the same lane or the other
21075 lane is bit 4, so shift it up by 3 to the MSB position. */
21076 t5 = gen_reg_rtx (V4DImode);
21077 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
21078 GEN_INT (3)));
21079 /* Clear MSB bits from the mask just in case it had them set. */
21080 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
21081 /* After this t1 will have MSB set for elements from other lane. */
21082 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
21083 /* Clear bits other than MSB. */
21084 emit_insn (gen_andv32qi3 (t1, t1, vt));
21085 /* Or in the lower bits from mask into t3. */
21086 emit_insn (gen_iorv32qi3 (t3, t1, t2));
21087 /* And invert MSB bits in t1, so MSB is set for elements from the same
21088 lane. */
21089 emit_insn (gen_xorv32qi3 (t1, t1, vt));
21090 /* Swap 128-bit lanes in t3. */
21091 t6 = gen_reg_rtx (V4DImode);
21092 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
21093 const2_rtx, GEN_INT (3),
21094 const0_rtx, const1_rtx));
21095 /* And or in the lower bits from mask into t1. */
21096 emit_insn (gen_iorv32qi3 (t1, t1, t2));
21097 if (one_operand_shuffle)
21098 {
21099 /* Each of these shuffles will put 0s in places where
21100 element from the other 128-bit lane is needed, otherwise
21101 will shuffle in the requested value. */
21102 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
21103 gen_lowpart (V32QImode, t6)));
21104 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
21105 /* For t3 the 128-bit lanes are swapped again. */
21106 t7 = gen_reg_rtx (V4DImode);
21107 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
21108 const2_rtx, GEN_INT (3),
21109 const0_rtx, const1_rtx));
21110 /* And oring both together leads to the result. */
21111 emit_insn (gen_iorv32qi3 (target, t1,
21112 gen_lowpart (V32QImode, t7)));
21113 if (target != operands[0])
21114 emit_move_insn (operands[0],
21115 gen_lowpart (GET_MODE (operands[0]), target));
21116 return;
21117 }
21118
21119 t4 = gen_reg_rtx (V32QImode);
21120 /* Similarly to the above one_operand_shuffle code,
21121 just for repeated twice for each operand. merge_two:
21122 code will merge the two results together. */
21123 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
21124 gen_lowpart (V32QImode, t6)));
21125 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
21126 gen_lowpart (V32QImode, t6)));
21127 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
21128 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
21129 t7 = gen_reg_rtx (V4DImode);
21130 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
21131 const2_rtx, GEN_INT (3),
21132 const0_rtx, const1_rtx));
21133 t8 = gen_reg_rtx (V4DImode);
21134 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
21135 const2_rtx, GEN_INT (3),
21136 const0_rtx, const1_rtx));
21137 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
21138 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
21139 t1 = t4;
21140 t2 = t3;
21141 goto merge_two;
21142
21143 default:
21144 gcc_assert (GET_MODE_SIZE (mode) <= 16);
21145 break;
21146 }
21147 }
21148
21149 if (TARGET_XOP)
21150 {
21151 /* The XOP VPPERM insn supports three inputs. By ignoring the
21152 one_operand_shuffle special case, we avoid creating another
21153 set of constant vectors in memory. */
21154 one_operand_shuffle = false;
21155
21156 /* mask = mask & {2*w-1, ...} */
21157 vt = GEN_INT (2*w - 1);
21158 }
21159 else
21160 {
21161 /* mask = mask & {w-1, ...} */
21162 vt = GEN_INT (w - 1);
21163 }
21164
21165 for (i = 0; i < w; i++)
21166 vec[i] = vt;
21167 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21168 mask = expand_simple_binop (maskmode, AND, mask, vt,
21169 NULL_RTX, 0, OPTAB_DIRECT);
21170
21171 /* For non-QImode operations, convert the word permutation control
21172 into a byte permutation control. */
21173 if (mode != V16QImode)
21174 {
21175 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21176 GEN_INT (exact_log2 (e)),
21177 NULL_RTX, 0, OPTAB_DIRECT);
21178
21179 /* Convert mask to vector of chars. */
21180 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21181
21182 /* Replicate each of the input bytes into byte positions:
21183 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21184 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21185 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21186 for (i = 0; i < 16; ++i)
21187 vec[i] = GEN_INT (i/e * e);
21188 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21189 vt = validize_mem (force_const_mem (V16QImode, vt));
21190 if (TARGET_XOP)
21191 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21192 else
21193 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21194
21195 /* Convert it into the byte positions by doing
21196 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21197 for (i = 0; i < 16; ++i)
21198 vec[i] = GEN_INT (i % e);
21199 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21200 vt = validize_mem (force_const_mem (V16QImode, vt));
21201 emit_insn (gen_addv16qi3 (mask, mask, vt));
21202 }
21203
21204 /* The actual shuffle operations all operate on V16QImode. */
21205 op0 = gen_lowpart (V16QImode, op0);
21206 op1 = gen_lowpart (V16QImode, op1);
21207
21208 if (TARGET_XOP)
21209 {
21210 if (GET_MODE (target) != V16QImode)
21211 target = gen_reg_rtx (V16QImode);
21212 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21213 if (target != operands[0])
21214 emit_move_insn (operands[0],
21215 gen_lowpart (GET_MODE (operands[0]), target));
21216 }
21217 else if (one_operand_shuffle)
21218 {
21219 if (GET_MODE (target) != V16QImode)
21220 target = gen_reg_rtx (V16QImode);
21221 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21222 if (target != operands[0])
21223 emit_move_insn (operands[0],
21224 gen_lowpart (GET_MODE (operands[0]), target));
21225 }
21226 else
21227 {
21228 rtx xops[6];
21229 bool ok;
21230
21231 /* Shuffle the two input vectors independently. */
21232 t1 = gen_reg_rtx (V16QImode);
21233 t2 = gen_reg_rtx (V16QImode);
21234 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21235 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21236
21237 merge_two:
21238 /* Then merge them together. The key is whether any given control
21239 element contained a bit set that indicates the second word. */
21240 mask = operands[3];
21241 vt = GEN_INT (w);
21242 if (maskmode == V2DImode && !TARGET_SSE4_1)
21243 {
21244 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21245 more shuffle to convert the V2DI input mask into a V4SI
21246 input mask. At which point the masking that expand_int_vcond
21247 will work as desired. */
21248 rtx t3 = gen_reg_rtx (V4SImode);
21249 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21250 const0_rtx, const0_rtx,
21251 const2_rtx, const2_rtx));
21252 mask = t3;
21253 maskmode = V4SImode;
21254 e = w = 4;
21255 }
21256
21257 for (i = 0; i < w; i++)
21258 vec[i] = vt;
21259 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21260 vt = force_reg (maskmode, vt);
21261 mask = expand_simple_binop (maskmode, AND, mask, vt,
21262 NULL_RTX, 0, OPTAB_DIRECT);
21263
21264 if (GET_MODE (target) != mode)
21265 target = gen_reg_rtx (mode);
21266 xops[0] = target;
21267 xops[1] = gen_lowpart (mode, t2);
21268 xops[2] = gen_lowpart (mode, t1);
21269 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21270 xops[4] = mask;
21271 xops[5] = vt;
21272 ok = ix86_expand_int_vcond (xops);
21273 gcc_assert (ok);
21274 if (target != operands[0])
21275 emit_move_insn (operands[0],
21276 gen_lowpart (GET_MODE (operands[0]), target));
21277 }
21278 }
21279
21280 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21281 true if we should do zero extension, else sign extension. HIGH_P is
21282 true if we want the N/2 high elements, else the low elements. */
21283
21284 void
21285 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21286 {
21287 enum machine_mode imode = GET_MODE (src);
21288 rtx tmp;
21289
21290 if (TARGET_SSE4_1)
21291 {
21292 rtx (*unpack)(rtx, rtx);
21293 rtx (*extract)(rtx, rtx) = NULL;
21294 enum machine_mode halfmode = BLKmode;
21295
21296 switch (imode)
21297 {
21298 case V32QImode:
21299 if (unsigned_p)
21300 unpack = gen_avx2_zero_extendv16qiv16hi2;
21301 else
21302 unpack = gen_avx2_sign_extendv16qiv16hi2;
21303 halfmode = V16QImode;
21304 extract
21305 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21306 break;
21307 case V16HImode:
21308 if (unsigned_p)
21309 unpack = gen_avx2_zero_extendv8hiv8si2;
21310 else
21311 unpack = gen_avx2_sign_extendv8hiv8si2;
21312 halfmode = V8HImode;
21313 extract
21314 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21315 break;
21316 case V8SImode:
21317 if (unsigned_p)
21318 unpack = gen_avx2_zero_extendv4siv4di2;
21319 else
21320 unpack = gen_avx2_sign_extendv4siv4di2;
21321 halfmode = V4SImode;
21322 extract
21323 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21324 break;
21325 case V16QImode:
21326 if (unsigned_p)
21327 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21328 else
21329 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21330 break;
21331 case V8HImode:
21332 if (unsigned_p)
21333 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21334 else
21335 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21336 break;
21337 case V4SImode:
21338 if (unsigned_p)
21339 unpack = gen_sse4_1_zero_extendv2siv2di2;
21340 else
21341 unpack = gen_sse4_1_sign_extendv2siv2di2;
21342 break;
21343 default:
21344 gcc_unreachable ();
21345 }
21346
21347 if (GET_MODE_SIZE (imode) == 32)
21348 {
21349 tmp = gen_reg_rtx (halfmode);
21350 emit_insn (extract (tmp, src));
21351 }
21352 else if (high_p)
21353 {
21354 /* Shift higher 8 bytes to lower 8 bytes. */
21355 tmp = gen_reg_rtx (V1TImode);
21356 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
21357 GEN_INT (64)));
21358 tmp = gen_lowpart (imode, tmp);
21359 }
21360 else
21361 tmp = src;
21362
21363 emit_insn (unpack (dest, tmp));
21364 }
21365 else
21366 {
21367 rtx (*unpack)(rtx, rtx, rtx);
21368
21369 switch (imode)
21370 {
21371 case V16QImode:
21372 if (high_p)
21373 unpack = gen_vec_interleave_highv16qi;
21374 else
21375 unpack = gen_vec_interleave_lowv16qi;
21376 break;
21377 case V8HImode:
21378 if (high_p)
21379 unpack = gen_vec_interleave_highv8hi;
21380 else
21381 unpack = gen_vec_interleave_lowv8hi;
21382 break;
21383 case V4SImode:
21384 if (high_p)
21385 unpack = gen_vec_interleave_highv4si;
21386 else
21387 unpack = gen_vec_interleave_lowv4si;
21388 break;
21389 default:
21390 gcc_unreachable ();
21391 }
21392
21393 if (unsigned_p)
21394 tmp = force_reg (imode, CONST0_RTX (imode));
21395 else
21396 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21397 src, pc_rtx, pc_rtx);
21398
21399 rtx tmp2 = gen_reg_rtx (imode);
21400 emit_insn (unpack (tmp2, src, tmp));
21401 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
21402 }
21403 }
21404
21405 /* Expand conditional increment or decrement using adb/sbb instructions.
21406 The default case using setcc followed by the conditional move can be
21407 done by generic code. */
21408 bool
21409 ix86_expand_int_addcc (rtx operands[])
21410 {
21411 enum rtx_code code = GET_CODE (operands[1]);
21412 rtx flags;
21413 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21414 rtx compare_op;
21415 rtx val = const0_rtx;
21416 bool fpcmp = false;
21417 enum machine_mode mode;
21418 rtx op0 = XEXP (operands[1], 0);
21419 rtx op1 = XEXP (operands[1], 1);
21420
21421 if (operands[3] != const1_rtx
21422 && operands[3] != constm1_rtx)
21423 return false;
21424 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21425 return false;
21426 code = GET_CODE (compare_op);
21427
21428 flags = XEXP (compare_op, 0);
21429
21430 if (GET_MODE (flags) == CCFPmode
21431 || GET_MODE (flags) == CCFPUmode)
21432 {
21433 fpcmp = true;
21434 code = ix86_fp_compare_code_to_integer (code);
21435 }
21436
21437 if (code != LTU)
21438 {
21439 val = constm1_rtx;
21440 if (fpcmp)
21441 PUT_CODE (compare_op,
21442 reverse_condition_maybe_unordered
21443 (GET_CODE (compare_op)));
21444 else
21445 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21446 }
21447
21448 mode = GET_MODE (operands[0]);
21449
21450 /* Construct either adc or sbb insn. */
21451 if ((code == LTU) == (operands[3] == constm1_rtx))
21452 {
21453 switch (mode)
21454 {
21455 case QImode:
21456 insn = gen_subqi3_carry;
21457 break;
21458 case HImode:
21459 insn = gen_subhi3_carry;
21460 break;
21461 case SImode:
21462 insn = gen_subsi3_carry;
21463 break;
21464 case DImode:
21465 insn = gen_subdi3_carry;
21466 break;
21467 default:
21468 gcc_unreachable ();
21469 }
21470 }
21471 else
21472 {
21473 switch (mode)
21474 {
21475 case QImode:
21476 insn = gen_addqi3_carry;
21477 break;
21478 case HImode:
21479 insn = gen_addhi3_carry;
21480 break;
21481 case SImode:
21482 insn = gen_addsi3_carry;
21483 break;
21484 case DImode:
21485 insn = gen_adddi3_carry;
21486 break;
21487 default:
21488 gcc_unreachable ();
21489 }
21490 }
21491 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21492
21493 return true;
21494 }
21495
21496
21497 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21498 but works for floating pointer parameters and nonoffsetable memories.
21499 For pushes, it returns just stack offsets; the values will be saved
21500 in the right order. Maximally three parts are generated. */
21501
21502 static int
21503 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21504 {
21505 int size;
21506
21507 if (!TARGET_64BIT)
21508 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21509 else
21510 size = (GET_MODE_SIZE (mode) + 4) / 8;
21511
21512 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21513 gcc_assert (size >= 2 && size <= 4);
21514
21515 /* Optimize constant pool reference to immediates. This is used by fp
21516 moves, that force all constants to memory to allow combining. */
21517 if (MEM_P (operand) && MEM_READONLY_P (operand))
21518 {
21519 rtx tmp = maybe_get_pool_constant (operand);
21520 if (tmp)
21521 operand = tmp;
21522 }
21523
21524 if (MEM_P (operand) && !offsettable_memref_p (operand))
21525 {
21526 /* The only non-offsetable memories we handle are pushes. */
21527 int ok = push_operand (operand, VOIDmode);
21528
21529 gcc_assert (ok);
21530
21531 operand = copy_rtx (operand);
21532 PUT_MODE (operand, word_mode);
21533 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21534 return size;
21535 }
21536
21537 if (GET_CODE (operand) == CONST_VECTOR)
21538 {
21539 enum machine_mode imode = int_mode_for_mode (mode);
21540 /* Caution: if we looked through a constant pool memory above,
21541 the operand may actually have a different mode now. That's
21542 ok, since we want to pun this all the way back to an integer. */
21543 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21544 gcc_assert (operand != NULL);
21545 mode = imode;
21546 }
21547
21548 if (!TARGET_64BIT)
21549 {
21550 if (mode == DImode)
21551 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21552 else
21553 {
21554 int i;
21555
21556 if (REG_P (operand))
21557 {
21558 gcc_assert (reload_completed);
21559 for (i = 0; i < size; i++)
21560 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21561 }
21562 else if (offsettable_memref_p (operand))
21563 {
21564 operand = adjust_address (operand, SImode, 0);
21565 parts[0] = operand;
21566 for (i = 1; i < size; i++)
21567 parts[i] = adjust_address (operand, SImode, 4 * i);
21568 }
21569 else if (GET_CODE (operand) == CONST_DOUBLE)
21570 {
21571 REAL_VALUE_TYPE r;
21572 long l[4];
21573
21574 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21575 switch (mode)
21576 {
21577 case TFmode:
21578 real_to_target (l, &r, mode);
21579 parts[3] = gen_int_mode (l[3], SImode);
21580 parts[2] = gen_int_mode (l[2], SImode);
21581 break;
21582 case XFmode:
21583 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21584 long double may not be 80-bit. */
21585 real_to_target (l, &r, mode);
21586 parts[2] = gen_int_mode (l[2], SImode);
21587 break;
21588 case DFmode:
21589 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21590 break;
21591 default:
21592 gcc_unreachable ();
21593 }
21594 parts[1] = gen_int_mode (l[1], SImode);
21595 parts[0] = gen_int_mode (l[0], SImode);
21596 }
21597 else
21598 gcc_unreachable ();
21599 }
21600 }
21601 else
21602 {
21603 if (mode == TImode)
21604 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21605 if (mode == XFmode || mode == TFmode)
21606 {
21607 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21608 if (REG_P (operand))
21609 {
21610 gcc_assert (reload_completed);
21611 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21612 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21613 }
21614 else if (offsettable_memref_p (operand))
21615 {
21616 operand = adjust_address (operand, DImode, 0);
21617 parts[0] = operand;
21618 parts[1] = adjust_address (operand, upper_mode, 8);
21619 }
21620 else if (GET_CODE (operand) == CONST_DOUBLE)
21621 {
21622 REAL_VALUE_TYPE r;
21623 long l[4];
21624
21625 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21626 real_to_target (l, &r, mode);
21627
21628 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21629 if (HOST_BITS_PER_WIDE_INT >= 64)
21630 parts[0]
21631 = gen_int_mode
21632 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21633 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21634 DImode);
21635 else
21636 parts[0] = immed_double_const (l[0], l[1], DImode);
21637
21638 if (upper_mode == SImode)
21639 parts[1] = gen_int_mode (l[2], SImode);
21640 else if (HOST_BITS_PER_WIDE_INT >= 64)
21641 parts[1]
21642 = gen_int_mode
21643 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21644 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21645 DImode);
21646 else
21647 parts[1] = immed_double_const (l[2], l[3], DImode);
21648 }
21649 else
21650 gcc_unreachable ();
21651 }
21652 }
21653
21654 return size;
21655 }
21656
21657 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21658 Return false when normal moves are needed; true when all required
21659 insns have been emitted. Operands 2-4 contain the input values
21660 int the correct order; operands 5-7 contain the output values. */
21661
21662 void
21663 ix86_split_long_move (rtx operands[])
21664 {
21665 rtx part[2][4];
21666 int nparts, i, j;
21667 int push = 0;
21668 int collisions = 0;
21669 enum machine_mode mode = GET_MODE (operands[0]);
21670 bool collisionparts[4];
21671
21672 /* The DFmode expanders may ask us to move double.
21673 For 64bit target this is single move. By hiding the fact
21674 here we simplify i386.md splitters. */
21675 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21676 {
21677 /* Optimize constant pool reference to immediates. This is used by
21678 fp moves, that force all constants to memory to allow combining. */
21679
21680 if (MEM_P (operands[1])
21681 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21682 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21683 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21684 if (push_operand (operands[0], VOIDmode))
21685 {
21686 operands[0] = copy_rtx (operands[0]);
21687 PUT_MODE (operands[0], word_mode);
21688 }
21689 else
21690 operands[0] = gen_lowpart (DImode, operands[0]);
21691 operands[1] = gen_lowpart (DImode, operands[1]);
21692 emit_move_insn (operands[0], operands[1]);
21693 return;
21694 }
21695
21696 /* The only non-offsettable memory we handle is push. */
21697 if (push_operand (operands[0], VOIDmode))
21698 push = 1;
21699 else
21700 gcc_assert (!MEM_P (operands[0])
21701 || offsettable_memref_p (operands[0]));
21702
21703 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21704 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21705
21706 /* When emitting push, take care for source operands on the stack. */
21707 if (push && MEM_P (operands[1])
21708 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21709 {
21710 rtx src_base = XEXP (part[1][nparts - 1], 0);
21711
21712 /* Compensate for the stack decrement by 4. */
21713 if (!TARGET_64BIT && nparts == 3
21714 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21715 src_base = plus_constant (Pmode, src_base, 4);
21716
21717 /* src_base refers to the stack pointer and is
21718 automatically decreased by emitted push. */
21719 for (i = 0; i < nparts; i++)
21720 part[1][i] = change_address (part[1][i],
21721 GET_MODE (part[1][i]), src_base);
21722 }
21723
21724 /* We need to do copy in the right order in case an address register
21725 of the source overlaps the destination. */
21726 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21727 {
21728 rtx tmp;
21729
21730 for (i = 0; i < nparts; i++)
21731 {
21732 collisionparts[i]
21733 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21734 if (collisionparts[i])
21735 collisions++;
21736 }
21737
21738 /* Collision in the middle part can be handled by reordering. */
21739 if (collisions == 1 && nparts == 3 && collisionparts [1])
21740 {
21741 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21742 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21743 }
21744 else if (collisions == 1
21745 && nparts == 4
21746 && (collisionparts [1] || collisionparts [2]))
21747 {
21748 if (collisionparts [1])
21749 {
21750 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21751 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21752 }
21753 else
21754 {
21755 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21756 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21757 }
21758 }
21759
21760 /* If there are more collisions, we can't handle it by reordering.
21761 Do an lea to the last part and use only one colliding move. */
21762 else if (collisions > 1)
21763 {
21764 rtx base;
21765
21766 collisions = 1;
21767
21768 base = part[0][nparts - 1];
21769
21770 /* Handle the case when the last part isn't valid for lea.
21771 Happens in 64-bit mode storing the 12-byte XFmode. */
21772 if (GET_MODE (base) != Pmode)
21773 base = gen_rtx_REG (Pmode, REGNO (base));
21774
21775 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21776 part[1][0] = replace_equiv_address (part[1][0], base);
21777 for (i = 1; i < nparts; i++)
21778 {
21779 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21780 part[1][i] = replace_equiv_address (part[1][i], tmp);
21781 }
21782 }
21783 }
21784
21785 if (push)
21786 {
21787 if (!TARGET_64BIT)
21788 {
21789 if (nparts == 3)
21790 {
21791 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21792 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21793 stack_pointer_rtx, GEN_INT (-4)));
21794 emit_move_insn (part[0][2], part[1][2]);
21795 }
21796 else if (nparts == 4)
21797 {
21798 emit_move_insn (part[0][3], part[1][3]);
21799 emit_move_insn (part[0][2], part[1][2]);
21800 }
21801 }
21802 else
21803 {
21804 /* In 64bit mode we don't have 32bit push available. In case this is
21805 register, it is OK - we will just use larger counterpart. We also
21806 retype memory - these comes from attempt to avoid REX prefix on
21807 moving of second half of TFmode value. */
21808 if (GET_MODE (part[1][1]) == SImode)
21809 {
21810 switch (GET_CODE (part[1][1]))
21811 {
21812 case MEM:
21813 part[1][1] = adjust_address (part[1][1], DImode, 0);
21814 break;
21815
21816 case REG:
21817 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21818 break;
21819
21820 default:
21821 gcc_unreachable ();
21822 }
21823
21824 if (GET_MODE (part[1][0]) == SImode)
21825 part[1][0] = part[1][1];
21826 }
21827 }
21828 emit_move_insn (part[0][1], part[1][1]);
21829 emit_move_insn (part[0][0], part[1][0]);
21830 return;
21831 }
21832
21833 /* Choose correct order to not overwrite the source before it is copied. */
21834 if ((REG_P (part[0][0])
21835 && REG_P (part[1][1])
21836 && (REGNO (part[0][0]) == REGNO (part[1][1])
21837 || (nparts == 3
21838 && REGNO (part[0][0]) == REGNO (part[1][2]))
21839 || (nparts == 4
21840 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21841 || (collisions > 0
21842 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21843 {
21844 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21845 {
21846 operands[2 + i] = part[0][j];
21847 operands[6 + i] = part[1][j];
21848 }
21849 }
21850 else
21851 {
21852 for (i = 0; i < nparts; i++)
21853 {
21854 operands[2 + i] = part[0][i];
21855 operands[6 + i] = part[1][i];
21856 }
21857 }
21858
21859 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21860 if (optimize_insn_for_size_p ())
21861 {
21862 for (j = 0; j < nparts - 1; j++)
21863 if (CONST_INT_P (operands[6 + j])
21864 && operands[6 + j] != const0_rtx
21865 && REG_P (operands[2 + j]))
21866 for (i = j; i < nparts - 1; i++)
21867 if (CONST_INT_P (operands[7 + i])
21868 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21869 operands[7 + i] = operands[2 + j];
21870 }
21871
21872 for (i = 0; i < nparts; i++)
21873 emit_move_insn (operands[2 + i], operands[6 + i]);
21874
21875 return;
21876 }
21877
21878 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21879 left shift by a constant, either using a single shift or
21880 a sequence of add instructions. */
21881
21882 static void
21883 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21884 {
21885 rtx (*insn)(rtx, rtx, rtx);
21886
21887 if (count == 1
21888 || (count * ix86_cost->add <= ix86_cost->shift_const
21889 && !optimize_insn_for_size_p ()))
21890 {
21891 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21892 while (count-- > 0)
21893 emit_insn (insn (operand, operand, operand));
21894 }
21895 else
21896 {
21897 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21898 emit_insn (insn (operand, operand, GEN_INT (count)));
21899 }
21900 }
21901
21902 void
21903 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21904 {
21905 rtx (*gen_ashl3)(rtx, rtx, rtx);
21906 rtx (*gen_shld)(rtx, rtx, rtx);
21907 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21908
21909 rtx low[2], high[2];
21910 int count;
21911
21912 if (CONST_INT_P (operands[2]))
21913 {
21914 split_double_mode (mode, operands, 2, low, high);
21915 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21916
21917 if (count >= half_width)
21918 {
21919 emit_move_insn (high[0], low[1]);
21920 emit_move_insn (low[0], const0_rtx);
21921
21922 if (count > half_width)
21923 ix86_expand_ashl_const (high[0], count - half_width, mode);
21924 }
21925 else
21926 {
21927 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21928
21929 if (!rtx_equal_p (operands[0], operands[1]))
21930 emit_move_insn (operands[0], operands[1]);
21931
21932 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21933 ix86_expand_ashl_const (low[0], count, mode);
21934 }
21935 return;
21936 }
21937
21938 split_double_mode (mode, operands, 1, low, high);
21939
21940 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21941
21942 if (operands[1] == const1_rtx)
21943 {
21944 /* Assuming we've chosen a QImode capable registers, then 1 << N
21945 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21946 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21947 {
21948 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21949
21950 ix86_expand_clear (low[0]);
21951 ix86_expand_clear (high[0]);
21952 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21953
21954 d = gen_lowpart (QImode, low[0]);
21955 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21956 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21957 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21958
21959 d = gen_lowpart (QImode, high[0]);
21960 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21961 s = gen_rtx_NE (QImode, flags, const0_rtx);
21962 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21963 }
21964
21965 /* Otherwise, we can get the same results by manually performing
21966 a bit extract operation on bit 5/6, and then performing the two
21967 shifts. The two methods of getting 0/1 into low/high are exactly
21968 the same size. Avoiding the shift in the bit extract case helps
21969 pentium4 a bit; no one else seems to care much either way. */
21970 else
21971 {
21972 enum machine_mode half_mode;
21973 rtx (*gen_lshr3)(rtx, rtx, rtx);
21974 rtx (*gen_and3)(rtx, rtx, rtx);
21975 rtx (*gen_xor3)(rtx, rtx, rtx);
21976 HOST_WIDE_INT bits;
21977 rtx x;
21978
21979 if (mode == DImode)
21980 {
21981 half_mode = SImode;
21982 gen_lshr3 = gen_lshrsi3;
21983 gen_and3 = gen_andsi3;
21984 gen_xor3 = gen_xorsi3;
21985 bits = 5;
21986 }
21987 else
21988 {
21989 half_mode = DImode;
21990 gen_lshr3 = gen_lshrdi3;
21991 gen_and3 = gen_anddi3;
21992 gen_xor3 = gen_xordi3;
21993 bits = 6;
21994 }
21995
21996 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21997 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21998 else
21999 x = gen_lowpart (half_mode, operands[2]);
22000 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
22001
22002 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
22003 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
22004 emit_move_insn (low[0], high[0]);
22005 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
22006 }
22007
22008 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22009 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
22010 return;
22011 }
22012
22013 if (operands[1] == constm1_rtx)
22014 {
22015 /* For -1 << N, we can avoid the shld instruction, because we
22016 know that we're shifting 0...31/63 ones into a -1. */
22017 emit_move_insn (low[0], constm1_rtx);
22018 if (optimize_insn_for_size_p ())
22019 emit_move_insn (high[0], low[0]);
22020 else
22021 emit_move_insn (high[0], constm1_rtx);
22022 }
22023 else
22024 {
22025 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22026
22027 if (!rtx_equal_p (operands[0], operands[1]))
22028 emit_move_insn (operands[0], operands[1]);
22029
22030 split_double_mode (mode, operands, 1, low, high);
22031 emit_insn (gen_shld (high[0], low[0], operands[2]));
22032 }
22033
22034 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22035
22036 if (TARGET_CMOVE && scratch)
22037 {
22038 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22039 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22040
22041 ix86_expand_clear (scratch);
22042 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
22043 }
22044 else
22045 {
22046 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22047 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22048
22049 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
22050 }
22051 }
22052
22053 void
22054 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
22055 {
22056 rtx (*gen_ashr3)(rtx, rtx, rtx)
22057 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
22058 rtx (*gen_shrd)(rtx, rtx, rtx);
22059 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22060
22061 rtx low[2], high[2];
22062 int count;
22063
22064 if (CONST_INT_P (operands[2]))
22065 {
22066 split_double_mode (mode, operands, 2, low, high);
22067 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22068
22069 if (count == GET_MODE_BITSIZE (mode) - 1)
22070 {
22071 emit_move_insn (high[0], high[1]);
22072 emit_insn (gen_ashr3 (high[0], high[0],
22073 GEN_INT (half_width - 1)));
22074 emit_move_insn (low[0], high[0]);
22075
22076 }
22077 else if (count >= half_width)
22078 {
22079 emit_move_insn (low[0], high[1]);
22080 emit_move_insn (high[0], low[0]);
22081 emit_insn (gen_ashr3 (high[0], high[0],
22082 GEN_INT (half_width - 1)));
22083
22084 if (count > half_width)
22085 emit_insn (gen_ashr3 (low[0], low[0],
22086 GEN_INT (count - half_width)));
22087 }
22088 else
22089 {
22090 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22091
22092 if (!rtx_equal_p (operands[0], operands[1]))
22093 emit_move_insn (operands[0], operands[1]);
22094
22095 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22096 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
22097 }
22098 }
22099 else
22100 {
22101 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22102
22103 if (!rtx_equal_p (operands[0], operands[1]))
22104 emit_move_insn (operands[0], operands[1]);
22105
22106 split_double_mode (mode, operands, 1, low, high);
22107
22108 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22109 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
22110
22111 if (TARGET_CMOVE && scratch)
22112 {
22113 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22114 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22115
22116 emit_move_insn (scratch, high[0]);
22117 emit_insn (gen_ashr3 (scratch, scratch,
22118 GEN_INT (half_width - 1)));
22119 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22120 scratch));
22121 }
22122 else
22123 {
22124 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
22125 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
22126
22127 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
22128 }
22129 }
22130 }
22131
22132 void
22133 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
22134 {
22135 rtx (*gen_lshr3)(rtx, rtx, rtx)
22136 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
22137 rtx (*gen_shrd)(rtx, rtx, rtx);
22138 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22139
22140 rtx low[2], high[2];
22141 int count;
22142
22143 if (CONST_INT_P (operands[2]))
22144 {
22145 split_double_mode (mode, operands, 2, low, high);
22146 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22147
22148 if (count >= half_width)
22149 {
22150 emit_move_insn (low[0], high[1]);
22151 ix86_expand_clear (high[0]);
22152
22153 if (count > half_width)
22154 emit_insn (gen_lshr3 (low[0], low[0],
22155 GEN_INT (count - half_width)));
22156 }
22157 else
22158 {
22159 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22160
22161 if (!rtx_equal_p (operands[0], operands[1]))
22162 emit_move_insn (operands[0], operands[1]);
22163
22164 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22165 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22166 }
22167 }
22168 else
22169 {
22170 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22171
22172 if (!rtx_equal_p (operands[0], operands[1]))
22173 emit_move_insn (operands[0], operands[1]);
22174
22175 split_double_mode (mode, operands, 1, low, high);
22176
22177 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22178 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22179
22180 if (TARGET_CMOVE && scratch)
22181 {
22182 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22183 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22184
22185 ix86_expand_clear (scratch);
22186 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22187 scratch));
22188 }
22189 else
22190 {
22191 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22192 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22193
22194 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22195 }
22196 }
22197 }
22198
22199 /* Predict just emitted jump instruction to be taken with probability PROB. */
22200 static void
22201 predict_jump (int prob)
22202 {
22203 rtx insn = get_last_insn ();
22204 gcc_assert (JUMP_P (insn));
22205 add_int_reg_note (insn, REG_BR_PROB, prob);
22206 }
22207
22208 /* Helper function for the string operations below. Dest VARIABLE whether
22209 it is aligned to VALUE bytes. If true, jump to the label. */
22210 static rtx
22211 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22212 {
22213 rtx label = gen_label_rtx ();
22214 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22215 if (GET_MODE (variable) == DImode)
22216 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22217 else
22218 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22219 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22220 1, label);
22221 if (epilogue)
22222 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22223 else
22224 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22225 return label;
22226 }
22227
22228 /* Adjust COUNTER by the VALUE. */
22229 static void
22230 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22231 {
22232 rtx (*gen_add)(rtx, rtx, rtx)
22233 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22234
22235 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22236 }
22237
22238 /* Zero extend possibly SImode EXP to Pmode register. */
22239 rtx
22240 ix86_zero_extend_to_Pmode (rtx exp)
22241 {
22242 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22243 }
22244
22245 /* Divide COUNTREG by SCALE. */
22246 static rtx
22247 scale_counter (rtx countreg, int scale)
22248 {
22249 rtx sc;
22250
22251 if (scale == 1)
22252 return countreg;
22253 if (CONST_INT_P (countreg))
22254 return GEN_INT (INTVAL (countreg) / scale);
22255 gcc_assert (REG_P (countreg));
22256
22257 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22258 GEN_INT (exact_log2 (scale)),
22259 NULL, 1, OPTAB_DIRECT);
22260 return sc;
22261 }
22262
22263 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22264 DImode for constant loop counts. */
22265
22266 static enum machine_mode
22267 counter_mode (rtx count_exp)
22268 {
22269 if (GET_MODE (count_exp) != VOIDmode)
22270 return GET_MODE (count_exp);
22271 if (!CONST_INT_P (count_exp))
22272 return Pmode;
22273 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22274 return DImode;
22275 return SImode;
22276 }
22277
22278 /* Copy the address to a Pmode register. This is used for x32 to
22279 truncate DImode TLS address to a SImode register. */
22280
22281 static rtx
22282 ix86_copy_addr_to_reg (rtx addr)
22283 {
22284 if (GET_MODE (addr) == Pmode)
22285 return copy_addr_to_reg (addr);
22286 else
22287 {
22288 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
22289 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
22290 }
22291 }
22292
22293 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
22294 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
22295 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
22296 memory by VALUE (supposed to be in MODE).
22297
22298 The size is rounded down to whole number of chunk size moved at once.
22299 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22300
22301
22302 static void
22303 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22304 rtx destptr, rtx srcptr, rtx value,
22305 rtx count, enum machine_mode mode, int unroll,
22306 int expected_size, bool issetmem)
22307 {
22308 rtx out_label, top_label, iter, tmp;
22309 enum machine_mode iter_mode = counter_mode (count);
22310 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22311 rtx piece_size = GEN_INT (piece_size_n);
22312 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22313 rtx size;
22314 int i;
22315
22316 top_label = gen_label_rtx ();
22317 out_label = gen_label_rtx ();
22318 iter = gen_reg_rtx (iter_mode);
22319
22320 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22321 NULL, 1, OPTAB_DIRECT);
22322 /* Those two should combine. */
22323 if (piece_size == const1_rtx)
22324 {
22325 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22326 true, out_label);
22327 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22328 }
22329 emit_move_insn (iter, const0_rtx);
22330
22331 emit_label (top_label);
22332
22333 tmp = convert_modes (Pmode, iter_mode, iter, true);
22334
22335 /* This assert could be relaxed - in this case we'll need to compute
22336 smallest power of two, containing in PIECE_SIZE_N and pass it to
22337 offset_address. */
22338 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22339 destmem = offset_address (destmem, tmp, piece_size_n);
22340 destmem = adjust_address (destmem, mode, 0);
22341
22342 if (!issetmem)
22343 {
22344 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22345 srcmem = adjust_address (srcmem, mode, 0);
22346
22347 /* When unrolling for chips that reorder memory reads and writes,
22348 we can save registers by using single temporary.
22349 Also using 4 temporaries is overkill in 32bit mode. */
22350 if (!TARGET_64BIT && 0)
22351 {
22352 for (i = 0; i < unroll; i++)
22353 {
22354 if (i)
22355 {
22356 destmem =
22357 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22358 srcmem =
22359 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22360 }
22361 emit_move_insn (destmem, srcmem);
22362 }
22363 }
22364 else
22365 {
22366 rtx tmpreg[4];
22367 gcc_assert (unroll <= 4);
22368 for (i = 0; i < unroll; i++)
22369 {
22370 tmpreg[i] = gen_reg_rtx (mode);
22371 if (i)
22372 {
22373 srcmem =
22374 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22375 }
22376 emit_move_insn (tmpreg[i], srcmem);
22377 }
22378 for (i = 0; i < unroll; i++)
22379 {
22380 if (i)
22381 {
22382 destmem =
22383 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22384 }
22385 emit_move_insn (destmem, tmpreg[i]);
22386 }
22387 }
22388 }
22389 else
22390 for (i = 0; i < unroll; i++)
22391 {
22392 if (i)
22393 destmem =
22394 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22395 emit_move_insn (destmem, value);
22396 }
22397
22398 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22399 true, OPTAB_LIB_WIDEN);
22400 if (tmp != iter)
22401 emit_move_insn (iter, tmp);
22402
22403 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22404 true, top_label);
22405 if (expected_size != -1)
22406 {
22407 expected_size /= GET_MODE_SIZE (mode) * unroll;
22408 if (expected_size == 0)
22409 predict_jump (0);
22410 else if (expected_size > REG_BR_PROB_BASE)
22411 predict_jump (REG_BR_PROB_BASE - 1);
22412 else
22413 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22414 }
22415 else
22416 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22417 iter = ix86_zero_extend_to_Pmode (iter);
22418 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22419 true, OPTAB_LIB_WIDEN);
22420 if (tmp != destptr)
22421 emit_move_insn (destptr, tmp);
22422 if (!issetmem)
22423 {
22424 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22425 true, OPTAB_LIB_WIDEN);
22426 if (tmp != srcptr)
22427 emit_move_insn (srcptr, tmp);
22428 }
22429 emit_label (out_label);
22430 }
22431
22432 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
22433 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
22434 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
22435 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
22436 ORIG_VALUE is the original value passed to memset to fill the memory with.
22437 Other arguments have same meaning as for previous function. */
22438
22439 static void
22440 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
22441 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
22442 rtx count,
22443 enum machine_mode mode, bool issetmem)
22444 {
22445 rtx destexp;
22446 rtx srcexp;
22447 rtx countreg;
22448 HOST_WIDE_INT rounded_count;
22449
22450 /* If possible, it is shorter to use rep movs.
22451 TODO: Maybe it is better to move this logic to decide_alg. */
22452 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
22453 && (!issetmem || orig_value == const0_rtx))
22454 mode = SImode;
22455
22456 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22457 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22458
22459 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
22460 GET_MODE_SIZE (mode)));
22461 if (mode != QImode)
22462 {
22463 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22464 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22465 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22466 }
22467 else
22468 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22469 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
22470 {
22471 rounded_count = (INTVAL (count)
22472 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22473 destmem = shallow_copy_rtx (destmem);
22474 set_mem_size (destmem, rounded_count);
22475 }
22476 else if (MEM_SIZE_KNOWN_P (destmem))
22477 clear_mem_size (destmem);
22478
22479 if (issetmem)
22480 {
22481 value = force_reg (mode, gen_lowpart (mode, value));
22482 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22483 }
22484 else
22485 {
22486 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22487 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22488 if (mode != QImode)
22489 {
22490 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22491 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22492 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22493 }
22494 else
22495 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22496 if (CONST_INT_P (count))
22497 {
22498 rounded_count = (INTVAL (count)
22499 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22500 srcmem = shallow_copy_rtx (srcmem);
22501 set_mem_size (srcmem, rounded_count);
22502 }
22503 else
22504 {
22505 if (MEM_SIZE_KNOWN_P (srcmem))
22506 clear_mem_size (srcmem);
22507 }
22508 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22509 destexp, srcexp));
22510 }
22511 }
22512
22513 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22514 DESTMEM.
22515 SRC is passed by pointer to be updated on return.
22516 Return value is updated DST. */
22517 static rtx
22518 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22519 HOST_WIDE_INT size_to_move)
22520 {
22521 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22522 enum insn_code code;
22523 enum machine_mode move_mode;
22524 int piece_size, i;
22525
22526 /* Find the widest mode in which we could perform moves.
22527 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22528 it until move of such size is supported. */
22529 piece_size = 1 << floor_log2 (size_to_move);
22530 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22531 code = optab_handler (mov_optab, move_mode);
22532 while (code == CODE_FOR_nothing && piece_size > 1)
22533 {
22534 piece_size >>= 1;
22535 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22536 code = optab_handler (mov_optab, move_mode);
22537 }
22538
22539 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22540 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22541 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22542 {
22543 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22544 move_mode = mode_for_vector (word_mode, nunits);
22545 code = optab_handler (mov_optab, move_mode);
22546 if (code == CODE_FOR_nothing)
22547 {
22548 move_mode = word_mode;
22549 piece_size = GET_MODE_SIZE (move_mode);
22550 code = optab_handler (mov_optab, move_mode);
22551 }
22552 }
22553 gcc_assert (code != CODE_FOR_nothing);
22554
22555 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22556 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22557
22558 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22559 gcc_assert (size_to_move % piece_size == 0);
22560 adjust = GEN_INT (piece_size);
22561 for (i = 0; i < size_to_move; i += piece_size)
22562 {
22563 /* We move from memory to memory, so we'll need to do it via
22564 a temporary register. */
22565 tempreg = gen_reg_rtx (move_mode);
22566 emit_insn (GEN_FCN (code) (tempreg, src));
22567 emit_insn (GEN_FCN (code) (dst, tempreg));
22568
22569 emit_move_insn (destptr,
22570 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22571 emit_move_insn (srcptr,
22572 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22573
22574 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22575 piece_size);
22576 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22577 piece_size);
22578 }
22579
22580 /* Update DST and SRC rtx. */
22581 *srcmem = src;
22582 return dst;
22583 }
22584
22585 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22586 static void
22587 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22588 rtx destptr, rtx srcptr, rtx count, int max_size)
22589 {
22590 rtx src, dest;
22591 if (CONST_INT_P (count))
22592 {
22593 HOST_WIDE_INT countval = INTVAL (count);
22594 HOST_WIDE_INT epilogue_size = countval % max_size;
22595 int i;
22596
22597 /* For now MAX_SIZE should be a power of 2. This assert could be
22598 relaxed, but it'll require a bit more complicated epilogue
22599 expanding. */
22600 gcc_assert ((max_size & (max_size - 1)) == 0);
22601 for (i = max_size; i >= 1; i >>= 1)
22602 {
22603 if (epilogue_size & i)
22604 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22605 }
22606 return;
22607 }
22608 if (max_size > 8)
22609 {
22610 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22611 count, 1, OPTAB_DIRECT);
22612 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22613 count, QImode, 1, 4, false);
22614 return;
22615 }
22616
22617 /* When there are stringops, we can cheaply increase dest and src pointers.
22618 Otherwise we save code size by maintaining offset (zero is readily
22619 available from preceding rep operation) and using x86 addressing modes.
22620 */
22621 if (TARGET_SINGLE_STRINGOP)
22622 {
22623 if (max_size > 4)
22624 {
22625 rtx label = ix86_expand_aligntest (count, 4, true);
22626 src = change_address (srcmem, SImode, srcptr);
22627 dest = change_address (destmem, SImode, destptr);
22628 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22629 emit_label (label);
22630 LABEL_NUSES (label) = 1;
22631 }
22632 if (max_size > 2)
22633 {
22634 rtx label = ix86_expand_aligntest (count, 2, true);
22635 src = change_address (srcmem, HImode, srcptr);
22636 dest = change_address (destmem, HImode, destptr);
22637 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22638 emit_label (label);
22639 LABEL_NUSES (label) = 1;
22640 }
22641 if (max_size > 1)
22642 {
22643 rtx label = ix86_expand_aligntest (count, 1, true);
22644 src = change_address (srcmem, QImode, srcptr);
22645 dest = change_address (destmem, QImode, destptr);
22646 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22647 emit_label (label);
22648 LABEL_NUSES (label) = 1;
22649 }
22650 }
22651 else
22652 {
22653 rtx offset = force_reg (Pmode, const0_rtx);
22654 rtx tmp;
22655
22656 if (max_size > 4)
22657 {
22658 rtx label = ix86_expand_aligntest (count, 4, true);
22659 src = change_address (srcmem, SImode, srcptr);
22660 dest = change_address (destmem, SImode, destptr);
22661 emit_move_insn (dest, src);
22662 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22663 true, OPTAB_LIB_WIDEN);
22664 if (tmp != offset)
22665 emit_move_insn (offset, tmp);
22666 emit_label (label);
22667 LABEL_NUSES (label) = 1;
22668 }
22669 if (max_size > 2)
22670 {
22671 rtx label = ix86_expand_aligntest (count, 2, true);
22672 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22673 src = change_address (srcmem, HImode, tmp);
22674 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22675 dest = change_address (destmem, HImode, tmp);
22676 emit_move_insn (dest, src);
22677 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22678 true, OPTAB_LIB_WIDEN);
22679 if (tmp != offset)
22680 emit_move_insn (offset, tmp);
22681 emit_label (label);
22682 LABEL_NUSES (label) = 1;
22683 }
22684 if (max_size > 1)
22685 {
22686 rtx label = ix86_expand_aligntest (count, 1, true);
22687 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22688 src = change_address (srcmem, QImode, tmp);
22689 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22690 dest = change_address (destmem, QImode, tmp);
22691 emit_move_insn (dest, src);
22692 emit_label (label);
22693 LABEL_NUSES (label) = 1;
22694 }
22695 }
22696 }
22697
22698 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
22699 with value PROMOTED_VAL.
22700 SRC is passed by pointer to be updated on return.
22701 Return value is updated DST. */
22702 static rtx
22703 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
22704 HOST_WIDE_INT size_to_move)
22705 {
22706 rtx dst = destmem, adjust;
22707 enum insn_code code;
22708 enum machine_mode move_mode;
22709 int piece_size, i;
22710
22711 /* Find the widest mode in which we could perform moves.
22712 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22713 it until move of such size is supported. */
22714 move_mode = GET_MODE (promoted_val);
22715 if (move_mode == VOIDmode)
22716 move_mode = QImode;
22717 if (size_to_move < GET_MODE_SIZE (move_mode))
22718 {
22719 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
22720 promoted_val = gen_lowpart (move_mode, promoted_val);
22721 }
22722 piece_size = GET_MODE_SIZE (move_mode);
22723 code = optab_handler (mov_optab, move_mode);
22724 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
22725
22726 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22727
22728 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22729 gcc_assert (size_to_move % piece_size == 0);
22730 adjust = GEN_INT (piece_size);
22731 for (i = 0; i < size_to_move; i += piece_size)
22732 {
22733 if (piece_size <= GET_MODE_SIZE (word_mode))
22734 {
22735 emit_insn (gen_strset (destptr, dst, promoted_val));
22736 continue;
22737 }
22738
22739 emit_insn (GEN_FCN (code) (dst, promoted_val));
22740
22741 emit_move_insn (destptr,
22742 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22743
22744 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22745 piece_size);
22746 }
22747
22748 /* Update DST rtx. */
22749 return dst;
22750 }
22751 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22752 static void
22753 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22754 rtx count, int max_size)
22755 {
22756 count =
22757 expand_simple_binop (counter_mode (count), AND, count,
22758 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22759 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22760 gen_lowpart (QImode, value), count, QImode,
22761 1, max_size / 2, true);
22762 }
22763
22764 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22765 static void
22766 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
22767 rtx count, int max_size)
22768 {
22769 rtx dest;
22770
22771 if (CONST_INT_P (count))
22772 {
22773 HOST_WIDE_INT countval = INTVAL (count);
22774 HOST_WIDE_INT epilogue_size = countval % max_size;
22775 int i;
22776
22777 /* For now MAX_SIZE should be a power of 2. This assert could be
22778 relaxed, but it'll require a bit more complicated epilogue
22779 expanding. */
22780 gcc_assert ((max_size & (max_size - 1)) == 0);
22781 for (i = max_size; i >= 1; i >>= 1)
22782 {
22783 if (epilogue_size & i)
22784 {
22785 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22786 destmem = emit_memset (destmem, destptr, vec_value, i);
22787 else
22788 destmem = emit_memset (destmem, destptr, value, i);
22789 }
22790 }
22791 return;
22792 }
22793 if (max_size > 32)
22794 {
22795 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22796 return;
22797 }
22798 if (max_size > 16)
22799 {
22800 rtx label = ix86_expand_aligntest (count, 16, true);
22801 if (TARGET_64BIT)
22802 {
22803 dest = change_address (destmem, DImode, destptr);
22804 emit_insn (gen_strset (destptr, dest, value));
22805 emit_insn (gen_strset (destptr, dest, value));
22806 }
22807 else
22808 {
22809 dest = change_address (destmem, SImode, destptr);
22810 emit_insn (gen_strset (destptr, dest, value));
22811 emit_insn (gen_strset (destptr, dest, value));
22812 emit_insn (gen_strset (destptr, dest, value));
22813 emit_insn (gen_strset (destptr, dest, value));
22814 }
22815 emit_label (label);
22816 LABEL_NUSES (label) = 1;
22817 }
22818 if (max_size > 8)
22819 {
22820 rtx label = ix86_expand_aligntest (count, 8, true);
22821 if (TARGET_64BIT)
22822 {
22823 dest = change_address (destmem, DImode, destptr);
22824 emit_insn (gen_strset (destptr, dest, value));
22825 }
22826 else
22827 {
22828 dest = change_address (destmem, SImode, destptr);
22829 emit_insn (gen_strset (destptr, dest, value));
22830 emit_insn (gen_strset (destptr, dest, value));
22831 }
22832 emit_label (label);
22833 LABEL_NUSES (label) = 1;
22834 }
22835 if (max_size > 4)
22836 {
22837 rtx label = ix86_expand_aligntest (count, 4, true);
22838 dest = change_address (destmem, SImode, destptr);
22839 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22840 emit_label (label);
22841 LABEL_NUSES (label) = 1;
22842 }
22843 if (max_size > 2)
22844 {
22845 rtx label = ix86_expand_aligntest (count, 2, true);
22846 dest = change_address (destmem, HImode, destptr);
22847 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22848 emit_label (label);
22849 LABEL_NUSES (label) = 1;
22850 }
22851 if (max_size > 1)
22852 {
22853 rtx label = ix86_expand_aligntest (count, 1, true);
22854 dest = change_address (destmem, QImode, destptr);
22855 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22856 emit_label (label);
22857 LABEL_NUSES (label) = 1;
22858 }
22859 }
22860
22861 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
22862 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
22863 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
22864 ignored.
22865 Return value is updated DESTMEM. */
22866 static rtx
22867 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
22868 rtx destptr, rtx srcptr, rtx value,
22869 rtx vec_value, rtx count, int align,
22870 int desired_alignment, bool issetmem)
22871 {
22872 int i;
22873 for (i = 1; i < desired_alignment; i <<= 1)
22874 {
22875 if (align <= i)
22876 {
22877 rtx label = ix86_expand_aligntest (destptr, i, false);
22878 if (issetmem)
22879 {
22880 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22881 destmem = emit_memset (destmem, destptr, vec_value, i);
22882 else
22883 destmem = emit_memset (destmem, destptr, value, i);
22884 }
22885 else
22886 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22887 ix86_adjust_counter (count, i);
22888 emit_label (label);
22889 LABEL_NUSES (label) = 1;
22890 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22891 }
22892 }
22893 return destmem;
22894 }
22895
22896 /* Test if COUNT&SIZE is nonzero and if so, expand movme
22897 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
22898 and jump to DONE_LABEL. */
22899 static void
22900 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
22901 rtx destptr, rtx srcptr,
22902 rtx value, rtx vec_value,
22903 rtx count, int size,
22904 rtx done_label, bool issetmem)
22905 {
22906 rtx label = ix86_expand_aligntest (count, size, false);
22907 enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
22908 rtx modesize;
22909 int n;
22910
22911 /* If we do not have vector value to copy, we must reduce size. */
22912 if (issetmem)
22913 {
22914 if (!vec_value)
22915 {
22916 if (GET_MODE (value) == VOIDmode && size > 8)
22917 mode = Pmode;
22918 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
22919 mode = GET_MODE (value);
22920 }
22921 else
22922 mode = GET_MODE (vec_value), value = vec_value;
22923 }
22924 else
22925 {
22926 /* Choose appropriate vector mode. */
22927 if (size >= 32)
22928 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
22929 else if (size >= 16)
22930 mode = TARGET_SSE ? V16QImode : DImode;
22931 srcmem = change_address (srcmem, mode, srcptr);
22932 }
22933 destmem = change_address (destmem, mode, destptr);
22934 modesize = GEN_INT (GET_MODE_SIZE (mode));
22935 gcc_assert (GET_MODE_SIZE (mode) <= size);
22936 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
22937 {
22938 if (issetmem)
22939 emit_move_insn (destmem, gen_lowpart (mode, value));
22940 else
22941 {
22942 emit_move_insn (destmem, srcmem);
22943 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
22944 }
22945 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
22946 }
22947
22948 destmem = offset_address (destmem, count, 1);
22949 destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
22950 GET_MODE_SIZE (mode));
22951 if (issetmem)
22952 emit_move_insn (destmem, gen_lowpart (mode, value));
22953 else
22954 {
22955 srcmem = offset_address (srcmem, count, 1);
22956 srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
22957 GET_MODE_SIZE (mode));
22958 emit_move_insn (destmem, srcmem);
22959 }
22960 emit_jump_insn (gen_jump (done_label));
22961 emit_barrier ();
22962
22963 emit_label (label);
22964 LABEL_NUSES (label) = 1;
22965 }
22966
22967 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
22968 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
22969 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
22970 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
22971 DONE_LABEL is a label after the whole copying sequence. The label is created
22972 on demand if *DONE_LABEL is NULL.
22973 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
22974 bounds after the initial copies.
22975
22976 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
22977 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
22978 we will dispatch to a library call for large blocks.
22979
22980 In pseudocode we do:
22981
22982 if (COUNT < SIZE)
22983 {
22984 Assume that SIZE is 4. Bigger sizes are handled analogously
22985 if (COUNT & 4)
22986 {
22987 copy 4 bytes from SRCPTR to DESTPTR
22988 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
22989 goto done_label
22990 }
22991 if (!COUNT)
22992 goto done_label;
22993 copy 1 byte from SRCPTR to DESTPTR
22994 if (COUNT & 2)
22995 {
22996 copy 2 bytes from SRCPTR to DESTPTR
22997 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
22998 }
22999 }
23000 else
23001 {
23002 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
23003 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
23004
23005 OLD_DESPTR = DESTPTR;
23006 Align DESTPTR up to DESIRED_ALIGN
23007 SRCPTR += DESTPTR - OLD_DESTPTR
23008 COUNT -= DEST_PTR - OLD_DESTPTR
23009 if (DYNAMIC_CHECK)
23010 Round COUNT down to multiple of SIZE
23011 << optional caller supplied zero size guard is here >>
23012 << optional caller suppplied dynamic check is here >>
23013 << caller supplied main copy loop is here >>
23014 }
23015 done_label:
23016 */
23017 static void
23018 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
23019 rtx *destptr, rtx *srcptr,
23020 enum machine_mode mode,
23021 rtx value, rtx vec_value,
23022 rtx *count,
23023 rtx *done_label,
23024 int size,
23025 int desired_align,
23026 int align,
23027 unsigned HOST_WIDE_INT *min_size,
23028 bool dynamic_check,
23029 bool issetmem)
23030 {
23031 rtx loop_label = NULL, label;
23032 int n;
23033 rtx modesize;
23034 int prolog_size = 0;
23035 rtx mode_value;
23036
23037 /* Chose proper value to copy. */
23038 if (issetmem && VECTOR_MODE_P (mode))
23039 mode_value = vec_value;
23040 else
23041 mode_value = value;
23042 gcc_assert (GET_MODE_SIZE (mode) <= size);
23043
23044 /* See if block is big or small, handle small blocks. */
23045 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
23046 {
23047 int size2 = size;
23048 loop_label = gen_label_rtx ();
23049
23050 if (!*done_label)
23051 *done_label = gen_label_rtx ();
23052
23053 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
23054 1, loop_label);
23055 size2 >>= 1;
23056
23057 /* Handle sizes > 3. */
23058 for (;size2 > 2; size2 >>= 1)
23059 expand_small_movmem_or_setmem (destmem, srcmem,
23060 *destptr, *srcptr,
23061 value, vec_value,
23062 *count,
23063 size2, *done_label, issetmem);
23064 /* Nothing to copy? Jump to DONE_LABEL if so */
23065 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
23066 1, *done_label);
23067
23068 /* Do a byte copy. */
23069 destmem = change_address (destmem, QImode, *destptr);
23070 if (issetmem)
23071 emit_move_insn (destmem, gen_lowpart (QImode, value));
23072 else
23073 {
23074 srcmem = change_address (srcmem, QImode, *srcptr);
23075 emit_move_insn (destmem, srcmem);
23076 }
23077
23078 /* Handle sizes 2 and 3. */
23079 label = ix86_expand_aligntest (*count, 2, false);
23080 destmem = change_address (destmem, HImode, *destptr);
23081 destmem = offset_address (destmem, *count, 1);
23082 destmem = offset_address (destmem, GEN_INT (-2), 2);
23083 if (issetmem)
23084 emit_move_insn (destmem, gen_lowpart (HImode, value));
23085 else
23086 {
23087 srcmem = change_address (srcmem, HImode, *srcptr);
23088 srcmem = offset_address (srcmem, *count, 1);
23089 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
23090 emit_move_insn (destmem, srcmem);
23091 }
23092
23093 emit_label (label);
23094 LABEL_NUSES (label) = 1;
23095 emit_jump_insn (gen_jump (*done_label));
23096 emit_barrier ();
23097 }
23098 else
23099 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
23100 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
23101
23102 /* Start memcpy for COUNT >= SIZE. */
23103 if (loop_label)
23104 {
23105 emit_label (loop_label);
23106 LABEL_NUSES (loop_label) = 1;
23107 }
23108
23109 /* Copy first desired_align bytes. */
23110 if (!issetmem)
23111 srcmem = change_address (srcmem, mode, *srcptr);
23112 destmem = change_address (destmem, mode, *destptr);
23113 modesize = GEN_INT (GET_MODE_SIZE (mode));
23114 for (n = 0; prolog_size < desired_align - align; n++)
23115 {
23116 if (issetmem)
23117 emit_move_insn (destmem, mode_value);
23118 else
23119 {
23120 emit_move_insn (destmem, srcmem);
23121 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23122 }
23123 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23124 prolog_size += GET_MODE_SIZE (mode);
23125 }
23126
23127
23128 /* Copy last SIZE bytes. */
23129 destmem = offset_address (destmem, *count, 1);
23130 destmem = offset_address (destmem,
23131 GEN_INT (-size - prolog_size),
23132 1);
23133 if (issetmem)
23134 emit_move_insn (destmem, mode_value);
23135 else
23136 {
23137 srcmem = offset_address (srcmem, *count, 1);
23138 srcmem = offset_address (srcmem,
23139 GEN_INT (-size - prolog_size),
23140 1);
23141 emit_move_insn (destmem, srcmem);
23142 }
23143 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
23144 {
23145 destmem = offset_address (destmem, modesize, 1);
23146 if (issetmem)
23147 emit_move_insn (destmem, mode_value);
23148 else
23149 {
23150 srcmem = offset_address (srcmem, modesize, 1);
23151 emit_move_insn (destmem, srcmem);
23152 }
23153 }
23154
23155 /* Align destination. */
23156 if (desired_align > 1 && desired_align > align)
23157 {
23158 rtx saveddest = *destptr;
23159
23160 gcc_assert (desired_align <= size);
23161 /* Align destptr up, place it to new register. */
23162 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
23163 GEN_INT (prolog_size),
23164 NULL_RTX, 1, OPTAB_DIRECT);
23165 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
23166 GEN_INT (-desired_align),
23167 *destptr, 1, OPTAB_DIRECT);
23168 /* See how many bytes we skipped. */
23169 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
23170 *destptr,
23171 saveddest, 1, OPTAB_DIRECT);
23172 /* Adjust srcptr and count. */
23173 if (!issetmem)
23174 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
23175 *srcptr, 1, OPTAB_DIRECT);
23176 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23177 saveddest, *count, 1, OPTAB_DIRECT);
23178 /* We copied at most size + prolog_size. */
23179 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
23180 *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
23181 else
23182 *min_size = 0;
23183
23184 /* Our loops always round down the bock size, but for dispatch to library
23185 we need precise value. */
23186 if (dynamic_check)
23187 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
23188 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
23189 }
23190 else
23191 {
23192 gcc_assert (prolog_size == 0);
23193 /* Decrease count, so we won't end up copying last word twice. */
23194 if (!CONST_INT_P (*count))
23195 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23196 constm1_rtx, *count, 1, OPTAB_DIRECT);
23197 else
23198 *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
23199 if (*min_size)
23200 *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
23201 }
23202 }
23203
23204
23205 /* This function is like the previous one, except here we know how many bytes
23206 need to be copied. That allows us to update alignment not only of DST, which
23207 is returned, but also of SRC, which is passed as a pointer for that
23208 reason. */
23209 static rtx
23210 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
23211 rtx srcreg, rtx value, rtx vec_value,
23212 int desired_align, int align_bytes,
23213 bool issetmem)
23214 {
23215 rtx src = NULL;
23216 rtx orig_dst = dst;
23217 rtx orig_src = NULL;
23218 int piece_size = 1;
23219 int copied_bytes = 0;
23220
23221 if (!issetmem)
23222 {
23223 gcc_assert (srcp != NULL);
23224 src = *srcp;
23225 orig_src = src;
23226 }
23227
23228 for (piece_size = 1;
23229 piece_size <= desired_align && copied_bytes < align_bytes;
23230 piece_size <<= 1)
23231 {
23232 if (align_bytes & piece_size)
23233 {
23234 if (issetmem)
23235 {
23236 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
23237 dst = emit_memset (dst, destreg, vec_value, piece_size);
23238 else
23239 dst = emit_memset (dst, destreg, value, piece_size);
23240 }
23241 else
23242 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
23243 copied_bytes += piece_size;
23244 }
23245 }
23246 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
23247 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23248 if (MEM_SIZE_KNOWN_P (orig_dst))
23249 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
23250
23251 if (!issetmem)
23252 {
23253 int src_align_bytes = get_mem_align_offset (src, desired_align
23254 * BITS_PER_UNIT);
23255 if (src_align_bytes >= 0)
23256 src_align_bytes = desired_align - src_align_bytes;
23257 if (src_align_bytes >= 0)
23258 {
23259 unsigned int src_align;
23260 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
23261 {
23262 if ((src_align_bytes & (src_align - 1))
23263 == (align_bytes & (src_align - 1)))
23264 break;
23265 }
23266 if (src_align > (unsigned int) desired_align)
23267 src_align = desired_align;
23268 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
23269 set_mem_align (src, src_align * BITS_PER_UNIT);
23270 }
23271 if (MEM_SIZE_KNOWN_P (orig_src))
23272 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
23273 *srcp = src;
23274 }
23275
23276 return dst;
23277 }
23278
23279 /* Return true if ALG can be used in current context.
23280 Assume we expand memset if MEMSET is true. */
23281 static bool
23282 alg_usable_p (enum stringop_alg alg, bool memset)
23283 {
23284 if (alg == no_stringop)
23285 return false;
23286 if (alg == vector_loop)
23287 return TARGET_SSE || TARGET_AVX;
23288 /* Algorithms using the rep prefix want at least edi and ecx;
23289 additionally, memset wants eax and memcpy wants esi. Don't
23290 consider such algorithms if the user has appropriated those
23291 registers for their own purposes. */
23292 if (alg == rep_prefix_1_byte
23293 || alg == rep_prefix_4_byte
23294 || alg == rep_prefix_8_byte)
23295 return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
23296 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
23297 return true;
23298 }
23299
23300 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
23301 static enum stringop_alg
23302 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
23303 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
23304 bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
23305 {
23306 const struct stringop_algs * algs;
23307 bool optimize_for_speed;
23308 int max = -1;
23309 const struct processor_costs *cost;
23310 int i;
23311 bool any_alg_usable_p = false;
23312
23313 *noalign = false;
23314 *dynamic_check = -1;
23315
23316 /* Even if the string operation call is cold, we still might spend a lot
23317 of time processing large blocks. */
23318 if (optimize_function_for_size_p (cfun)
23319 || (optimize_insn_for_size_p ()
23320 && (max_size < 256
23321 || (expected_size != -1 && expected_size < 256))))
23322 optimize_for_speed = false;
23323 else
23324 optimize_for_speed = true;
23325
23326 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
23327 if (memset)
23328 algs = &cost->memset[TARGET_64BIT != 0];
23329 else
23330 algs = &cost->memcpy[TARGET_64BIT != 0];
23331
23332 /* See maximal size for user defined algorithm. */
23333 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23334 {
23335 enum stringop_alg candidate = algs->size[i].alg;
23336 bool usable = alg_usable_p (candidate, memset);
23337 any_alg_usable_p |= usable;
23338
23339 if (candidate != libcall && candidate && usable)
23340 max = algs->size[i].max;
23341 }
23342
23343 /* If expected size is not known but max size is small enough
23344 so inline version is a win, set expected size into
23345 the range. */
23346 if (max > 1 && (unsigned HOST_WIDE_INT)max >= max_size && expected_size == -1)
23347 expected_size = min_size / 2 + max_size / 2;
23348
23349 /* If user specified the algorithm, honnor it if possible. */
23350 if (ix86_stringop_alg != no_stringop
23351 && alg_usable_p (ix86_stringop_alg, memset))
23352 return ix86_stringop_alg;
23353 /* rep; movq or rep; movl is the smallest variant. */
23354 else if (!optimize_for_speed)
23355 {
23356 *noalign = true;
23357 if (!count || (count & 3) || (memset && !zero_memset))
23358 return alg_usable_p (rep_prefix_1_byte, memset)
23359 ? rep_prefix_1_byte : loop_1_byte;
23360 else
23361 return alg_usable_p (rep_prefix_4_byte, memset)
23362 ? rep_prefix_4_byte : loop;
23363 }
23364 /* Very tiny blocks are best handled via the loop, REP is expensive to
23365 setup. */
23366 else if (expected_size != -1 && expected_size < 4)
23367 return loop_1_byte;
23368 else if (expected_size != -1)
23369 {
23370 enum stringop_alg alg = libcall;
23371 bool alg_noalign = false;
23372 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23373 {
23374 /* We get here if the algorithms that were not libcall-based
23375 were rep-prefix based and we are unable to use rep prefixes
23376 based on global register usage. Break out of the loop and
23377 use the heuristic below. */
23378 if (algs->size[i].max == 0)
23379 break;
23380 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
23381 {
23382 enum stringop_alg candidate = algs->size[i].alg;
23383
23384 if (candidate != libcall && alg_usable_p (candidate, memset))
23385 {
23386 alg = candidate;
23387 alg_noalign = algs->size[i].noalign;
23388 }
23389 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
23390 last non-libcall inline algorithm. */
23391 if (TARGET_INLINE_ALL_STRINGOPS)
23392 {
23393 /* When the current size is best to be copied by a libcall,
23394 but we are still forced to inline, run the heuristic below
23395 that will pick code for medium sized blocks. */
23396 if (alg != libcall)
23397 {
23398 *noalign = alg_noalign;
23399 return alg;
23400 }
23401 break;
23402 }
23403 else if (alg_usable_p (candidate, memset))
23404 {
23405 *noalign = algs->size[i].noalign;
23406 return candidate;
23407 }
23408 }
23409 }
23410 }
23411 /* When asked to inline the call anyway, try to pick meaningful choice.
23412 We look for maximal size of block that is faster to copy by hand and
23413 take blocks of at most of that size guessing that average size will
23414 be roughly half of the block.
23415
23416 If this turns out to be bad, we might simply specify the preferred
23417 choice in ix86_costs. */
23418 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23419 && (algs->unknown_size == libcall
23420 || !alg_usable_p (algs->unknown_size, memset)))
23421 {
23422 enum stringop_alg alg;
23423
23424 /* If there aren't any usable algorithms, then recursing on
23425 smaller sizes isn't going to find anything. Just return the
23426 simple byte-at-a-time copy loop. */
23427 if (!any_alg_usable_p)
23428 {
23429 /* Pick something reasonable. */
23430 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23431 *dynamic_check = 128;
23432 return loop_1_byte;
23433 }
23434 if (max == -1)
23435 max = 4096;
23436 alg = decide_alg (count, max / 2, min_size, max_size, memset,
23437 zero_memset, dynamic_check, noalign);
23438 gcc_assert (*dynamic_check == -1);
23439 gcc_assert (alg != libcall);
23440 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23441 *dynamic_check = max;
23442 return alg;
23443 }
23444 return (alg_usable_p (algs->unknown_size, memset)
23445 ? algs->unknown_size : libcall);
23446 }
23447
23448 /* Decide on alignment. We know that the operand is already aligned to ALIGN
23449 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
23450 static int
23451 decide_alignment (int align,
23452 enum stringop_alg alg,
23453 int expected_size,
23454 enum machine_mode move_mode)
23455 {
23456 int desired_align = 0;
23457
23458 gcc_assert (alg != no_stringop);
23459
23460 if (alg == libcall)
23461 return 0;
23462 if (move_mode == VOIDmode)
23463 return 0;
23464
23465 desired_align = GET_MODE_SIZE (move_mode);
23466 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
23467 copying whole cacheline at once. */
23468 if (TARGET_PENTIUMPRO
23469 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
23470 desired_align = 8;
23471
23472 if (optimize_size)
23473 desired_align = 1;
23474 if (desired_align < align)
23475 desired_align = align;
23476 if (expected_size != -1 && expected_size < 4)
23477 desired_align = align;
23478
23479 return desired_align;
23480 }
23481
23482
23483 /* Helper function for memcpy. For QImode value 0xXY produce
23484 0xXYXYXYXY of wide specified by MODE. This is essentially
23485 a * 0x10101010, but we can do slightly better than
23486 synth_mult by unwinding the sequence by hand on CPUs with
23487 slow multiply. */
23488 static rtx
23489 promote_duplicated_reg (enum machine_mode mode, rtx val)
23490 {
23491 enum machine_mode valmode = GET_MODE (val);
23492 rtx tmp;
23493 int nops = mode == DImode ? 3 : 2;
23494
23495 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
23496 if (val == const0_rtx)
23497 return copy_to_mode_reg (mode, CONST0_RTX (mode));
23498 if (CONST_INT_P (val))
23499 {
23500 HOST_WIDE_INT v = INTVAL (val) & 255;
23501
23502 v |= v << 8;
23503 v |= v << 16;
23504 if (mode == DImode)
23505 v |= (v << 16) << 16;
23506 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23507 }
23508
23509 if (valmode == VOIDmode)
23510 valmode = QImode;
23511 if (valmode != QImode)
23512 val = gen_lowpart (QImode, val);
23513 if (mode == QImode)
23514 return val;
23515 if (!TARGET_PARTIAL_REG_STALL)
23516 nops--;
23517 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23518 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23519 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23520 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23521 {
23522 rtx reg = convert_modes (mode, QImode, val, true);
23523 tmp = promote_duplicated_reg (mode, const1_rtx);
23524 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23525 OPTAB_DIRECT);
23526 }
23527 else
23528 {
23529 rtx reg = convert_modes (mode, QImode, val, true);
23530
23531 if (!TARGET_PARTIAL_REG_STALL)
23532 if (mode == SImode)
23533 emit_insn (gen_movsi_insv_1 (reg, reg));
23534 else
23535 emit_insn (gen_movdi_insv_1 (reg, reg));
23536 else
23537 {
23538 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23539 NULL, 1, OPTAB_DIRECT);
23540 reg =
23541 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23542 }
23543 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23544 NULL, 1, OPTAB_DIRECT);
23545 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23546 if (mode == SImode)
23547 return reg;
23548 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23549 NULL, 1, OPTAB_DIRECT);
23550 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23551 return reg;
23552 }
23553 }
23554
23555 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23556 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23557 alignment from ALIGN to DESIRED_ALIGN. */
23558 static rtx
23559 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
23560 int align)
23561 {
23562 rtx promoted_val;
23563
23564 if (TARGET_64BIT
23565 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23566 promoted_val = promote_duplicated_reg (DImode, val);
23567 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23568 promoted_val = promote_duplicated_reg (SImode, val);
23569 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23570 promoted_val = promote_duplicated_reg (HImode, val);
23571 else
23572 promoted_val = val;
23573
23574 return promoted_val;
23575 }
23576
23577 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
23578 operations when profitable. The code depends upon architecture, block size
23579 and alignment, but always has one of the following overall structures:
23580
23581 Aligned move sequence:
23582
23583 1) Prologue guard: Conditional that jumps up to epilogues for small
23584 blocks that can be handled by epilogue alone. This is faster
23585 but also needed for correctness, since prologue assume the block
23586 is larger than the desired alignment.
23587
23588 Optional dynamic check for size and libcall for large
23589 blocks is emitted here too, with -minline-stringops-dynamically.
23590
23591 2) Prologue: copy first few bytes in order to get destination
23592 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23593 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23594 copied. We emit either a jump tree on power of two sized
23595 blocks, or a byte loop.
23596
23597 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23598 with specified algorithm.
23599
23600 4) Epilogue: code copying tail of the block that is too small to be
23601 handled by main body (or up to size guarded by prologue guard).
23602
23603 Misaligned move sequence
23604
23605 1) missaligned move prologue/epilogue containing:
23606 a) Prologue handling small memory blocks and jumping to done_label
23607 (skipped if blocks are known to be large enough)
23608 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
23609 needed by single possibly misaligned move
23610 (skipped if alignment is not needed)
23611 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
23612
23613 2) Zero size guard dispatching to done_label, if needed
23614
23615 3) dispatch to library call, if needed,
23616
23617 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23618 with specified algorithm. */
23619 static bool
23620 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
23621 rtx align_exp, rtx expected_align_exp,
23622 rtx expected_size_exp, bool issetmem)
23623 {
23624 rtx destreg;
23625 rtx srcreg = NULL;
23626 rtx label = NULL;
23627 rtx tmp;
23628 rtx jump_around_label = NULL;
23629 HOST_WIDE_INT align = 1;
23630 unsigned HOST_WIDE_INT count = 0;
23631 HOST_WIDE_INT expected_size = -1;
23632 int size_needed = 0, epilogue_size_needed;
23633 int desired_align = 0, align_bytes = 0;
23634 enum stringop_alg alg;
23635 rtx promoted_val = NULL;
23636 rtx vec_promoted_val = NULL;
23637 bool force_loopy_epilogue = false;
23638 int dynamic_check;
23639 bool need_zero_guard = false;
23640 bool noalign;
23641 enum machine_mode move_mode = VOIDmode;
23642 int unroll_factor = 1;
23643 /* TODO: Once vlaue ranges are available, fill in proper data. */
23644 unsigned HOST_WIDE_INT min_size = 0;
23645 unsigned HOST_WIDE_INT max_size = -1;
23646 bool misaligned_prologue_used = false;
23647
23648 if (CONST_INT_P (align_exp))
23649 align = INTVAL (align_exp);
23650 /* i386 can do misaligned access on reasonably increased cost. */
23651 if (CONST_INT_P (expected_align_exp)
23652 && INTVAL (expected_align_exp) > align)
23653 align = INTVAL (expected_align_exp);
23654 /* ALIGN is the minimum of destination and source alignment, but we care here
23655 just about destination alignment. */
23656 else if (!issetmem
23657 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23658 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23659
23660 if (CONST_INT_P (count_exp))
23661 min_size = max_size = count = expected_size = INTVAL (count_exp);
23662 if (CONST_INT_P (expected_size_exp) && count == 0)
23663 expected_size = INTVAL (expected_size_exp);
23664
23665 /* Make sure we don't need to care about overflow later on. */
23666 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23667 return false;
23668
23669 /* Step 0: Decide on preferred algorithm, desired alignment and
23670 size of chunks to be copied by main loop. */
23671 alg = decide_alg (count, expected_size, min_size, max_size, issetmem,
23672 issetmem && val_exp == const0_rtx,
23673 &dynamic_check, &noalign);
23674 if (alg == libcall)
23675 return false;
23676 gcc_assert (alg != no_stringop);
23677
23678 /* For now vector-version of memset is generated only for memory zeroing, as
23679 creating of promoted vector value is very cheap in this case. */
23680 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
23681 alg = unrolled_loop;
23682
23683 if (!count)
23684 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23685 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23686 if (!issetmem)
23687 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
23688
23689 unroll_factor = 1;
23690 move_mode = word_mode;
23691 switch (alg)
23692 {
23693 case libcall:
23694 case no_stringop:
23695 case last_alg:
23696 gcc_unreachable ();
23697 case loop_1_byte:
23698 need_zero_guard = true;
23699 move_mode = QImode;
23700 break;
23701 case loop:
23702 need_zero_guard = true;
23703 break;
23704 case unrolled_loop:
23705 need_zero_guard = true;
23706 unroll_factor = (TARGET_64BIT ? 4 : 2);
23707 break;
23708 case vector_loop:
23709 need_zero_guard = true;
23710 unroll_factor = 4;
23711 /* Find the widest supported mode. */
23712 move_mode = word_mode;
23713 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23714 != CODE_FOR_nothing)
23715 move_mode = GET_MODE_WIDER_MODE (move_mode);
23716
23717 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23718 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23719 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23720 {
23721 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23722 move_mode = mode_for_vector (word_mode, nunits);
23723 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23724 move_mode = word_mode;
23725 }
23726 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23727 break;
23728 case rep_prefix_8_byte:
23729 move_mode = DImode;
23730 break;
23731 case rep_prefix_4_byte:
23732 move_mode = SImode;
23733 break;
23734 case rep_prefix_1_byte:
23735 move_mode = QImode;
23736 break;
23737 }
23738 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23739 epilogue_size_needed = size_needed;
23740
23741 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23742 if (!TARGET_ALIGN_STRINGOPS || noalign)
23743 align = desired_align;
23744
23745 /* Step 1: Prologue guard. */
23746
23747 /* Alignment code needs count to be in register. */
23748 if (CONST_INT_P (count_exp) && desired_align > align)
23749 {
23750 if (INTVAL (count_exp) > desired_align
23751 && INTVAL (count_exp) > size_needed)
23752 {
23753 align_bytes
23754 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23755 if (align_bytes <= 0)
23756 align_bytes = 0;
23757 else
23758 align_bytes = desired_align - align_bytes;
23759 }
23760 if (align_bytes == 0)
23761 count_exp = force_reg (counter_mode (count_exp), count_exp);
23762 }
23763 gcc_assert (desired_align >= 1 && align >= 1);
23764
23765 /* Misaligned move sequences handles both prologues and epilogues at once.
23766 Default code generation results in smaller code for large alignments and
23767 also avoids redundant job when sizes are known precisely. */
23768 misaligned_prologue_used = (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES
23769 && MAX (desired_align, epilogue_size_needed) <= 32
23770 && ((desired_align > align && !align_bytes)
23771 || (!count && epilogue_size_needed > 1)));
23772
23773 /* Do the cheap promotion to allow better CSE across the
23774 main loop and epilogue (ie one load of the big constant in the
23775 front of all code.
23776 For now the misaligned move sequences do not have fast path
23777 without broadcasting. */
23778 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
23779 {
23780 if (alg == vector_loop)
23781 {
23782 gcc_assert (val_exp == const0_rtx);
23783 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
23784 promoted_val = promote_duplicated_reg_to_size (val_exp,
23785 GET_MODE_SIZE (word_mode),
23786 desired_align, align);
23787 }
23788 else
23789 {
23790 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23791 desired_align, align);
23792 }
23793 }
23794 /* Misaligned move sequences handles both prologues and epilogues at once.
23795 Default code generation results in smaller code for large alignments and
23796 also avoids redundant job when sizes are known precisely. */
23797 if (misaligned_prologue_used)
23798 {
23799 /* Misaligned move prologue handled small blocks by itself. */
23800 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
23801 (dst, src, &destreg, &srcreg,
23802 move_mode, promoted_val, vec_promoted_val,
23803 &count_exp,
23804 &jump_around_label,
23805 desired_align < align
23806 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
23807 desired_align, align, &min_size, dynamic_check, issetmem);
23808 if (!issetmem)
23809 src = change_address (src, BLKmode, srcreg);
23810 dst = change_address (dst, BLKmode, destreg);
23811 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23812 epilogue_size_needed = 0;
23813 if (need_zero_guard && !min_size)
23814 {
23815 /* It is possible that we copied enough so the main loop will not
23816 execute. */
23817 gcc_assert (size_needed > 1);
23818 if (jump_around_label == NULL_RTX)
23819 jump_around_label = gen_label_rtx ();
23820 emit_cmp_and_jump_insns (count_exp,
23821 GEN_INT (size_needed),
23822 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
23823 if (expected_size == -1
23824 || expected_size < (desired_align - align) / 2 + size_needed)
23825 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23826 else
23827 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23828 }
23829 }
23830 /* Ensure that alignment prologue won't copy past end of block. */
23831 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23832 {
23833 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23834 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23835 Make sure it is power of 2. */
23836 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23837
23838 /* To improve performance of small blocks, we jump around the VAL
23839 promoting mode. This mean that if the promoted VAL is not constant,
23840 we might not use it in the epilogue and have to use byte
23841 loop variant. */
23842 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
23843 force_loopy_epilogue = true;
23844 if (count)
23845 {
23846 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23847 {
23848 /* If main algorithm works on QImode, no epilogue is needed.
23849 For small sizes just don't align anything. */
23850 if (size_needed == 1)
23851 desired_align = align;
23852 else
23853 goto epilogue;
23854 }
23855 }
23856 else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23857 {
23858 gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
23859 label = gen_label_rtx ();
23860 emit_cmp_and_jump_insns (count_exp,
23861 GEN_INT (epilogue_size_needed),
23862 LTU, 0, counter_mode (count_exp), 1, label);
23863 if (expected_size == -1 || expected_size < epilogue_size_needed)
23864 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23865 else
23866 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23867 }
23868 }
23869
23870 /* Emit code to decide on runtime whether library call or inline should be
23871 used. */
23872 if (dynamic_check != -1)
23873 {
23874 if (!issetmem && CONST_INT_P (count_exp))
23875 {
23876 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23877 {
23878 emit_block_move_via_libcall (dst, src, count_exp, false);
23879 count_exp = const0_rtx;
23880 goto epilogue;
23881 }
23882 }
23883 else
23884 {
23885 rtx hot_label = gen_label_rtx ();
23886 jump_around_label = gen_label_rtx ();
23887 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23888 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23889 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23890 if (issetmem)
23891 set_storage_via_libcall (dst, count_exp, val_exp, false);
23892 else
23893 emit_block_move_via_libcall (dst, src, count_exp, false);
23894 emit_jump (jump_around_label);
23895 emit_label (hot_label);
23896 }
23897 }
23898
23899 /* Step 2: Alignment prologue. */
23900 /* Do the expensive promotion once we branched off the small blocks. */
23901 if (issetmem && !promoted_val)
23902 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23903 desired_align, align);
23904
23905 if (desired_align > align && !misaligned_prologue_used)
23906 {
23907 if (align_bytes == 0)
23908 {
23909 /* Except for the first move in prologue, we no longer know
23910 constant offset in aliasing info. It don't seems to worth
23911 the pain to maintain it for the first move, so throw away
23912 the info early. */
23913 dst = change_address (dst, BLKmode, destreg);
23914 if (!issetmem)
23915 src = change_address (src, BLKmode, srcreg);
23916 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
23917 promoted_val, vec_promoted_val,
23918 count_exp, align, desired_align,
23919 issetmem);
23920 /* At most desired_align - align bytes are copied. */
23921 if (min_size < (unsigned)(desired_align - align))
23922 min_size = 0;
23923 else
23924 min_size -= desired_align - align;
23925 }
23926 else
23927 {
23928 /* If we know how many bytes need to be stored before dst is
23929 sufficiently aligned, maintain aliasing info accurately. */
23930 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
23931 srcreg,
23932 promoted_val,
23933 vec_promoted_val,
23934 desired_align,
23935 align_bytes,
23936 issetmem);
23937
23938 count_exp = plus_constant (counter_mode (count_exp),
23939 count_exp, -align_bytes);
23940 count -= align_bytes;
23941 min_size -= align_bytes;
23942 max_size -= align_bytes;
23943 }
23944 if (need_zero_guard
23945 && !min_size
23946 && (count < (unsigned HOST_WIDE_INT) size_needed
23947 || (align_bytes == 0
23948 && count < ((unsigned HOST_WIDE_INT) size_needed
23949 + desired_align - align))))
23950 {
23951 /* It is possible that we copied enough so the main loop will not
23952 execute. */
23953 gcc_assert (size_needed > 1);
23954 if (label == NULL_RTX)
23955 label = gen_label_rtx ();
23956 emit_cmp_and_jump_insns (count_exp,
23957 GEN_INT (size_needed),
23958 LTU, 0, counter_mode (count_exp), 1, label);
23959 if (expected_size == -1
23960 || expected_size < (desired_align - align) / 2 + size_needed)
23961 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23962 else
23963 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23964 }
23965 }
23966 if (label && size_needed == 1)
23967 {
23968 emit_label (label);
23969 LABEL_NUSES (label) = 1;
23970 label = NULL;
23971 epilogue_size_needed = 1;
23972 if (issetmem)
23973 promoted_val = val_exp;
23974 }
23975 else if (label == NULL_RTX && !misaligned_prologue_used)
23976 epilogue_size_needed = size_needed;
23977
23978 /* Step 3: Main loop. */
23979
23980 switch (alg)
23981 {
23982 case libcall:
23983 case no_stringop:
23984 case last_alg:
23985 gcc_unreachable ();
23986 case loop_1_byte:
23987 case loop:
23988 case unrolled_loop:
23989 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
23990 count_exp, move_mode, unroll_factor,
23991 expected_size, issetmem);
23992 break;
23993 case vector_loop:
23994 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
23995 vec_promoted_val, count_exp, move_mode,
23996 unroll_factor, expected_size, issetmem);
23997 break;
23998 case rep_prefix_8_byte:
23999 case rep_prefix_4_byte:
24000 case rep_prefix_1_byte:
24001 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
24002 val_exp, count_exp, move_mode, issetmem);
24003 break;
24004 }
24005 /* Adjust properly the offset of src and dest memory for aliasing. */
24006 if (CONST_INT_P (count_exp))
24007 {
24008 if (!issetmem)
24009 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
24010 (count / size_needed) * size_needed);
24011 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
24012 (count / size_needed) * size_needed);
24013 }
24014 else
24015 {
24016 if (!issetmem)
24017 src = change_address (src, BLKmode, srcreg);
24018 dst = change_address (dst, BLKmode, destreg);
24019 }
24020
24021 /* Step 4: Epilogue to copy the remaining bytes. */
24022 epilogue:
24023 if (label)
24024 {
24025 /* When the main loop is done, COUNT_EXP might hold original count,
24026 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
24027 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
24028 bytes. Compensate if needed. */
24029
24030 if (size_needed < epilogue_size_needed)
24031 {
24032 tmp =
24033 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
24034 GEN_INT (size_needed - 1), count_exp, 1,
24035 OPTAB_DIRECT);
24036 if (tmp != count_exp)
24037 emit_move_insn (count_exp, tmp);
24038 }
24039 emit_label (label);
24040 LABEL_NUSES (label) = 1;
24041 }
24042
24043 if (count_exp != const0_rtx && epilogue_size_needed > 1)
24044 {
24045 if (force_loopy_epilogue)
24046 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
24047 epilogue_size_needed);
24048 else
24049 {
24050 if (issetmem)
24051 expand_setmem_epilogue (dst, destreg, promoted_val,
24052 vec_promoted_val, count_exp,
24053 epilogue_size_needed);
24054 else
24055 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
24056 epilogue_size_needed);
24057 }
24058 }
24059 if (jump_around_label)
24060 emit_label (jump_around_label);
24061 return true;
24062 }
24063
24064 /* Wrapper for ix86_expand_set_or_movmem for memcpy case. */
24065 bool
24066 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
24067 rtx expected_align_exp, rtx expected_size_exp)
24068 {
24069 return ix86_expand_set_or_movmem (dst, src, count_exp, NULL, align_exp,
24070 expected_align_exp, expected_size_exp, false);
24071 }
24072
24073 /* Wrapper for ix86_expand_set_or_movmem for memset case. */
24074 bool
24075 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
24076 rtx expected_align_exp, rtx expected_size_exp)
24077 {
24078 return ix86_expand_set_or_movmem (dst, NULL, count_exp, val_exp, align_exp,
24079 expected_align_exp, expected_size_exp, true);
24080 }
24081
24082
24083 /* Expand the appropriate insns for doing strlen if not just doing
24084 repnz; scasb
24085
24086 out = result, initialized with the start address
24087 align_rtx = alignment of the address.
24088 scratch = scratch register, initialized with the startaddress when
24089 not aligned, otherwise undefined
24090
24091 This is just the body. It needs the initializations mentioned above and
24092 some address computing at the end. These things are done in i386.md. */
24093
24094 static void
24095 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
24096 {
24097 int align;
24098 rtx tmp;
24099 rtx align_2_label = NULL_RTX;
24100 rtx align_3_label = NULL_RTX;
24101 rtx align_4_label = gen_label_rtx ();
24102 rtx end_0_label = gen_label_rtx ();
24103 rtx mem;
24104 rtx tmpreg = gen_reg_rtx (SImode);
24105 rtx scratch = gen_reg_rtx (SImode);
24106 rtx cmp;
24107
24108 align = 0;
24109 if (CONST_INT_P (align_rtx))
24110 align = INTVAL (align_rtx);
24111
24112 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
24113
24114 /* Is there a known alignment and is it less than 4? */
24115 if (align < 4)
24116 {
24117 rtx scratch1 = gen_reg_rtx (Pmode);
24118 emit_move_insn (scratch1, out);
24119 /* Is there a known alignment and is it not 2? */
24120 if (align != 2)
24121 {
24122 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
24123 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
24124
24125 /* Leave just the 3 lower bits. */
24126 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
24127 NULL_RTX, 0, OPTAB_WIDEN);
24128
24129 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24130 Pmode, 1, align_4_label);
24131 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
24132 Pmode, 1, align_2_label);
24133 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
24134 Pmode, 1, align_3_label);
24135 }
24136 else
24137 {
24138 /* Since the alignment is 2, we have to check 2 or 0 bytes;
24139 check if is aligned to 4 - byte. */
24140
24141 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
24142 NULL_RTX, 0, OPTAB_WIDEN);
24143
24144 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24145 Pmode, 1, align_4_label);
24146 }
24147
24148 mem = change_address (src, QImode, out);
24149
24150 /* Now compare the bytes. */
24151
24152 /* Compare the first n unaligned byte on a byte per byte basis. */
24153 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
24154 QImode, 1, end_0_label);
24155
24156 /* Increment the address. */
24157 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24158
24159 /* Not needed with an alignment of 2 */
24160 if (align != 2)
24161 {
24162 emit_label (align_2_label);
24163
24164 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24165 end_0_label);
24166
24167 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24168
24169 emit_label (align_3_label);
24170 }
24171
24172 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24173 end_0_label);
24174
24175 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24176 }
24177
24178 /* Generate loop to check 4 bytes at a time. It is not a good idea to
24179 align this loop. It gives only huge programs, but does not help to
24180 speed up. */
24181 emit_label (align_4_label);
24182
24183 mem = change_address (src, SImode, out);
24184 emit_move_insn (scratch, mem);
24185 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
24186
24187 /* This formula yields a nonzero result iff one of the bytes is zero.
24188 This saves three branches inside loop and many cycles. */
24189
24190 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
24191 emit_insn (gen_one_cmplsi2 (scratch, scratch));
24192 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
24193 emit_insn (gen_andsi3 (tmpreg, tmpreg,
24194 gen_int_mode (0x80808080, SImode)));
24195 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
24196 align_4_label);
24197
24198 if (TARGET_CMOVE)
24199 {
24200 rtx reg = gen_reg_rtx (SImode);
24201 rtx reg2 = gen_reg_rtx (Pmode);
24202 emit_move_insn (reg, tmpreg);
24203 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
24204
24205 /* If zero is not in the first two bytes, move two bytes forward. */
24206 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24207 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24208 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24209 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
24210 gen_rtx_IF_THEN_ELSE (SImode, tmp,
24211 reg,
24212 tmpreg)));
24213 /* Emit lea manually to avoid clobbering of flags. */
24214 emit_insn (gen_rtx_SET (SImode, reg2,
24215 gen_rtx_PLUS (Pmode, out, const2_rtx)));
24216
24217 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24218 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24219 emit_insn (gen_rtx_SET (VOIDmode, out,
24220 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
24221 reg2,
24222 out)));
24223 }
24224 else
24225 {
24226 rtx end_2_label = gen_label_rtx ();
24227 /* Is zero in the first two bytes? */
24228
24229 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24230 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24231 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
24232 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24233 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
24234 pc_rtx);
24235 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
24236 JUMP_LABEL (tmp) = end_2_label;
24237
24238 /* Not in the first two. Move two bytes forward. */
24239 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
24240 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
24241
24242 emit_label (end_2_label);
24243
24244 }
24245
24246 /* Avoid branch in fixing the byte. */
24247 tmpreg = gen_lowpart (QImode, tmpreg);
24248 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
24249 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
24250 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
24251 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
24252
24253 emit_label (end_0_label);
24254 }
24255
24256 /* Expand strlen. */
24257
24258 bool
24259 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
24260 {
24261 rtx addr, scratch1, scratch2, scratch3, scratch4;
24262
24263 /* The generic case of strlen expander is long. Avoid it's
24264 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
24265
24266 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24267 && !TARGET_INLINE_ALL_STRINGOPS
24268 && !optimize_insn_for_size_p ()
24269 && (!CONST_INT_P (align) || INTVAL (align) < 4))
24270 return false;
24271
24272 addr = force_reg (Pmode, XEXP (src, 0));
24273 scratch1 = gen_reg_rtx (Pmode);
24274
24275 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24276 && !optimize_insn_for_size_p ())
24277 {
24278 /* Well it seems that some optimizer does not combine a call like
24279 foo(strlen(bar), strlen(bar));
24280 when the move and the subtraction is done here. It does calculate
24281 the length just once when these instructions are done inside of
24282 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
24283 often used and I use one fewer register for the lifetime of
24284 output_strlen_unroll() this is better. */
24285
24286 emit_move_insn (out, addr);
24287
24288 ix86_expand_strlensi_unroll_1 (out, src, align);
24289
24290 /* strlensi_unroll_1 returns the address of the zero at the end of
24291 the string, like memchr(), so compute the length by subtracting
24292 the start address. */
24293 emit_insn (ix86_gen_sub3 (out, out, addr));
24294 }
24295 else
24296 {
24297 rtx unspec;
24298
24299 /* Can't use this if the user has appropriated eax, ecx, or edi. */
24300 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
24301 return false;
24302
24303 scratch2 = gen_reg_rtx (Pmode);
24304 scratch3 = gen_reg_rtx (Pmode);
24305 scratch4 = force_reg (Pmode, constm1_rtx);
24306
24307 emit_move_insn (scratch3, addr);
24308 eoschar = force_reg (QImode, eoschar);
24309
24310 src = replace_equiv_address_nv (src, scratch3);
24311
24312 /* If .md starts supporting :P, this can be done in .md. */
24313 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
24314 scratch4), UNSPEC_SCAS);
24315 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
24316 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
24317 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
24318 }
24319 return true;
24320 }
24321
24322 /* For given symbol (function) construct code to compute address of it's PLT
24323 entry in large x86-64 PIC model. */
24324 static rtx
24325 construct_plt_address (rtx symbol)
24326 {
24327 rtx tmp, unspec;
24328
24329 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
24330 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
24331 gcc_assert (Pmode == DImode);
24332
24333 tmp = gen_reg_rtx (Pmode);
24334 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
24335
24336 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
24337 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
24338 return tmp;
24339 }
24340
24341 rtx
24342 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
24343 rtx callarg2,
24344 rtx pop, bool sibcall)
24345 {
24346 unsigned int const cregs_size
24347 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24348 rtx vec[3 + cregs_size];
24349 rtx use = NULL, call;
24350 unsigned int vec_len = 0;
24351
24352 if (pop == const0_rtx)
24353 pop = NULL;
24354 gcc_assert (!TARGET_64BIT || !pop);
24355
24356 if (TARGET_MACHO && !TARGET_64BIT)
24357 {
24358 #if TARGET_MACHO
24359 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24360 fnaddr = machopic_indirect_call_target (fnaddr);
24361 #endif
24362 }
24363 else
24364 {
24365 /* Static functions and indirect calls don't need the pic register. */
24366 if (flag_pic
24367 && (!TARGET_64BIT
24368 || (ix86_cmodel == CM_LARGE_PIC
24369 && DEFAULT_ABI != MS_ABI))
24370 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24371 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24372 use_reg (&use, pic_offset_table_rtx);
24373 }
24374
24375 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24376 {
24377 rtx al = gen_rtx_REG (QImode, AX_REG);
24378 emit_move_insn (al, callarg2);
24379 use_reg (&use, al);
24380 }
24381
24382 if (ix86_cmodel == CM_LARGE_PIC
24383 && !TARGET_PECOFF
24384 && MEM_P (fnaddr)
24385 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24386 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24387 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24388 else if (sibcall
24389 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24390 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24391 {
24392 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24393 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24394 }
24395
24396 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24397 if (retval)
24398 call = gen_rtx_SET (VOIDmode, retval, call);
24399 vec[vec_len++] = call;
24400
24401 if (pop)
24402 {
24403 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24404 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24405 vec[vec_len++] = pop;
24406 }
24407
24408 if (TARGET_64BIT_MS_ABI
24409 && (!callarg2 || INTVAL (callarg2) != -2))
24410 {
24411 unsigned i;
24412
24413 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24414 UNSPEC_MS_TO_SYSV_CALL);
24415
24416 for (i = 0; i < cregs_size; i++)
24417 {
24418 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24419 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24420
24421 vec[vec_len++]
24422 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24423 }
24424 }
24425
24426 if (vec_len > 1)
24427 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24428 call = emit_call_insn (call);
24429 if (use)
24430 CALL_INSN_FUNCTION_USAGE (call) = use;
24431
24432 return call;
24433 }
24434
24435 /* Output the assembly for a call instruction. */
24436
24437 const char *
24438 ix86_output_call_insn (rtx insn, rtx call_op)
24439 {
24440 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24441 bool seh_nop_p = false;
24442 const char *xasm;
24443
24444 if (SIBLING_CALL_P (insn))
24445 {
24446 if (direct_p)
24447 xasm = "%!jmp\t%P0";
24448 /* SEH epilogue detection requires the indirect branch case
24449 to include REX.W. */
24450 else if (TARGET_SEH)
24451 xasm = "%!rex.W jmp %A0";
24452 else
24453 xasm = "%!jmp\t%A0";
24454
24455 output_asm_insn (xasm, &call_op);
24456 return "";
24457 }
24458
24459 /* SEH unwinding can require an extra nop to be emitted in several
24460 circumstances. Determine if we have one of those. */
24461 if (TARGET_SEH)
24462 {
24463 rtx i;
24464
24465 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24466 {
24467 /* If we get to another real insn, we don't need the nop. */
24468 if (INSN_P (i))
24469 break;
24470
24471 /* If we get to the epilogue note, prevent a catch region from
24472 being adjacent to the standard epilogue sequence. If non-
24473 call-exceptions, we'll have done this during epilogue emission. */
24474 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24475 && !flag_non_call_exceptions
24476 && !can_throw_internal (insn))
24477 {
24478 seh_nop_p = true;
24479 break;
24480 }
24481 }
24482
24483 /* If we didn't find a real insn following the call, prevent the
24484 unwinder from looking into the next function. */
24485 if (i == NULL)
24486 seh_nop_p = true;
24487 }
24488
24489 if (direct_p)
24490 xasm = "%!call\t%P0";
24491 else
24492 xasm = "%!call\t%A0";
24493
24494 output_asm_insn (xasm, &call_op);
24495
24496 if (seh_nop_p)
24497 return "nop";
24498
24499 return "";
24500 }
24501 \f
24502 /* Clear stack slot assignments remembered from previous functions.
24503 This is called from INIT_EXPANDERS once before RTL is emitted for each
24504 function. */
24505
24506 static struct machine_function *
24507 ix86_init_machine_status (void)
24508 {
24509 struct machine_function *f;
24510
24511 f = ggc_alloc_cleared_machine_function ();
24512 f->use_fast_prologue_epilogue_nregs = -1;
24513 f->call_abi = ix86_abi;
24514
24515 return f;
24516 }
24517
24518 /* Return a MEM corresponding to a stack slot with mode MODE.
24519 Allocate a new slot if necessary.
24520
24521 The RTL for a function can have several slots available: N is
24522 which slot to use. */
24523
24524 rtx
24525 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24526 {
24527 struct stack_local_entry *s;
24528
24529 gcc_assert (n < MAX_386_STACK_LOCALS);
24530
24531 for (s = ix86_stack_locals; s; s = s->next)
24532 if (s->mode == mode && s->n == n)
24533 return validize_mem (copy_rtx (s->rtl));
24534
24535 s = ggc_alloc_stack_local_entry ();
24536 s->n = n;
24537 s->mode = mode;
24538 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24539
24540 s->next = ix86_stack_locals;
24541 ix86_stack_locals = s;
24542 return validize_mem (s->rtl);
24543 }
24544
24545 static void
24546 ix86_instantiate_decls (void)
24547 {
24548 struct stack_local_entry *s;
24549
24550 for (s = ix86_stack_locals; s; s = s->next)
24551 if (s->rtl != NULL_RTX)
24552 instantiate_decl_rtl (s->rtl);
24553 }
24554 \f
24555 /* Check whether x86 address PARTS is a pc-relative address. */
24556
24557 static bool
24558 rip_relative_addr_p (struct ix86_address *parts)
24559 {
24560 rtx base, index, disp;
24561
24562 base = parts->base;
24563 index = parts->index;
24564 disp = parts->disp;
24565
24566 if (disp && !base && !index)
24567 {
24568 if (TARGET_64BIT)
24569 {
24570 rtx symbol = disp;
24571
24572 if (GET_CODE (disp) == CONST)
24573 symbol = XEXP (disp, 0);
24574 if (GET_CODE (symbol) == PLUS
24575 && CONST_INT_P (XEXP (symbol, 1)))
24576 symbol = XEXP (symbol, 0);
24577
24578 if (GET_CODE (symbol) == LABEL_REF
24579 || (GET_CODE (symbol) == SYMBOL_REF
24580 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
24581 || (GET_CODE (symbol) == UNSPEC
24582 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
24583 || XINT (symbol, 1) == UNSPEC_PCREL
24584 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
24585 return true;
24586 }
24587 }
24588 return false;
24589 }
24590
24591 /* Calculate the length of the memory address in the instruction encoding.
24592 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24593 or other prefixes. We never generate addr32 prefix for LEA insn. */
24594
24595 int
24596 memory_address_length (rtx addr, bool lea)
24597 {
24598 struct ix86_address parts;
24599 rtx base, index, disp;
24600 int len;
24601 int ok;
24602
24603 if (GET_CODE (addr) == PRE_DEC
24604 || GET_CODE (addr) == POST_INC
24605 || GET_CODE (addr) == PRE_MODIFY
24606 || GET_CODE (addr) == POST_MODIFY)
24607 return 0;
24608
24609 ok = ix86_decompose_address (addr, &parts);
24610 gcc_assert (ok);
24611
24612 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24613
24614 /* If this is not LEA instruction, add the length of addr32 prefix. */
24615 if (TARGET_64BIT && !lea
24616 && (SImode_address_operand (addr, VOIDmode)
24617 || (parts.base && GET_MODE (parts.base) == SImode)
24618 || (parts.index && GET_MODE (parts.index) == SImode)))
24619 len++;
24620
24621 base = parts.base;
24622 index = parts.index;
24623 disp = parts.disp;
24624
24625 if (base && GET_CODE (base) == SUBREG)
24626 base = SUBREG_REG (base);
24627 if (index && GET_CODE (index) == SUBREG)
24628 index = SUBREG_REG (index);
24629
24630 gcc_assert (base == NULL_RTX || REG_P (base));
24631 gcc_assert (index == NULL_RTX || REG_P (index));
24632
24633 /* Rule of thumb:
24634 - esp as the base always wants an index,
24635 - ebp as the base always wants a displacement,
24636 - r12 as the base always wants an index,
24637 - r13 as the base always wants a displacement. */
24638
24639 /* Register Indirect. */
24640 if (base && !index && !disp)
24641 {
24642 /* esp (for its index) and ebp (for its displacement) need
24643 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24644 code. */
24645 if (base == arg_pointer_rtx
24646 || base == frame_pointer_rtx
24647 || REGNO (base) == SP_REG
24648 || REGNO (base) == BP_REG
24649 || REGNO (base) == R12_REG
24650 || REGNO (base) == R13_REG)
24651 len++;
24652 }
24653
24654 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24655 is not disp32, but disp32(%rip), so for disp32
24656 SIB byte is needed, unless print_operand_address
24657 optimizes it into disp32(%rip) or (%rip) is implied
24658 by UNSPEC. */
24659 else if (disp && !base && !index)
24660 {
24661 len += 4;
24662 if (rip_relative_addr_p (&parts))
24663 len++;
24664 }
24665 else
24666 {
24667 /* Find the length of the displacement constant. */
24668 if (disp)
24669 {
24670 if (base && satisfies_constraint_K (disp))
24671 len += 1;
24672 else
24673 len += 4;
24674 }
24675 /* ebp always wants a displacement. Similarly r13. */
24676 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24677 len++;
24678
24679 /* An index requires the two-byte modrm form.... */
24680 if (index
24681 /* ...like esp (or r12), which always wants an index. */
24682 || base == arg_pointer_rtx
24683 || base == frame_pointer_rtx
24684 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24685 len++;
24686 }
24687
24688 return len;
24689 }
24690
24691 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24692 is set, expect that insn have 8bit immediate alternative. */
24693 int
24694 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24695 {
24696 int len = 0;
24697 int i;
24698 extract_insn_cached (insn);
24699 for (i = recog_data.n_operands - 1; i >= 0; --i)
24700 if (CONSTANT_P (recog_data.operand[i]))
24701 {
24702 enum attr_mode mode = get_attr_mode (insn);
24703
24704 gcc_assert (!len);
24705 if (shortform && CONST_INT_P (recog_data.operand[i]))
24706 {
24707 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24708 switch (mode)
24709 {
24710 case MODE_QI:
24711 len = 1;
24712 continue;
24713 case MODE_HI:
24714 ival = trunc_int_for_mode (ival, HImode);
24715 break;
24716 case MODE_SI:
24717 ival = trunc_int_for_mode (ival, SImode);
24718 break;
24719 default:
24720 break;
24721 }
24722 if (IN_RANGE (ival, -128, 127))
24723 {
24724 len = 1;
24725 continue;
24726 }
24727 }
24728 switch (mode)
24729 {
24730 case MODE_QI:
24731 len = 1;
24732 break;
24733 case MODE_HI:
24734 len = 2;
24735 break;
24736 case MODE_SI:
24737 len = 4;
24738 break;
24739 /* Immediates for DImode instructions are encoded
24740 as 32bit sign extended values. */
24741 case MODE_DI:
24742 len = 4;
24743 break;
24744 default:
24745 fatal_insn ("unknown insn mode", insn);
24746 }
24747 }
24748 return len;
24749 }
24750
24751 /* Compute default value for "length_address" attribute. */
24752 int
24753 ix86_attr_length_address_default (rtx insn)
24754 {
24755 int i;
24756
24757 if (get_attr_type (insn) == TYPE_LEA)
24758 {
24759 rtx set = PATTERN (insn), addr;
24760
24761 if (GET_CODE (set) == PARALLEL)
24762 set = XVECEXP (set, 0, 0);
24763
24764 gcc_assert (GET_CODE (set) == SET);
24765
24766 addr = SET_SRC (set);
24767
24768 return memory_address_length (addr, true);
24769 }
24770
24771 extract_insn_cached (insn);
24772 for (i = recog_data.n_operands - 1; i >= 0; --i)
24773 if (MEM_P (recog_data.operand[i]))
24774 {
24775 constrain_operands_cached (reload_completed);
24776 if (which_alternative != -1)
24777 {
24778 const char *constraints = recog_data.constraints[i];
24779 int alt = which_alternative;
24780
24781 while (*constraints == '=' || *constraints == '+')
24782 constraints++;
24783 while (alt-- > 0)
24784 while (*constraints++ != ',')
24785 ;
24786 /* Skip ignored operands. */
24787 if (*constraints == 'X')
24788 continue;
24789 }
24790 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24791 }
24792 return 0;
24793 }
24794
24795 /* Compute default value for "length_vex" attribute. It includes
24796 2 or 3 byte VEX prefix and 1 opcode byte. */
24797
24798 int
24799 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24800 {
24801 int i;
24802
24803 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24804 byte VEX prefix. */
24805 if (!has_0f_opcode || has_vex_w)
24806 return 3 + 1;
24807
24808 /* We can always use 2 byte VEX prefix in 32bit. */
24809 if (!TARGET_64BIT)
24810 return 2 + 1;
24811
24812 extract_insn_cached (insn);
24813
24814 for (i = recog_data.n_operands - 1; i >= 0; --i)
24815 if (REG_P (recog_data.operand[i]))
24816 {
24817 /* REX.W bit uses 3 byte VEX prefix. */
24818 if (GET_MODE (recog_data.operand[i]) == DImode
24819 && GENERAL_REG_P (recog_data.operand[i]))
24820 return 3 + 1;
24821 }
24822 else
24823 {
24824 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24825 if (MEM_P (recog_data.operand[i])
24826 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24827 return 3 + 1;
24828 }
24829
24830 return 2 + 1;
24831 }
24832 \f
24833 /* Return the maximum number of instructions a cpu can issue. */
24834
24835 static int
24836 ix86_issue_rate (void)
24837 {
24838 switch (ix86_tune)
24839 {
24840 case PROCESSOR_PENTIUM:
24841 case PROCESSOR_ATOM:
24842 case PROCESSOR_SLM:
24843 case PROCESSOR_K6:
24844 case PROCESSOR_BTVER2:
24845 case PROCESSOR_PENTIUM4:
24846 case PROCESSOR_NOCONA:
24847 return 2;
24848
24849 case PROCESSOR_PENTIUMPRO:
24850 case PROCESSOR_ATHLON:
24851 case PROCESSOR_K8:
24852 case PROCESSOR_AMDFAM10:
24853 case PROCESSOR_GENERIC:
24854 case PROCESSOR_BTVER1:
24855 return 3;
24856
24857 case PROCESSOR_BDVER1:
24858 case PROCESSOR_BDVER2:
24859 case PROCESSOR_BDVER3:
24860 case PROCESSOR_CORE2:
24861 case PROCESSOR_COREI7:
24862 case PROCESSOR_COREI7_AVX:
24863 case PROCESSOR_HASWELL:
24864 return 4;
24865
24866 default:
24867 return 1;
24868 }
24869 }
24870
24871 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24872 by DEP_INSN and nothing set by DEP_INSN. */
24873
24874 static bool
24875 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24876 {
24877 rtx set, set2;
24878
24879 /* Simplify the test for uninteresting insns. */
24880 if (insn_type != TYPE_SETCC
24881 && insn_type != TYPE_ICMOV
24882 && insn_type != TYPE_FCMOV
24883 && insn_type != TYPE_IBR)
24884 return false;
24885
24886 if ((set = single_set (dep_insn)) != 0)
24887 {
24888 set = SET_DEST (set);
24889 set2 = NULL_RTX;
24890 }
24891 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24892 && XVECLEN (PATTERN (dep_insn), 0) == 2
24893 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24894 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24895 {
24896 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24897 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24898 }
24899 else
24900 return false;
24901
24902 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24903 return false;
24904
24905 /* This test is true if the dependent insn reads the flags but
24906 not any other potentially set register. */
24907 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24908 return false;
24909
24910 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24911 return false;
24912
24913 return true;
24914 }
24915
24916 /* Return true iff USE_INSN has a memory address with operands set by
24917 SET_INSN. */
24918
24919 bool
24920 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24921 {
24922 int i;
24923 extract_insn_cached (use_insn);
24924 for (i = recog_data.n_operands - 1; i >= 0; --i)
24925 if (MEM_P (recog_data.operand[i]))
24926 {
24927 rtx addr = XEXP (recog_data.operand[i], 0);
24928 return modified_in_p (addr, set_insn) != 0;
24929 }
24930 return false;
24931 }
24932
24933 /* Helper function for exact_store_load_dependency.
24934 Return true if addr is found in insn. */
24935 static bool
24936 exact_dependency_1 (rtx addr, rtx insn)
24937 {
24938 enum rtx_code code;
24939 const char *format_ptr;
24940 int i, j;
24941
24942 code = GET_CODE (insn);
24943 switch (code)
24944 {
24945 case MEM:
24946 if (rtx_equal_p (addr, insn))
24947 return true;
24948 break;
24949 case REG:
24950 CASE_CONST_ANY:
24951 case SYMBOL_REF:
24952 case CODE_LABEL:
24953 case PC:
24954 case CC0:
24955 case EXPR_LIST:
24956 return false;
24957 default:
24958 break;
24959 }
24960
24961 format_ptr = GET_RTX_FORMAT (code);
24962 for (i = 0; i < GET_RTX_LENGTH (code); i++)
24963 {
24964 switch (*format_ptr++)
24965 {
24966 case 'e':
24967 if (exact_dependency_1 (addr, XEXP (insn, i)))
24968 return true;
24969 break;
24970 case 'E':
24971 for (j = 0; j < XVECLEN (insn, i); j++)
24972 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
24973 return true;
24974 break;
24975 }
24976 }
24977 return false;
24978 }
24979
24980 /* Return true if there exists exact dependency for store & load, i.e.
24981 the same memory address is used in them. */
24982 static bool
24983 exact_store_load_dependency (rtx store, rtx load)
24984 {
24985 rtx set1, set2;
24986
24987 set1 = single_set (store);
24988 if (!set1)
24989 return false;
24990 if (!MEM_P (SET_DEST (set1)))
24991 return false;
24992 set2 = single_set (load);
24993 if (!set2)
24994 return false;
24995 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
24996 return true;
24997 return false;
24998 }
24999
25000 static int
25001 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
25002 {
25003 enum attr_type insn_type, dep_insn_type;
25004 enum attr_memory memory;
25005 rtx set, set2;
25006 int dep_insn_code_number;
25007
25008 /* Anti and output dependencies have zero cost on all CPUs. */
25009 if (REG_NOTE_KIND (link) != 0)
25010 return 0;
25011
25012 dep_insn_code_number = recog_memoized (dep_insn);
25013
25014 /* If we can't recognize the insns, we can't really do anything. */
25015 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
25016 return cost;
25017
25018 insn_type = get_attr_type (insn);
25019 dep_insn_type = get_attr_type (dep_insn);
25020
25021 switch (ix86_tune)
25022 {
25023 case PROCESSOR_PENTIUM:
25024 /* Address Generation Interlock adds a cycle of latency. */
25025 if (insn_type == TYPE_LEA)
25026 {
25027 rtx addr = PATTERN (insn);
25028
25029 if (GET_CODE (addr) == PARALLEL)
25030 addr = XVECEXP (addr, 0, 0);
25031
25032 gcc_assert (GET_CODE (addr) == SET);
25033
25034 addr = SET_SRC (addr);
25035 if (modified_in_p (addr, dep_insn))
25036 cost += 1;
25037 }
25038 else if (ix86_agi_dependent (dep_insn, insn))
25039 cost += 1;
25040
25041 /* ??? Compares pair with jump/setcc. */
25042 if (ix86_flags_dependent (insn, dep_insn, insn_type))
25043 cost = 0;
25044
25045 /* Floating point stores require value to be ready one cycle earlier. */
25046 if (insn_type == TYPE_FMOV
25047 && get_attr_memory (insn) == MEMORY_STORE
25048 && !ix86_agi_dependent (dep_insn, insn))
25049 cost += 1;
25050 break;
25051
25052 case PROCESSOR_PENTIUMPRO:
25053 memory = get_attr_memory (insn);
25054
25055 /* INT->FP conversion is expensive. */
25056 if (get_attr_fp_int_src (dep_insn))
25057 cost += 5;
25058
25059 /* There is one cycle extra latency between an FP op and a store. */
25060 if (insn_type == TYPE_FMOV
25061 && (set = single_set (dep_insn)) != NULL_RTX
25062 && (set2 = single_set (insn)) != NULL_RTX
25063 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
25064 && MEM_P (SET_DEST (set2)))
25065 cost += 1;
25066
25067 /* Show ability of reorder buffer to hide latency of load by executing
25068 in parallel with previous instruction in case
25069 previous instruction is not needed to compute the address. */
25070 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25071 && !ix86_agi_dependent (dep_insn, insn))
25072 {
25073 /* Claim moves to take one cycle, as core can issue one load
25074 at time and the next load can start cycle later. */
25075 if (dep_insn_type == TYPE_IMOV
25076 || dep_insn_type == TYPE_FMOV)
25077 cost = 1;
25078 else if (cost > 1)
25079 cost--;
25080 }
25081 break;
25082
25083 case PROCESSOR_K6:
25084 memory = get_attr_memory (insn);
25085
25086 /* The esp dependency is resolved before the instruction is really
25087 finished. */
25088 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25089 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25090 return 1;
25091
25092 /* INT->FP conversion is expensive. */
25093 if (get_attr_fp_int_src (dep_insn))
25094 cost += 5;
25095
25096 /* Show ability of reorder buffer to hide latency of load by executing
25097 in parallel with previous instruction in case
25098 previous instruction is not needed to compute the address. */
25099 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25100 && !ix86_agi_dependent (dep_insn, insn))
25101 {
25102 /* Claim moves to take one cycle, as core can issue one load
25103 at time and the next load can start cycle later. */
25104 if (dep_insn_type == TYPE_IMOV
25105 || dep_insn_type == TYPE_FMOV)
25106 cost = 1;
25107 else if (cost > 2)
25108 cost -= 2;
25109 else
25110 cost = 1;
25111 }
25112 break;
25113
25114 case PROCESSOR_ATHLON:
25115 case PROCESSOR_K8:
25116 case PROCESSOR_AMDFAM10:
25117 case PROCESSOR_BDVER1:
25118 case PROCESSOR_BDVER2:
25119 case PROCESSOR_BDVER3:
25120 case PROCESSOR_BTVER1:
25121 case PROCESSOR_BTVER2:
25122 case PROCESSOR_GENERIC:
25123 memory = get_attr_memory (insn);
25124
25125 /* Stack engine allows to execute push&pop instructions in parall. */
25126 if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25127 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25128 && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
25129 return 0;
25130
25131 /* Show ability of reorder buffer to hide latency of load by executing
25132 in parallel with previous instruction in case
25133 previous instruction is not needed to compute the address. */
25134 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25135 && !ix86_agi_dependent (dep_insn, insn))
25136 {
25137 enum attr_unit unit = get_attr_unit (insn);
25138 int loadcost = 3;
25139
25140 /* Because of the difference between the length of integer and
25141 floating unit pipeline preparation stages, the memory operands
25142 for floating point are cheaper.
25143
25144 ??? For Athlon it the difference is most probably 2. */
25145 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
25146 loadcost = 3;
25147 else
25148 loadcost = TARGET_ATHLON ? 2 : 0;
25149
25150 if (cost >= loadcost)
25151 cost -= loadcost;
25152 else
25153 cost = 0;
25154 }
25155 break;
25156
25157 case PROCESSOR_CORE2:
25158 case PROCESSOR_COREI7:
25159 case PROCESSOR_COREI7_AVX:
25160 case PROCESSOR_HASWELL:
25161 memory = get_attr_memory (insn);
25162
25163 /* Stack engine allows to execute push&pop instructions in parall. */
25164 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25165 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25166 return 0;
25167
25168 /* Show ability of reorder buffer to hide latency of load by executing
25169 in parallel with previous instruction in case
25170 previous instruction is not needed to compute the address. */
25171 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25172 && !ix86_agi_dependent (dep_insn, insn))
25173 {
25174 if (cost >= 4)
25175 cost -= 4;
25176 else
25177 cost = 0;
25178 }
25179 break;
25180
25181 case PROCESSOR_SLM:
25182 if (!reload_completed)
25183 return cost;
25184
25185 /* Increase cost of integer loads. */
25186 memory = get_attr_memory (dep_insn);
25187 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25188 {
25189 enum attr_unit unit = get_attr_unit (dep_insn);
25190 if (unit == UNIT_INTEGER && cost == 1)
25191 {
25192 if (memory == MEMORY_LOAD)
25193 cost = 3;
25194 else
25195 {
25196 /* Increase cost of ld/st for short int types only
25197 because of store forwarding issue. */
25198 rtx set = single_set (dep_insn);
25199 if (set && (GET_MODE (SET_DEST (set)) == QImode
25200 || GET_MODE (SET_DEST (set)) == HImode))
25201 {
25202 /* Increase cost of store/load insn if exact
25203 dependence exists and it is load insn. */
25204 enum attr_memory insn_memory = get_attr_memory (insn);
25205 if (insn_memory == MEMORY_LOAD
25206 && exact_store_load_dependency (dep_insn, insn))
25207 cost = 3;
25208 }
25209 }
25210 }
25211 }
25212
25213 default:
25214 break;
25215 }
25216
25217 return cost;
25218 }
25219
25220 /* How many alternative schedules to try. This should be as wide as the
25221 scheduling freedom in the DFA, but no wider. Making this value too
25222 large results extra work for the scheduler. */
25223
25224 static int
25225 ia32_multipass_dfa_lookahead (void)
25226 {
25227 switch (ix86_tune)
25228 {
25229 case PROCESSOR_PENTIUM:
25230 return 2;
25231
25232 case PROCESSOR_PENTIUMPRO:
25233 case PROCESSOR_K6:
25234 return 1;
25235
25236 case PROCESSOR_BDVER1:
25237 case PROCESSOR_BDVER2:
25238 case PROCESSOR_BDVER3:
25239 /* We use lookahead value 4 for BD both before and after reload
25240 schedules. Plan is to have value 8 included for O3. */
25241 return 4;
25242
25243 case PROCESSOR_CORE2:
25244 case PROCESSOR_COREI7:
25245 case PROCESSOR_COREI7_AVX:
25246 case PROCESSOR_HASWELL:
25247 case PROCESSOR_ATOM:
25248 case PROCESSOR_SLM:
25249 /* Generally, we want haifa-sched:max_issue() to look ahead as far
25250 as many instructions can be executed on a cycle, i.e.,
25251 issue_rate. I wonder why tuning for many CPUs does not do this. */
25252 if (reload_completed)
25253 return ix86_issue_rate ();
25254 /* Don't use lookahead for pre-reload schedule to save compile time. */
25255 return 0;
25256
25257 default:
25258 return 0;
25259 }
25260 }
25261
25262 /* Return true if target platform supports macro-fusion. */
25263
25264 static bool
25265 ix86_macro_fusion_p ()
25266 {
25267 return TARGET_FUSE_CMP_AND_BRANCH;
25268 }
25269
25270 /* Check whether current microarchitecture support macro fusion
25271 for insn pair "CONDGEN + CONDJMP". Refer to
25272 "Intel Architectures Optimization Reference Manual". */
25273
25274 static bool
25275 ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
25276 {
25277 rtx src, dest;
25278 rtx single_set = single_set (condgen);
25279 enum rtx_code ccode;
25280 rtx compare_set = NULL_RTX, test_if, cond;
25281 rtx alu_set = NULL_RTX, addr = NULL_RTX;
25282
25283 if (get_attr_type (condgen) != TYPE_TEST
25284 && get_attr_type (condgen) != TYPE_ICMP
25285 && get_attr_type (condgen) != TYPE_INCDEC
25286 && get_attr_type (condgen) != TYPE_ALU)
25287 return false;
25288
25289 if (single_set == NULL_RTX
25290 && !TARGET_FUSE_ALU_AND_BRANCH)
25291 return false;
25292
25293 if (single_set != NULL_RTX)
25294 compare_set = single_set;
25295 else
25296 {
25297 int i;
25298 rtx pat = PATTERN (condgen);
25299 for (i = 0; i < XVECLEN (pat, 0); i++)
25300 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
25301 {
25302 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
25303 if (GET_CODE (set_src) == COMPARE)
25304 compare_set = XVECEXP (pat, 0, i);
25305 else
25306 alu_set = XVECEXP (pat, 0, i);
25307 }
25308 }
25309 if (compare_set == NULL_RTX)
25310 return false;
25311 src = SET_SRC (compare_set);
25312 if (GET_CODE (src) != COMPARE)
25313 return false;
25314
25315 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
25316 supported. */
25317 if ((MEM_P (XEXP (src, 0))
25318 && CONST_INT_P (XEXP (src, 1)))
25319 || (MEM_P (XEXP (src, 1))
25320 && CONST_INT_P (XEXP (src, 0))))
25321 return false;
25322
25323 /* No fusion for RIP-relative address. */
25324 if (MEM_P (XEXP (src, 0)))
25325 addr = XEXP (XEXP (src, 0), 0);
25326 else if (MEM_P (XEXP (src, 1)))
25327 addr = XEXP (XEXP (src, 1), 0);
25328
25329 if (addr) {
25330 ix86_address parts;
25331 int ok = ix86_decompose_address (addr, &parts);
25332 gcc_assert (ok);
25333
25334 if (rip_relative_addr_p (&parts))
25335 return false;
25336 }
25337
25338 test_if = SET_SRC (pc_set (condjmp));
25339 cond = XEXP (test_if, 0);
25340 ccode = GET_CODE (cond);
25341 /* Check whether conditional jump use Sign or Overflow Flags. */
25342 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
25343 && (ccode == GE
25344 || ccode == GT
25345 || ccode == LE
25346 || ccode == LT))
25347 return false;
25348
25349 /* Return true for TYPE_TEST and TYPE_ICMP. */
25350 if (get_attr_type (condgen) == TYPE_TEST
25351 || get_attr_type (condgen) == TYPE_ICMP)
25352 return true;
25353
25354 /* The following is the case that macro-fusion for alu + jmp. */
25355 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
25356 return false;
25357
25358 /* No fusion for alu op with memory destination operand. */
25359 dest = SET_DEST (alu_set);
25360 if (MEM_P (dest))
25361 return false;
25362
25363 /* Macro-fusion for inc/dec + unsigned conditional jump is not
25364 supported. */
25365 if (get_attr_type (condgen) == TYPE_INCDEC
25366 && (ccode == GEU
25367 || ccode == GTU
25368 || ccode == LEU
25369 || ccode == LTU))
25370 return false;
25371
25372 return true;
25373 }
25374
25375 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
25376 execution. It is applied if
25377 (1) IMUL instruction is on the top of list;
25378 (2) There exists the only producer of independent IMUL instruction in
25379 ready list.
25380 Return index of IMUL producer if it was found and -1 otherwise. */
25381 static int
25382 do_reorder_for_imul (rtx *ready, int n_ready)
25383 {
25384 rtx insn, set, insn1, insn2;
25385 sd_iterator_def sd_it;
25386 dep_t dep;
25387 int index = -1;
25388 int i;
25389
25390 if (ix86_tune != PROCESSOR_ATOM)
25391 return index;
25392
25393 /* Check that IMUL instruction is on the top of ready list. */
25394 insn = ready[n_ready - 1];
25395 set = single_set (insn);
25396 if (!set)
25397 return index;
25398 if (!(GET_CODE (SET_SRC (set)) == MULT
25399 && GET_MODE (SET_SRC (set)) == SImode))
25400 return index;
25401
25402 /* Search for producer of independent IMUL instruction. */
25403 for (i = n_ready - 2; i >= 0; i--)
25404 {
25405 insn = ready[i];
25406 if (!NONDEBUG_INSN_P (insn))
25407 continue;
25408 /* Skip IMUL instruction. */
25409 insn2 = PATTERN (insn);
25410 if (GET_CODE (insn2) == PARALLEL)
25411 insn2 = XVECEXP (insn2, 0, 0);
25412 if (GET_CODE (insn2) == SET
25413 && GET_CODE (SET_SRC (insn2)) == MULT
25414 && GET_MODE (SET_SRC (insn2)) == SImode)
25415 continue;
25416
25417 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
25418 {
25419 rtx con;
25420 con = DEP_CON (dep);
25421 if (!NONDEBUG_INSN_P (con))
25422 continue;
25423 insn1 = PATTERN (con);
25424 if (GET_CODE (insn1) == PARALLEL)
25425 insn1 = XVECEXP (insn1, 0, 0);
25426
25427 if (GET_CODE (insn1) == SET
25428 && GET_CODE (SET_SRC (insn1)) == MULT
25429 && GET_MODE (SET_SRC (insn1)) == SImode)
25430 {
25431 sd_iterator_def sd_it1;
25432 dep_t dep1;
25433 /* Check if there is no other dependee for IMUL. */
25434 index = i;
25435 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
25436 {
25437 rtx pro;
25438 pro = DEP_PRO (dep1);
25439 if (!NONDEBUG_INSN_P (pro))
25440 continue;
25441 if (pro != insn)
25442 index = -1;
25443 }
25444 if (index >= 0)
25445 break;
25446 }
25447 }
25448 if (index >= 0)
25449 break;
25450 }
25451 return index;
25452 }
25453
25454 /* Try to find the best candidate on the top of ready list if two insns
25455 have the same priority - candidate is best if its dependees were
25456 scheduled earlier. Applied for Silvermont only.
25457 Return true if top 2 insns must be interchanged. */
25458 static bool
25459 swap_top_of_ready_list (rtx *ready, int n_ready)
25460 {
25461 rtx top = ready[n_ready - 1];
25462 rtx next = ready[n_ready - 2];
25463 rtx set;
25464 sd_iterator_def sd_it;
25465 dep_t dep;
25466 int clock1 = -1;
25467 int clock2 = -1;
25468 #define INSN_TICK(INSN) (HID (INSN)->tick)
25469
25470 if (ix86_tune != PROCESSOR_SLM)
25471 return false;
25472
25473 if (!NONDEBUG_INSN_P (top))
25474 return false;
25475 if (!NONJUMP_INSN_P (top))
25476 return false;
25477 if (!NONDEBUG_INSN_P (next))
25478 return false;
25479 if (!NONJUMP_INSN_P (next))
25480 return false;
25481 set = single_set (top);
25482 if (!set)
25483 return false;
25484 set = single_set (next);
25485 if (!set)
25486 return false;
25487
25488 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25489 {
25490 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25491 return false;
25492 /* Determine winner more precise. */
25493 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25494 {
25495 rtx pro;
25496 pro = DEP_PRO (dep);
25497 if (!NONDEBUG_INSN_P (pro))
25498 continue;
25499 if (INSN_TICK (pro) > clock1)
25500 clock1 = INSN_TICK (pro);
25501 }
25502 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25503 {
25504 rtx pro;
25505 pro = DEP_PRO (dep);
25506 if (!NONDEBUG_INSN_P (pro))
25507 continue;
25508 if (INSN_TICK (pro) > clock2)
25509 clock2 = INSN_TICK (pro);
25510 }
25511
25512 if (clock1 == clock2)
25513 {
25514 /* Determine winner - load must win. */
25515 enum attr_memory memory1, memory2;
25516 memory1 = get_attr_memory (top);
25517 memory2 = get_attr_memory (next);
25518 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25519 return true;
25520 }
25521 return (bool) (clock2 < clock1);
25522 }
25523 return false;
25524 #undef INSN_TICK
25525 }
25526
25527 /* Perform possible reodering of ready list for Atom/Silvermont only.
25528 Return issue rate. */
25529 static int
25530 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25531 int clock_var)
25532 {
25533 int issue_rate = -1;
25534 int n_ready = *pn_ready;
25535 int i;
25536 rtx insn;
25537 int index = -1;
25538
25539 /* Set up issue rate. */
25540 issue_rate = ix86_issue_rate ();
25541
25542 /* Do reodering for Atom/SLM only. */
25543 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25544 return issue_rate;
25545
25546 /* Nothing to do if ready list contains only 1 instruction. */
25547 if (n_ready <= 1)
25548 return issue_rate;
25549
25550 /* Do reodering for post-reload scheduler only. */
25551 if (!reload_completed)
25552 return issue_rate;
25553
25554 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25555 {
25556 if (sched_verbose > 1)
25557 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25558 INSN_UID (ready[index]));
25559
25560 /* Put IMUL producer (ready[index]) at the top of ready list. */
25561 insn = ready[index];
25562 for (i = index; i < n_ready - 1; i++)
25563 ready[i] = ready[i + 1];
25564 ready[n_ready - 1] = insn;
25565 return issue_rate;
25566 }
25567 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25568 {
25569 if (sched_verbose > 1)
25570 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25571 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25572 /* Swap 2 top elements of ready list. */
25573 insn = ready[n_ready - 1];
25574 ready[n_ready - 1] = ready[n_ready - 2];
25575 ready[n_ready - 2] = insn;
25576 }
25577 return issue_rate;
25578 }
25579
25580 static bool
25581 ix86_class_likely_spilled_p (reg_class_t);
25582
25583 /* Returns true if lhs of insn is HW function argument register and set up
25584 is_spilled to true if it is likely spilled HW register. */
25585 static bool
25586 insn_is_function_arg (rtx insn, bool* is_spilled)
25587 {
25588 rtx dst;
25589
25590 if (!NONDEBUG_INSN_P (insn))
25591 return false;
25592 /* Call instructions are not movable, ignore it. */
25593 if (CALL_P (insn))
25594 return false;
25595 insn = PATTERN (insn);
25596 if (GET_CODE (insn) == PARALLEL)
25597 insn = XVECEXP (insn, 0, 0);
25598 if (GET_CODE (insn) != SET)
25599 return false;
25600 dst = SET_DEST (insn);
25601 if (REG_P (dst) && HARD_REGISTER_P (dst)
25602 && ix86_function_arg_regno_p (REGNO (dst)))
25603 {
25604 /* Is it likely spilled HW register? */
25605 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25606 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25607 *is_spilled = true;
25608 return true;
25609 }
25610 return false;
25611 }
25612
25613 /* Add output dependencies for chain of function adjacent arguments if only
25614 there is a move to likely spilled HW register. Return first argument
25615 if at least one dependence was added or NULL otherwise. */
25616 static rtx
25617 add_parameter_dependencies (rtx call, rtx head)
25618 {
25619 rtx insn;
25620 rtx last = call;
25621 rtx first_arg = NULL;
25622 bool is_spilled = false;
25623
25624 head = PREV_INSN (head);
25625
25626 /* Find nearest to call argument passing instruction. */
25627 while (true)
25628 {
25629 last = PREV_INSN (last);
25630 if (last == head)
25631 return NULL;
25632 if (!NONDEBUG_INSN_P (last))
25633 continue;
25634 if (insn_is_function_arg (last, &is_spilled))
25635 break;
25636 return NULL;
25637 }
25638
25639 first_arg = last;
25640 while (true)
25641 {
25642 insn = PREV_INSN (last);
25643 if (!INSN_P (insn))
25644 break;
25645 if (insn == head)
25646 break;
25647 if (!NONDEBUG_INSN_P (insn))
25648 {
25649 last = insn;
25650 continue;
25651 }
25652 if (insn_is_function_arg (insn, &is_spilled))
25653 {
25654 /* Add output depdendence between two function arguments if chain
25655 of output arguments contains likely spilled HW registers. */
25656 if (is_spilled)
25657 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25658 first_arg = last = insn;
25659 }
25660 else
25661 break;
25662 }
25663 if (!is_spilled)
25664 return NULL;
25665 return first_arg;
25666 }
25667
25668 /* Add output or anti dependency from insn to first_arg to restrict its code
25669 motion. */
25670 static void
25671 avoid_func_arg_motion (rtx first_arg, rtx insn)
25672 {
25673 rtx set;
25674 rtx tmp;
25675
25676 set = single_set (insn);
25677 if (!set)
25678 return;
25679 tmp = SET_DEST (set);
25680 if (REG_P (tmp))
25681 {
25682 /* Add output dependency to the first function argument. */
25683 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25684 return;
25685 }
25686 /* Add anti dependency. */
25687 add_dependence (first_arg, insn, REG_DEP_ANTI);
25688 }
25689
25690 /* Avoid cross block motion of function argument through adding dependency
25691 from the first non-jump instruction in bb. */
25692 static void
25693 add_dependee_for_func_arg (rtx arg, basic_block bb)
25694 {
25695 rtx insn = BB_END (bb);
25696
25697 while (insn)
25698 {
25699 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25700 {
25701 rtx set = single_set (insn);
25702 if (set)
25703 {
25704 avoid_func_arg_motion (arg, insn);
25705 return;
25706 }
25707 }
25708 if (insn == BB_HEAD (bb))
25709 return;
25710 insn = PREV_INSN (insn);
25711 }
25712 }
25713
25714 /* Hook for pre-reload schedule - avoid motion of function arguments
25715 passed in likely spilled HW registers. */
25716 static void
25717 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25718 {
25719 rtx insn;
25720 rtx first_arg = NULL;
25721 if (reload_completed)
25722 return;
25723 while (head != tail && DEBUG_INSN_P (head))
25724 head = NEXT_INSN (head);
25725 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25726 if (INSN_P (insn) && CALL_P (insn))
25727 {
25728 first_arg = add_parameter_dependencies (insn, head);
25729 if (first_arg)
25730 {
25731 /* Add dependee for first argument to predecessors if only
25732 region contains more than one block. */
25733 basic_block bb = BLOCK_FOR_INSN (insn);
25734 int rgn = CONTAINING_RGN (bb->index);
25735 int nr_blks = RGN_NR_BLOCKS (rgn);
25736 /* Skip trivial regions and region head blocks that can have
25737 predecessors outside of region. */
25738 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25739 {
25740 edge e;
25741 edge_iterator ei;
25742 /* Assume that region is SCC, i.e. all immediate predecessors
25743 of non-head block are in the same region. */
25744 FOR_EACH_EDGE (e, ei, bb->preds)
25745 {
25746 /* Avoid creating of loop-carried dependencies through
25747 using topological odering in region. */
25748 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25749 add_dependee_for_func_arg (first_arg, e->src);
25750 }
25751 }
25752 insn = first_arg;
25753 if (insn == head)
25754 break;
25755 }
25756 }
25757 else if (first_arg)
25758 avoid_func_arg_motion (first_arg, insn);
25759 }
25760
25761 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25762 HW registers to maximum, to schedule them at soon as possible. These are
25763 moves from function argument registers at the top of the function entry
25764 and moves from function return value registers after call. */
25765 static int
25766 ix86_adjust_priority (rtx insn, int priority)
25767 {
25768 rtx set;
25769
25770 if (reload_completed)
25771 return priority;
25772
25773 if (!NONDEBUG_INSN_P (insn))
25774 return priority;
25775
25776 set = single_set (insn);
25777 if (set)
25778 {
25779 rtx tmp = SET_SRC (set);
25780 if (REG_P (tmp)
25781 && HARD_REGISTER_P (tmp)
25782 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25783 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25784 return current_sched_info->sched_max_insns_priority;
25785 }
25786
25787 return priority;
25788 }
25789
25790 /* Model decoder of Core 2/i7.
25791 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25792 track the instruction fetch block boundaries and make sure that long
25793 (9+ bytes) instructions are assigned to D0. */
25794
25795 /* Maximum length of an insn that can be handled by
25796 a secondary decoder unit. '8' for Core 2/i7. */
25797 static int core2i7_secondary_decoder_max_insn_size;
25798
25799 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25800 '16' for Core 2/i7. */
25801 static int core2i7_ifetch_block_size;
25802
25803 /* Maximum number of instructions decoder can handle per cycle.
25804 '6' for Core 2/i7. */
25805 static int core2i7_ifetch_block_max_insns;
25806
25807 typedef struct ix86_first_cycle_multipass_data_ *
25808 ix86_first_cycle_multipass_data_t;
25809 typedef const struct ix86_first_cycle_multipass_data_ *
25810 const_ix86_first_cycle_multipass_data_t;
25811
25812 /* A variable to store target state across calls to max_issue within
25813 one cycle. */
25814 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25815 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25816
25817 /* Initialize DATA. */
25818 static void
25819 core2i7_first_cycle_multipass_init (void *_data)
25820 {
25821 ix86_first_cycle_multipass_data_t data
25822 = (ix86_first_cycle_multipass_data_t) _data;
25823
25824 data->ifetch_block_len = 0;
25825 data->ifetch_block_n_insns = 0;
25826 data->ready_try_change = NULL;
25827 data->ready_try_change_size = 0;
25828 }
25829
25830 /* Advancing the cycle; reset ifetch block counts. */
25831 static void
25832 core2i7_dfa_post_advance_cycle (void)
25833 {
25834 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25835
25836 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25837
25838 data->ifetch_block_len = 0;
25839 data->ifetch_block_n_insns = 0;
25840 }
25841
25842 static int min_insn_size (rtx);
25843
25844 /* Filter out insns from ready_try that the core will not be able to issue
25845 on current cycle due to decoder. */
25846 static void
25847 core2i7_first_cycle_multipass_filter_ready_try
25848 (const_ix86_first_cycle_multipass_data_t data,
25849 char *ready_try, int n_ready, bool first_cycle_insn_p)
25850 {
25851 while (n_ready--)
25852 {
25853 rtx insn;
25854 int insn_size;
25855
25856 if (ready_try[n_ready])
25857 continue;
25858
25859 insn = get_ready_element (n_ready);
25860 insn_size = min_insn_size (insn);
25861
25862 if (/* If this is a too long an insn for a secondary decoder ... */
25863 (!first_cycle_insn_p
25864 && insn_size > core2i7_secondary_decoder_max_insn_size)
25865 /* ... or it would not fit into the ifetch block ... */
25866 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25867 /* ... or the decoder is full already ... */
25868 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25869 /* ... mask the insn out. */
25870 {
25871 ready_try[n_ready] = 1;
25872
25873 if (data->ready_try_change)
25874 bitmap_set_bit (data->ready_try_change, n_ready);
25875 }
25876 }
25877 }
25878
25879 /* Prepare for a new round of multipass lookahead scheduling. */
25880 static void
25881 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25882 bool first_cycle_insn_p)
25883 {
25884 ix86_first_cycle_multipass_data_t data
25885 = (ix86_first_cycle_multipass_data_t) _data;
25886 const_ix86_first_cycle_multipass_data_t prev_data
25887 = ix86_first_cycle_multipass_data;
25888
25889 /* Restore the state from the end of the previous round. */
25890 data->ifetch_block_len = prev_data->ifetch_block_len;
25891 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25892
25893 /* Filter instructions that cannot be issued on current cycle due to
25894 decoder restrictions. */
25895 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25896 first_cycle_insn_p);
25897 }
25898
25899 /* INSN is being issued in current solution. Account for its impact on
25900 the decoder model. */
25901 static void
25902 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
25903 rtx insn, const void *_prev_data)
25904 {
25905 ix86_first_cycle_multipass_data_t data
25906 = (ix86_first_cycle_multipass_data_t) _data;
25907 const_ix86_first_cycle_multipass_data_t prev_data
25908 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25909
25910 int insn_size = min_insn_size (insn);
25911
25912 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25913 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25914 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25915 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25916
25917 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
25918 if (!data->ready_try_change)
25919 {
25920 data->ready_try_change = sbitmap_alloc (n_ready);
25921 data->ready_try_change_size = n_ready;
25922 }
25923 else if (data->ready_try_change_size < n_ready)
25924 {
25925 data->ready_try_change = sbitmap_resize (data->ready_try_change,
25926 n_ready, 0);
25927 data->ready_try_change_size = n_ready;
25928 }
25929 bitmap_clear (data->ready_try_change);
25930
25931 /* Filter out insns from ready_try that the core will not be able to issue
25932 on current cycle due to decoder. */
25933 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25934 false);
25935 }
25936
25937 /* Revert the effect on ready_try. */
25938 static void
25939 core2i7_first_cycle_multipass_backtrack (const void *_data,
25940 char *ready_try,
25941 int n_ready ATTRIBUTE_UNUSED)
25942 {
25943 const_ix86_first_cycle_multipass_data_t data
25944 = (const_ix86_first_cycle_multipass_data_t) _data;
25945 unsigned int i = 0;
25946 sbitmap_iterator sbi;
25947
25948 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
25949 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
25950 {
25951 ready_try[i] = 0;
25952 }
25953 }
25954
25955 /* Save the result of multipass lookahead scheduling for the next round. */
25956 static void
25957 core2i7_first_cycle_multipass_end (const void *_data)
25958 {
25959 const_ix86_first_cycle_multipass_data_t data
25960 = (const_ix86_first_cycle_multipass_data_t) _data;
25961 ix86_first_cycle_multipass_data_t next_data
25962 = ix86_first_cycle_multipass_data;
25963
25964 if (data != NULL)
25965 {
25966 next_data->ifetch_block_len = data->ifetch_block_len;
25967 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
25968 }
25969 }
25970
25971 /* Deallocate target data. */
25972 static void
25973 core2i7_first_cycle_multipass_fini (void *_data)
25974 {
25975 ix86_first_cycle_multipass_data_t data
25976 = (ix86_first_cycle_multipass_data_t) _data;
25977
25978 if (data->ready_try_change)
25979 {
25980 sbitmap_free (data->ready_try_change);
25981 data->ready_try_change = NULL;
25982 data->ready_try_change_size = 0;
25983 }
25984 }
25985
25986 /* Prepare for scheduling pass. */
25987 static void
25988 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
25989 int verbose ATTRIBUTE_UNUSED,
25990 int max_uid ATTRIBUTE_UNUSED)
25991 {
25992 /* Install scheduling hooks for current CPU. Some of these hooks are used
25993 in time-critical parts of the scheduler, so we only set them up when
25994 they are actually used. */
25995 switch (ix86_tune)
25996 {
25997 case PROCESSOR_CORE2:
25998 case PROCESSOR_COREI7:
25999 case PROCESSOR_COREI7_AVX:
26000 case PROCESSOR_HASWELL:
26001 /* Do not perform multipass scheduling for pre-reload schedule
26002 to save compile time. */
26003 if (reload_completed)
26004 {
26005 targetm.sched.dfa_post_advance_cycle
26006 = core2i7_dfa_post_advance_cycle;
26007 targetm.sched.first_cycle_multipass_init
26008 = core2i7_first_cycle_multipass_init;
26009 targetm.sched.first_cycle_multipass_begin
26010 = core2i7_first_cycle_multipass_begin;
26011 targetm.sched.first_cycle_multipass_issue
26012 = core2i7_first_cycle_multipass_issue;
26013 targetm.sched.first_cycle_multipass_backtrack
26014 = core2i7_first_cycle_multipass_backtrack;
26015 targetm.sched.first_cycle_multipass_end
26016 = core2i7_first_cycle_multipass_end;
26017 targetm.sched.first_cycle_multipass_fini
26018 = core2i7_first_cycle_multipass_fini;
26019
26020 /* Set decoder parameters. */
26021 core2i7_secondary_decoder_max_insn_size = 8;
26022 core2i7_ifetch_block_size = 16;
26023 core2i7_ifetch_block_max_insns = 6;
26024 break;
26025 }
26026 /* ... Fall through ... */
26027 default:
26028 targetm.sched.dfa_post_advance_cycle = NULL;
26029 targetm.sched.first_cycle_multipass_init = NULL;
26030 targetm.sched.first_cycle_multipass_begin = NULL;
26031 targetm.sched.first_cycle_multipass_issue = NULL;
26032 targetm.sched.first_cycle_multipass_backtrack = NULL;
26033 targetm.sched.first_cycle_multipass_end = NULL;
26034 targetm.sched.first_cycle_multipass_fini = NULL;
26035 break;
26036 }
26037 }
26038
26039 \f
26040 /* Compute the alignment given to a constant that is being placed in memory.
26041 EXP is the constant and ALIGN is the alignment that the object would
26042 ordinarily have.
26043 The value of this function is used instead of that alignment to align
26044 the object. */
26045
26046 int
26047 ix86_constant_alignment (tree exp, int align)
26048 {
26049 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
26050 || TREE_CODE (exp) == INTEGER_CST)
26051 {
26052 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
26053 return 64;
26054 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
26055 return 128;
26056 }
26057 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
26058 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
26059 return BITS_PER_WORD;
26060
26061 return align;
26062 }
26063
26064 /* Compute the alignment for a static variable.
26065 TYPE is the data type, and ALIGN is the alignment that
26066 the object would ordinarily have. The value of this function is used
26067 instead of that alignment to align the object. */
26068
26069 int
26070 ix86_data_alignment (tree type, int align, bool opt)
26071 {
26072 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
26073
26074 if (opt
26075 && AGGREGATE_TYPE_P (type)
26076 && TYPE_SIZE (type)
26077 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26078 && wi::geu_p (TYPE_SIZE (type), max_align)
26079 && align < max_align)
26080 align = max_align;
26081
26082 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26083 to 16byte boundary. */
26084 if (TARGET_64BIT)
26085 {
26086 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
26087 && TYPE_SIZE (type)
26088 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26089 && wi::geu_p (TYPE_SIZE (type), 128)
26090 && align < 128)
26091 return 128;
26092 }
26093
26094 if (!opt)
26095 return align;
26096
26097 if (TREE_CODE (type) == ARRAY_TYPE)
26098 {
26099 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26100 return 64;
26101 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26102 return 128;
26103 }
26104 else if (TREE_CODE (type) == COMPLEX_TYPE)
26105 {
26106
26107 if (TYPE_MODE (type) == DCmode && align < 64)
26108 return 64;
26109 if ((TYPE_MODE (type) == XCmode
26110 || TYPE_MODE (type) == TCmode) && align < 128)
26111 return 128;
26112 }
26113 else if ((TREE_CODE (type) == RECORD_TYPE
26114 || TREE_CODE (type) == UNION_TYPE
26115 || TREE_CODE (type) == QUAL_UNION_TYPE)
26116 && TYPE_FIELDS (type))
26117 {
26118 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26119 return 64;
26120 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26121 return 128;
26122 }
26123 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26124 || TREE_CODE (type) == INTEGER_TYPE)
26125 {
26126 if (TYPE_MODE (type) == DFmode && align < 64)
26127 return 64;
26128 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26129 return 128;
26130 }
26131
26132 return align;
26133 }
26134
26135 /* Compute the alignment for a local variable or a stack slot. EXP is
26136 the data type or decl itself, MODE is the widest mode available and
26137 ALIGN is the alignment that the object would ordinarily have. The
26138 value of this macro is used instead of that alignment to align the
26139 object. */
26140
26141 unsigned int
26142 ix86_local_alignment (tree exp, enum machine_mode mode,
26143 unsigned int align)
26144 {
26145 tree type, decl;
26146
26147 if (exp && DECL_P (exp))
26148 {
26149 type = TREE_TYPE (exp);
26150 decl = exp;
26151 }
26152 else
26153 {
26154 type = exp;
26155 decl = NULL;
26156 }
26157
26158 /* Don't do dynamic stack realignment for long long objects with
26159 -mpreferred-stack-boundary=2. */
26160 if (!TARGET_64BIT
26161 && align == 64
26162 && ix86_preferred_stack_boundary < 64
26163 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
26164 && (!type || !TYPE_USER_ALIGN (type))
26165 && (!decl || !DECL_USER_ALIGN (decl)))
26166 align = 32;
26167
26168 /* If TYPE is NULL, we are allocating a stack slot for caller-save
26169 register in MODE. We will return the largest alignment of XF
26170 and DF. */
26171 if (!type)
26172 {
26173 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
26174 align = GET_MODE_ALIGNMENT (DFmode);
26175 return align;
26176 }
26177
26178 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26179 to 16byte boundary. Exact wording is:
26180
26181 An array uses the same alignment as its elements, except that a local or
26182 global array variable of length at least 16 bytes or
26183 a C99 variable-length array variable always has alignment of at least 16 bytes.
26184
26185 This was added to allow use of aligned SSE instructions at arrays. This
26186 rule is meant for static storage (where compiler can not do the analysis
26187 by itself). We follow it for automatic variables only when convenient.
26188 We fully control everything in the function compiled and functions from
26189 other unit can not rely on the alignment.
26190
26191 Exclude va_list type. It is the common case of local array where
26192 we can not benefit from the alignment.
26193
26194 TODO: Probably one should optimize for size only when var is not escaping. */
26195 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
26196 && TARGET_SSE)
26197 {
26198 if (AGGREGATE_TYPE_P (type)
26199 && (va_list_type_node == NULL_TREE
26200 || (TYPE_MAIN_VARIANT (type)
26201 != TYPE_MAIN_VARIANT (va_list_type_node)))
26202 && TYPE_SIZE (type)
26203 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26204 && wi::geu_p (TYPE_SIZE (type), 16)
26205 && align < 128)
26206 return 128;
26207 }
26208 if (TREE_CODE (type) == ARRAY_TYPE)
26209 {
26210 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26211 return 64;
26212 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26213 return 128;
26214 }
26215 else if (TREE_CODE (type) == COMPLEX_TYPE)
26216 {
26217 if (TYPE_MODE (type) == DCmode && align < 64)
26218 return 64;
26219 if ((TYPE_MODE (type) == XCmode
26220 || TYPE_MODE (type) == TCmode) && align < 128)
26221 return 128;
26222 }
26223 else if ((TREE_CODE (type) == RECORD_TYPE
26224 || TREE_CODE (type) == UNION_TYPE
26225 || TREE_CODE (type) == QUAL_UNION_TYPE)
26226 && TYPE_FIELDS (type))
26227 {
26228 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26229 return 64;
26230 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26231 return 128;
26232 }
26233 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26234 || TREE_CODE (type) == INTEGER_TYPE)
26235 {
26236
26237 if (TYPE_MODE (type) == DFmode && align < 64)
26238 return 64;
26239 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26240 return 128;
26241 }
26242 return align;
26243 }
26244
26245 /* Compute the minimum required alignment for dynamic stack realignment
26246 purposes for a local variable, parameter or a stack slot. EXP is
26247 the data type or decl itself, MODE is its mode and ALIGN is the
26248 alignment that the object would ordinarily have. */
26249
26250 unsigned int
26251 ix86_minimum_alignment (tree exp, enum machine_mode mode,
26252 unsigned int align)
26253 {
26254 tree type, decl;
26255
26256 if (exp && DECL_P (exp))
26257 {
26258 type = TREE_TYPE (exp);
26259 decl = exp;
26260 }
26261 else
26262 {
26263 type = exp;
26264 decl = NULL;
26265 }
26266
26267 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
26268 return align;
26269
26270 /* Don't do dynamic stack realignment for long long objects with
26271 -mpreferred-stack-boundary=2. */
26272 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
26273 && (!type || !TYPE_USER_ALIGN (type))
26274 && (!decl || !DECL_USER_ALIGN (decl)))
26275 return 32;
26276
26277 return align;
26278 }
26279 \f
26280 /* Find a location for the static chain incoming to a nested function.
26281 This is a register, unless all free registers are used by arguments. */
26282
26283 static rtx
26284 ix86_static_chain (const_tree fndecl, bool incoming_p)
26285 {
26286 unsigned regno;
26287
26288 if (!DECL_STATIC_CHAIN (fndecl))
26289 return NULL;
26290
26291 if (TARGET_64BIT)
26292 {
26293 /* We always use R10 in 64-bit mode. */
26294 regno = R10_REG;
26295 }
26296 else
26297 {
26298 tree fntype;
26299 unsigned int ccvt;
26300
26301 /* By default in 32-bit mode we use ECX to pass the static chain. */
26302 regno = CX_REG;
26303
26304 fntype = TREE_TYPE (fndecl);
26305 ccvt = ix86_get_callcvt (fntype);
26306 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
26307 {
26308 /* Fastcall functions use ecx/edx for arguments, which leaves
26309 us with EAX for the static chain.
26310 Thiscall functions use ecx for arguments, which also
26311 leaves us with EAX for the static chain. */
26312 regno = AX_REG;
26313 }
26314 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
26315 {
26316 /* Thiscall functions use ecx for arguments, which leaves
26317 us with EAX and EDX for the static chain.
26318 We are using for abi-compatibility EAX. */
26319 regno = AX_REG;
26320 }
26321 else if (ix86_function_regparm (fntype, fndecl) == 3)
26322 {
26323 /* For regparm 3, we have no free call-clobbered registers in
26324 which to store the static chain. In order to implement this,
26325 we have the trampoline push the static chain to the stack.
26326 However, we can't push a value below the return address when
26327 we call the nested function directly, so we have to use an
26328 alternate entry point. For this we use ESI, and have the
26329 alternate entry point push ESI, so that things appear the
26330 same once we're executing the nested function. */
26331 if (incoming_p)
26332 {
26333 if (fndecl == current_function_decl)
26334 ix86_static_chain_on_stack = true;
26335 return gen_frame_mem (SImode,
26336 plus_constant (Pmode,
26337 arg_pointer_rtx, -8));
26338 }
26339 regno = SI_REG;
26340 }
26341 }
26342
26343 return gen_rtx_REG (Pmode, regno);
26344 }
26345
26346 /* Emit RTL insns to initialize the variable parts of a trampoline.
26347 FNDECL is the decl of the target address; M_TRAMP is a MEM for
26348 the trampoline, and CHAIN_VALUE is an RTX for the static chain
26349 to be passed to the target function. */
26350
26351 static void
26352 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
26353 {
26354 rtx mem, fnaddr;
26355 int opcode;
26356 int offset = 0;
26357
26358 fnaddr = XEXP (DECL_RTL (fndecl), 0);
26359
26360 if (TARGET_64BIT)
26361 {
26362 int size;
26363
26364 /* Load the function address to r11. Try to load address using
26365 the shorter movl instead of movabs. We may want to support
26366 movq for kernel mode, but kernel does not use trampolines at
26367 the moment. FNADDR is a 32bit address and may not be in
26368 DImode when ptr_mode == SImode. Always use movl in this
26369 case. */
26370 if (ptr_mode == SImode
26371 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
26372 {
26373 fnaddr = copy_addr_to_reg (fnaddr);
26374
26375 mem = adjust_address (m_tramp, HImode, offset);
26376 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
26377
26378 mem = adjust_address (m_tramp, SImode, offset + 2);
26379 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
26380 offset += 6;
26381 }
26382 else
26383 {
26384 mem = adjust_address (m_tramp, HImode, offset);
26385 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
26386
26387 mem = adjust_address (m_tramp, DImode, offset + 2);
26388 emit_move_insn (mem, fnaddr);
26389 offset += 10;
26390 }
26391
26392 /* Load static chain using movabs to r10. Use the shorter movl
26393 instead of movabs when ptr_mode == SImode. */
26394 if (ptr_mode == SImode)
26395 {
26396 opcode = 0xba41;
26397 size = 6;
26398 }
26399 else
26400 {
26401 opcode = 0xba49;
26402 size = 10;
26403 }
26404
26405 mem = adjust_address (m_tramp, HImode, offset);
26406 emit_move_insn (mem, gen_int_mode (opcode, HImode));
26407
26408 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
26409 emit_move_insn (mem, chain_value);
26410 offset += size;
26411
26412 /* Jump to r11; the last (unused) byte is a nop, only there to
26413 pad the write out to a single 32-bit store. */
26414 mem = adjust_address (m_tramp, SImode, offset);
26415 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
26416 offset += 4;
26417 }
26418 else
26419 {
26420 rtx disp, chain;
26421
26422 /* Depending on the static chain location, either load a register
26423 with a constant, or push the constant to the stack. All of the
26424 instructions are the same size. */
26425 chain = ix86_static_chain (fndecl, true);
26426 if (REG_P (chain))
26427 {
26428 switch (REGNO (chain))
26429 {
26430 case AX_REG:
26431 opcode = 0xb8; break;
26432 case CX_REG:
26433 opcode = 0xb9; break;
26434 default:
26435 gcc_unreachable ();
26436 }
26437 }
26438 else
26439 opcode = 0x68;
26440
26441 mem = adjust_address (m_tramp, QImode, offset);
26442 emit_move_insn (mem, gen_int_mode (opcode, QImode));
26443
26444 mem = adjust_address (m_tramp, SImode, offset + 1);
26445 emit_move_insn (mem, chain_value);
26446 offset += 5;
26447
26448 mem = adjust_address (m_tramp, QImode, offset);
26449 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
26450
26451 mem = adjust_address (m_tramp, SImode, offset + 1);
26452
26453 /* Compute offset from the end of the jmp to the target function.
26454 In the case in which the trampoline stores the static chain on
26455 the stack, we need to skip the first insn which pushes the
26456 (call-saved) register static chain; this push is 1 byte. */
26457 offset += 5;
26458 disp = expand_binop (SImode, sub_optab, fnaddr,
26459 plus_constant (Pmode, XEXP (m_tramp, 0),
26460 offset - (MEM_P (chain) ? 1 : 0)),
26461 NULL_RTX, 1, OPTAB_DIRECT);
26462 emit_move_insn (mem, disp);
26463 }
26464
26465 gcc_assert (offset <= TRAMPOLINE_SIZE);
26466
26467 #ifdef HAVE_ENABLE_EXECUTE_STACK
26468 #ifdef CHECK_EXECUTE_STACK_ENABLED
26469 if (CHECK_EXECUTE_STACK_ENABLED)
26470 #endif
26471 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
26472 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
26473 #endif
26474 }
26475 \f
26476 /* The following file contains several enumerations and data structures
26477 built from the definitions in i386-builtin-types.def. */
26478
26479 #include "i386-builtin-types.inc"
26480
26481 /* Table for the ix86 builtin non-function types. */
26482 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
26483
26484 /* Retrieve an element from the above table, building some of
26485 the types lazily. */
26486
26487 static tree
26488 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26489 {
26490 unsigned int index;
26491 tree type, itype;
26492
26493 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26494
26495 type = ix86_builtin_type_tab[(int) tcode];
26496 if (type != NULL)
26497 return type;
26498
26499 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26500 if (tcode <= IX86_BT_LAST_VECT)
26501 {
26502 enum machine_mode mode;
26503
26504 index = tcode - IX86_BT_LAST_PRIM - 1;
26505 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26506 mode = ix86_builtin_type_vect_mode[index];
26507
26508 type = build_vector_type_for_mode (itype, mode);
26509 }
26510 else
26511 {
26512 int quals;
26513
26514 index = tcode - IX86_BT_LAST_VECT - 1;
26515 if (tcode <= IX86_BT_LAST_PTR)
26516 quals = TYPE_UNQUALIFIED;
26517 else
26518 quals = TYPE_QUAL_CONST;
26519
26520 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26521 if (quals != TYPE_UNQUALIFIED)
26522 itype = build_qualified_type (itype, quals);
26523
26524 type = build_pointer_type (itype);
26525 }
26526
26527 ix86_builtin_type_tab[(int) tcode] = type;
26528 return type;
26529 }
26530
26531 /* Table for the ix86 builtin function types. */
26532 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26533
26534 /* Retrieve an element from the above table, building some of
26535 the types lazily. */
26536
26537 static tree
26538 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26539 {
26540 tree type;
26541
26542 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26543
26544 type = ix86_builtin_func_type_tab[(int) tcode];
26545 if (type != NULL)
26546 return type;
26547
26548 if (tcode <= IX86_BT_LAST_FUNC)
26549 {
26550 unsigned start = ix86_builtin_func_start[(int) tcode];
26551 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26552 tree rtype, atype, args = void_list_node;
26553 unsigned i;
26554
26555 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26556 for (i = after - 1; i > start; --i)
26557 {
26558 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26559 args = tree_cons (NULL, atype, args);
26560 }
26561
26562 type = build_function_type (rtype, args);
26563 }
26564 else
26565 {
26566 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26567 enum ix86_builtin_func_type icode;
26568
26569 icode = ix86_builtin_func_alias_base[index];
26570 type = ix86_get_builtin_func_type (icode);
26571 }
26572
26573 ix86_builtin_func_type_tab[(int) tcode] = type;
26574 return type;
26575 }
26576
26577
26578 /* Codes for all the SSE/MMX builtins. */
26579 enum ix86_builtins
26580 {
26581 IX86_BUILTIN_ADDPS,
26582 IX86_BUILTIN_ADDSS,
26583 IX86_BUILTIN_DIVPS,
26584 IX86_BUILTIN_DIVSS,
26585 IX86_BUILTIN_MULPS,
26586 IX86_BUILTIN_MULSS,
26587 IX86_BUILTIN_SUBPS,
26588 IX86_BUILTIN_SUBSS,
26589
26590 IX86_BUILTIN_CMPEQPS,
26591 IX86_BUILTIN_CMPLTPS,
26592 IX86_BUILTIN_CMPLEPS,
26593 IX86_BUILTIN_CMPGTPS,
26594 IX86_BUILTIN_CMPGEPS,
26595 IX86_BUILTIN_CMPNEQPS,
26596 IX86_BUILTIN_CMPNLTPS,
26597 IX86_BUILTIN_CMPNLEPS,
26598 IX86_BUILTIN_CMPNGTPS,
26599 IX86_BUILTIN_CMPNGEPS,
26600 IX86_BUILTIN_CMPORDPS,
26601 IX86_BUILTIN_CMPUNORDPS,
26602 IX86_BUILTIN_CMPEQSS,
26603 IX86_BUILTIN_CMPLTSS,
26604 IX86_BUILTIN_CMPLESS,
26605 IX86_BUILTIN_CMPNEQSS,
26606 IX86_BUILTIN_CMPNLTSS,
26607 IX86_BUILTIN_CMPNLESS,
26608 IX86_BUILTIN_CMPORDSS,
26609 IX86_BUILTIN_CMPUNORDSS,
26610
26611 IX86_BUILTIN_COMIEQSS,
26612 IX86_BUILTIN_COMILTSS,
26613 IX86_BUILTIN_COMILESS,
26614 IX86_BUILTIN_COMIGTSS,
26615 IX86_BUILTIN_COMIGESS,
26616 IX86_BUILTIN_COMINEQSS,
26617 IX86_BUILTIN_UCOMIEQSS,
26618 IX86_BUILTIN_UCOMILTSS,
26619 IX86_BUILTIN_UCOMILESS,
26620 IX86_BUILTIN_UCOMIGTSS,
26621 IX86_BUILTIN_UCOMIGESS,
26622 IX86_BUILTIN_UCOMINEQSS,
26623
26624 IX86_BUILTIN_CVTPI2PS,
26625 IX86_BUILTIN_CVTPS2PI,
26626 IX86_BUILTIN_CVTSI2SS,
26627 IX86_BUILTIN_CVTSI642SS,
26628 IX86_BUILTIN_CVTSS2SI,
26629 IX86_BUILTIN_CVTSS2SI64,
26630 IX86_BUILTIN_CVTTPS2PI,
26631 IX86_BUILTIN_CVTTSS2SI,
26632 IX86_BUILTIN_CVTTSS2SI64,
26633
26634 IX86_BUILTIN_MAXPS,
26635 IX86_BUILTIN_MAXSS,
26636 IX86_BUILTIN_MINPS,
26637 IX86_BUILTIN_MINSS,
26638
26639 IX86_BUILTIN_LOADUPS,
26640 IX86_BUILTIN_STOREUPS,
26641 IX86_BUILTIN_MOVSS,
26642
26643 IX86_BUILTIN_MOVHLPS,
26644 IX86_BUILTIN_MOVLHPS,
26645 IX86_BUILTIN_LOADHPS,
26646 IX86_BUILTIN_LOADLPS,
26647 IX86_BUILTIN_STOREHPS,
26648 IX86_BUILTIN_STORELPS,
26649
26650 IX86_BUILTIN_MASKMOVQ,
26651 IX86_BUILTIN_MOVMSKPS,
26652 IX86_BUILTIN_PMOVMSKB,
26653
26654 IX86_BUILTIN_MOVNTPS,
26655 IX86_BUILTIN_MOVNTQ,
26656
26657 IX86_BUILTIN_LOADDQU,
26658 IX86_BUILTIN_STOREDQU,
26659
26660 IX86_BUILTIN_PACKSSWB,
26661 IX86_BUILTIN_PACKSSDW,
26662 IX86_BUILTIN_PACKUSWB,
26663
26664 IX86_BUILTIN_PADDB,
26665 IX86_BUILTIN_PADDW,
26666 IX86_BUILTIN_PADDD,
26667 IX86_BUILTIN_PADDQ,
26668 IX86_BUILTIN_PADDSB,
26669 IX86_BUILTIN_PADDSW,
26670 IX86_BUILTIN_PADDUSB,
26671 IX86_BUILTIN_PADDUSW,
26672 IX86_BUILTIN_PSUBB,
26673 IX86_BUILTIN_PSUBW,
26674 IX86_BUILTIN_PSUBD,
26675 IX86_BUILTIN_PSUBQ,
26676 IX86_BUILTIN_PSUBSB,
26677 IX86_BUILTIN_PSUBSW,
26678 IX86_BUILTIN_PSUBUSB,
26679 IX86_BUILTIN_PSUBUSW,
26680
26681 IX86_BUILTIN_PAND,
26682 IX86_BUILTIN_PANDN,
26683 IX86_BUILTIN_POR,
26684 IX86_BUILTIN_PXOR,
26685
26686 IX86_BUILTIN_PAVGB,
26687 IX86_BUILTIN_PAVGW,
26688
26689 IX86_BUILTIN_PCMPEQB,
26690 IX86_BUILTIN_PCMPEQW,
26691 IX86_BUILTIN_PCMPEQD,
26692 IX86_BUILTIN_PCMPGTB,
26693 IX86_BUILTIN_PCMPGTW,
26694 IX86_BUILTIN_PCMPGTD,
26695
26696 IX86_BUILTIN_PMADDWD,
26697
26698 IX86_BUILTIN_PMAXSW,
26699 IX86_BUILTIN_PMAXUB,
26700 IX86_BUILTIN_PMINSW,
26701 IX86_BUILTIN_PMINUB,
26702
26703 IX86_BUILTIN_PMULHUW,
26704 IX86_BUILTIN_PMULHW,
26705 IX86_BUILTIN_PMULLW,
26706
26707 IX86_BUILTIN_PSADBW,
26708 IX86_BUILTIN_PSHUFW,
26709
26710 IX86_BUILTIN_PSLLW,
26711 IX86_BUILTIN_PSLLD,
26712 IX86_BUILTIN_PSLLQ,
26713 IX86_BUILTIN_PSRAW,
26714 IX86_BUILTIN_PSRAD,
26715 IX86_BUILTIN_PSRLW,
26716 IX86_BUILTIN_PSRLD,
26717 IX86_BUILTIN_PSRLQ,
26718 IX86_BUILTIN_PSLLWI,
26719 IX86_BUILTIN_PSLLDI,
26720 IX86_BUILTIN_PSLLQI,
26721 IX86_BUILTIN_PSRAWI,
26722 IX86_BUILTIN_PSRADI,
26723 IX86_BUILTIN_PSRLWI,
26724 IX86_BUILTIN_PSRLDI,
26725 IX86_BUILTIN_PSRLQI,
26726
26727 IX86_BUILTIN_PUNPCKHBW,
26728 IX86_BUILTIN_PUNPCKHWD,
26729 IX86_BUILTIN_PUNPCKHDQ,
26730 IX86_BUILTIN_PUNPCKLBW,
26731 IX86_BUILTIN_PUNPCKLWD,
26732 IX86_BUILTIN_PUNPCKLDQ,
26733
26734 IX86_BUILTIN_SHUFPS,
26735
26736 IX86_BUILTIN_RCPPS,
26737 IX86_BUILTIN_RCPSS,
26738 IX86_BUILTIN_RSQRTPS,
26739 IX86_BUILTIN_RSQRTPS_NR,
26740 IX86_BUILTIN_RSQRTSS,
26741 IX86_BUILTIN_RSQRTF,
26742 IX86_BUILTIN_SQRTPS,
26743 IX86_BUILTIN_SQRTPS_NR,
26744 IX86_BUILTIN_SQRTSS,
26745
26746 IX86_BUILTIN_UNPCKHPS,
26747 IX86_BUILTIN_UNPCKLPS,
26748
26749 IX86_BUILTIN_ANDPS,
26750 IX86_BUILTIN_ANDNPS,
26751 IX86_BUILTIN_ORPS,
26752 IX86_BUILTIN_XORPS,
26753
26754 IX86_BUILTIN_EMMS,
26755 IX86_BUILTIN_LDMXCSR,
26756 IX86_BUILTIN_STMXCSR,
26757 IX86_BUILTIN_SFENCE,
26758
26759 IX86_BUILTIN_FXSAVE,
26760 IX86_BUILTIN_FXRSTOR,
26761 IX86_BUILTIN_FXSAVE64,
26762 IX86_BUILTIN_FXRSTOR64,
26763
26764 IX86_BUILTIN_XSAVE,
26765 IX86_BUILTIN_XRSTOR,
26766 IX86_BUILTIN_XSAVE64,
26767 IX86_BUILTIN_XRSTOR64,
26768
26769 IX86_BUILTIN_XSAVEOPT,
26770 IX86_BUILTIN_XSAVEOPT64,
26771
26772 /* 3DNow! Original */
26773 IX86_BUILTIN_FEMMS,
26774 IX86_BUILTIN_PAVGUSB,
26775 IX86_BUILTIN_PF2ID,
26776 IX86_BUILTIN_PFACC,
26777 IX86_BUILTIN_PFADD,
26778 IX86_BUILTIN_PFCMPEQ,
26779 IX86_BUILTIN_PFCMPGE,
26780 IX86_BUILTIN_PFCMPGT,
26781 IX86_BUILTIN_PFMAX,
26782 IX86_BUILTIN_PFMIN,
26783 IX86_BUILTIN_PFMUL,
26784 IX86_BUILTIN_PFRCP,
26785 IX86_BUILTIN_PFRCPIT1,
26786 IX86_BUILTIN_PFRCPIT2,
26787 IX86_BUILTIN_PFRSQIT1,
26788 IX86_BUILTIN_PFRSQRT,
26789 IX86_BUILTIN_PFSUB,
26790 IX86_BUILTIN_PFSUBR,
26791 IX86_BUILTIN_PI2FD,
26792 IX86_BUILTIN_PMULHRW,
26793
26794 /* 3DNow! Athlon Extensions */
26795 IX86_BUILTIN_PF2IW,
26796 IX86_BUILTIN_PFNACC,
26797 IX86_BUILTIN_PFPNACC,
26798 IX86_BUILTIN_PI2FW,
26799 IX86_BUILTIN_PSWAPDSI,
26800 IX86_BUILTIN_PSWAPDSF,
26801
26802 /* SSE2 */
26803 IX86_BUILTIN_ADDPD,
26804 IX86_BUILTIN_ADDSD,
26805 IX86_BUILTIN_DIVPD,
26806 IX86_BUILTIN_DIVSD,
26807 IX86_BUILTIN_MULPD,
26808 IX86_BUILTIN_MULSD,
26809 IX86_BUILTIN_SUBPD,
26810 IX86_BUILTIN_SUBSD,
26811
26812 IX86_BUILTIN_CMPEQPD,
26813 IX86_BUILTIN_CMPLTPD,
26814 IX86_BUILTIN_CMPLEPD,
26815 IX86_BUILTIN_CMPGTPD,
26816 IX86_BUILTIN_CMPGEPD,
26817 IX86_BUILTIN_CMPNEQPD,
26818 IX86_BUILTIN_CMPNLTPD,
26819 IX86_BUILTIN_CMPNLEPD,
26820 IX86_BUILTIN_CMPNGTPD,
26821 IX86_BUILTIN_CMPNGEPD,
26822 IX86_BUILTIN_CMPORDPD,
26823 IX86_BUILTIN_CMPUNORDPD,
26824 IX86_BUILTIN_CMPEQSD,
26825 IX86_BUILTIN_CMPLTSD,
26826 IX86_BUILTIN_CMPLESD,
26827 IX86_BUILTIN_CMPNEQSD,
26828 IX86_BUILTIN_CMPNLTSD,
26829 IX86_BUILTIN_CMPNLESD,
26830 IX86_BUILTIN_CMPORDSD,
26831 IX86_BUILTIN_CMPUNORDSD,
26832
26833 IX86_BUILTIN_COMIEQSD,
26834 IX86_BUILTIN_COMILTSD,
26835 IX86_BUILTIN_COMILESD,
26836 IX86_BUILTIN_COMIGTSD,
26837 IX86_BUILTIN_COMIGESD,
26838 IX86_BUILTIN_COMINEQSD,
26839 IX86_BUILTIN_UCOMIEQSD,
26840 IX86_BUILTIN_UCOMILTSD,
26841 IX86_BUILTIN_UCOMILESD,
26842 IX86_BUILTIN_UCOMIGTSD,
26843 IX86_BUILTIN_UCOMIGESD,
26844 IX86_BUILTIN_UCOMINEQSD,
26845
26846 IX86_BUILTIN_MAXPD,
26847 IX86_BUILTIN_MAXSD,
26848 IX86_BUILTIN_MINPD,
26849 IX86_BUILTIN_MINSD,
26850
26851 IX86_BUILTIN_ANDPD,
26852 IX86_BUILTIN_ANDNPD,
26853 IX86_BUILTIN_ORPD,
26854 IX86_BUILTIN_XORPD,
26855
26856 IX86_BUILTIN_SQRTPD,
26857 IX86_BUILTIN_SQRTSD,
26858
26859 IX86_BUILTIN_UNPCKHPD,
26860 IX86_BUILTIN_UNPCKLPD,
26861
26862 IX86_BUILTIN_SHUFPD,
26863
26864 IX86_BUILTIN_LOADUPD,
26865 IX86_BUILTIN_STOREUPD,
26866 IX86_BUILTIN_MOVSD,
26867
26868 IX86_BUILTIN_LOADHPD,
26869 IX86_BUILTIN_LOADLPD,
26870
26871 IX86_BUILTIN_CVTDQ2PD,
26872 IX86_BUILTIN_CVTDQ2PS,
26873
26874 IX86_BUILTIN_CVTPD2DQ,
26875 IX86_BUILTIN_CVTPD2PI,
26876 IX86_BUILTIN_CVTPD2PS,
26877 IX86_BUILTIN_CVTTPD2DQ,
26878 IX86_BUILTIN_CVTTPD2PI,
26879
26880 IX86_BUILTIN_CVTPI2PD,
26881 IX86_BUILTIN_CVTSI2SD,
26882 IX86_BUILTIN_CVTSI642SD,
26883
26884 IX86_BUILTIN_CVTSD2SI,
26885 IX86_BUILTIN_CVTSD2SI64,
26886 IX86_BUILTIN_CVTSD2SS,
26887 IX86_BUILTIN_CVTSS2SD,
26888 IX86_BUILTIN_CVTTSD2SI,
26889 IX86_BUILTIN_CVTTSD2SI64,
26890
26891 IX86_BUILTIN_CVTPS2DQ,
26892 IX86_BUILTIN_CVTPS2PD,
26893 IX86_BUILTIN_CVTTPS2DQ,
26894
26895 IX86_BUILTIN_MOVNTI,
26896 IX86_BUILTIN_MOVNTI64,
26897 IX86_BUILTIN_MOVNTPD,
26898 IX86_BUILTIN_MOVNTDQ,
26899
26900 IX86_BUILTIN_MOVQ128,
26901
26902 /* SSE2 MMX */
26903 IX86_BUILTIN_MASKMOVDQU,
26904 IX86_BUILTIN_MOVMSKPD,
26905 IX86_BUILTIN_PMOVMSKB128,
26906
26907 IX86_BUILTIN_PACKSSWB128,
26908 IX86_BUILTIN_PACKSSDW128,
26909 IX86_BUILTIN_PACKUSWB128,
26910
26911 IX86_BUILTIN_PADDB128,
26912 IX86_BUILTIN_PADDW128,
26913 IX86_BUILTIN_PADDD128,
26914 IX86_BUILTIN_PADDQ128,
26915 IX86_BUILTIN_PADDSB128,
26916 IX86_BUILTIN_PADDSW128,
26917 IX86_BUILTIN_PADDUSB128,
26918 IX86_BUILTIN_PADDUSW128,
26919 IX86_BUILTIN_PSUBB128,
26920 IX86_BUILTIN_PSUBW128,
26921 IX86_BUILTIN_PSUBD128,
26922 IX86_BUILTIN_PSUBQ128,
26923 IX86_BUILTIN_PSUBSB128,
26924 IX86_BUILTIN_PSUBSW128,
26925 IX86_BUILTIN_PSUBUSB128,
26926 IX86_BUILTIN_PSUBUSW128,
26927
26928 IX86_BUILTIN_PAND128,
26929 IX86_BUILTIN_PANDN128,
26930 IX86_BUILTIN_POR128,
26931 IX86_BUILTIN_PXOR128,
26932
26933 IX86_BUILTIN_PAVGB128,
26934 IX86_BUILTIN_PAVGW128,
26935
26936 IX86_BUILTIN_PCMPEQB128,
26937 IX86_BUILTIN_PCMPEQW128,
26938 IX86_BUILTIN_PCMPEQD128,
26939 IX86_BUILTIN_PCMPGTB128,
26940 IX86_BUILTIN_PCMPGTW128,
26941 IX86_BUILTIN_PCMPGTD128,
26942
26943 IX86_BUILTIN_PMADDWD128,
26944
26945 IX86_BUILTIN_PMAXSW128,
26946 IX86_BUILTIN_PMAXUB128,
26947 IX86_BUILTIN_PMINSW128,
26948 IX86_BUILTIN_PMINUB128,
26949
26950 IX86_BUILTIN_PMULUDQ,
26951 IX86_BUILTIN_PMULUDQ128,
26952 IX86_BUILTIN_PMULHUW128,
26953 IX86_BUILTIN_PMULHW128,
26954 IX86_BUILTIN_PMULLW128,
26955
26956 IX86_BUILTIN_PSADBW128,
26957 IX86_BUILTIN_PSHUFHW,
26958 IX86_BUILTIN_PSHUFLW,
26959 IX86_BUILTIN_PSHUFD,
26960
26961 IX86_BUILTIN_PSLLDQI128,
26962 IX86_BUILTIN_PSLLWI128,
26963 IX86_BUILTIN_PSLLDI128,
26964 IX86_BUILTIN_PSLLQI128,
26965 IX86_BUILTIN_PSRAWI128,
26966 IX86_BUILTIN_PSRADI128,
26967 IX86_BUILTIN_PSRLDQI128,
26968 IX86_BUILTIN_PSRLWI128,
26969 IX86_BUILTIN_PSRLDI128,
26970 IX86_BUILTIN_PSRLQI128,
26971
26972 IX86_BUILTIN_PSLLDQ128,
26973 IX86_BUILTIN_PSLLW128,
26974 IX86_BUILTIN_PSLLD128,
26975 IX86_BUILTIN_PSLLQ128,
26976 IX86_BUILTIN_PSRAW128,
26977 IX86_BUILTIN_PSRAD128,
26978 IX86_BUILTIN_PSRLW128,
26979 IX86_BUILTIN_PSRLD128,
26980 IX86_BUILTIN_PSRLQ128,
26981
26982 IX86_BUILTIN_PUNPCKHBW128,
26983 IX86_BUILTIN_PUNPCKHWD128,
26984 IX86_BUILTIN_PUNPCKHDQ128,
26985 IX86_BUILTIN_PUNPCKHQDQ128,
26986 IX86_BUILTIN_PUNPCKLBW128,
26987 IX86_BUILTIN_PUNPCKLWD128,
26988 IX86_BUILTIN_PUNPCKLDQ128,
26989 IX86_BUILTIN_PUNPCKLQDQ128,
26990
26991 IX86_BUILTIN_CLFLUSH,
26992 IX86_BUILTIN_MFENCE,
26993 IX86_BUILTIN_LFENCE,
26994 IX86_BUILTIN_PAUSE,
26995
26996 IX86_BUILTIN_FNSTENV,
26997 IX86_BUILTIN_FLDENV,
26998 IX86_BUILTIN_FNSTSW,
26999 IX86_BUILTIN_FNCLEX,
27000
27001 IX86_BUILTIN_BSRSI,
27002 IX86_BUILTIN_BSRDI,
27003 IX86_BUILTIN_RDPMC,
27004 IX86_BUILTIN_RDTSC,
27005 IX86_BUILTIN_RDTSCP,
27006 IX86_BUILTIN_ROLQI,
27007 IX86_BUILTIN_ROLHI,
27008 IX86_BUILTIN_RORQI,
27009 IX86_BUILTIN_RORHI,
27010
27011 /* SSE3. */
27012 IX86_BUILTIN_ADDSUBPS,
27013 IX86_BUILTIN_HADDPS,
27014 IX86_BUILTIN_HSUBPS,
27015 IX86_BUILTIN_MOVSHDUP,
27016 IX86_BUILTIN_MOVSLDUP,
27017 IX86_BUILTIN_ADDSUBPD,
27018 IX86_BUILTIN_HADDPD,
27019 IX86_BUILTIN_HSUBPD,
27020 IX86_BUILTIN_LDDQU,
27021
27022 IX86_BUILTIN_MONITOR,
27023 IX86_BUILTIN_MWAIT,
27024
27025 /* SSSE3. */
27026 IX86_BUILTIN_PHADDW,
27027 IX86_BUILTIN_PHADDD,
27028 IX86_BUILTIN_PHADDSW,
27029 IX86_BUILTIN_PHSUBW,
27030 IX86_BUILTIN_PHSUBD,
27031 IX86_BUILTIN_PHSUBSW,
27032 IX86_BUILTIN_PMADDUBSW,
27033 IX86_BUILTIN_PMULHRSW,
27034 IX86_BUILTIN_PSHUFB,
27035 IX86_BUILTIN_PSIGNB,
27036 IX86_BUILTIN_PSIGNW,
27037 IX86_BUILTIN_PSIGND,
27038 IX86_BUILTIN_PALIGNR,
27039 IX86_BUILTIN_PABSB,
27040 IX86_BUILTIN_PABSW,
27041 IX86_BUILTIN_PABSD,
27042
27043 IX86_BUILTIN_PHADDW128,
27044 IX86_BUILTIN_PHADDD128,
27045 IX86_BUILTIN_PHADDSW128,
27046 IX86_BUILTIN_PHSUBW128,
27047 IX86_BUILTIN_PHSUBD128,
27048 IX86_BUILTIN_PHSUBSW128,
27049 IX86_BUILTIN_PMADDUBSW128,
27050 IX86_BUILTIN_PMULHRSW128,
27051 IX86_BUILTIN_PSHUFB128,
27052 IX86_BUILTIN_PSIGNB128,
27053 IX86_BUILTIN_PSIGNW128,
27054 IX86_BUILTIN_PSIGND128,
27055 IX86_BUILTIN_PALIGNR128,
27056 IX86_BUILTIN_PABSB128,
27057 IX86_BUILTIN_PABSW128,
27058 IX86_BUILTIN_PABSD128,
27059
27060 /* AMDFAM10 - SSE4A New Instructions. */
27061 IX86_BUILTIN_MOVNTSD,
27062 IX86_BUILTIN_MOVNTSS,
27063 IX86_BUILTIN_EXTRQI,
27064 IX86_BUILTIN_EXTRQ,
27065 IX86_BUILTIN_INSERTQI,
27066 IX86_BUILTIN_INSERTQ,
27067
27068 /* SSE4.1. */
27069 IX86_BUILTIN_BLENDPD,
27070 IX86_BUILTIN_BLENDPS,
27071 IX86_BUILTIN_BLENDVPD,
27072 IX86_BUILTIN_BLENDVPS,
27073 IX86_BUILTIN_PBLENDVB128,
27074 IX86_BUILTIN_PBLENDW128,
27075
27076 IX86_BUILTIN_DPPD,
27077 IX86_BUILTIN_DPPS,
27078
27079 IX86_BUILTIN_INSERTPS128,
27080
27081 IX86_BUILTIN_MOVNTDQA,
27082 IX86_BUILTIN_MPSADBW128,
27083 IX86_BUILTIN_PACKUSDW128,
27084 IX86_BUILTIN_PCMPEQQ,
27085 IX86_BUILTIN_PHMINPOSUW128,
27086
27087 IX86_BUILTIN_PMAXSB128,
27088 IX86_BUILTIN_PMAXSD128,
27089 IX86_BUILTIN_PMAXUD128,
27090 IX86_BUILTIN_PMAXUW128,
27091
27092 IX86_BUILTIN_PMINSB128,
27093 IX86_BUILTIN_PMINSD128,
27094 IX86_BUILTIN_PMINUD128,
27095 IX86_BUILTIN_PMINUW128,
27096
27097 IX86_BUILTIN_PMOVSXBW128,
27098 IX86_BUILTIN_PMOVSXBD128,
27099 IX86_BUILTIN_PMOVSXBQ128,
27100 IX86_BUILTIN_PMOVSXWD128,
27101 IX86_BUILTIN_PMOVSXWQ128,
27102 IX86_BUILTIN_PMOVSXDQ128,
27103
27104 IX86_BUILTIN_PMOVZXBW128,
27105 IX86_BUILTIN_PMOVZXBD128,
27106 IX86_BUILTIN_PMOVZXBQ128,
27107 IX86_BUILTIN_PMOVZXWD128,
27108 IX86_BUILTIN_PMOVZXWQ128,
27109 IX86_BUILTIN_PMOVZXDQ128,
27110
27111 IX86_BUILTIN_PMULDQ128,
27112 IX86_BUILTIN_PMULLD128,
27113
27114 IX86_BUILTIN_ROUNDSD,
27115 IX86_BUILTIN_ROUNDSS,
27116
27117 IX86_BUILTIN_ROUNDPD,
27118 IX86_BUILTIN_ROUNDPS,
27119
27120 IX86_BUILTIN_FLOORPD,
27121 IX86_BUILTIN_CEILPD,
27122 IX86_BUILTIN_TRUNCPD,
27123 IX86_BUILTIN_RINTPD,
27124 IX86_BUILTIN_ROUNDPD_AZ,
27125
27126 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
27127 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
27128 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
27129
27130 IX86_BUILTIN_FLOORPS,
27131 IX86_BUILTIN_CEILPS,
27132 IX86_BUILTIN_TRUNCPS,
27133 IX86_BUILTIN_RINTPS,
27134 IX86_BUILTIN_ROUNDPS_AZ,
27135
27136 IX86_BUILTIN_FLOORPS_SFIX,
27137 IX86_BUILTIN_CEILPS_SFIX,
27138 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
27139
27140 IX86_BUILTIN_PTESTZ,
27141 IX86_BUILTIN_PTESTC,
27142 IX86_BUILTIN_PTESTNZC,
27143
27144 IX86_BUILTIN_VEC_INIT_V2SI,
27145 IX86_BUILTIN_VEC_INIT_V4HI,
27146 IX86_BUILTIN_VEC_INIT_V8QI,
27147 IX86_BUILTIN_VEC_EXT_V2DF,
27148 IX86_BUILTIN_VEC_EXT_V2DI,
27149 IX86_BUILTIN_VEC_EXT_V4SF,
27150 IX86_BUILTIN_VEC_EXT_V4SI,
27151 IX86_BUILTIN_VEC_EXT_V8HI,
27152 IX86_BUILTIN_VEC_EXT_V2SI,
27153 IX86_BUILTIN_VEC_EXT_V4HI,
27154 IX86_BUILTIN_VEC_EXT_V16QI,
27155 IX86_BUILTIN_VEC_SET_V2DI,
27156 IX86_BUILTIN_VEC_SET_V4SF,
27157 IX86_BUILTIN_VEC_SET_V4SI,
27158 IX86_BUILTIN_VEC_SET_V8HI,
27159 IX86_BUILTIN_VEC_SET_V4HI,
27160 IX86_BUILTIN_VEC_SET_V16QI,
27161
27162 IX86_BUILTIN_VEC_PACK_SFIX,
27163 IX86_BUILTIN_VEC_PACK_SFIX256,
27164
27165 /* SSE4.2. */
27166 IX86_BUILTIN_CRC32QI,
27167 IX86_BUILTIN_CRC32HI,
27168 IX86_BUILTIN_CRC32SI,
27169 IX86_BUILTIN_CRC32DI,
27170
27171 IX86_BUILTIN_PCMPESTRI128,
27172 IX86_BUILTIN_PCMPESTRM128,
27173 IX86_BUILTIN_PCMPESTRA128,
27174 IX86_BUILTIN_PCMPESTRC128,
27175 IX86_BUILTIN_PCMPESTRO128,
27176 IX86_BUILTIN_PCMPESTRS128,
27177 IX86_BUILTIN_PCMPESTRZ128,
27178 IX86_BUILTIN_PCMPISTRI128,
27179 IX86_BUILTIN_PCMPISTRM128,
27180 IX86_BUILTIN_PCMPISTRA128,
27181 IX86_BUILTIN_PCMPISTRC128,
27182 IX86_BUILTIN_PCMPISTRO128,
27183 IX86_BUILTIN_PCMPISTRS128,
27184 IX86_BUILTIN_PCMPISTRZ128,
27185
27186 IX86_BUILTIN_PCMPGTQ,
27187
27188 /* AES instructions */
27189 IX86_BUILTIN_AESENC128,
27190 IX86_BUILTIN_AESENCLAST128,
27191 IX86_BUILTIN_AESDEC128,
27192 IX86_BUILTIN_AESDECLAST128,
27193 IX86_BUILTIN_AESIMC128,
27194 IX86_BUILTIN_AESKEYGENASSIST128,
27195
27196 /* PCLMUL instruction */
27197 IX86_BUILTIN_PCLMULQDQ128,
27198
27199 /* AVX */
27200 IX86_BUILTIN_ADDPD256,
27201 IX86_BUILTIN_ADDPS256,
27202 IX86_BUILTIN_ADDSUBPD256,
27203 IX86_BUILTIN_ADDSUBPS256,
27204 IX86_BUILTIN_ANDPD256,
27205 IX86_BUILTIN_ANDPS256,
27206 IX86_BUILTIN_ANDNPD256,
27207 IX86_BUILTIN_ANDNPS256,
27208 IX86_BUILTIN_BLENDPD256,
27209 IX86_BUILTIN_BLENDPS256,
27210 IX86_BUILTIN_BLENDVPD256,
27211 IX86_BUILTIN_BLENDVPS256,
27212 IX86_BUILTIN_DIVPD256,
27213 IX86_BUILTIN_DIVPS256,
27214 IX86_BUILTIN_DPPS256,
27215 IX86_BUILTIN_HADDPD256,
27216 IX86_BUILTIN_HADDPS256,
27217 IX86_BUILTIN_HSUBPD256,
27218 IX86_BUILTIN_HSUBPS256,
27219 IX86_BUILTIN_MAXPD256,
27220 IX86_BUILTIN_MAXPS256,
27221 IX86_BUILTIN_MINPD256,
27222 IX86_BUILTIN_MINPS256,
27223 IX86_BUILTIN_MULPD256,
27224 IX86_BUILTIN_MULPS256,
27225 IX86_BUILTIN_ORPD256,
27226 IX86_BUILTIN_ORPS256,
27227 IX86_BUILTIN_SHUFPD256,
27228 IX86_BUILTIN_SHUFPS256,
27229 IX86_BUILTIN_SUBPD256,
27230 IX86_BUILTIN_SUBPS256,
27231 IX86_BUILTIN_XORPD256,
27232 IX86_BUILTIN_XORPS256,
27233 IX86_BUILTIN_CMPSD,
27234 IX86_BUILTIN_CMPSS,
27235 IX86_BUILTIN_CMPPD,
27236 IX86_BUILTIN_CMPPS,
27237 IX86_BUILTIN_CMPPD256,
27238 IX86_BUILTIN_CMPPS256,
27239 IX86_BUILTIN_CVTDQ2PD256,
27240 IX86_BUILTIN_CVTDQ2PS256,
27241 IX86_BUILTIN_CVTPD2PS256,
27242 IX86_BUILTIN_CVTPS2DQ256,
27243 IX86_BUILTIN_CVTPS2PD256,
27244 IX86_BUILTIN_CVTTPD2DQ256,
27245 IX86_BUILTIN_CVTPD2DQ256,
27246 IX86_BUILTIN_CVTTPS2DQ256,
27247 IX86_BUILTIN_EXTRACTF128PD256,
27248 IX86_BUILTIN_EXTRACTF128PS256,
27249 IX86_BUILTIN_EXTRACTF128SI256,
27250 IX86_BUILTIN_VZEROALL,
27251 IX86_BUILTIN_VZEROUPPER,
27252 IX86_BUILTIN_VPERMILVARPD,
27253 IX86_BUILTIN_VPERMILVARPS,
27254 IX86_BUILTIN_VPERMILVARPD256,
27255 IX86_BUILTIN_VPERMILVARPS256,
27256 IX86_BUILTIN_VPERMILPD,
27257 IX86_BUILTIN_VPERMILPS,
27258 IX86_BUILTIN_VPERMILPD256,
27259 IX86_BUILTIN_VPERMILPS256,
27260 IX86_BUILTIN_VPERMIL2PD,
27261 IX86_BUILTIN_VPERMIL2PS,
27262 IX86_BUILTIN_VPERMIL2PD256,
27263 IX86_BUILTIN_VPERMIL2PS256,
27264 IX86_BUILTIN_VPERM2F128PD256,
27265 IX86_BUILTIN_VPERM2F128PS256,
27266 IX86_BUILTIN_VPERM2F128SI256,
27267 IX86_BUILTIN_VBROADCASTSS,
27268 IX86_BUILTIN_VBROADCASTSD256,
27269 IX86_BUILTIN_VBROADCASTSS256,
27270 IX86_BUILTIN_VBROADCASTPD256,
27271 IX86_BUILTIN_VBROADCASTPS256,
27272 IX86_BUILTIN_VINSERTF128PD256,
27273 IX86_BUILTIN_VINSERTF128PS256,
27274 IX86_BUILTIN_VINSERTF128SI256,
27275 IX86_BUILTIN_LOADUPD256,
27276 IX86_BUILTIN_LOADUPS256,
27277 IX86_BUILTIN_STOREUPD256,
27278 IX86_BUILTIN_STOREUPS256,
27279 IX86_BUILTIN_LDDQU256,
27280 IX86_BUILTIN_MOVNTDQ256,
27281 IX86_BUILTIN_MOVNTPD256,
27282 IX86_BUILTIN_MOVNTPS256,
27283 IX86_BUILTIN_LOADDQU256,
27284 IX86_BUILTIN_STOREDQU256,
27285 IX86_BUILTIN_MASKLOADPD,
27286 IX86_BUILTIN_MASKLOADPS,
27287 IX86_BUILTIN_MASKSTOREPD,
27288 IX86_BUILTIN_MASKSTOREPS,
27289 IX86_BUILTIN_MASKLOADPD256,
27290 IX86_BUILTIN_MASKLOADPS256,
27291 IX86_BUILTIN_MASKSTOREPD256,
27292 IX86_BUILTIN_MASKSTOREPS256,
27293 IX86_BUILTIN_MOVSHDUP256,
27294 IX86_BUILTIN_MOVSLDUP256,
27295 IX86_BUILTIN_MOVDDUP256,
27296
27297 IX86_BUILTIN_SQRTPD256,
27298 IX86_BUILTIN_SQRTPS256,
27299 IX86_BUILTIN_SQRTPS_NR256,
27300 IX86_BUILTIN_RSQRTPS256,
27301 IX86_BUILTIN_RSQRTPS_NR256,
27302
27303 IX86_BUILTIN_RCPPS256,
27304
27305 IX86_BUILTIN_ROUNDPD256,
27306 IX86_BUILTIN_ROUNDPS256,
27307
27308 IX86_BUILTIN_FLOORPD256,
27309 IX86_BUILTIN_CEILPD256,
27310 IX86_BUILTIN_TRUNCPD256,
27311 IX86_BUILTIN_RINTPD256,
27312 IX86_BUILTIN_ROUNDPD_AZ256,
27313
27314 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
27315 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
27316 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
27317
27318 IX86_BUILTIN_FLOORPS256,
27319 IX86_BUILTIN_CEILPS256,
27320 IX86_BUILTIN_TRUNCPS256,
27321 IX86_BUILTIN_RINTPS256,
27322 IX86_BUILTIN_ROUNDPS_AZ256,
27323
27324 IX86_BUILTIN_FLOORPS_SFIX256,
27325 IX86_BUILTIN_CEILPS_SFIX256,
27326 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
27327
27328 IX86_BUILTIN_UNPCKHPD256,
27329 IX86_BUILTIN_UNPCKLPD256,
27330 IX86_BUILTIN_UNPCKHPS256,
27331 IX86_BUILTIN_UNPCKLPS256,
27332
27333 IX86_BUILTIN_SI256_SI,
27334 IX86_BUILTIN_PS256_PS,
27335 IX86_BUILTIN_PD256_PD,
27336 IX86_BUILTIN_SI_SI256,
27337 IX86_BUILTIN_PS_PS256,
27338 IX86_BUILTIN_PD_PD256,
27339
27340 IX86_BUILTIN_VTESTZPD,
27341 IX86_BUILTIN_VTESTCPD,
27342 IX86_BUILTIN_VTESTNZCPD,
27343 IX86_BUILTIN_VTESTZPS,
27344 IX86_BUILTIN_VTESTCPS,
27345 IX86_BUILTIN_VTESTNZCPS,
27346 IX86_BUILTIN_VTESTZPD256,
27347 IX86_BUILTIN_VTESTCPD256,
27348 IX86_BUILTIN_VTESTNZCPD256,
27349 IX86_BUILTIN_VTESTZPS256,
27350 IX86_BUILTIN_VTESTCPS256,
27351 IX86_BUILTIN_VTESTNZCPS256,
27352 IX86_BUILTIN_PTESTZ256,
27353 IX86_BUILTIN_PTESTC256,
27354 IX86_BUILTIN_PTESTNZC256,
27355
27356 IX86_BUILTIN_MOVMSKPD256,
27357 IX86_BUILTIN_MOVMSKPS256,
27358
27359 /* AVX2 */
27360 IX86_BUILTIN_MPSADBW256,
27361 IX86_BUILTIN_PABSB256,
27362 IX86_BUILTIN_PABSW256,
27363 IX86_BUILTIN_PABSD256,
27364 IX86_BUILTIN_PACKSSDW256,
27365 IX86_BUILTIN_PACKSSWB256,
27366 IX86_BUILTIN_PACKUSDW256,
27367 IX86_BUILTIN_PACKUSWB256,
27368 IX86_BUILTIN_PADDB256,
27369 IX86_BUILTIN_PADDW256,
27370 IX86_BUILTIN_PADDD256,
27371 IX86_BUILTIN_PADDQ256,
27372 IX86_BUILTIN_PADDSB256,
27373 IX86_BUILTIN_PADDSW256,
27374 IX86_BUILTIN_PADDUSB256,
27375 IX86_BUILTIN_PADDUSW256,
27376 IX86_BUILTIN_PALIGNR256,
27377 IX86_BUILTIN_AND256I,
27378 IX86_BUILTIN_ANDNOT256I,
27379 IX86_BUILTIN_PAVGB256,
27380 IX86_BUILTIN_PAVGW256,
27381 IX86_BUILTIN_PBLENDVB256,
27382 IX86_BUILTIN_PBLENDVW256,
27383 IX86_BUILTIN_PCMPEQB256,
27384 IX86_BUILTIN_PCMPEQW256,
27385 IX86_BUILTIN_PCMPEQD256,
27386 IX86_BUILTIN_PCMPEQQ256,
27387 IX86_BUILTIN_PCMPGTB256,
27388 IX86_BUILTIN_PCMPGTW256,
27389 IX86_BUILTIN_PCMPGTD256,
27390 IX86_BUILTIN_PCMPGTQ256,
27391 IX86_BUILTIN_PHADDW256,
27392 IX86_BUILTIN_PHADDD256,
27393 IX86_BUILTIN_PHADDSW256,
27394 IX86_BUILTIN_PHSUBW256,
27395 IX86_BUILTIN_PHSUBD256,
27396 IX86_BUILTIN_PHSUBSW256,
27397 IX86_BUILTIN_PMADDUBSW256,
27398 IX86_BUILTIN_PMADDWD256,
27399 IX86_BUILTIN_PMAXSB256,
27400 IX86_BUILTIN_PMAXSW256,
27401 IX86_BUILTIN_PMAXSD256,
27402 IX86_BUILTIN_PMAXUB256,
27403 IX86_BUILTIN_PMAXUW256,
27404 IX86_BUILTIN_PMAXUD256,
27405 IX86_BUILTIN_PMINSB256,
27406 IX86_BUILTIN_PMINSW256,
27407 IX86_BUILTIN_PMINSD256,
27408 IX86_BUILTIN_PMINUB256,
27409 IX86_BUILTIN_PMINUW256,
27410 IX86_BUILTIN_PMINUD256,
27411 IX86_BUILTIN_PMOVMSKB256,
27412 IX86_BUILTIN_PMOVSXBW256,
27413 IX86_BUILTIN_PMOVSXBD256,
27414 IX86_BUILTIN_PMOVSXBQ256,
27415 IX86_BUILTIN_PMOVSXWD256,
27416 IX86_BUILTIN_PMOVSXWQ256,
27417 IX86_BUILTIN_PMOVSXDQ256,
27418 IX86_BUILTIN_PMOVZXBW256,
27419 IX86_BUILTIN_PMOVZXBD256,
27420 IX86_BUILTIN_PMOVZXBQ256,
27421 IX86_BUILTIN_PMOVZXWD256,
27422 IX86_BUILTIN_PMOVZXWQ256,
27423 IX86_BUILTIN_PMOVZXDQ256,
27424 IX86_BUILTIN_PMULDQ256,
27425 IX86_BUILTIN_PMULHRSW256,
27426 IX86_BUILTIN_PMULHUW256,
27427 IX86_BUILTIN_PMULHW256,
27428 IX86_BUILTIN_PMULLW256,
27429 IX86_BUILTIN_PMULLD256,
27430 IX86_BUILTIN_PMULUDQ256,
27431 IX86_BUILTIN_POR256,
27432 IX86_BUILTIN_PSADBW256,
27433 IX86_BUILTIN_PSHUFB256,
27434 IX86_BUILTIN_PSHUFD256,
27435 IX86_BUILTIN_PSHUFHW256,
27436 IX86_BUILTIN_PSHUFLW256,
27437 IX86_BUILTIN_PSIGNB256,
27438 IX86_BUILTIN_PSIGNW256,
27439 IX86_BUILTIN_PSIGND256,
27440 IX86_BUILTIN_PSLLDQI256,
27441 IX86_BUILTIN_PSLLWI256,
27442 IX86_BUILTIN_PSLLW256,
27443 IX86_BUILTIN_PSLLDI256,
27444 IX86_BUILTIN_PSLLD256,
27445 IX86_BUILTIN_PSLLQI256,
27446 IX86_BUILTIN_PSLLQ256,
27447 IX86_BUILTIN_PSRAWI256,
27448 IX86_BUILTIN_PSRAW256,
27449 IX86_BUILTIN_PSRADI256,
27450 IX86_BUILTIN_PSRAD256,
27451 IX86_BUILTIN_PSRLDQI256,
27452 IX86_BUILTIN_PSRLWI256,
27453 IX86_BUILTIN_PSRLW256,
27454 IX86_BUILTIN_PSRLDI256,
27455 IX86_BUILTIN_PSRLD256,
27456 IX86_BUILTIN_PSRLQI256,
27457 IX86_BUILTIN_PSRLQ256,
27458 IX86_BUILTIN_PSUBB256,
27459 IX86_BUILTIN_PSUBW256,
27460 IX86_BUILTIN_PSUBD256,
27461 IX86_BUILTIN_PSUBQ256,
27462 IX86_BUILTIN_PSUBSB256,
27463 IX86_BUILTIN_PSUBSW256,
27464 IX86_BUILTIN_PSUBUSB256,
27465 IX86_BUILTIN_PSUBUSW256,
27466 IX86_BUILTIN_PUNPCKHBW256,
27467 IX86_BUILTIN_PUNPCKHWD256,
27468 IX86_BUILTIN_PUNPCKHDQ256,
27469 IX86_BUILTIN_PUNPCKHQDQ256,
27470 IX86_BUILTIN_PUNPCKLBW256,
27471 IX86_BUILTIN_PUNPCKLWD256,
27472 IX86_BUILTIN_PUNPCKLDQ256,
27473 IX86_BUILTIN_PUNPCKLQDQ256,
27474 IX86_BUILTIN_PXOR256,
27475 IX86_BUILTIN_MOVNTDQA256,
27476 IX86_BUILTIN_VBROADCASTSS_PS,
27477 IX86_BUILTIN_VBROADCASTSS_PS256,
27478 IX86_BUILTIN_VBROADCASTSD_PD256,
27479 IX86_BUILTIN_VBROADCASTSI256,
27480 IX86_BUILTIN_PBLENDD256,
27481 IX86_BUILTIN_PBLENDD128,
27482 IX86_BUILTIN_PBROADCASTB256,
27483 IX86_BUILTIN_PBROADCASTW256,
27484 IX86_BUILTIN_PBROADCASTD256,
27485 IX86_BUILTIN_PBROADCASTQ256,
27486 IX86_BUILTIN_PBROADCASTB128,
27487 IX86_BUILTIN_PBROADCASTW128,
27488 IX86_BUILTIN_PBROADCASTD128,
27489 IX86_BUILTIN_PBROADCASTQ128,
27490 IX86_BUILTIN_VPERMVARSI256,
27491 IX86_BUILTIN_VPERMDF256,
27492 IX86_BUILTIN_VPERMVARSF256,
27493 IX86_BUILTIN_VPERMDI256,
27494 IX86_BUILTIN_VPERMTI256,
27495 IX86_BUILTIN_VEXTRACT128I256,
27496 IX86_BUILTIN_VINSERT128I256,
27497 IX86_BUILTIN_MASKLOADD,
27498 IX86_BUILTIN_MASKLOADQ,
27499 IX86_BUILTIN_MASKLOADD256,
27500 IX86_BUILTIN_MASKLOADQ256,
27501 IX86_BUILTIN_MASKSTORED,
27502 IX86_BUILTIN_MASKSTOREQ,
27503 IX86_BUILTIN_MASKSTORED256,
27504 IX86_BUILTIN_MASKSTOREQ256,
27505 IX86_BUILTIN_PSLLVV4DI,
27506 IX86_BUILTIN_PSLLVV2DI,
27507 IX86_BUILTIN_PSLLVV8SI,
27508 IX86_BUILTIN_PSLLVV4SI,
27509 IX86_BUILTIN_PSRAVV8SI,
27510 IX86_BUILTIN_PSRAVV4SI,
27511 IX86_BUILTIN_PSRLVV4DI,
27512 IX86_BUILTIN_PSRLVV2DI,
27513 IX86_BUILTIN_PSRLVV8SI,
27514 IX86_BUILTIN_PSRLVV4SI,
27515
27516 IX86_BUILTIN_GATHERSIV2DF,
27517 IX86_BUILTIN_GATHERSIV4DF,
27518 IX86_BUILTIN_GATHERDIV2DF,
27519 IX86_BUILTIN_GATHERDIV4DF,
27520 IX86_BUILTIN_GATHERSIV4SF,
27521 IX86_BUILTIN_GATHERSIV8SF,
27522 IX86_BUILTIN_GATHERDIV4SF,
27523 IX86_BUILTIN_GATHERDIV8SF,
27524 IX86_BUILTIN_GATHERSIV2DI,
27525 IX86_BUILTIN_GATHERSIV4DI,
27526 IX86_BUILTIN_GATHERDIV2DI,
27527 IX86_BUILTIN_GATHERDIV4DI,
27528 IX86_BUILTIN_GATHERSIV4SI,
27529 IX86_BUILTIN_GATHERSIV8SI,
27530 IX86_BUILTIN_GATHERDIV4SI,
27531 IX86_BUILTIN_GATHERDIV8SI,
27532
27533 /* Alternate 4 element gather for the vectorizer where
27534 all operands are 32-byte wide. */
27535 IX86_BUILTIN_GATHERALTSIV4DF,
27536 IX86_BUILTIN_GATHERALTDIV8SF,
27537 IX86_BUILTIN_GATHERALTSIV4DI,
27538 IX86_BUILTIN_GATHERALTDIV8SI,
27539
27540 /* TFmode support builtins. */
27541 IX86_BUILTIN_INFQ,
27542 IX86_BUILTIN_HUGE_VALQ,
27543 IX86_BUILTIN_FABSQ,
27544 IX86_BUILTIN_COPYSIGNQ,
27545
27546 /* Vectorizer support builtins. */
27547 IX86_BUILTIN_CPYSGNPS,
27548 IX86_BUILTIN_CPYSGNPD,
27549 IX86_BUILTIN_CPYSGNPS256,
27550 IX86_BUILTIN_CPYSGNPD256,
27551
27552 /* FMA4 instructions. */
27553 IX86_BUILTIN_VFMADDSS,
27554 IX86_BUILTIN_VFMADDSD,
27555 IX86_BUILTIN_VFMADDPS,
27556 IX86_BUILTIN_VFMADDPD,
27557 IX86_BUILTIN_VFMADDPS256,
27558 IX86_BUILTIN_VFMADDPD256,
27559 IX86_BUILTIN_VFMADDSUBPS,
27560 IX86_BUILTIN_VFMADDSUBPD,
27561 IX86_BUILTIN_VFMADDSUBPS256,
27562 IX86_BUILTIN_VFMADDSUBPD256,
27563
27564 /* FMA3 instructions. */
27565 IX86_BUILTIN_VFMADDSS3,
27566 IX86_BUILTIN_VFMADDSD3,
27567
27568 /* XOP instructions. */
27569 IX86_BUILTIN_VPCMOV,
27570 IX86_BUILTIN_VPCMOV_V2DI,
27571 IX86_BUILTIN_VPCMOV_V4SI,
27572 IX86_BUILTIN_VPCMOV_V8HI,
27573 IX86_BUILTIN_VPCMOV_V16QI,
27574 IX86_BUILTIN_VPCMOV_V4SF,
27575 IX86_BUILTIN_VPCMOV_V2DF,
27576 IX86_BUILTIN_VPCMOV256,
27577 IX86_BUILTIN_VPCMOV_V4DI256,
27578 IX86_BUILTIN_VPCMOV_V8SI256,
27579 IX86_BUILTIN_VPCMOV_V16HI256,
27580 IX86_BUILTIN_VPCMOV_V32QI256,
27581 IX86_BUILTIN_VPCMOV_V8SF256,
27582 IX86_BUILTIN_VPCMOV_V4DF256,
27583
27584 IX86_BUILTIN_VPPERM,
27585
27586 IX86_BUILTIN_VPMACSSWW,
27587 IX86_BUILTIN_VPMACSWW,
27588 IX86_BUILTIN_VPMACSSWD,
27589 IX86_BUILTIN_VPMACSWD,
27590 IX86_BUILTIN_VPMACSSDD,
27591 IX86_BUILTIN_VPMACSDD,
27592 IX86_BUILTIN_VPMACSSDQL,
27593 IX86_BUILTIN_VPMACSSDQH,
27594 IX86_BUILTIN_VPMACSDQL,
27595 IX86_BUILTIN_VPMACSDQH,
27596 IX86_BUILTIN_VPMADCSSWD,
27597 IX86_BUILTIN_VPMADCSWD,
27598
27599 IX86_BUILTIN_VPHADDBW,
27600 IX86_BUILTIN_VPHADDBD,
27601 IX86_BUILTIN_VPHADDBQ,
27602 IX86_BUILTIN_VPHADDWD,
27603 IX86_BUILTIN_VPHADDWQ,
27604 IX86_BUILTIN_VPHADDDQ,
27605 IX86_BUILTIN_VPHADDUBW,
27606 IX86_BUILTIN_VPHADDUBD,
27607 IX86_BUILTIN_VPHADDUBQ,
27608 IX86_BUILTIN_VPHADDUWD,
27609 IX86_BUILTIN_VPHADDUWQ,
27610 IX86_BUILTIN_VPHADDUDQ,
27611 IX86_BUILTIN_VPHSUBBW,
27612 IX86_BUILTIN_VPHSUBWD,
27613 IX86_BUILTIN_VPHSUBDQ,
27614
27615 IX86_BUILTIN_VPROTB,
27616 IX86_BUILTIN_VPROTW,
27617 IX86_BUILTIN_VPROTD,
27618 IX86_BUILTIN_VPROTQ,
27619 IX86_BUILTIN_VPROTB_IMM,
27620 IX86_BUILTIN_VPROTW_IMM,
27621 IX86_BUILTIN_VPROTD_IMM,
27622 IX86_BUILTIN_VPROTQ_IMM,
27623
27624 IX86_BUILTIN_VPSHLB,
27625 IX86_BUILTIN_VPSHLW,
27626 IX86_BUILTIN_VPSHLD,
27627 IX86_BUILTIN_VPSHLQ,
27628 IX86_BUILTIN_VPSHAB,
27629 IX86_BUILTIN_VPSHAW,
27630 IX86_BUILTIN_VPSHAD,
27631 IX86_BUILTIN_VPSHAQ,
27632
27633 IX86_BUILTIN_VFRCZSS,
27634 IX86_BUILTIN_VFRCZSD,
27635 IX86_BUILTIN_VFRCZPS,
27636 IX86_BUILTIN_VFRCZPD,
27637 IX86_BUILTIN_VFRCZPS256,
27638 IX86_BUILTIN_VFRCZPD256,
27639
27640 IX86_BUILTIN_VPCOMEQUB,
27641 IX86_BUILTIN_VPCOMNEUB,
27642 IX86_BUILTIN_VPCOMLTUB,
27643 IX86_BUILTIN_VPCOMLEUB,
27644 IX86_BUILTIN_VPCOMGTUB,
27645 IX86_BUILTIN_VPCOMGEUB,
27646 IX86_BUILTIN_VPCOMFALSEUB,
27647 IX86_BUILTIN_VPCOMTRUEUB,
27648
27649 IX86_BUILTIN_VPCOMEQUW,
27650 IX86_BUILTIN_VPCOMNEUW,
27651 IX86_BUILTIN_VPCOMLTUW,
27652 IX86_BUILTIN_VPCOMLEUW,
27653 IX86_BUILTIN_VPCOMGTUW,
27654 IX86_BUILTIN_VPCOMGEUW,
27655 IX86_BUILTIN_VPCOMFALSEUW,
27656 IX86_BUILTIN_VPCOMTRUEUW,
27657
27658 IX86_BUILTIN_VPCOMEQUD,
27659 IX86_BUILTIN_VPCOMNEUD,
27660 IX86_BUILTIN_VPCOMLTUD,
27661 IX86_BUILTIN_VPCOMLEUD,
27662 IX86_BUILTIN_VPCOMGTUD,
27663 IX86_BUILTIN_VPCOMGEUD,
27664 IX86_BUILTIN_VPCOMFALSEUD,
27665 IX86_BUILTIN_VPCOMTRUEUD,
27666
27667 IX86_BUILTIN_VPCOMEQUQ,
27668 IX86_BUILTIN_VPCOMNEUQ,
27669 IX86_BUILTIN_VPCOMLTUQ,
27670 IX86_BUILTIN_VPCOMLEUQ,
27671 IX86_BUILTIN_VPCOMGTUQ,
27672 IX86_BUILTIN_VPCOMGEUQ,
27673 IX86_BUILTIN_VPCOMFALSEUQ,
27674 IX86_BUILTIN_VPCOMTRUEUQ,
27675
27676 IX86_BUILTIN_VPCOMEQB,
27677 IX86_BUILTIN_VPCOMNEB,
27678 IX86_BUILTIN_VPCOMLTB,
27679 IX86_BUILTIN_VPCOMLEB,
27680 IX86_BUILTIN_VPCOMGTB,
27681 IX86_BUILTIN_VPCOMGEB,
27682 IX86_BUILTIN_VPCOMFALSEB,
27683 IX86_BUILTIN_VPCOMTRUEB,
27684
27685 IX86_BUILTIN_VPCOMEQW,
27686 IX86_BUILTIN_VPCOMNEW,
27687 IX86_BUILTIN_VPCOMLTW,
27688 IX86_BUILTIN_VPCOMLEW,
27689 IX86_BUILTIN_VPCOMGTW,
27690 IX86_BUILTIN_VPCOMGEW,
27691 IX86_BUILTIN_VPCOMFALSEW,
27692 IX86_BUILTIN_VPCOMTRUEW,
27693
27694 IX86_BUILTIN_VPCOMEQD,
27695 IX86_BUILTIN_VPCOMNED,
27696 IX86_BUILTIN_VPCOMLTD,
27697 IX86_BUILTIN_VPCOMLED,
27698 IX86_BUILTIN_VPCOMGTD,
27699 IX86_BUILTIN_VPCOMGED,
27700 IX86_BUILTIN_VPCOMFALSED,
27701 IX86_BUILTIN_VPCOMTRUED,
27702
27703 IX86_BUILTIN_VPCOMEQQ,
27704 IX86_BUILTIN_VPCOMNEQ,
27705 IX86_BUILTIN_VPCOMLTQ,
27706 IX86_BUILTIN_VPCOMLEQ,
27707 IX86_BUILTIN_VPCOMGTQ,
27708 IX86_BUILTIN_VPCOMGEQ,
27709 IX86_BUILTIN_VPCOMFALSEQ,
27710 IX86_BUILTIN_VPCOMTRUEQ,
27711
27712 /* LWP instructions. */
27713 IX86_BUILTIN_LLWPCB,
27714 IX86_BUILTIN_SLWPCB,
27715 IX86_BUILTIN_LWPVAL32,
27716 IX86_BUILTIN_LWPVAL64,
27717 IX86_BUILTIN_LWPINS32,
27718 IX86_BUILTIN_LWPINS64,
27719
27720 IX86_BUILTIN_CLZS,
27721
27722 /* RTM */
27723 IX86_BUILTIN_XBEGIN,
27724 IX86_BUILTIN_XEND,
27725 IX86_BUILTIN_XABORT,
27726 IX86_BUILTIN_XTEST,
27727
27728 /* BMI instructions. */
27729 IX86_BUILTIN_BEXTR32,
27730 IX86_BUILTIN_BEXTR64,
27731 IX86_BUILTIN_CTZS,
27732
27733 /* TBM instructions. */
27734 IX86_BUILTIN_BEXTRI32,
27735 IX86_BUILTIN_BEXTRI64,
27736
27737 /* BMI2 instructions. */
27738 IX86_BUILTIN_BZHI32,
27739 IX86_BUILTIN_BZHI64,
27740 IX86_BUILTIN_PDEP32,
27741 IX86_BUILTIN_PDEP64,
27742 IX86_BUILTIN_PEXT32,
27743 IX86_BUILTIN_PEXT64,
27744
27745 /* ADX instructions. */
27746 IX86_BUILTIN_ADDCARRYX32,
27747 IX86_BUILTIN_ADDCARRYX64,
27748
27749 /* FSGSBASE instructions. */
27750 IX86_BUILTIN_RDFSBASE32,
27751 IX86_BUILTIN_RDFSBASE64,
27752 IX86_BUILTIN_RDGSBASE32,
27753 IX86_BUILTIN_RDGSBASE64,
27754 IX86_BUILTIN_WRFSBASE32,
27755 IX86_BUILTIN_WRFSBASE64,
27756 IX86_BUILTIN_WRGSBASE32,
27757 IX86_BUILTIN_WRGSBASE64,
27758
27759 /* RDRND instructions. */
27760 IX86_BUILTIN_RDRAND16_STEP,
27761 IX86_BUILTIN_RDRAND32_STEP,
27762 IX86_BUILTIN_RDRAND64_STEP,
27763
27764 /* RDSEED instructions. */
27765 IX86_BUILTIN_RDSEED16_STEP,
27766 IX86_BUILTIN_RDSEED32_STEP,
27767 IX86_BUILTIN_RDSEED64_STEP,
27768
27769 /* F16C instructions. */
27770 IX86_BUILTIN_CVTPH2PS,
27771 IX86_BUILTIN_CVTPH2PS256,
27772 IX86_BUILTIN_CVTPS2PH,
27773 IX86_BUILTIN_CVTPS2PH256,
27774
27775 /* CFString built-in for darwin */
27776 IX86_BUILTIN_CFSTRING,
27777
27778 /* Builtins to get CPU type and supported features. */
27779 IX86_BUILTIN_CPU_INIT,
27780 IX86_BUILTIN_CPU_IS,
27781 IX86_BUILTIN_CPU_SUPPORTS,
27782
27783 IX86_BUILTIN_MAX
27784 };
27785
27786 /* Table for the ix86 builtin decls. */
27787 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27788
27789 /* Table of all of the builtin functions that are possible with different ISA's
27790 but are waiting to be built until a function is declared to use that
27791 ISA. */
27792 struct builtin_isa {
27793 const char *name; /* function name */
27794 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27795 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27796 bool const_p; /* true if the declaration is constant */
27797 bool set_and_not_built_p;
27798 };
27799
27800 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27801
27802
27803 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27804 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27805 function decl in the ix86_builtins array. Returns the function decl or
27806 NULL_TREE, if the builtin was not added.
27807
27808 If the front end has a special hook for builtin functions, delay adding
27809 builtin functions that aren't in the current ISA until the ISA is changed
27810 with function specific optimization. Doing so, can save about 300K for the
27811 default compiler. When the builtin is expanded, check at that time whether
27812 it is valid.
27813
27814 If the front end doesn't have a special hook, record all builtins, even if
27815 it isn't an instruction set in the current ISA in case the user uses
27816 function specific options for a different ISA, so that we don't get scope
27817 errors if a builtin is added in the middle of a function scope. */
27818
27819 static inline tree
27820 def_builtin (HOST_WIDE_INT mask, const char *name,
27821 enum ix86_builtin_func_type tcode,
27822 enum ix86_builtins code)
27823 {
27824 tree decl = NULL_TREE;
27825
27826 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27827 {
27828 ix86_builtins_isa[(int) code].isa = mask;
27829
27830 mask &= ~OPTION_MASK_ISA_64BIT;
27831 if (mask == 0
27832 || (mask & ix86_isa_flags) != 0
27833 || (lang_hooks.builtin_function
27834 == lang_hooks.builtin_function_ext_scope))
27835
27836 {
27837 tree type = ix86_get_builtin_func_type (tcode);
27838 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27839 NULL, NULL_TREE);
27840 ix86_builtins[(int) code] = decl;
27841 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27842 }
27843 else
27844 {
27845 ix86_builtins[(int) code] = NULL_TREE;
27846 ix86_builtins_isa[(int) code].tcode = tcode;
27847 ix86_builtins_isa[(int) code].name = name;
27848 ix86_builtins_isa[(int) code].const_p = false;
27849 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27850 }
27851 }
27852
27853 return decl;
27854 }
27855
27856 /* Like def_builtin, but also marks the function decl "const". */
27857
27858 static inline tree
27859 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27860 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27861 {
27862 tree decl = def_builtin (mask, name, tcode, code);
27863 if (decl)
27864 TREE_READONLY (decl) = 1;
27865 else
27866 ix86_builtins_isa[(int) code].const_p = true;
27867
27868 return decl;
27869 }
27870
27871 /* Add any new builtin functions for a given ISA that may not have been
27872 declared. This saves a bit of space compared to adding all of the
27873 declarations to the tree, even if we didn't use them. */
27874
27875 static void
27876 ix86_add_new_builtins (HOST_WIDE_INT isa)
27877 {
27878 int i;
27879
27880 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27881 {
27882 if ((ix86_builtins_isa[i].isa & isa) != 0
27883 && ix86_builtins_isa[i].set_and_not_built_p)
27884 {
27885 tree decl, type;
27886
27887 /* Don't define the builtin again. */
27888 ix86_builtins_isa[i].set_and_not_built_p = false;
27889
27890 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27891 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27892 type, i, BUILT_IN_MD, NULL,
27893 NULL_TREE);
27894
27895 ix86_builtins[i] = decl;
27896 if (ix86_builtins_isa[i].const_p)
27897 TREE_READONLY (decl) = 1;
27898 }
27899 }
27900 }
27901
27902 /* Bits for builtin_description.flag. */
27903
27904 /* Set when we don't support the comparison natively, and should
27905 swap_comparison in order to support it. */
27906 #define BUILTIN_DESC_SWAP_OPERANDS 1
27907
27908 struct builtin_description
27909 {
27910 const HOST_WIDE_INT mask;
27911 const enum insn_code icode;
27912 const char *const name;
27913 const enum ix86_builtins code;
27914 const enum rtx_code comparison;
27915 const int flag;
27916 };
27917
27918 static const struct builtin_description bdesc_comi[] =
27919 {
27920 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
27921 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
27922 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
27923 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
27924 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
27925 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
27926 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
27927 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
27928 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
27929 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
27930 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
27931 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
27932 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
27933 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
27934 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
27935 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
27936 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
27937 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
27938 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
27939 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
27940 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
27941 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
27942 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
27943 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
27944 };
27945
27946 static const struct builtin_description bdesc_pcmpestr[] =
27947 {
27948 /* SSE4.2 */
27949 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
27950 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
27951 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
27952 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
27953 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
27954 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
27955 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
27956 };
27957
27958 static const struct builtin_description bdesc_pcmpistr[] =
27959 {
27960 /* SSE4.2 */
27961 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
27962 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
27963 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
27964 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
27965 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
27966 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
27967 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
27968 };
27969
27970 /* Special builtins with variable number of arguments. */
27971 static const struct builtin_description bdesc_special_args[] =
27972 {
27973 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
27974 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
27975 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
27976
27977 /* 80387 (for use internally for atomic compound assignment). */
27978 { 0, CODE_FOR_fnstenv, "__builtin_ia32_fnstenv", IX86_BUILTIN_FNSTENV, UNKNOWN, (int) VOID_FTYPE_PVOID },
27979 { 0, CODE_FOR_fldenv, "__builtin_ia32_fldenv", IX86_BUILTIN_FLDENV, UNKNOWN, (int) VOID_FTYPE_PCVOID },
27980 { 0, CODE_FOR_fnstsw, "__builtin_ia32_fnstsw", IX86_BUILTIN_FNSTSW, UNKNOWN, (int) VOID_FTYPE_PUSHORT },
27981 { 0, CODE_FOR_fnclex, "__builtin_ia32_fnclex", IX86_BUILTIN_FNCLEX, UNKNOWN, (int) VOID_FTYPE_VOID },
27982
27983 /* MMX */
27984 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27985
27986 /* 3DNow! */
27987 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27988
27989 /* FXSR, XSAVE and XSAVEOPT */
27990 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
27991 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
27992 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27993 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27994 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27995
27996 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27997 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27998 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27999 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28000 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28001
28002 /* SSE */
28003 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28004 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28005 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28006
28007 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28008 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28009 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28010 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28011
28012 /* SSE or 3DNow!A */
28013 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28014 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
28015
28016 /* SSE2 */
28017 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28018 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28019 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28020 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
28021 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28022 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
28023 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
28024 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
28025 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
28026 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28027
28028 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28029 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28030
28031 /* SSE3 */
28032 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28033
28034 /* SSE4.1 */
28035 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
28036
28037 /* SSE4A */
28038 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28039 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28040
28041 /* AVX */
28042 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
28043 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
28044
28045 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28046 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28047 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28048 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
28049 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
28050
28051 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28052 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28053 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28054 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28055 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28056 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
28057 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28058
28059 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
28060 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28061 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28062
28063 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
28064 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
28065 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
28066 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
28067 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
28068 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
28069 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
28070 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
28071
28072 /* AVX2 */
28073 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
28074 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
28075 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
28076 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
28077 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
28078 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
28079 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
28080 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
28081 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
28082
28083 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
28084 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
28085 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
28086 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
28087 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
28088 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
28089
28090 /* FSGSBASE */
28091 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28092 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28093 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28094 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28095 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28096 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28097 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28098 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28099
28100 /* RTM */
28101 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28102 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
28103 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
28104 };
28105
28106 /* Builtins with variable number of arguments. */
28107 static const struct builtin_description bdesc_args[] =
28108 {
28109 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
28110 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
28111 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
28112 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28113 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28114 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28115 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28116
28117 /* MMX */
28118 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28119 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28120 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28121 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28122 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28123 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28124
28125 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28126 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28127 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28128 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28129 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28130 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28131 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28132 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28133
28134 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28135 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28136
28137 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28138 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28139 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28140 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28141
28142 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28143 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28144 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28145 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28146 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28147 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28148
28149 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28150 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28151 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28152 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28153 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
28154 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
28155
28156 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28157 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
28158 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28159
28160 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
28161
28162 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28163 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28164 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28165 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28166 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28167 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28168
28169 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28170 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28171 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28172 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28173 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28174 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28175
28176 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28177 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28178 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28179 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28180
28181 /* 3DNow! */
28182 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28183 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28184 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28185 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28186
28187 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28188 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28189 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28190 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28191 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28192 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28193 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28194 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28195 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28196 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28197 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28198 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28199 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28200 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28201 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28202
28203 /* 3DNow!A */
28204 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28205 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28206 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28207 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28208 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28209 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28210
28211 /* SSE */
28212 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
28213 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28214 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28215 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28216 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28217 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28218 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28219 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28220 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28221 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28222 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28223 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28224
28225 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28226
28227 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28228 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28229 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28230 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28231 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28232 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28233 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28234 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28235
28236 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28237 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28238 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28239 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28240 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28241 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28242 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28243 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28244 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28245 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28246 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
28247 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28248 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28249 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28250 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28251 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28252 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28253 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28254 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28255 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28256
28257 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28258 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28259 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28260 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28261
28262 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28263 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28264 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28265 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28266
28267 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28268
28269 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28270 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28271 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28272 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28273 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28274
28275 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
28276 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
28277 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
28278
28279 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
28280
28281 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28282 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28283 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28284
28285 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
28286 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
28287
28288 /* SSE MMX or 3Dnow!A */
28289 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28290 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28291 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28292
28293 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28294 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28295 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28296 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28297
28298 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
28299 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
28300
28301 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
28302
28303 /* SSE2 */
28304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28305
28306 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
28307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
28308 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
28310 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
28311
28312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28314 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
28315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28316 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28317
28318 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
28319
28320 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28322 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28323 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28324
28325 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28326 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
28327 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28328
28329 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28330 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28331 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28332 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28333 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28334 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28335 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28337
28338 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28339 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28341 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28342 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
28343 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28344 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28345 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28346 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28347 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28348 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28350 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28351 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28352 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28354 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28355 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28356 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28358
28359 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28360 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28361 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28362 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28363
28364 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28365 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28366 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28367 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28368
28369 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28370
28371 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28372 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28373 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28374
28375 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28376
28377 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28378 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28379 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28380 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28381 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28382 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28383 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28384 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28385
28386 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28387 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28388 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28389 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28390 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28391 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28392 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28393 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28394
28395 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28396 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
28397
28398 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28399 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28400 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28401 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28402
28403 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28404 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28405
28406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28407 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28408 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28409 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28410 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28411 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28412
28413 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28414 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28415 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28416 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28417
28418 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28419 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28420 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28421 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28422 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28423 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28424 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28425 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28426
28427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28428 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28429 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28430
28431 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28432 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
28433
28434 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
28435 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28436
28437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
28438
28439 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
28440 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
28441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
28442 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
28443
28444 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28445 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28446 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28447 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28448 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28449 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28450 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28451
28452 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28453 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28454 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28455 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28456 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28457 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28458 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28459
28460 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28461 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28462 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28463 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28464
28465 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
28466 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28467 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28468
28469 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
28470
28471 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28472
28473 /* SSE2 MMX */
28474 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28475 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28476
28477 /* SSE3 */
28478 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
28479 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28480
28481 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28482 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28483 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28484 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28485 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28486 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28487
28488 /* SSSE3 */
28489 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28490 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
28491 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28492 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
28493 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28494 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28495
28496 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28497 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28498 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28499 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28500 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28501 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28502 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28503 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28504 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28505 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28506 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28507 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28508 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28509 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28510 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28511 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28512 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28513 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28514 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28515 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28516 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28517 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28518 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28519 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28520
28521 /* SSSE3. */
28522 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28523 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28524
28525 /* SSE4.1 */
28526 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28527 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28528 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28529 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28530 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28531 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28532 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28533 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28534 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28535 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28536
28537 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28538 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28539 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28540 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28541 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28542 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28543 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28544 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28545 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28546 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28547 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28548 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28549 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28550
28551 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28552 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28553 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28554 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28555 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28556 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28557 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28558 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28559 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28560 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28561 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28562 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28563
28564 /* SSE4.1 */
28565 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28566 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28567 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28568 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28569
28570 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28571 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28572 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28573 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28574
28575 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28576 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28577
28578 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28579 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28580
28581 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28582 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28583 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28584 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28585
28586 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28587 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28588
28589 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28590 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28591
28592 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28593 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28594 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28595
28596 /* SSE4.2 */
28597 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28598 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28599 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28600 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28601 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28602
28603 /* SSE4A */
28604 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28605 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28606 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28607 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28608
28609 /* AES */
28610 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28611 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28612
28613 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28614 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28615 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28616 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28617
28618 /* PCLMUL */
28619 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28620
28621 /* AVX */
28622 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28623 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28624 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28626 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28627 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28630 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28636 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28637 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28638 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28639 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28640 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28641 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28642 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28643 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28644 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28645 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28646 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28647 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28648
28649 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28650 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28651 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28652 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28653
28654 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28655 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28656 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28657 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28658 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28659 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28660 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28661 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28662 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28663 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28664 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28665 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28666 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28667 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28668 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28669 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28670 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28671 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28672 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28673 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28674 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28675 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28676 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28677 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28678 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28679 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28680 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28681 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28682 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28683 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28684 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28685 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28686 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28687 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28688
28689 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28690 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28691 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28692
28693 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28694 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28695 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28696 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28697 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28698
28699 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28700
28701 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28702 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28703
28704 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28705 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28706 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28707 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28708
28709 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28710 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28711
28712 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28713 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28714
28715 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28716 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28717 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28718 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28719
28720 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28721 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28722
28723 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28724 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28725
28726 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28727 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28728 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28729 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28730
28731 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28732 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28733 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28734 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28735 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28736 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28737
28738 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28739 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28740 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28741 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28742 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28743 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28744 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28745 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28746 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28747 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28748 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28749 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28750 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28751 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28752 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28753
28754 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28755 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28756
28757 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28758 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28759
28760 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28761
28762 /* AVX2 */
28763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28764 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28765 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28766 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28771 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28772 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28773 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28774 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28780 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28783 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28784 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28785 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28796 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28797 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28798 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28799 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28800 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28801 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28802 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28803 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28804 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28805 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28806 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28807 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28808 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28809 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28810 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28811 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28812 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28813 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28814 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28815 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28816 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28817 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28818 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28819 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28820 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28821 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28822 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28823 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28824 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28825 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28826 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28827 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28828 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28829 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28830 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28831 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28832 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28833 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28834 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28835 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28836 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28837 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28838 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28839 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28840 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28841 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28842 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28843 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28844 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28845 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28846 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28847 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28848 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28849 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28850 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28851 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28852 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28853 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28854 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28855 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28856 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28857 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28858 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28859 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28860 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28861 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28862 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28863 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28864 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28865 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28866 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28867 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28868 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28869 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28870 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28871 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28872 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28873 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28874 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28875 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28876 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28877 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28878 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28879 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28880 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28881 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28882 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28883 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28884 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28885 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28886 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28887 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28888 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28889 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28890 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28891 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28892 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28893 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28894 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28895 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28896 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28897 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
28898 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
28899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28900 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28901 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28904 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28905 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28906 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28907 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28908 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28909
28910 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28911
28912 /* BMI */
28913 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28914 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28915 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28916
28917 /* TBM */
28918 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28919 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28920
28921 /* F16C */
28922 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
28923 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
28924 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
28925 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
28926
28927 /* BMI2 */
28928 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28929 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28930 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28931 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28932 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28933 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28934 };
28935
28936 /* FMA4 and XOP. */
28937 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
28938 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
28939 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
28940 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
28941 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
28942 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
28943 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
28944 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
28945 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
28946 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
28947 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
28948 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
28949 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
28950 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
28951 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
28952 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
28953 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
28954 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
28955 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
28956 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
28957 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
28958 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
28959 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
28960 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
28961 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
28962 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
28963 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
28964 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
28965 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
28966 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
28967 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
28968 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
28969 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
28970 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
28971 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
28972 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
28973 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
28974 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
28975 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
28976 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
28977 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
28978 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
28979 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
28980 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
28981 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
28982 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
28983 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
28984 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
28985 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
28986 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
28987 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
28988 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
28989
28990 static const struct builtin_description bdesc_multi_arg[] =
28991 {
28992 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
28993 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
28994 UNKNOWN, (int)MULTI_ARG_3_SF },
28995 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
28996 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
28997 UNKNOWN, (int)MULTI_ARG_3_DF },
28998
28999 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
29000 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
29001 UNKNOWN, (int)MULTI_ARG_3_SF },
29002 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
29003 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
29004 UNKNOWN, (int)MULTI_ARG_3_DF },
29005
29006 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
29007 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
29008 UNKNOWN, (int)MULTI_ARG_3_SF },
29009 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
29010 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
29011 UNKNOWN, (int)MULTI_ARG_3_DF },
29012 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
29013 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
29014 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29015 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
29016 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
29017 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29018
29019 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
29020 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
29021 UNKNOWN, (int)MULTI_ARG_3_SF },
29022 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
29023 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
29024 UNKNOWN, (int)MULTI_ARG_3_DF },
29025 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
29026 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
29027 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29028 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
29029 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
29030 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29031
29032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
29033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
29034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
29035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
29036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
29037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
29038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
29039
29040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
29043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
29044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
29045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
29046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
29047
29048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
29049
29050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29062
29063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
29065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
29066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
29067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
29068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
29069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
29070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
29071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
29073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
29074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
29075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
29077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
29078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
29079
29080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
29081 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
29082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
29083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
29084 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
29085 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
29086
29087 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29088 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29092 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29093 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29094 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29095 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29096 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29097 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29101 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29102
29103 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
29104 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29105 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
29107 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
29108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
29109 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
29110
29111 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
29112 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29113 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
29115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
29116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
29117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
29118
29119 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
29120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
29123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
29124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
29125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
29126
29127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29128 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
29131 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
29132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
29133 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
29134
29135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
29136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29137 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
29139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
29140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
29141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
29142
29143 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
29144 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29145 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29146 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
29147 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
29148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
29149 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
29150
29151 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
29152 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
29155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
29156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
29157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
29158
29159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29160 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29161 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
29163 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
29164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
29165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
29166
29167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29168 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29170 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29174 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29175
29176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29179 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29184
29185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
29186 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
29187 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
29188 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
29189
29190 };
29191 \f
29192 /* TM vector builtins. */
29193
29194 /* Reuse the existing x86-specific `struct builtin_description' cause
29195 we're lazy. Add casts to make them fit. */
29196 static const struct builtin_description bdesc_tm[] =
29197 {
29198 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29199 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29200 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29201 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29202 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29203 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29204 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29205
29206 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29207 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29208 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29209 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29210 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29211 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29212 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29213
29214 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29215 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29216 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29217 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29218 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29219 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29220 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29221
29222 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
29223 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
29224 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
29225 };
29226
29227 /* TM callbacks. */
29228
29229 /* Return the builtin decl needed to load a vector of TYPE. */
29230
29231 static tree
29232 ix86_builtin_tm_load (tree type)
29233 {
29234 if (TREE_CODE (type) == VECTOR_TYPE)
29235 {
29236 switch (tree_to_uhwi (TYPE_SIZE (type)))
29237 {
29238 case 64:
29239 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
29240 case 128:
29241 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
29242 case 256:
29243 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
29244 }
29245 }
29246 return NULL_TREE;
29247 }
29248
29249 /* Return the builtin decl needed to store a vector of TYPE. */
29250
29251 static tree
29252 ix86_builtin_tm_store (tree type)
29253 {
29254 if (TREE_CODE (type) == VECTOR_TYPE)
29255 {
29256 switch (tree_to_uhwi (TYPE_SIZE (type)))
29257 {
29258 case 64:
29259 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
29260 case 128:
29261 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
29262 case 256:
29263 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
29264 }
29265 }
29266 return NULL_TREE;
29267 }
29268 \f
29269 /* Initialize the transactional memory vector load/store builtins. */
29270
29271 static void
29272 ix86_init_tm_builtins (void)
29273 {
29274 enum ix86_builtin_func_type ftype;
29275 const struct builtin_description *d;
29276 size_t i;
29277 tree decl;
29278 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
29279 tree attrs_log, attrs_type_log;
29280
29281 if (!flag_tm)
29282 return;
29283
29284 /* If there are no builtins defined, we must be compiling in a
29285 language without trans-mem support. */
29286 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
29287 return;
29288
29289 /* Use whatever attributes a normal TM load has. */
29290 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
29291 attrs_load = DECL_ATTRIBUTES (decl);
29292 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29293 /* Use whatever attributes a normal TM store has. */
29294 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
29295 attrs_store = DECL_ATTRIBUTES (decl);
29296 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29297 /* Use whatever attributes a normal TM log has. */
29298 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
29299 attrs_log = DECL_ATTRIBUTES (decl);
29300 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29301
29302 for (i = 0, d = bdesc_tm;
29303 i < ARRAY_SIZE (bdesc_tm);
29304 i++, d++)
29305 {
29306 if ((d->mask & ix86_isa_flags) != 0
29307 || (lang_hooks.builtin_function
29308 == lang_hooks.builtin_function_ext_scope))
29309 {
29310 tree type, attrs, attrs_type;
29311 enum built_in_function code = (enum built_in_function) d->code;
29312
29313 ftype = (enum ix86_builtin_func_type) d->flag;
29314 type = ix86_get_builtin_func_type (ftype);
29315
29316 if (BUILTIN_TM_LOAD_P (code))
29317 {
29318 attrs = attrs_load;
29319 attrs_type = attrs_type_load;
29320 }
29321 else if (BUILTIN_TM_STORE_P (code))
29322 {
29323 attrs = attrs_store;
29324 attrs_type = attrs_type_store;
29325 }
29326 else
29327 {
29328 attrs = attrs_log;
29329 attrs_type = attrs_type_log;
29330 }
29331 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
29332 /* The builtin without the prefix for
29333 calling it directly. */
29334 d->name + strlen ("__builtin_"),
29335 attrs);
29336 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
29337 set the TYPE_ATTRIBUTES. */
29338 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
29339
29340 set_builtin_decl (code, decl, false);
29341 }
29342 }
29343 }
29344
29345 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
29346 in the current target ISA to allow the user to compile particular modules
29347 with different target specific options that differ from the command line
29348 options. */
29349 static void
29350 ix86_init_mmx_sse_builtins (void)
29351 {
29352 const struct builtin_description * d;
29353 enum ix86_builtin_func_type ftype;
29354 size_t i;
29355
29356 /* Add all special builtins with variable number of operands. */
29357 for (i = 0, d = bdesc_special_args;
29358 i < ARRAY_SIZE (bdesc_special_args);
29359 i++, d++)
29360 {
29361 if (d->name == 0)
29362 continue;
29363
29364 ftype = (enum ix86_builtin_func_type) d->flag;
29365 def_builtin (d->mask, d->name, ftype, d->code);
29366 }
29367
29368 /* Add all builtins with variable number of operands. */
29369 for (i = 0, d = bdesc_args;
29370 i < ARRAY_SIZE (bdesc_args);
29371 i++, d++)
29372 {
29373 if (d->name == 0)
29374 continue;
29375
29376 ftype = (enum ix86_builtin_func_type) d->flag;
29377 def_builtin_const (d->mask, d->name, ftype, d->code);
29378 }
29379
29380 /* pcmpestr[im] insns. */
29381 for (i = 0, d = bdesc_pcmpestr;
29382 i < ARRAY_SIZE (bdesc_pcmpestr);
29383 i++, d++)
29384 {
29385 if (d->code == IX86_BUILTIN_PCMPESTRM128)
29386 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
29387 else
29388 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
29389 def_builtin_const (d->mask, d->name, ftype, d->code);
29390 }
29391
29392 /* pcmpistr[im] insns. */
29393 for (i = 0, d = bdesc_pcmpistr;
29394 i < ARRAY_SIZE (bdesc_pcmpistr);
29395 i++, d++)
29396 {
29397 if (d->code == IX86_BUILTIN_PCMPISTRM128)
29398 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
29399 else
29400 ftype = INT_FTYPE_V16QI_V16QI_INT;
29401 def_builtin_const (d->mask, d->name, ftype, d->code);
29402 }
29403
29404 /* comi/ucomi insns. */
29405 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29406 {
29407 if (d->mask == OPTION_MASK_ISA_SSE2)
29408 ftype = INT_FTYPE_V2DF_V2DF;
29409 else
29410 ftype = INT_FTYPE_V4SF_V4SF;
29411 def_builtin_const (d->mask, d->name, ftype, d->code);
29412 }
29413
29414 /* SSE */
29415 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
29416 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
29417 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
29418 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
29419
29420 /* SSE or 3DNow!A */
29421 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29422 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
29423 IX86_BUILTIN_MASKMOVQ);
29424
29425 /* SSE2 */
29426 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
29427 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
29428
29429 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
29430 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
29431 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
29432 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
29433
29434 /* SSE3. */
29435 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
29436 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
29437 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
29438 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
29439
29440 /* AES */
29441 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
29442 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
29443 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
29444 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
29445 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
29446 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
29447 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
29448 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
29449 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
29450 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
29451 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
29452 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
29453
29454 /* PCLMUL */
29455 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
29456 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
29457
29458 /* RDRND */
29459 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
29460 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
29461 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
29462 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
29463 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
29464 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
29465 IX86_BUILTIN_RDRAND64_STEP);
29466
29467 /* AVX2 */
29468 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
29469 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
29470 IX86_BUILTIN_GATHERSIV2DF);
29471
29472 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
29473 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
29474 IX86_BUILTIN_GATHERSIV4DF);
29475
29476 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
29477 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
29478 IX86_BUILTIN_GATHERDIV2DF);
29479
29480 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
29481 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
29482 IX86_BUILTIN_GATHERDIV4DF);
29483
29484 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
29485 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
29486 IX86_BUILTIN_GATHERSIV4SF);
29487
29488 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
29489 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
29490 IX86_BUILTIN_GATHERSIV8SF);
29491
29492 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
29493 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
29494 IX86_BUILTIN_GATHERDIV4SF);
29495
29496 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
29497 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29498 IX86_BUILTIN_GATHERDIV8SF);
29499
29500 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29501 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29502 IX86_BUILTIN_GATHERSIV2DI);
29503
29504 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29505 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29506 IX86_BUILTIN_GATHERSIV4DI);
29507
29508 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29509 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29510 IX86_BUILTIN_GATHERDIV2DI);
29511
29512 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29513 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29514 IX86_BUILTIN_GATHERDIV4DI);
29515
29516 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29517 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29518 IX86_BUILTIN_GATHERSIV4SI);
29519
29520 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29521 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29522 IX86_BUILTIN_GATHERSIV8SI);
29523
29524 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29525 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29526 IX86_BUILTIN_GATHERDIV4SI);
29527
29528 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29529 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29530 IX86_BUILTIN_GATHERDIV8SI);
29531
29532 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29533 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29534 IX86_BUILTIN_GATHERALTSIV4DF);
29535
29536 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29537 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29538 IX86_BUILTIN_GATHERALTDIV8SF);
29539
29540 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29541 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29542 IX86_BUILTIN_GATHERALTSIV4DI);
29543
29544 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29545 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29546 IX86_BUILTIN_GATHERALTDIV8SI);
29547
29548 /* RTM. */
29549 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29550 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29551
29552 /* MMX access to the vec_init patterns. */
29553 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29554 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29555
29556 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29557 V4HI_FTYPE_HI_HI_HI_HI,
29558 IX86_BUILTIN_VEC_INIT_V4HI);
29559
29560 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29561 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29562 IX86_BUILTIN_VEC_INIT_V8QI);
29563
29564 /* Access to the vec_extract patterns. */
29565 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29566 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29567 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29568 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29569 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29570 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29571 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29572 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29573 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29574 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29575
29576 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29577 "__builtin_ia32_vec_ext_v4hi",
29578 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29579
29580 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29581 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29582
29583 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29584 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29585
29586 /* Access to the vec_set patterns. */
29587 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29588 "__builtin_ia32_vec_set_v2di",
29589 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29590
29591 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29592 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29593
29594 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29595 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29596
29597 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29598 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29599
29600 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29601 "__builtin_ia32_vec_set_v4hi",
29602 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29603
29604 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29605 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29606
29607 /* RDSEED */
29608 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29609 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29610 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29611 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29612 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29613 "__builtin_ia32_rdseed_di_step",
29614 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29615
29616 /* ADCX */
29617 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29618 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29619 def_builtin (OPTION_MASK_ISA_64BIT,
29620 "__builtin_ia32_addcarryx_u64",
29621 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29622 IX86_BUILTIN_ADDCARRYX64);
29623
29624 /* Add FMA4 multi-arg argument instructions */
29625 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29626 {
29627 if (d->name == 0)
29628 continue;
29629
29630 ftype = (enum ix86_builtin_func_type) d->flag;
29631 def_builtin_const (d->mask, d->name, ftype, d->code);
29632 }
29633 }
29634
29635 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29636 to return a pointer to VERSION_DECL if the outcome of the expression
29637 formed by PREDICATE_CHAIN is true. This function will be called during
29638 version dispatch to decide which function version to execute. It returns
29639 the basic block at the end, to which more conditions can be added. */
29640
29641 static basic_block
29642 add_condition_to_bb (tree function_decl, tree version_decl,
29643 tree predicate_chain, basic_block new_bb)
29644 {
29645 gimple return_stmt;
29646 tree convert_expr, result_var;
29647 gimple convert_stmt;
29648 gimple call_cond_stmt;
29649 gimple if_else_stmt;
29650
29651 basic_block bb1, bb2, bb3;
29652 edge e12, e23;
29653
29654 tree cond_var, and_expr_var = NULL_TREE;
29655 gimple_seq gseq;
29656
29657 tree predicate_decl, predicate_arg;
29658
29659 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29660
29661 gcc_assert (new_bb != NULL);
29662 gseq = bb_seq (new_bb);
29663
29664
29665 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29666 build_fold_addr_expr (version_decl));
29667 result_var = create_tmp_var (ptr_type_node, NULL);
29668 convert_stmt = gimple_build_assign (result_var, convert_expr);
29669 return_stmt = gimple_build_return (result_var);
29670
29671 if (predicate_chain == NULL_TREE)
29672 {
29673 gimple_seq_add_stmt (&gseq, convert_stmt);
29674 gimple_seq_add_stmt (&gseq, return_stmt);
29675 set_bb_seq (new_bb, gseq);
29676 gimple_set_bb (convert_stmt, new_bb);
29677 gimple_set_bb (return_stmt, new_bb);
29678 pop_cfun ();
29679 return new_bb;
29680 }
29681
29682 while (predicate_chain != NULL)
29683 {
29684 cond_var = create_tmp_var (integer_type_node, NULL);
29685 predicate_decl = TREE_PURPOSE (predicate_chain);
29686 predicate_arg = TREE_VALUE (predicate_chain);
29687 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29688 gimple_call_set_lhs (call_cond_stmt, cond_var);
29689
29690 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29691 gimple_set_bb (call_cond_stmt, new_bb);
29692 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29693
29694 predicate_chain = TREE_CHAIN (predicate_chain);
29695
29696 if (and_expr_var == NULL)
29697 and_expr_var = cond_var;
29698 else
29699 {
29700 gimple assign_stmt;
29701 /* Use MIN_EXPR to check if any integer is zero?.
29702 and_expr_var = min_expr <cond_var, and_expr_var> */
29703 assign_stmt = gimple_build_assign (and_expr_var,
29704 build2 (MIN_EXPR, integer_type_node,
29705 cond_var, and_expr_var));
29706
29707 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29708 gimple_set_bb (assign_stmt, new_bb);
29709 gimple_seq_add_stmt (&gseq, assign_stmt);
29710 }
29711 }
29712
29713 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29714 integer_zero_node,
29715 NULL_TREE, NULL_TREE);
29716 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29717 gimple_set_bb (if_else_stmt, new_bb);
29718 gimple_seq_add_stmt (&gseq, if_else_stmt);
29719
29720 gimple_seq_add_stmt (&gseq, convert_stmt);
29721 gimple_seq_add_stmt (&gseq, return_stmt);
29722 set_bb_seq (new_bb, gseq);
29723
29724 bb1 = new_bb;
29725 e12 = split_block (bb1, if_else_stmt);
29726 bb2 = e12->dest;
29727 e12->flags &= ~EDGE_FALLTHRU;
29728 e12->flags |= EDGE_TRUE_VALUE;
29729
29730 e23 = split_block (bb2, return_stmt);
29731
29732 gimple_set_bb (convert_stmt, bb2);
29733 gimple_set_bb (return_stmt, bb2);
29734
29735 bb3 = e23->dest;
29736 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29737
29738 remove_edge (e23);
29739 make_edge (bb2, EXIT_BLOCK_PTR, 0);
29740
29741 pop_cfun ();
29742
29743 return bb3;
29744 }
29745
29746 /* This parses the attribute arguments to target in DECL and determines
29747 the right builtin to use to match the platform specification.
29748 It returns the priority value for this version decl. If PREDICATE_LIST
29749 is not NULL, it stores the list of cpu features that need to be checked
29750 before dispatching this function. */
29751
29752 static unsigned int
29753 get_builtin_code_for_version (tree decl, tree *predicate_list)
29754 {
29755 tree attrs;
29756 struct cl_target_option cur_target;
29757 tree target_node;
29758 struct cl_target_option *new_target;
29759 const char *arg_str = NULL;
29760 const char *attrs_str = NULL;
29761 char *tok_str = NULL;
29762 char *token;
29763
29764 /* Priority of i386 features, greater value is higher priority. This is
29765 used to decide the order in which function dispatch must happen. For
29766 instance, a version specialized for SSE4.2 should be checked for dispatch
29767 before a version for SSE3, as SSE4.2 implies SSE3. */
29768 enum feature_priority
29769 {
29770 P_ZERO = 0,
29771 P_MMX,
29772 P_SSE,
29773 P_SSE2,
29774 P_SSE3,
29775 P_SSSE3,
29776 P_PROC_SSSE3,
29777 P_SSE4_a,
29778 P_PROC_SSE4_a,
29779 P_SSE4_1,
29780 P_SSE4_2,
29781 P_PROC_SSE4_2,
29782 P_POPCNT,
29783 P_AVX,
29784 P_AVX2,
29785 P_FMA,
29786 P_PROC_FMA
29787 };
29788
29789 enum feature_priority priority = P_ZERO;
29790
29791 /* These are the target attribute strings for which a dispatcher is
29792 available, from fold_builtin_cpu. */
29793
29794 static struct _feature_list
29795 {
29796 const char *const name;
29797 const enum feature_priority priority;
29798 }
29799 const feature_list[] =
29800 {
29801 {"mmx", P_MMX},
29802 {"sse", P_SSE},
29803 {"sse2", P_SSE2},
29804 {"sse3", P_SSE3},
29805 {"ssse3", P_SSSE3},
29806 {"sse4.1", P_SSE4_1},
29807 {"sse4.2", P_SSE4_2},
29808 {"popcnt", P_POPCNT},
29809 {"avx", P_AVX},
29810 {"avx2", P_AVX2}
29811 };
29812
29813
29814 static unsigned int NUM_FEATURES
29815 = sizeof (feature_list) / sizeof (struct _feature_list);
29816
29817 unsigned int i;
29818
29819 tree predicate_chain = NULL_TREE;
29820 tree predicate_decl, predicate_arg;
29821
29822 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29823 gcc_assert (attrs != NULL);
29824
29825 attrs = TREE_VALUE (TREE_VALUE (attrs));
29826
29827 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29828 attrs_str = TREE_STRING_POINTER (attrs);
29829
29830 /* Return priority zero for default function. */
29831 if (strcmp (attrs_str, "default") == 0)
29832 return 0;
29833
29834 /* Handle arch= if specified. For priority, set it to be 1 more than
29835 the best instruction set the processor can handle. For instance, if
29836 there is a version for atom and a version for ssse3 (the highest ISA
29837 priority for atom), the atom version must be checked for dispatch
29838 before the ssse3 version. */
29839 if (strstr (attrs_str, "arch=") != NULL)
29840 {
29841 cl_target_option_save (&cur_target, &global_options);
29842 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
29843 &global_options_set);
29844
29845 gcc_assert (target_node);
29846 new_target = TREE_TARGET_OPTION (target_node);
29847 gcc_assert (new_target);
29848
29849 if (new_target->arch_specified && new_target->arch > 0)
29850 {
29851 switch (new_target->arch)
29852 {
29853 case PROCESSOR_CORE2:
29854 arg_str = "core2";
29855 priority = P_PROC_SSSE3;
29856 break;
29857 case PROCESSOR_COREI7:
29858 arg_str = "corei7";
29859 priority = P_PROC_SSE4_2;
29860 break;
29861 case PROCESSOR_COREI7_AVX:
29862 arg_str = "corei7-avx";
29863 priority = P_PROC_SSE4_2;
29864 break;
29865 case PROCESSOR_ATOM:
29866 arg_str = "atom";
29867 priority = P_PROC_SSSE3;
29868 break;
29869 case PROCESSOR_AMDFAM10:
29870 arg_str = "amdfam10h";
29871 priority = P_PROC_SSE4_a;
29872 break;
29873 case PROCESSOR_BDVER1:
29874 arg_str = "bdver1";
29875 priority = P_PROC_FMA;
29876 break;
29877 case PROCESSOR_BDVER2:
29878 arg_str = "bdver2";
29879 priority = P_PROC_FMA;
29880 break;
29881 }
29882 }
29883
29884 cl_target_option_restore (&global_options, &cur_target);
29885
29886 if (predicate_list && arg_str == NULL)
29887 {
29888 error_at (DECL_SOURCE_LOCATION (decl),
29889 "No dispatcher found for the versioning attributes");
29890 return 0;
29891 }
29892
29893 if (predicate_list)
29894 {
29895 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29896 /* For a C string literal the length includes the trailing NULL. */
29897 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
29898 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29899 predicate_chain);
29900 }
29901 }
29902
29903 /* Process feature name. */
29904 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
29905 strcpy (tok_str, attrs_str);
29906 token = strtok (tok_str, ",");
29907 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
29908
29909 while (token != NULL)
29910 {
29911 /* Do not process "arch=" */
29912 if (strncmp (token, "arch=", 5) == 0)
29913 {
29914 token = strtok (NULL, ",");
29915 continue;
29916 }
29917 for (i = 0; i < NUM_FEATURES; ++i)
29918 {
29919 if (strcmp (token, feature_list[i].name) == 0)
29920 {
29921 if (predicate_list)
29922 {
29923 predicate_arg = build_string_literal (
29924 strlen (feature_list[i].name) + 1,
29925 feature_list[i].name);
29926 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29927 predicate_chain);
29928 }
29929 /* Find the maximum priority feature. */
29930 if (feature_list[i].priority > priority)
29931 priority = feature_list[i].priority;
29932
29933 break;
29934 }
29935 }
29936 if (predicate_list && i == NUM_FEATURES)
29937 {
29938 error_at (DECL_SOURCE_LOCATION (decl),
29939 "No dispatcher found for %s", token);
29940 return 0;
29941 }
29942 token = strtok (NULL, ",");
29943 }
29944 free (tok_str);
29945
29946 if (predicate_list && predicate_chain == NULL_TREE)
29947 {
29948 error_at (DECL_SOURCE_LOCATION (decl),
29949 "No dispatcher found for the versioning attributes : %s",
29950 attrs_str);
29951 return 0;
29952 }
29953 else if (predicate_list)
29954 {
29955 predicate_chain = nreverse (predicate_chain);
29956 *predicate_list = predicate_chain;
29957 }
29958
29959 return priority;
29960 }
29961
29962 /* This compares the priority of target features in function DECL1
29963 and DECL2. It returns positive value if DECL1 is higher priority,
29964 negative value if DECL2 is higher priority and 0 if they are the
29965 same. */
29966
29967 static int
29968 ix86_compare_version_priority (tree decl1, tree decl2)
29969 {
29970 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
29971 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
29972
29973 return (int)priority1 - (int)priority2;
29974 }
29975
29976 /* V1 and V2 point to function versions with different priorities
29977 based on the target ISA. This function compares their priorities. */
29978
29979 static int
29980 feature_compare (const void *v1, const void *v2)
29981 {
29982 typedef struct _function_version_info
29983 {
29984 tree version_decl;
29985 tree predicate_chain;
29986 unsigned int dispatch_priority;
29987 } function_version_info;
29988
29989 const function_version_info c1 = *(const function_version_info *)v1;
29990 const function_version_info c2 = *(const function_version_info *)v2;
29991 return (c2.dispatch_priority - c1.dispatch_priority);
29992 }
29993
29994 /* This function generates the dispatch function for
29995 multi-versioned functions. DISPATCH_DECL is the function which will
29996 contain the dispatch logic. FNDECLS are the function choices for
29997 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
29998 in DISPATCH_DECL in which the dispatch code is generated. */
29999
30000 static int
30001 dispatch_function_versions (tree dispatch_decl,
30002 void *fndecls_p,
30003 basic_block *empty_bb)
30004 {
30005 tree default_decl;
30006 gimple ifunc_cpu_init_stmt;
30007 gimple_seq gseq;
30008 int ix;
30009 tree ele;
30010 vec<tree> *fndecls;
30011 unsigned int num_versions = 0;
30012 unsigned int actual_versions = 0;
30013 unsigned int i;
30014
30015 struct _function_version_info
30016 {
30017 tree version_decl;
30018 tree predicate_chain;
30019 unsigned int dispatch_priority;
30020 }*function_version_info;
30021
30022 gcc_assert (dispatch_decl != NULL
30023 && fndecls_p != NULL
30024 && empty_bb != NULL);
30025
30026 /*fndecls_p is actually a vector. */
30027 fndecls = static_cast<vec<tree> *> (fndecls_p);
30028
30029 /* At least one more version other than the default. */
30030 num_versions = fndecls->length ();
30031 gcc_assert (num_versions >= 2);
30032
30033 function_version_info = (struct _function_version_info *)
30034 XNEWVEC (struct _function_version_info, (num_versions - 1));
30035
30036 /* The first version in the vector is the default decl. */
30037 default_decl = (*fndecls)[0];
30038
30039 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
30040
30041 gseq = bb_seq (*empty_bb);
30042 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
30043 constructors, so explicity call __builtin_cpu_init here. */
30044 ifunc_cpu_init_stmt = gimple_build_call_vec (
30045 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
30046 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
30047 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
30048 set_bb_seq (*empty_bb, gseq);
30049
30050 pop_cfun ();
30051
30052
30053 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
30054 {
30055 tree version_decl = ele;
30056 tree predicate_chain = NULL_TREE;
30057 unsigned int priority;
30058 /* Get attribute string, parse it and find the right predicate decl.
30059 The predicate function could be a lengthy combination of many
30060 features, like arch-type and various isa-variants. */
30061 priority = get_builtin_code_for_version (version_decl,
30062 &predicate_chain);
30063
30064 if (predicate_chain == NULL_TREE)
30065 continue;
30066
30067 function_version_info [actual_versions].version_decl = version_decl;
30068 function_version_info [actual_versions].predicate_chain
30069 = predicate_chain;
30070 function_version_info [actual_versions].dispatch_priority = priority;
30071 actual_versions++;
30072 }
30073
30074 /* Sort the versions according to descending order of dispatch priority. The
30075 priority is based on the ISA. This is not a perfect solution. There
30076 could still be ambiguity. If more than one function version is suitable
30077 to execute, which one should be dispatched? In future, allow the user
30078 to specify a dispatch priority next to the version. */
30079 qsort (function_version_info, actual_versions,
30080 sizeof (struct _function_version_info), feature_compare);
30081
30082 for (i = 0; i < actual_versions; ++i)
30083 *empty_bb = add_condition_to_bb (dispatch_decl,
30084 function_version_info[i].version_decl,
30085 function_version_info[i].predicate_chain,
30086 *empty_bb);
30087
30088 /* dispatch default version at the end. */
30089 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
30090 NULL, *empty_bb);
30091
30092 free (function_version_info);
30093 return 0;
30094 }
30095
30096 /* Comparator function to be used in qsort routine to sort attribute
30097 specification strings to "target". */
30098
30099 static int
30100 attr_strcmp (const void *v1, const void *v2)
30101 {
30102 const char *c1 = *(char *const*)v1;
30103 const char *c2 = *(char *const*)v2;
30104 return strcmp (c1, c2);
30105 }
30106
30107 /* ARGLIST is the argument to target attribute. This function tokenizes
30108 the comma separated arguments, sorts them and returns a string which
30109 is a unique identifier for the comma separated arguments. It also
30110 replaces non-identifier characters "=,-" with "_". */
30111
30112 static char *
30113 sorted_attr_string (tree arglist)
30114 {
30115 tree arg;
30116 size_t str_len_sum = 0;
30117 char **args = NULL;
30118 char *attr_str, *ret_str;
30119 char *attr = NULL;
30120 unsigned int argnum = 1;
30121 unsigned int i;
30122
30123 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30124 {
30125 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30126 size_t len = strlen (str);
30127 str_len_sum += len + 1;
30128 if (arg != arglist)
30129 argnum++;
30130 for (i = 0; i < strlen (str); i++)
30131 if (str[i] == ',')
30132 argnum++;
30133 }
30134
30135 attr_str = XNEWVEC (char, str_len_sum);
30136 str_len_sum = 0;
30137 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30138 {
30139 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30140 size_t len = strlen (str);
30141 memcpy (attr_str + str_len_sum, str, len);
30142 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
30143 str_len_sum += len + 1;
30144 }
30145
30146 /* Replace "=,-" with "_". */
30147 for (i = 0; i < strlen (attr_str); i++)
30148 if (attr_str[i] == '=' || attr_str[i]== '-')
30149 attr_str[i] = '_';
30150
30151 if (argnum == 1)
30152 return attr_str;
30153
30154 args = XNEWVEC (char *, argnum);
30155
30156 i = 0;
30157 attr = strtok (attr_str, ",");
30158 while (attr != NULL)
30159 {
30160 args[i] = attr;
30161 i++;
30162 attr = strtok (NULL, ",");
30163 }
30164
30165 qsort (args, argnum, sizeof (char *), attr_strcmp);
30166
30167 ret_str = XNEWVEC (char, str_len_sum);
30168 str_len_sum = 0;
30169 for (i = 0; i < argnum; i++)
30170 {
30171 size_t len = strlen (args[i]);
30172 memcpy (ret_str + str_len_sum, args[i], len);
30173 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
30174 str_len_sum += len + 1;
30175 }
30176
30177 XDELETEVEC (args);
30178 XDELETEVEC (attr_str);
30179 return ret_str;
30180 }
30181
30182 /* This function changes the assembler name for functions that are
30183 versions. If DECL is a function version and has a "target"
30184 attribute, it appends the attribute string to its assembler name. */
30185
30186 static tree
30187 ix86_mangle_function_version_assembler_name (tree decl, tree id)
30188 {
30189 tree version_attr;
30190 const char *orig_name, *version_string;
30191 char *attr_str, *assembler_name;
30192
30193 if (DECL_DECLARED_INLINE_P (decl)
30194 && lookup_attribute ("gnu_inline",
30195 DECL_ATTRIBUTES (decl)))
30196 error_at (DECL_SOURCE_LOCATION (decl),
30197 "Function versions cannot be marked as gnu_inline,"
30198 " bodies have to be generated");
30199
30200 if (DECL_VIRTUAL_P (decl)
30201 || DECL_VINDEX (decl))
30202 sorry ("Virtual function multiversioning not supported");
30203
30204 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30205
30206 /* target attribute string cannot be NULL. */
30207 gcc_assert (version_attr != NULL_TREE);
30208
30209 orig_name = IDENTIFIER_POINTER (id);
30210 version_string
30211 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
30212
30213 if (strcmp (version_string, "default") == 0)
30214 return id;
30215
30216 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
30217 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
30218
30219 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
30220
30221 /* Allow assembler name to be modified if already set. */
30222 if (DECL_ASSEMBLER_NAME_SET_P (decl))
30223 SET_DECL_RTL (decl, NULL);
30224
30225 tree ret = get_identifier (assembler_name);
30226 XDELETEVEC (attr_str);
30227 XDELETEVEC (assembler_name);
30228 return ret;
30229 }
30230
30231 /* This function returns true if FN1 and FN2 are versions of the same function,
30232 that is, the target strings of the function decls are different. This assumes
30233 that FN1 and FN2 have the same signature. */
30234
30235 static bool
30236 ix86_function_versions (tree fn1, tree fn2)
30237 {
30238 tree attr1, attr2;
30239 char *target1, *target2;
30240 bool result;
30241
30242 if (TREE_CODE (fn1) != FUNCTION_DECL
30243 || TREE_CODE (fn2) != FUNCTION_DECL)
30244 return false;
30245
30246 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
30247 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
30248
30249 /* At least one function decl should have the target attribute specified. */
30250 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
30251 return false;
30252
30253 /* Diagnose missing target attribute if one of the decls is already
30254 multi-versioned. */
30255 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
30256 {
30257 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
30258 {
30259 if (attr2 != NULL_TREE)
30260 {
30261 tree tem = fn1;
30262 fn1 = fn2;
30263 fn2 = tem;
30264 attr1 = attr2;
30265 }
30266 error_at (DECL_SOURCE_LOCATION (fn2),
30267 "missing %<target%> attribute for multi-versioned %D",
30268 fn2);
30269 inform (DECL_SOURCE_LOCATION (fn1),
30270 "previous declaration of %D", fn1);
30271 /* Prevent diagnosing of the same error multiple times. */
30272 DECL_ATTRIBUTES (fn2)
30273 = tree_cons (get_identifier ("target"),
30274 copy_node (TREE_VALUE (attr1)),
30275 DECL_ATTRIBUTES (fn2));
30276 }
30277 return false;
30278 }
30279
30280 target1 = sorted_attr_string (TREE_VALUE (attr1));
30281 target2 = sorted_attr_string (TREE_VALUE (attr2));
30282
30283 /* The sorted target strings must be different for fn1 and fn2
30284 to be versions. */
30285 if (strcmp (target1, target2) == 0)
30286 result = false;
30287 else
30288 result = true;
30289
30290 XDELETEVEC (target1);
30291 XDELETEVEC (target2);
30292
30293 return result;
30294 }
30295
30296 static tree
30297 ix86_mangle_decl_assembler_name (tree decl, tree id)
30298 {
30299 /* For function version, add the target suffix to the assembler name. */
30300 if (TREE_CODE (decl) == FUNCTION_DECL
30301 && DECL_FUNCTION_VERSIONED (decl))
30302 id = ix86_mangle_function_version_assembler_name (decl, id);
30303 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
30304 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
30305 #endif
30306
30307 return id;
30308 }
30309
30310 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
30311 is true, append the full path name of the source file. */
30312
30313 static char *
30314 make_name (tree decl, const char *suffix, bool make_unique)
30315 {
30316 char *global_var_name;
30317 int name_len;
30318 const char *name;
30319 const char *unique_name = NULL;
30320
30321 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
30322
30323 /* Get a unique name that can be used globally without any chances
30324 of collision at link time. */
30325 if (make_unique)
30326 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
30327
30328 name_len = strlen (name) + strlen (suffix) + 2;
30329
30330 if (make_unique)
30331 name_len += strlen (unique_name) + 1;
30332 global_var_name = XNEWVEC (char, name_len);
30333
30334 /* Use '.' to concatenate names as it is demangler friendly. */
30335 if (make_unique)
30336 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
30337 suffix);
30338 else
30339 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
30340
30341 return global_var_name;
30342 }
30343
30344 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30345
30346 /* Make a dispatcher declaration for the multi-versioned function DECL.
30347 Calls to DECL function will be replaced with calls to the dispatcher
30348 by the front-end. Return the decl created. */
30349
30350 static tree
30351 make_dispatcher_decl (const tree decl)
30352 {
30353 tree func_decl;
30354 char *func_name;
30355 tree fn_type, func_type;
30356 bool is_uniq = false;
30357
30358 if (TREE_PUBLIC (decl) == 0)
30359 is_uniq = true;
30360
30361 func_name = make_name (decl, "ifunc", is_uniq);
30362
30363 fn_type = TREE_TYPE (decl);
30364 func_type = build_function_type (TREE_TYPE (fn_type),
30365 TYPE_ARG_TYPES (fn_type));
30366
30367 func_decl = build_fn_decl (func_name, func_type);
30368 XDELETEVEC (func_name);
30369 TREE_USED (func_decl) = 1;
30370 DECL_CONTEXT (func_decl) = NULL_TREE;
30371 DECL_INITIAL (func_decl) = error_mark_node;
30372 DECL_ARTIFICIAL (func_decl) = 1;
30373 /* Mark this func as external, the resolver will flip it again if
30374 it gets generated. */
30375 DECL_EXTERNAL (func_decl) = 1;
30376 /* This will be of type IFUNCs have to be externally visible. */
30377 TREE_PUBLIC (func_decl) = 1;
30378
30379 return func_decl;
30380 }
30381
30382 #endif
30383
30384 /* Returns true if decl is multi-versioned and DECL is the default function,
30385 that is it is not tagged with target specific optimization. */
30386
30387 static bool
30388 is_function_default_version (const tree decl)
30389 {
30390 if (TREE_CODE (decl) != FUNCTION_DECL
30391 || !DECL_FUNCTION_VERSIONED (decl))
30392 return false;
30393 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30394 gcc_assert (attr);
30395 attr = TREE_VALUE (TREE_VALUE (attr));
30396 return (TREE_CODE (attr) == STRING_CST
30397 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
30398 }
30399
30400 /* Make a dispatcher declaration for the multi-versioned function DECL.
30401 Calls to DECL function will be replaced with calls to the dispatcher
30402 by the front-end. Returns the decl of the dispatcher function. */
30403
30404 static tree
30405 ix86_get_function_versions_dispatcher (void *decl)
30406 {
30407 tree fn = (tree) decl;
30408 struct cgraph_node *node = NULL;
30409 struct cgraph_node *default_node = NULL;
30410 struct cgraph_function_version_info *node_v = NULL;
30411 struct cgraph_function_version_info *first_v = NULL;
30412
30413 tree dispatch_decl = NULL;
30414
30415 struct cgraph_function_version_info *default_version_info = NULL;
30416
30417 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
30418
30419 node = cgraph_get_node (fn);
30420 gcc_assert (node != NULL);
30421
30422 node_v = get_cgraph_node_version (node);
30423 gcc_assert (node_v != NULL);
30424
30425 if (node_v->dispatcher_resolver != NULL)
30426 return node_v->dispatcher_resolver;
30427
30428 /* Find the default version and make it the first node. */
30429 first_v = node_v;
30430 /* Go to the beginning of the chain. */
30431 while (first_v->prev != NULL)
30432 first_v = first_v->prev;
30433 default_version_info = first_v;
30434 while (default_version_info != NULL)
30435 {
30436 if (is_function_default_version
30437 (default_version_info->this_node->decl))
30438 break;
30439 default_version_info = default_version_info->next;
30440 }
30441
30442 /* If there is no default node, just return NULL. */
30443 if (default_version_info == NULL)
30444 return NULL;
30445
30446 /* Make default info the first node. */
30447 if (first_v != default_version_info)
30448 {
30449 default_version_info->prev->next = default_version_info->next;
30450 if (default_version_info->next)
30451 default_version_info->next->prev = default_version_info->prev;
30452 first_v->prev = default_version_info;
30453 default_version_info->next = first_v;
30454 default_version_info->prev = NULL;
30455 }
30456
30457 default_node = default_version_info->this_node;
30458
30459 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30460 if (targetm.has_ifunc_p ())
30461 {
30462 struct cgraph_function_version_info *it_v = NULL;
30463 struct cgraph_node *dispatcher_node = NULL;
30464 struct cgraph_function_version_info *dispatcher_version_info = NULL;
30465
30466 /* Right now, the dispatching is done via ifunc. */
30467 dispatch_decl = make_dispatcher_decl (default_node->decl);
30468
30469 dispatcher_node = cgraph_get_create_node (dispatch_decl);
30470 gcc_assert (dispatcher_node != NULL);
30471 dispatcher_node->dispatcher_function = 1;
30472 dispatcher_version_info
30473 = insert_new_cgraph_node_version (dispatcher_node);
30474 dispatcher_version_info->next = default_version_info;
30475 dispatcher_node->definition = 1;
30476
30477 /* Set the dispatcher for all the versions. */
30478 it_v = default_version_info;
30479 while (it_v != NULL)
30480 {
30481 it_v->dispatcher_resolver = dispatch_decl;
30482 it_v = it_v->next;
30483 }
30484 }
30485 else
30486 #endif
30487 {
30488 error_at (DECL_SOURCE_LOCATION (default_node->decl),
30489 "multiversioning needs ifunc which is not supported "
30490 "on this target");
30491 }
30492
30493 return dispatch_decl;
30494 }
30495
30496 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
30497 it to CHAIN. */
30498
30499 static tree
30500 make_attribute (const char *name, const char *arg_name, tree chain)
30501 {
30502 tree attr_name;
30503 tree attr_arg_name;
30504 tree attr_args;
30505 tree attr;
30506
30507 attr_name = get_identifier (name);
30508 attr_arg_name = build_string (strlen (arg_name), arg_name);
30509 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30510 attr = tree_cons (attr_name, attr_args, chain);
30511 return attr;
30512 }
30513
30514 /* Make the resolver function decl to dispatch the versions of
30515 a multi-versioned function, DEFAULT_DECL. Create an
30516 empty basic block in the resolver and store the pointer in
30517 EMPTY_BB. Return the decl of the resolver function. */
30518
30519 static tree
30520 make_resolver_func (const tree default_decl,
30521 const tree dispatch_decl,
30522 basic_block *empty_bb)
30523 {
30524 char *resolver_name;
30525 tree decl, type, decl_name, t;
30526 bool is_uniq = false;
30527
30528 /* IFUNC's have to be globally visible. So, if the default_decl is
30529 not, then the name of the IFUNC should be made unique. */
30530 if (TREE_PUBLIC (default_decl) == 0)
30531 is_uniq = true;
30532
30533 /* Append the filename to the resolver function if the versions are
30534 not externally visible. This is because the resolver function has
30535 to be externally visible for the loader to find it. So, appending
30536 the filename will prevent conflicts with a resolver function from
30537 another module which is based on the same version name. */
30538 resolver_name = make_name (default_decl, "resolver", is_uniq);
30539
30540 /* The resolver function should return a (void *). */
30541 type = build_function_type_list (ptr_type_node, NULL_TREE);
30542
30543 decl = build_fn_decl (resolver_name, type);
30544 decl_name = get_identifier (resolver_name);
30545 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30546
30547 DECL_NAME (decl) = decl_name;
30548 TREE_USED (decl) = 1;
30549 DECL_ARTIFICIAL (decl) = 1;
30550 DECL_IGNORED_P (decl) = 0;
30551 /* IFUNC resolvers have to be externally visible. */
30552 TREE_PUBLIC (decl) = 1;
30553 DECL_UNINLINABLE (decl) = 1;
30554
30555 /* Resolver is not external, body is generated. */
30556 DECL_EXTERNAL (decl) = 0;
30557 DECL_EXTERNAL (dispatch_decl) = 0;
30558
30559 DECL_CONTEXT (decl) = NULL_TREE;
30560 DECL_INITIAL (decl) = make_node (BLOCK);
30561 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30562
30563 if (DECL_COMDAT_GROUP (default_decl)
30564 || TREE_PUBLIC (default_decl))
30565 {
30566 /* In this case, each translation unit with a call to this
30567 versioned function will put out a resolver. Ensure it
30568 is comdat to keep just one copy. */
30569 DECL_COMDAT (decl) = 1;
30570 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30571 }
30572 /* Build result decl and add to function_decl. */
30573 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30574 DECL_ARTIFICIAL (t) = 1;
30575 DECL_IGNORED_P (t) = 1;
30576 DECL_RESULT (decl) = t;
30577
30578 gimplify_function_tree (decl);
30579 push_cfun (DECL_STRUCT_FUNCTION (decl));
30580 *empty_bb = init_lowered_empty_function (decl, false);
30581
30582 cgraph_add_new_function (decl, true);
30583 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30584
30585 pop_cfun ();
30586
30587 gcc_assert (dispatch_decl != NULL);
30588 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30589 DECL_ATTRIBUTES (dispatch_decl)
30590 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30591
30592 /* Create the alias for dispatch to resolver here. */
30593 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30594 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30595 XDELETEVEC (resolver_name);
30596 return decl;
30597 }
30598
30599 /* Generate the dispatching code body to dispatch multi-versioned function
30600 DECL. The target hook is called to process the "target" attributes and
30601 provide the code to dispatch the right function at run-time. NODE points
30602 to the dispatcher decl whose body will be created. */
30603
30604 static tree
30605 ix86_generate_version_dispatcher_body (void *node_p)
30606 {
30607 tree resolver_decl;
30608 basic_block empty_bb;
30609 vec<tree> fn_ver_vec = vNULL;
30610 tree default_ver_decl;
30611 struct cgraph_node *versn;
30612 struct cgraph_node *node;
30613
30614 struct cgraph_function_version_info *node_version_info = NULL;
30615 struct cgraph_function_version_info *versn_info = NULL;
30616
30617 node = (cgraph_node *)node_p;
30618
30619 node_version_info = get_cgraph_node_version (node);
30620 gcc_assert (node->dispatcher_function
30621 && node_version_info != NULL);
30622
30623 if (node_version_info->dispatcher_resolver)
30624 return node_version_info->dispatcher_resolver;
30625
30626 /* The first version in the chain corresponds to the default version. */
30627 default_ver_decl = node_version_info->next->this_node->decl;
30628
30629 /* node is going to be an alias, so remove the finalized bit. */
30630 node->definition = false;
30631
30632 resolver_decl = make_resolver_func (default_ver_decl,
30633 node->decl, &empty_bb);
30634
30635 node_version_info->dispatcher_resolver = resolver_decl;
30636
30637 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30638
30639 fn_ver_vec.create (2);
30640
30641 for (versn_info = node_version_info->next; versn_info;
30642 versn_info = versn_info->next)
30643 {
30644 versn = versn_info->this_node;
30645 /* Check for virtual functions here again, as by this time it should
30646 have been determined if this function needs a vtable index or
30647 not. This happens for methods in derived classes that override
30648 virtual methods in base classes but are not explicitly marked as
30649 virtual. */
30650 if (DECL_VINDEX (versn->decl))
30651 sorry ("Virtual function multiversioning not supported");
30652
30653 fn_ver_vec.safe_push (versn->decl);
30654 }
30655
30656 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30657 fn_ver_vec.release ();
30658 rebuild_cgraph_edges ();
30659 pop_cfun ();
30660 return resolver_decl;
30661 }
30662 /* This builds the processor_model struct type defined in
30663 libgcc/config/i386/cpuinfo.c */
30664
30665 static tree
30666 build_processor_model_struct (void)
30667 {
30668 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30669 "__cpu_features"};
30670 tree field = NULL_TREE, field_chain = NULL_TREE;
30671 int i;
30672 tree type = make_node (RECORD_TYPE);
30673
30674 /* The first 3 fields are unsigned int. */
30675 for (i = 0; i < 3; ++i)
30676 {
30677 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30678 get_identifier (field_name[i]), unsigned_type_node);
30679 if (field_chain != NULL_TREE)
30680 DECL_CHAIN (field) = field_chain;
30681 field_chain = field;
30682 }
30683
30684 /* The last field is an array of unsigned integers of size one. */
30685 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30686 get_identifier (field_name[3]),
30687 build_array_type (unsigned_type_node,
30688 build_index_type (size_one_node)));
30689 if (field_chain != NULL_TREE)
30690 DECL_CHAIN (field) = field_chain;
30691 field_chain = field;
30692
30693 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30694 return type;
30695 }
30696
30697 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30698
30699 static tree
30700 make_var_decl (tree type, const char *name)
30701 {
30702 tree new_decl;
30703
30704 new_decl = build_decl (UNKNOWN_LOCATION,
30705 VAR_DECL,
30706 get_identifier(name),
30707 type);
30708
30709 DECL_EXTERNAL (new_decl) = 1;
30710 TREE_STATIC (new_decl) = 1;
30711 TREE_PUBLIC (new_decl) = 1;
30712 DECL_INITIAL (new_decl) = 0;
30713 DECL_ARTIFICIAL (new_decl) = 0;
30714 DECL_PRESERVE_P (new_decl) = 1;
30715
30716 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30717 assemble_variable (new_decl, 0, 0, 0);
30718
30719 return new_decl;
30720 }
30721
30722 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30723 into an integer defined in libgcc/config/i386/cpuinfo.c */
30724
30725 static tree
30726 fold_builtin_cpu (tree fndecl, tree *args)
30727 {
30728 unsigned int i;
30729 enum ix86_builtins fn_code = (enum ix86_builtins)
30730 DECL_FUNCTION_CODE (fndecl);
30731 tree param_string_cst = NULL;
30732
30733 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30734 enum processor_features
30735 {
30736 F_CMOV = 0,
30737 F_MMX,
30738 F_POPCNT,
30739 F_SSE,
30740 F_SSE2,
30741 F_SSE3,
30742 F_SSSE3,
30743 F_SSE4_1,
30744 F_SSE4_2,
30745 F_AVX,
30746 F_AVX2,
30747 F_MAX
30748 };
30749
30750 /* These are the values for vendor types and cpu types and subtypes
30751 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30752 the corresponding start value. */
30753 enum processor_model
30754 {
30755 M_INTEL = 1,
30756 M_AMD,
30757 M_CPU_TYPE_START,
30758 M_INTEL_ATOM,
30759 M_INTEL_CORE2,
30760 M_INTEL_COREI7,
30761 M_AMDFAM10H,
30762 M_AMDFAM15H,
30763 M_INTEL_SLM,
30764 M_CPU_SUBTYPE_START,
30765 M_INTEL_COREI7_NEHALEM,
30766 M_INTEL_COREI7_WESTMERE,
30767 M_INTEL_COREI7_SANDYBRIDGE,
30768 M_AMDFAM10H_BARCELONA,
30769 M_AMDFAM10H_SHANGHAI,
30770 M_AMDFAM10H_ISTANBUL,
30771 M_AMDFAM15H_BDVER1,
30772 M_AMDFAM15H_BDVER2,
30773 M_AMDFAM15H_BDVER3
30774 };
30775
30776 static struct _arch_names_table
30777 {
30778 const char *const name;
30779 const enum processor_model model;
30780 }
30781 const arch_names_table[] =
30782 {
30783 {"amd", M_AMD},
30784 {"intel", M_INTEL},
30785 {"atom", M_INTEL_ATOM},
30786 {"slm", M_INTEL_SLM},
30787 {"core2", M_INTEL_CORE2},
30788 {"corei7", M_INTEL_COREI7},
30789 {"nehalem", M_INTEL_COREI7_NEHALEM},
30790 {"westmere", M_INTEL_COREI7_WESTMERE},
30791 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30792 {"amdfam10h", M_AMDFAM10H},
30793 {"barcelona", M_AMDFAM10H_BARCELONA},
30794 {"shanghai", M_AMDFAM10H_SHANGHAI},
30795 {"istanbul", M_AMDFAM10H_ISTANBUL},
30796 {"amdfam15h", M_AMDFAM15H},
30797 {"bdver1", M_AMDFAM15H_BDVER1},
30798 {"bdver2", M_AMDFAM15H_BDVER2},
30799 {"bdver3", M_AMDFAM15H_BDVER3},
30800 };
30801
30802 static struct _isa_names_table
30803 {
30804 const char *const name;
30805 const enum processor_features feature;
30806 }
30807 const isa_names_table[] =
30808 {
30809 {"cmov", F_CMOV},
30810 {"mmx", F_MMX},
30811 {"popcnt", F_POPCNT},
30812 {"sse", F_SSE},
30813 {"sse2", F_SSE2},
30814 {"sse3", F_SSE3},
30815 {"ssse3", F_SSSE3},
30816 {"sse4.1", F_SSE4_1},
30817 {"sse4.2", F_SSE4_2},
30818 {"avx", F_AVX},
30819 {"avx2", F_AVX2}
30820 };
30821
30822 tree __processor_model_type = build_processor_model_struct ();
30823 tree __cpu_model_var = make_var_decl (__processor_model_type,
30824 "__cpu_model");
30825
30826
30827 varpool_add_new_variable (__cpu_model_var);
30828
30829 gcc_assert ((args != NULL) && (*args != NULL));
30830
30831 param_string_cst = *args;
30832 while (param_string_cst
30833 && TREE_CODE (param_string_cst) != STRING_CST)
30834 {
30835 /* *args must be a expr that can contain other EXPRS leading to a
30836 STRING_CST. */
30837 if (!EXPR_P (param_string_cst))
30838 {
30839 error ("Parameter to builtin must be a string constant or literal");
30840 return integer_zero_node;
30841 }
30842 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30843 }
30844
30845 gcc_assert (param_string_cst);
30846
30847 if (fn_code == IX86_BUILTIN_CPU_IS)
30848 {
30849 tree ref;
30850 tree field;
30851 tree final;
30852
30853 unsigned int field_val = 0;
30854 unsigned int NUM_ARCH_NAMES
30855 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30856
30857 for (i = 0; i < NUM_ARCH_NAMES; i++)
30858 if (strcmp (arch_names_table[i].name,
30859 TREE_STRING_POINTER (param_string_cst)) == 0)
30860 break;
30861
30862 if (i == NUM_ARCH_NAMES)
30863 {
30864 error ("Parameter to builtin not valid: %s",
30865 TREE_STRING_POINTER (param_string_cst));
30866 return integer_zero_node;
30867 }
30868
30869 field = TYPE_FIELDS (__processor_model_type);
30870 field_val = arch_names_table[i].model;
30871
30872 /* CPU types are stored in the next field. */
30873 if (field_val > M_CPU_TYPE_START
30874 && field_val < M_CPU_SUBTYPE_START)
30875 {
30876 field = DECL_CHAIN (field);
30877 field_val -= M_CPU_TYPE_START;
30878 }
30879
30880 /* CPU subtypes are stored in the next field. */
30881 if (field_val > M_CPU_SUBTYPE_START)
30882 {
30883 field = DECL_CHAIN ( DECL_CHAIN (field));
30884 field_val -= M_CPU_SUBTYPE_START;
30885 }
30886
30887 /* Get the appropriate field in __cpu_model. */
30888 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30889 field, NULL_TREE);
30890
30891 /* Check the value. */
30892 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30893 build_int_cstu (unsigned_type_node, field_val));
30894 return build1 (CONVERT_EXPR, integer_type_node, final);
30895 }
30896 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30897 {
30898 tree ref;
30899 tree array_elt;
30900 tree field;
30901 tree final;
30902
30903 unsigned int field_val = 0;
30904 unsigned int NUM_ISA_NAMES
30905 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
30906
30907 for (i = 0; i < NUM_ISA_NAMES; i++)
30908 if (strcmp (isa_names_table[i].name,
30909 TREE_STRING_POINTER (param_string_cst)) == 0)
30910 break;
30911
30912 if (i == NUM_ISA_NAMES)
30913 {
30914 error ("Parameter to builtin not valid: %s",
30915 TREE_STRING_POINTER (param_string_cst));
30916 return integer_zero_node;
30917 }
30918
30919 field = TYPE_FIELDS (__processor_model_type);
30920 /* Get the last field, which is __cpu_features. */
30921 while (DECL_CHAIN (field))
30922 field = DECL_CHAIN (field);
30923
30924 /* Get the appropriate field: __cpu_model.__cpu_features */
30925 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30926 field, NULL_TREE);
30927
30928 /* Access the 0th element of __cpu_features array. */
30929 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
30930 integer_zero_node, NULL_TREE, NULL_TREE);
30931
30932 field_val = (1 << isa_names_table[i].feature);
30933 /* Return __cpu_model.__cpu_features[0] & field_val */
30934 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
30935 build_int_cstu (unsigned_type_node, field_val));
30936 return build1 (CONVERT_EXPR, integer_type_node, final);
30937 }
30938 gcc_unreachable ();
30939 }
30940
30941 static tree
30942 ix86_fold_builtin (tree fndecl, int n_args,
30943 tree *args, bool ignore ATTRIBUTE_UNUSED)
30944 {
30945 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30946 {
30947 enum ix86_builtins fn_code = (enum ix86_builtins)
30948 DECL_FUNCTION_CODE (fndecl);
30949 if (fn_code == IX86_BUILTIN_CPU_IS
30950 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30951 {
30952 gcc_assert (n_args == 1);
30953 return fold_builtin_cpu (fndecl, args);
30954 }
30955 }
30956
30957 #ifdef SUBTARGET_FOLD_BUILTIN
30958 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
30959 #endif
30960
30961 return NULL_TREE;
30962 }
30963
30964 /* Make builtins to detect cpu type and features supported. NAME is
30965 the builtin name, CODE is the builtin code, and FTYPE is the function
30966 type of the builtin. */
30967
30968 static void
30969 make_cpu_type_builtin (const char* name, int code,
30970 enum ix86_builtin_func_type ftype, bool is_const)
30971 {
30972 tree decl;
30973 tree type;
30974
30975 type = ix86_get_builtin_func_type (ftype);
30976 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30977 NULL, NULL_TREE);
30978 gcc_assert (decl != NULL_TREE);
30979 ix86_builtins[(int) code] = decl;
30980 TREE_READONLY (decl) = is_const;
30981 }
30982
30983 /* Make builtins to get CPU type and features supported. The created
30984 builtins are :
30985
30986 __builtin_cpu_init (), to detect cpu type and features,
30987 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
30988 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
30989 */
30990
30991 static void
30992 ix86_init_platform_type_builtins (void)
30993 {
30994 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
30995 INT_FTYPE_VOID, false);
30996 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
30997 INT_FTYPE_PCCHAR, true);
30998 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
30999 INT_FTYPE_PCCHAR, true);
31000 }
31001
31002 /* Internal method for ix86_init_builtins. */
31003
31004 static void
31005 ix86_init_builtins_va_builtins_abi (void)
31006 {
31007 tree ms_va_ref, sysv_va_ref;
31008 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
31009 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
31010 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
31011 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
31012
31013 if (!TARGET_64BIT)
31014 return;
31015 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
31016 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
31017 ms_va_ref = build_reference_type (ms_va_list_type_node);
31018 sysv_va_ref =
31019 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
31020
31021 fnvoid_va_end_ms =
31022 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31023 fnvoid_va_start_ms =
31024 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31025 fnvoid_va_end_sysv =
31026 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
31027 fnvoid_va_start_sysv =
31028 build_varargs_function_type_list (void_type_node, sysv_va_ref,
31029 NULL_TREE);
31030 fnvoid_va_copy_ms =
31031 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
31032 NULL_TREE);
31033 fnvoid_va_copy_sysv =
31034 build_function_type_list (void_type_node, sysv_va_ref,
31035 sysv_va_ref, NULL_TREE);
31036
31037 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
31038 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
31039 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
31040 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
31041 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
31042 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
31043 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
31044 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31045 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
31046 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31047 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
31048 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31049 }
31050
31051 static void
31052 ix86_init_builtin_types (void)
31053 {
31054 tree float128_type_node, float80_type_node;
31055
31056 /* The __float80 type. */
31057 float80_type_node = long_double_type_node;
31058 if (TYPE_MODE (float80_type_node) != XFmode)
31059 {
31060 /* The __float80 type. */
31061 float80_type_node = make_node (REAL_TYPE);
31062
31063 TYPE_PRECISION (float80_type_node) = 80;
31064 layout_type (float80_type_node);
31065 }
31066 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
31067
31068 /* The __float128 type. */
31069 float128_type_node = make_node (REAL_TYPE);
31070 TYPE_PRECISION (float128_type_node) = 128;
31071 layout_type (float128_type_node);
31072 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
31073
31074 /* This macro is built by i386-builtin-types.awk. */
31075 DEFINE_BUILTIN_PRIMITIVE_TYPES;
31076 }
31077
31078 static void
31079 ix86_init_builtins (void)
31080 {
31081 tree t;
31082
31083 ix86_init_builtin_types ();
31084
31085 /* Builtins to get CPU type and features. */
31086 ix86_init_platform_type_builtins ();
31087
31088 /* TFmode support builtins. */
31089 def_builtin_const (0, "__builtin_infq",
31090 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
31091 def_builtin_const (0, "__builtin_huge_valq",
31092 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
31093
31094 /* We will expand them to normal call if SSE isn't available since
31095 they are used by libgcc. */
31096 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
31097 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
31098 BUILT_IN_MD, "__fabstf2", NULL_TREE);
31099 TREE_READONLY (t) = 1;
31100 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
31101
31102 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
31103 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
31104 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
31105 TREE_READONLY (t) = 1;
31106 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
31107
31108 ix86_init_tm_builtins ();
31109 ix86_init_mmx_sse_builtins ();
31110
31111 if (TARGET_LP64)
31112 ix86_init_builtins_va_builtins_abi ();
31113
31114 #ifdef SUBTARGET_INIT_BUILTINS
31115 SUBTARGET_INIT_BUILTINS;
31116 #endif
31117 }
31118
31119 /* Return the ix86 builtin for CODE. */
31120
31121 static tree
31122 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
31123 {
31124 if (code >= IX86_BUILTIN_MAX)
31125 return error_mark_node;
31126
31127 return ix86_builtins[code];
31128 }
31129
31130 /* Errors in the source file can cause expand_expr to return const0_rtx
31131 where we expect a vector. To avoid crashing, use one of the vector
31132 clear instructions. */
31133 static rtx
31134 safe_vector_operand (rtx x, enum machine_mode mode)
31135 {
31136 if (x == const0_rtx)
31137 x = CONST0_RTX (mode);
31138 return x;
31139 }
31140
31141 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
31142
31143 static rtx
31144 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
31145 {
31146 rtx pat;
31147 tree arg0 = CALL_EXPR_ARG (exp, 0);
31148 tree arg1 = CALL_EXPR_ARG (exp, 1);
31149 rtx op0 = expand_normal (arg0);
31150 rtx op1 = expand_normal (arg1);
31151 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31152 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31153 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
31154
31155 if (VECTOR_MODE_P (mode0))
31156 op0 = safe_vector_operand (op0, mode0);
31157 if (VECTOR_MODE_P (mode1))
31158 op1 = safe_vector_operand (op1, mode1);
31159
31160 if (optimize || !target
31161 || GET_MODE (target) != tmode
31162 || !insn_data[icode].operand[0].predicate (target, tmode))
31163 target = gen_reg_rtx (tmode);
31164
31165 if (GET_MODE (op1) == SImode && mode1 == TImode)
31166 {
31167 rtx x = gen_reg_rtx (V4SImode);
31168 emit_insn (gen_sse2_loadd (x, op1));
31169 op1 = gen_lowpart (TImode, x);
31170 }
31171
31172 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31173 op0 = copy_to_mode_reg (mode0, op0);
31174 if (!insn_data[icode].operand[2].predicate (op1, mode1))
31175 op1 = copy_to_mode_reg (mode1, op1);
31176
31177 pat = GEN_FCN (icode) (target, op0, op1);
31178 if (! pat)
31179 return 0;
31180
31181 emit_insn (pat);
31182
31183 return target;
31184 }
31185
31186 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
31187
31188 static rtx
31189 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
31190 enum ix86_builtin_func_type m_type,
31191 enum rtx_code sub_code)
31192 {
31193 rtx pat;
31194 int i;
31195 int nargs;
31196 bool comparison_p = false;
31197 bool tf_p = false;
31198 bool last_arg_constant = false;
31199 int num_memory = 0;
31200 struct {
31201 rtx op;
31202 enum machine_mode mode;
31203 } args[4];
31204
31205 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31206
31207 switch (m_type)
31208 {
31209 case MULTI_ARG_4_DF2_DI_I:
31210 case MULTI_ARG_4_DF2_DI_I1:
31211 case MULTI_ARG_4_SF2_SI_I:
31212 case MULTI_ARG_4_SF2_SI_I1:
31213 nargs = 4;
31214 last_arg_constant = true;
31215 break;
31216
31217 case MULTI_ARG_3_SF:
31218 case MULTI_ARG_3_DF:
31219 case MULTI_ARG_3_SF2:
31220 case MULTI_ARG_3_DF2:
31221 case MULTI_ARG_3_DI:
31222 case MULTI_ARG_3_SI:
31223 case MULTI_ARG_3_SI_DI:
31224 case MULTI_ARG_3_HI:
31225 case MULTI_ARG_3_HI_SI:
31226 case MULTI_ARG_3_QI:
31227 case MULTI_ARG_3_DI2:
31228 case MULTI_ARG_3_SI2:
31229 case MULTI_ARG_3_HI2:
31230 case MULTI_ARG_3_QI2:
31231 nargs = 3;
31232 break;
31233
31234 case MULTI_ARG_2_SF:
31235 case MULTI_ARG_2_DF:
31236 case MULTI_ARG_2_DI:
31237 case MULTI_ARG_2_SI:
31238 case MULTI_ARG_2_HI:
31239 case MULTI_ARG_2_QI:
31240 nargs = 2;
31241 break;
31242
31243 case MULTI_ARG_2_DI_IMM:
31244 case MULTI_ARG_2_SI_IMM:
31245 case MULTI_ARG_2_HI_IMM:
31246 case MULTI_ARG_2_QI_IMM:
31247 nargs = 2;
31248 last_arg_constant = true;
31249 break;
31250
31251 case MULTI_ARG_1_SF:
31252 case MULTI_ARG_1_DF:
31253 case MULTI_ARG_1_SF2:
31254 case MULTI_ARG_1_DF2:
31255 case MULTI_ARG_1_DI:
31256 case MULTI_ARG_1_SI:
31257 case MULTI_ARG_1_HI:
31258 case MULTI_ARG_1_QI:
31259 case MULTI_ARG_1_SI_DI:
31260 case MULTI_ARG_1_HI_DI:
31261 case MULTI_ARG_1_HI_SI:
31262 case MULTI_ARG_1_QI_DI:
31263 case MULTI_ARG_1_QI_SI:
31264 case MULTI_ARG_1_QI_HI:
31265 nargs = 1;
31266 break;
31267
31268 case MULTI_ARG_2_DI_CMP:
31269 case MULTI_ARG_2_SI_CMP:
31270 case MULTI_ARG_2_HI_CMP:
31271 case MULTI_ARG_2_QI_CMP:
31272 nargs = 2;
31273 comparison_p = true;
31274 break;
31275
31276 case MULTI_ARG_2_SF_TF:
31277 case MULTI_ARG_2_DF_TF:
31278 case MULTI_ARG_2_DI_TF:
31279 case MULTI_ARG_2_SI_TF:
31280 case MULTI_ARG_2_HI_TF:
31281 case MULTI_ARG_2_QI_TF:
31282 nargs = 2;
31283 tf_p = true;
31284 break;
31285
31286 default:
31287 gcc_unreachable ();
31288 }
31289
31290 if (optimize || !target
31291 || GET_MODE (target) != tmode
31292 || !insn_data[icode].operand[0].predicate (target, tmode))
31293 target = gen_reg_rtx (tmode);
31294
31295 gcc_assert (nargs <= 4);
31296
31297 for (i = 0; i < nargs; i++)
31298 {
31299 tree arg = CALL_EXPR_ARG (exp, i);
31300 rtx op = expand_normal (arg);
31301 int adjust = (comparison_p) ? 1 : 0;
31302 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
31303
31304 if (last_arg_constant && i == nargs - 1)
31305 {
31306 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
31307 {
31308 enum insn_code new_icode = icode;
31309 switch (icode)
31310 {
31311 case CODE_FOR_xop_vpermil2v2df3:
31312 case CODE_FOR_xop_vpermil2v4sf3:
31313 case CODE_FOR_xop_vpermil2v4df3:
31314 case CODE_FOR_xop_vpermil2v8sf3:
31315 error ("the last argument must be a 2-bit immediate");
31316 return gen_reg_rtx (tmode);
31317 case CODE_FOR_xop_rotlv2di3:
31318 new_icode = CODE_FOR_rotlv2di3;
31319 goto xop_rotl;
31320 case CODE_FOR_xop_rotlv4si3:
31321 new_icode = CODE_FOR_rotlv4si3;
31322 goto xop_rotl;
31323 case CODE_FOR_xop_rotlv8hi3:
31324 new_icode = CODE_FOR_rotlv8hi3;
31325 goto xop_rotl;
31326 case CODE_FOR_xop_rotlv16qi3:
31327 new_icode = CODE_FOR_rotlv16qi3;
31328 xop_rotl:
31329 if (CONST_INT_P (op))
31330 {
31331 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
31332 op = GEN_INT (INTVAL (op) & mask);
31333 gcc_checking_assert
31334 (insn_data[icode].operand[i + 1].predicate (op, mode));
31335 }
31336 else
31337 {
31338 gcc_checking_assert
31339 (nargs == 2
31340 && insn_data[new_icode].operand[0].mode == tmode
31341 && insn_data[new_icode].operand[1].mode == tmode
31342 && insn_data[new_icode].operand[2].mode == mode
31343 && insn_data[new_icode].operand[0].predicate
31344 == insn_data[icode].operand[0].predicate
31345 && insn_data[new_icode].operand[1].predicate
31346 == insn_data[icode].operand[1].predicate);
31347 icode = new_icode;
31348 goto non_constant;
31349 }
31350 break;
31351 default:
31352 gcc_unreachable ();
31353 }
31354 }
31355 }
31356 else
31357 {
31358 non_constant:
31359 if (VECTOR_MODE_P (mode))
31360 op = safe_vector_operand (op, mode);
31361
31362 /* If we aren't optimizing, only allow one memory operand to be
31363 generated. */
31364 if (memory_operand (op, mode))
31365 num_memory++;
31366
31367 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
31368
31369 if (optimize
31370 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
31371 || num_memory > 1)
31372 op = force_reg (mode, op);
31373 }
31374
31375 args[i].op = op;
31376 args[i].mode = mode;
31377 }
31378
31379 switch (nargs)
31380 {
31381 case 1:
31382 pat = GEN_FCN (icode) (target, args[0].op);
31383 break;
31384
31385 case 2:
31386 if (tf_p)
31387 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
31388 GEN_INT ((int)sub_code));
31389 else if (! comparison_p)
31390 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31391 else
31392 {
31393 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
31394 args[0].op,
31395 args[1].op);
31396
31397 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
31398 }
31399 break;
31400
31401 case 3:
31402 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31403 break;
31404
31405 case 4:
31406 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
31407 break;
31408
31409 default:
31410 gcc_unreachable ();
31411 }
31412
31413 if (! pat)
31414 return 0;
31415
31416 emit_insn (pat);
31417 return target;
31418 }
31419
31420 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
31421 insns with vec_merge. */
31422
31423 static rtx
31424 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
31425 rtx target)
31426 {
31427 rtx pat;
31428 tree arg0 = CALL_EXPR_ARG (exp, 0);
31429 rtx op1, op0 = expand_normal (arg0);
31430 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31431 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31432
31433 if (optimize || !target
31434 || GET_MODE (target) != tmode
31435 || !insn_data[icode].operand[0].predicate (target, tmode))
31436 target = gen_reg_rtx (tmode);
31437
31438 if (VECTOR_MODE_P (mode0))
31439 op0 = safe_vector_operand (op0, mode0);
31440
31441 if ((optimize && !register_operand (op0, mode0))
31442 || !insn_data[icode].operand[1].predicate (op0, mode0))
31443 op0 = copy_to_mode_reg (mode0, op0);
31444
31445 op1 = op0;
31446 if (!insn_data[icode].operand[2].predicate (op1, mode0))
31447 op1 = copy_to_mode_reg (mode0, op1);
31448
31449 pat = GEN_FCN (icode) (target, op0, op1);
31450 if (! pat)
31451 return 0;
31452 emit_insn (pat);
31453 return target;
31454 }
31455
31456 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
31457
31458 static rtx
31459 ix86_expand_sse_compare (const struct builtin_description *d,
31460 tree exp, rtx target, bool swap)
31461 {
31462 rtx pat;
31463 tree arg0 = CALL_EXPR_ARG (exp, 0);
31464 tree arg1 = CALL_EXPR_ARG (exp, 1);
31465 rtx op0 = expand_normal (arg0);
31466 rtx op1 = expand_normal (arg1);
31467 rtx op2;
31468 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31469 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31470 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31471 enum rtx_code comparison = d->comparison;
31472
31473 if (VECTOR_MODE_P (mode0))
31474 op0 = safe_vector_operand (op0, mode0);
31475 if (VECTOR_MODE_P (mode1))
31476 op1 = safe_vector_operand (op1, mode1);
31477
31478 /* Swap operands if we have a comparison that isn't available in
31479 hardware. */
31480 if (swap)
31481 {
31482 rtx tmp = gen_reg_rtx (mode1);
31483 emit_move_insn (tmp, op1);
31484 op1 = op0;
31485 op0 = tmp;
31486 }
31487
31488 if (optimize || !target
31489 || GET_MODE (target) != tmode
31490 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31491 target = gen_reg_rtx (tmode);
31492
31493 if ((optimize && !register_operand (op0, mode0))
31494 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
31495 op0 = copy_to_mode_reg (mode0, op0);
31496 if ((optimize && !register_operand (op1, mode1))
31497 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
31498 op1 = copy_to_mode_reg (mode1, op1);
31499
31500 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
31501 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31502 if (! pat)
31503 return 0;
31504 emit_insn (pat);
31505 return target;
31506 }
31507
31508 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31509
31510 static rtx
31511 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31512 rtx target)
31513 {
31514 rtx pat;
31515 tree arg0 = CALL_EXPR_ARG (exp, 0);
31516 tree arg1 = CALL_EXPR_ARG (exp, 1);
31517 rtx op0 = expand_normal (arg0);
31518 rtx op1 = expand_normal (arg1);
31519 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31520 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31521 enum rtx_code comparison = d->comparison;
31522
31523 if (VECTOR_MODE_P (mode0))
31524 op0 = safe_vector_operand (op0, mode0);
31525 if (VECTOR_MODE_P (mode1))
31526 op1 = safe_vector_operand (op1, mode1);
31527
31528 /* Swap operands if we have a comparison that isn't available in
31529 hardware. */
31530 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31531 {
31532 rtx tmp = op1;
31533 op1 = op0;
31534 op0 = tmp;
31535 }
31536
31537 target = gen_reg_rtx (SImode);
31538 emit_move_insn (target, const0_rtx);
31539 target = gen_rtx_SUBREG (QImode, target, 0);
31540
31541 if ((optimize && !register_operand (op0, mode0))
31542 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31543 op0 = copy_to_mode_reg (mode0, op0);
31544 if ((optimize && !register_operand (op1, mode1))
31545 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31546 op1 = copy_to_mode_reg (mode1, op1);
31547
31548 pat = GEN_FCN (d->icode) (op0, op1);
31549 if (! pat)
31550 return 0;
31551 emit_insn (pat);
31552 emit_insn (gen_rtx_SET (VOIDmode,
31553 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31554 gen_rtx_fmt_ee (comparison, QImode,
31555 SET_DEST (pat),
31556 const0_rtx)));
31557
31558 return SUBREG_REG (target);
31559 }
31560
31561 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31562
31563 static rtx
31564 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31565 rtx target)
31566 {
31567 rtx pat;
31568 tree arg0 = CALL_EXPR_ARG (exp, 0);
31569 rtx op1, op0 = expand_normal (arg0);
31570 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31571 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31572
31573 if (optimize || target == 0
31574 || GET_MODE (target) != tmode
31575 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31576 target = gen_reg_rtx (tmode);
31577
31578 if (VECTOR_MODE_P (mode0))
31579 op0 = safe_vector_operand (op0, mode0);
31580
31581 if ((optimize && !register_operand (op0, mode0))
31582 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31583 op0 = copy_to_mode_reg (mode0, op0);
31584
31585 op1 = GEN_INT (d->comparison);
31586
31587 pat = GEN_FCN (d->icode) (target, op0, op1);
31588 if (! pat)
31589 return 0;
31590 emit_insn (pat);
31591 return target;
31592 }
31593
31594 static rtx
31595 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31596 tree exp, rtx target)
31597 {
31598 rtx pat;
31599 tree arg0 = CALL_EXPR_ARG (exp, 0);
31600 tree arg1 = CALL_EXPR_ARG (exp, 1);
31601 rtx op0 = expand_normal (arg0);
31602 rtx op1 = expand_normal (arg1);
31603 rtx op2;
31604 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31605 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31606 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31607
31608 if (optimize || target == 0
31609 || GET_MODE (target) != tmode
31610 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31611 target = gen_reg_rtx (tmode);
31612
31613 op0 = safe_vector_operand (op0, mode0);
31614 op1 = safe_vector_operand (op1, mode1);
31615
31616 if ((optimize && !register_operand (op0, mode0))
31617 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31618 op0 = copy_to_mode_reg (mode0, op0);
31619 if ((optimize && !register_operand (op1, mode1))
31620 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31621 op1 = copy_to_mode_reg (mode1, op1);
31622
31623 op2 = GEN_INT (d->comparison);
31624
31625 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31626 if (! pat)
31627 return 0;
31628 emit_insn (pat);
31629 return target;
31630 }
31631
31632 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31633
31634 static rtx
31635 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31636 rtx target)
31637 {
31638 rtx pat;
31639 tree arg0 = CALL_EXPR_ARG (exp, 0);
31640 tree arg1 = CALL_EXPR_ARG (exp, 1);
31641 rtx op0 = expand_normal (arg0);
31642 rtx op1 = expand_normal (arg1);
31643 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31644 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31645 enum rtx_code comparison = d->comparison;
31646
31647 if (VECTOR_MODE_P (mode0))
31648 op0 = safe_vector_operand (op0, mode0);
31649 if (VECTOR_MODE_P (mode1))
31650 op1 = safe_vector_operand (op1, mode1);
31651
31652 target = gen_reg_rtx (SImode);
31653 emit_move_insn (target, const0_rtx);
31654 target = gen_rtx_SUBREG (QImode, target, 0);
31655
31656 if ((optimize && !register_operand (op0, mode0))
31657 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31658 op0 = copy_to_mode_reg (mode0, op0);
31659 if ((optimize && !register_operand (op1, mode1))
31660 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31661 op1 = copy_to_mode_reg (mode1, op1);
31662
31663 pat = GEN_FCN (d->icode) (op0, op1);
31664 if (! pat)
31665 return 0;
31666 emit_insn (pat);
31667 emit_insn (gen_rtx_SET (VOIDmode,
31668 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31669 gen_rtx_fmt_ee (comparison, QImode,
31670 SET_DEST (pat),
31671 const0_rtx)));
31672
31673 return SUBREG_REG (target);
31674 }
31675
31676 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31677
31678 static rtx
31679 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31680 tree exp, rtx target)
31681 {
31682 rtx pat;
31683 tree arg0 = CALL_EXPR_ARG (exp, 0);
31684 tree arg1 = CALL_EXPR_ARG (exp, 1);
31685 tree arg2 = CALL_EXPR_ARG (exp, 2);
31686 tree arg3 = CALL_EXPR_ARG (exp, 3);
31687 tree arg4 = CALL_EXPR_ARG (exp, 4);
31688 rtx scratch0, scratch1;
31689 rtx op0 = expand_normal (arg0);
31690 rtx op1 = expand_normal (arg1);
31691 rtx op2 = expand_normal (arg2);
31692 rtx op3 = expand_normal (arg3);
31693 rtx op4 = expand_normal (arg4);
31694 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31695
31696 tmode0 = insn_data[d->icode].operand[0].mode;
31697 tmode1 = insn_data[d->icode].operand[1].mode;
31698 modev2 = insn_data[d->icode].operand[2].mode;
31699 modei3 = insn_data[d->icode].operand[3].mode;
31700 modev4 = insn_data[d->icode].operand[4].mode;
31701 modei5 = insn_data[d->icode].operand[5].mode;
31702 modeimm = insn_data[d->icode].operand[6].mode;
31703
31704 if (VECTOR_MODE_P (modev2))
31705 op0 = safe_vector_operand (op0, modev2);
31706 if (VECTOR_MODE_P (modev4))
31707 op2 = safe_vector_operand (op2, modev4);
31708
31709 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31710 op0 = copy_to_mode_reg (modev2, op0);
31711 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31712 op1 = copy_to_mode_reg (modei3, op1);
31713 if ((optimize && !register_operand (op2, modev4))
31714 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31715 op2 = copy_to_mode_reg (modev4, op2);
31716 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31717 op3 = copy_to_mode_reg (modei5, op3);
31718
31719 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31720 {
31721 error ("the fifth argument must be an 8-bit immediate");
31722 return const0_rtx;
31723 }
31724
31725 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31726 {
31727 if (optimize || !target
31728 || GET_MODE (target) != tmode0
31729 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31730 target = gen_reg_rtx (tmode0);
31731
31732 scratch1 = gen_reg_rtx (tmode1);
31733
31734 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31735 }
31736 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31737 {
31738 if (optimize || !target
31739 || GET_MODE (target) != tmode1
31740 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31741 target = gen_reg_rtx (tmode1);
31742
31743 scratch0 = gen_reg_rtx (tmode0);
31744
31745 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31746 }
31747 else
31748 {
31749 gcc_assert (d->flag);
31750
31751 scratch0 = gen_reg_rtx (tmode0);
31752 scratch1 = gen_reg_rtx (tmode1);
31753
31754 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31755 }
31756
31757 if (! pat)
31758 return 0;
31759
31760 emit_insn (pat);
31761
31762 if (d->flag)
31763 {
31764 target = gen_reg_rtx (SImode);
31765 emit_move_insn (target, const0_rtx);
31766 target = gen_rtx_SUBREG (QImode, target, 0);
31767
31768 emit_insn
31769 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31770 gen_rtx_fmt_ee (EQ, QImode,
31771 gen_rtx_REG ((enum machine_mode) d->flag,
31772 FLAGS_REG),
31773 const0_rtx)));
31774 return SUBREG_REG (target);
31775 }
31776 else
31777 return target;
31778 }
31779
31780
31781 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31782
31783 static rtx
31784 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31785 tree exp, rtx target)
31786 {
31787 rtx pat;
31788 tree arg0 = CALL_EXPR_ARG (exp, 0);
31789 tree arg1 = CALL_EXPR_ARG (exp, 1);
31790 tree arg2 = CALL_EXPR_ARG (exp, 2);
31791 rtx scratch0, scratch1;
31792 rtx op0 = expand_normal (arg0);
31793 rtx op1 = expand_normal (arg1);
31794 rtx op2 = expand_normal (arg2);
31795 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31796
31797 tmode0 = insn_data[d->icode].operand[0].mode;
31798 tmode1 = insn_data[d->icode].operand[1].mode;
31799 modev2 = insn_data[d->icode].operand[2].mode;
31800 modev3 = insn_data[d->icode].operand[3].mode;
31801 modeimm = insn_data[d->icode].operand[4].mode;
31802
31803 if (VECTOR_MODE_P (modev2))
31804 op0 = safe_vector_operand (op0, modev2);
31805 if (VECTOR_MODE_P (modev3))
31806 op1 = safe_vector_operand (op1, modev3);
31807
31808 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31809 op0 = copy_to_mode_reg (modev2, op0);
31810 if ((optimize && !register_operand (op1, modev3))
31811 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31812 op1 = copy_to_mode_reg (modev3, op1);
31813
31814 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31815 {
31816 error ("the third argument must be an 8-bit immediate");
31817 return const0_rtx;
31818 }
31819
31820 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31821 {
31822 if (optimize || !target
31823 || GET_MODE (target) != tmode0
31824 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31825 target = gen_reg_rtx (tmode0);
31826
31827 scratch1 = gen_reg_rtx (tmode1);
31828
31829 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31830 }
31831 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31832 {
31833 if (optimize || !target
31834 || GET_MODE (target) != tmode1
31835 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31836 target = gen_reg_rtx (tmode1);
31837
31838 scratch0 = gen_reg_rtx (tmode0);
31839
31840 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31841 }
31842 else
31843 {
31844 gcc_assert (d->flag);
31845
31846 scratch0 = gen_reg_rtx (tmode0);
31847 scratch1 = gen_reg_rtx (tmode1);
31848
31849 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31850 }
31851
31852 if (! pat)
31853 return 0;
31854
31855 emit_insn (pat);
31856
31857 if (d->flag)
31858 {
31859 target = gen_reg_rtx (SImode);
31860 emit_move_insn (target, const0_rtx);
31861 target = gen_rtx_SUBREG (QImode, target, 0);
31862
31863 emit_insn
31864 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31865 gen_rtx_fmt_ee (EQ, QImode,
31866 gen_rtx_REG ((enum machine_mode) d->flag,
31867 FLAGS_REG),
31868 const0_rtx)));
31869 return SUBREG_REG (target);
31870 }
31871 else
31872 return target;
31873 }
31874
31875 /* Subroutine of ix86_expand_builtin to take care of insns with
31876 variable number of operands. */
31877
31878 static rtx
31879 ix86_expand_args_builtin (const struct builtin_description *d,
31880 tree exp, rtx target)
31881 {
31882 rtx pat, real_target;
31883 unsigned int i, nargs;
31884 unsigned int nargs_constant = 0;
31885 int num_memory = 0;
31886 struct
31887 {
31888 rtx op;
31889 enum machine_mode mode;
31890 } args[4];
31891 bool last_arg_count = false;
31892 enum insn_code icode = d->icode;
31893 const struct insn_data_d *insn_p = &insn_data[icode];
31894 enum machine_mode tmode = insn_p->operand[0].mode;
31895 enum machine_mode rmode = VOIDmode;
31896 bool swap = false;
31897 enum rtx_code comparison = d->comparison;
31898
31899 switch ((enum ix86_builtin_func_type) d->flag)
31900 {
31901 case V2DF_FTYPE_V2DF_ROUND:
31902 case V4DF_FTYPE_V4DF_ROUND:
31903 case V4SF_FTYPE_V4SF_ROUND:
31904 case V8SF_FTYPE_V8SF_ROUND:
31905 case V4SI_FTYPE_V4SF_ROUND:
31906 case V8SI_FTYPE_V8SF_ROUND:
31907 return ix86_expand_sse_round (d, exp, target);
31908 case V4SI_FTYPE_V2DF_V2DF_ROUND:
31909 case V8SI_FTYPE_V4DF_V4DF_ROUND:
31910 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
31911 case INT_FTYPE_V8SF_V8SF_PTEST:
31912 case INT_FTYPE_V4DI_V4DI_PTEST:
31913 case INT_FTYPE_V4DF_V4DF_PTEST:
31914 case INT_FTYPE_V4SF_V4SF_PTEST:
31915 case INT_FTYPE_V2DI_V2DI_PTEST:
31916 case INT_FTYPE_V2DF_V2DF_PTEST:
31917 return ix86_expand_sse_ptest (d, exp, target);
31918 case FLOAT128_FTYPE_FLOAT128:
31919 case FLOAT_FTYPE_FLOAT:
31920 case INT_FTYPE_INT:
31921 case UINT64_FTYPE_INT:
31922 case UINT16_FTYPE_UINT16:
31923 case INT64_FTYPE_INT64:
31924 case INT64_FTYPE_V4SF:
31925 case INT64_FTYPE_V2DF:
31926 case INT_FTYPE_V16QI:
31927 case INT_FTYPE_V8QI:
31928 case INT_FTYPE_V8SF:
31929 case INT_FTYPE_V4DF:
31930 case INT_FTYPE_V4SF:
31931 case INT_FTYPE_V2DF:
31932 case INT_FTYPE_V32QI:
31933 case V16QI_FTYPE_V16QI:
31934 case V8SI_FTYPE_V8SF:
31935 case V8SI_FTYPE_V4SI:
31936 case V8HI_FTYPE_V8HI:
31937 case V8HI_FTYPE_V16QI:
31938 case V8QI_FTYPE_V8QI:
31939 case V8SF_FTYPE_V8SF:
31940 case V8SF_FTYPE_V8SI:
31941 case V8SF_FTYPE_V4SF:
31942 case V8SF_FTYPE_V8HI:
31943 case V4SI_FTYPE_V4SI:
31944 case V4SI_FTYPE_V16QI:
31945 case V4SI_FTYPE_V4SF:
31946 case V4SI_FTYPE_V8SI:
31947 case V4SI_FTYPE_V8HI:
31948 case V4SI_FTYPE_V4DF:
31949 case V4SI_FTYPE_V2DF:
31950 case V4HI_FTYPE_V4HI:
31951 case V4DF_FTYPE_V4DF:
31952 case V4DF_FTYPE_V4SI:
31953 case V4DF_FTYPE_V4SF:
31954 case V4DF_FTYPE_V2DF:
31955 case V4SF_FTYPE_V4SF:
31956 case V4SF_FTYPE_V4SI:
31957 case V4SF_FTYPE_V8SF:
31958 case V4SF_FTYPE_V4DF:
31959 case V4SF_FTYPE_V8HI:
31960 case V4SF_FTYPE_V2DF:
31961 case V2DI_FTYPE_V2DI:
31962 case V2DI_FTYPE_V16QI:
31963 case V2DI_FTYPE_V8HI:
31964 case V2DI_FTYPE_V4SI:
31965 case V2DF_FTYPE_V2DF:
31966 case V2DF_FTYPE_V4SI:
31967 case V2DF_FTYPE_V4DF:
31968 case V2DF_FTYPE_V4SF:
31969 case V2DF_FTYPE_V2SI:
31970 case V2SI_FTYPE_V2SI:
31971 case V2SI_FTYPE_V4SF:
31972 case V2SI_FTYPE_V2SF:
31973 case V2SI_FTYPE_V2DF:
31974 case V2SF_FTYPE_V2SF:
31975 case V2SF_FTYPE_V2SI:
31976 case V32QI_FTYPE_V32QI:
31977 case V32QI_FTYPE_V16QI:
31978 case V16HI_FTYPE_V16HI:
31979 case V16HI_FTYPE_V8HI:
31980 case V8SI_FTYPE_V8SI:
31981 case V16HI_FTYPE_V16QI:
31982 case V8SI_FTYPE_V16QI:
31983 case V4DI_FTYPE_V16QI:
31984 case V8SI_FTYPE_V8HI:
31985 case V4DI_FTYPE_V8HI:
31986 case V4DI_FTYPE_V4SI:
31987 case V4DI_FTYPE_V2DI:
31988 nargs = 1;
31989 break;
31990 case V4SF_FTYPE_V4SF_VEC_MERGE:
31991 case V2DF_FTYPE_V2DF_VEC_MERGE:
31992 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
31993 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
31994 case V16QI_FTYPE_V16QI_V16QI:
31995 case V16QI_FTYPE_V8HI_V8HI:
31996 case V8QI_FTYPE_V8QI_V8QI:
31997 case V8QI_FTYPE_V4HI_V4HI:
31998 case V8HI_FTYPE_V8HI_V8HI:
31999 case V8HI_FTYPE_V16QI_V16QI:
32000 case V8HI_FTYPE_V4SI_V4SI:
32001 case V8SF_FTYPE_V8SF_V8SF:
32002 case V8SF_FTYPE_V8SF_V8SI:
32003 case V4SI_FTYPE_V4SI_V4SI:
32004 case V4SI_FTYPE_V8HI_V8HI:
32005 case V4SI_FTYPE_V4SF_V4SF:
32006 case V4SI_FTYPE_V2DF_V2DF:
32007 case V4HI_FTYPE_V4HI_V4HI:
32008 case V4HI_FTYPE_V8QI_V8QI:
32009 case V4HI_FTYPE_V2SI_V2SI:
32010 case V4DF_FTYPE_V4DF_V4DF:
32011 case V4DF_FTYPE_V4DF_V4DI:
32012 case V4SF_FTYPE_V4SF_V4SF:
32013 case V4SF_FTYPE_V4SF_V4SI:
32014 case V4SF_FTYPE_V4SF_V2SI:
32015 case V4SF_FTYPE_V4SF_V2DF:
32016 case V4SF_FTYPE_V4SF_DI:
32017 case V4SF_FTYPE_V4SF_SI:
32018 case V2DI_FTYPE_V2DI_V2DI:
32019 case V2DI_FTYPE_V16QI_V16QI:
32020 case V2DI_FTYPE_V4SI_V4SI:
32021 case V2UDI_FTYPE_V4USI_V4USI:
32022 case V2DI_FTYPE_V2DI_V16QI:
32023 case V2DI_FTYPE_V2DF_V2DF:
32024 case V2SI_FTYPE_V2SI_V2SI:
32025 case V2SI_FTYPE_V4HI_V4HI:
32026 case V2SI_FTYPE_V2SF_V2SF:
32027 case V2DF_FTYPE_V2DF_V2DF:
32028 case V2DF_FTYPE_V2DF_V4SF:
32029 case V2DF_FTYPE_V2DF_V2DI:
32030 case V2DF_FTYPE_V2DF_DI:
32031 case V2DF_FTYPE_V2DF_SI:
32032 case V2SF_FTYPE_V2SF_V2SF:
32033 case V1DI_FTYPE_V1DI_V1DI:
32034 case V1DI_FTYPE_V8QI_V8QI:
32035 case V1DI_FTYPE_V2SI_V2SI:
32036 case V32QI_FTYPE_V16HI_V16HI:
32037 case V16HI_FTYPE_V8SI_V8SI:
32038 case V32QI_FTYPE_V32QI_V32QI:
32039 case V16HI_FTYPE_V32QI_V32QI:
32040 case V16HI_FTYPE_V16HI_V16HI:
32041 case V8SI_FTYPE_V4DF_V4DF:
32042 case V8SI_FTYPE_V8SI_V8SI:
32043 case V8SI_FTYPE_V16HI_V16HI:
32044 case V4DI_FTYPE_V4DI_V4DI:
32045 case V4DI_FTYPE_V8SI_V8SI:
32046 case V4UDI_FTYPE_V8USI_V8USI:
32047 if (comparison == UNKNOWN)
32048 return ix86_expand_binop_builtin (icode, exp, target);
32049 nargs = 2;
32050 break;
32051 case V4SF_FTYPE_V4SF_V4SF_SWAP:
32052 case V2DF_FTYPE_V2DF_V2DF_SWAP:
32053 gcc_assert (comparison != UNKNOWN);
32054 nargs = 2;
32055 swap = true;
32056 break;
32057 case V16HI_FTYPE_V16HI_V8HI_COUNT:
32058 case V16HI_FTYPE_V16HI_SI_COUNT:
32059 case V8SI_FTYPE_V8SI_V4SI_COUNT:
32060 case V8SI_FTYPE_V8SI_SI_COUNT:
32061 case V4DI_FTYPE_V4DI_V2DI_COUNT:
32062 case V4DI_FTYPE_V4DI_INT_COUNT:
32063 case V8HI_FTYPE_V8HI_V8HI_COUNT:
32064 case V8HI_FTYPE_V8HI_SI_COUNT:
32065 case V4SI_FTYPE_V4SI_V4SI_COUNT:
32066 case V4SI_FTYPE_V4SI_SI_COUNT:
32067 case V4HI_FTYPE_V4HI_V4HI_COUNT:
32068 case V4HI_FTYPE_V4HI_SI_COUNT:
32069 case V2DI_FTYPE_V2DI_V2DI_COUNT:
32070 case V2DI_FTYPE_V2DI_SI_COUNT:
32071 case V2SI_FTYPE_V2SI_V2SI_COUNT:
32072 case V2SI_FTYPE_V2SI_SI_COUNT:
32073 case V1DI_FTYPE_V1DI_V1DI_COUNT:
32074 case V1DI_FTYPE_V1DI_SI_COUNT:
32075 nargs = 2;
32076 last_arg_count = true;
32077 break;
32078 case UINT64_FTYPE_UINT64_UINT64:
32079 case UINT_FTYPE_UINT_UINT:
32080 case UINT_FTYPE_UINT_USHORT:
32081 case UINT_FTYPE_UINT_UCHAR:
32082 case UINT16_FTYPE_UINT16_INT:
32083 case UINT8_FTYPE_UINT8_INT:
32084 nargs = 2;
32085 break;
32086 case V2DI_FTYPE_V2DI_INT_CONVERT:
32087 nargs = 2;
32088 rmode = V1TImode;
32089 nargs_constant = 1;
32090 break;
32091 case V4DI_FTYPE_V4DI_INT_CONVERT:
32092 nargs = 2;
32093 rmode = V2TImode;
32094 nargs_constant = 1;
32095 break;
32096 case V8HI_FTYPE_V8HI_INT:
32097 case V8HI_FTYPE_V8SF_INT:
32098 case V8HI_FTYPE_V4SF_INT:
32099 case V8SF_FTYPE_V8SF_INT:
32100 case V4SI_FTYPE_V4SI_INT:
32101 case V4SI_FTYPE_V8SI_INT:
32102 case V4HI_FTYPE_V4HI_INT:
32103 case V4DF_FTYPE_V4DF_INT:
32104 case V4SF_FTYPE_V4SF_INT:
32105 case V4SF_FTYPE_V8SF_INT:
32106 case V2DI_FTYPE_V2DI_INT:
32107 case V2DF_FTYPE_V2DF_INT:
32108 case V2DF_FTYPE_V4DF_INT:
32109 case V16HI_FTYPE_V16HI_INT:
32110 case V8SI_FTYPE_V8SI_INT:
32111 case V4DI_FTYPE_V4DI_INT:
32112 case V2DI_FTYPE_V4DI_INT:
32113 nargs = 2;
32114 nargs_constant = 1;
32115 break;
32116 case V16QI_FTYPE_V16QI_V16QI_V16QI:
32117 case V8SF_FTYPE_V8SF_V8SF_V8SF:
32118 case V4DF_FTYPE_V4DF_V4DF_V4DF:
32119 case V4SF_FTYPE_V4SF_V4SF_V4SF:
32120 case V2DF_FTYPE_V2DF_V2DF_V2DF:
32121 case V32QI_FTYPE_V32QI_V32QI_V32QI:
32122 nargs = 3;
32123 break;
32124 case V32QI_FTYPE_V32QI_V32QI_INT:
32125 case V16HI_FTYPE_V16HI_V16HI_INT:
32126 case V16QI_FTYPE_V16QI_V16QI_INT:
32127 case V4DI_FTYPE_V4DI_V4DI_INT:
32128 case V8HI_FTYPE_V8HI_V8HI_INT:
32129 case V8SI_FTYPE_V8SI_V8SI_INT:
32130 case V8SI_FTYPE_V8SI_V4SI_INT:
32131 case V8SF_FTYPE_V8SF_V8SF_INT:
32132 case V8SF_FTYPE_V8SF_V4SF_INT:
32133 case V4SI_FTYPE_V4SI_V4SI_INT:
32134 case V4DF_FTYPE_V4DF_V4DF_INT:
32135 case V4DF_FTYPE_V4DF_V2DF_INT:
32136 case V4SF_FTYPE_V4SF_V4SF_INT:
32137 case V2DI_FTYPE_V2DI_V2DI_INT:
32138 case V4DI_FTYPE_V4DI_V2DI_INT:
32139 case V2DF_FTYPE_V2DF_V2DF_INT:
32140 nargs = 3;
32141 nargs_constant = 1;
32142 break;
32143 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
32144 nargs = 3;
32145 rmode = V4DImode;
32146 nargs_constant = 1;
32147 break;
32148 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
32149 nargs = 3;
32150 rmode = V2DImode;
32151 nargs_constant = 1;
32152 break;
32153 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
32154 nargs = 3;
32155 rmode = DImode;
32156 nargs_constant = 1;
32157 break;
32158 case V2DI_FTYPE_V2DI_UINT_UINT:
32159 nargs = 3;
32160 nargs_constant = 2;
32161 break;
32162 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
32163 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
32164 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
32165 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
32166 nargs = 4;
32167 nargs_constant = 1;
32168 break;
32169 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
32170 nargs = 4;
32171 nargs_constant = 2;
32172 break;
32173 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
32174 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
32175 nargs = 4;
32176 break;
32177 default:
32178 gcc_unreachable ();
32179 }
32180
32181 gcc_assert (nargs <= ARRAY_SIZE (args));
32182
32183 if (comparison != UNKNOWN)
32184 {
32185 gcc_assert (nargs == 2);
32186 return ix86_expand_sse_compare (d, exp, target, swap);
32187 }
32188
32189 if (rmode == VOIDmode || rmode == tmode)
32190 {
32191 if (optimize
32192 || target == 0
32193 || GET_MODE (target) != tmode
32194 || !insn_p->operand[0].predicate (target, tmode))
32195 target = gen_reg_rtx (tmode);
32196 real_target = target;
32197 }
32198 else
32199 {
32200 real_target = gen_reg_rtx (tmode);
32201 target = simplify_gen_subreg (rmode, real_target, tmode, 0);
32202 }
32203
32204 for (i = 0; i < nargs; i++)
32205 {
32206 tree arg = CALL_EXPR_ARG (exp, i);
32207 rtx op = expand_normal (arg);
32208 enum machine_mode mode = insn_p->operand[i + 1].mode;
32209 bool match = insn_p->operand[i + 1].predicate (op, mode);
32210
32211 if (last_arg_count && (i + 1) == nargs)
32212 {
32213 /* SIMD shift insns take either an 8-bit immediate or
32214 register as count. But builtin functions take int as
32215 count. If count doesn't match, we put it in register. */
32216 if (!match)
32217 {
32218 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
32219 if (!insn_p->operand[i + 1].predicate (op, mode))
32220 op = copy_to_reg (op);
32221 }
32222 }
32223 else if ((nargs - i) <= nargs_constant)
32224 {
32225 if (!match)
32226 switch (icode)
32227 {
32228 case CODE_FOR_avx2_inserti128:
32229 case CODE_FOR_avx2_extracti128:
32230 error ("the last argument must be an 1-bit immediate");
32231 return const0_rtx;
32232
32233 case CODE_FOR_sse4_1_roundsd:
32234 case CODE_FOR_sse4_1_roundss:
32235
32236 case CODE_FOR_sse4_1_roundpd:
32237 case CODE_FOR_sse4_1_roundps:
32238 case CODE_FOR_avx_roundpd256:
32239 case CODE_FOR_avx_roundps256:
32240
32241 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
32242 case CODE_FOR_sse4_1_roundps_sfix:
32243 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
32244 case CODE_FOR_avx_roundps_sfix256:
32245
32246 case CODE_FOR_sse4_1_blendps:
32247 case CODE_FOR_avx_blendpd256:
32248 case CODE_FOR_avx_vpermilv4df:
32249 error ("the last argument must be a 4-bit immediate");
32250 return const0_rtx;
32251
32252 case CODE_FOR_sse4_1_blendpd:
32253 case CODE_FOR_avx_vpermilv2df:
32254 case CODE_FOR_xop_vpermil2v2df3:
32255 case CODE_FOR_xop_vpermil2v4sf3:
32256 case CODE_FOR_xop_vpermil2v4df3:
32257 case CODE_FOR_xop_vpermil2v8sf3:
32258 error ("the last argument must be a 2-bit immediate");
32259 return const0_rtx;
32260
32261 case CODE_FOR_avx_vextractf128v4df:
32262 case CODE_FOR_avx_vextractf128v8sf:
32263 case CODE_FOR_avx_vextractf128v8si:
32264 case CODE_FOR_avx_vinsertf128v4df:
32265 case CODE_FOR_avx_vinsertf128v8sf:
32266 case CODE_FOR_avx_vinsertf128v8si:
32267 error ("the last argument must be a 1-bit immediate");
32268 return const0_rtx;
32269
32270 case CODE_FOR_avx_vmcmpv2df3:
32271 case CODE_FOR_avx_vmcmpv4sf3:
32272 case CODE_FOR_avx_cmpv2df3:
32273 case CODE_FOR_avx_cmpv4sf3:
32274 case CODE_FOR_avx_cmpv4df3:
32275 case CODE_FOR_avx_cmpv8sf3:
32276 error ("the last argument must be a 5-bit immediate");
32277 return const0_rtx;
32278
32279 default:
32280 switch (nargs_constant)
32281 {
32282 case 2:
32283 if ((nargs - i) == nargs_constant)
32284 {
32285 error ("the next to last argument must be an 8-bit immediate");
32286 break;
32287 }
32288 case 1:
32289 error ("the last argument must be an 8-bit immediate");
32290 break;
32291 default:
32292 gcc_unreachable ();
32293 }
32294 return const0_rtx;
32295 }
32296 }
32297 else
32298 {
32299 if (VECTOR_MODE_P (mode))
32300 op = safe_vector_operand (op, mode);
32301
32302 /* If we aren't optimizing, only allow one memory operand to
32303 be generated. */
32304 if (memory_operand (op, mode))
32305 num_memory++;
32306
32307 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
32308 {
32309 if (optimize || !match || num_memory > 1)
32310 op = copy_to_mode_reg (mode, op);
32311 }
32312 else
32313 {
32314 op = copy_to_reg (op);
32315 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
32316 }
32317 }
32318
32319 args[i].op = op;
32320 args[i].mode = mode;
32321 }
32322
32323 switch (nargs)
32324 {
32325 case 1:
32326 pat = GEN_FCN (icode) (real_target, args[0].op);
32327 break;
32328 case 2:
32329 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
32330 break;
32331 case 3:
32332 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32333 args[2].op);
32334 break;
32335 case 4:
32336 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32337 args[2].op, args[3].op);
32338 break;
32339 default:
32340 gcc_unreachable ();
32341 }
32342
32343 if (! pat)
32344 return 0;
32345
32346 emit_insn (pat);
32347 return target;
32348 }
32349
32350 /* Subroutine of ix86_expand_builtin to take care of special insns
32351 with variable number of operands. */
32352
32353 static rtx
32354 ix86_expand_special_args_builtin (const struct builtin_description *d,
32355 tree exp, rtx target)
32356 {
32357 tree arg;
32358 rtx pat, op;
32359 unsigned int i, nargs, arg_adjust, memory;
32360 struct
32361 {
32362 rtx op;
32363 enum machine_mode mode;
32364 } args[3];
32365 enum insn_code icode = d->icode;
32366 bool last_arg_constant = false;
32367 const struct insn_data_d *insn_p = &insn_data[icode];
32368 enum machine_mode tmode = insn_p->operand[0].mode;
32369 enum { load, store } klass;
32370
32371 switch ((enum ix86_builtin_func_type) d->flag)
32372 {
32373 case VOID_FTYPE_VOID:
32374 emit_insn (GEN_FCN (icode) (target));
32375 return 0;
32376 case VOID_FTYPE_UINT64:
32377 case VOID_FTYPE_UNSIGNED:
32378 nargs = 0;
32379 klass = store;
32380 memory = 0;
32381 break;
32382
32383 case INT_FTYPE_VOID:
32384 case UINT64_FTYPE_VOID:
32385 case UNSIGNED_FTYPE_VOID:
32386 nargs = 0;
32387 klass = load;
32388 memory = 0;
32389 break;
32390 case UINT64_FTYPE_PUNSIGNED:
32391 case V2DI_FTYPE_PV2DI:
32392 case V4DI_FTYPE_PV4DI:
32393 case V32QI_FTYPE_PCCHAR:
32394 case V16QI_FTYPE_PCCHAR:
32395 case V8SF_FTYPE_PCV4SF:
32396 case V8SF_FTYPE_PCFLOAT:
32397 case V4SF_FTYPE_PCFLOAT:
32398 case V4DF_FTYPE_PCV2DF:
32399 case V4DF_FTYPE_PCDOUBLE:
32400 case V2DF_FTYPE_PCDOUBLE:
32401 case VOID_FTYPE_PVOID:
32402 nargs = 1;
32403 klass = load;
32404 memory = 0;
32405 break;
32406 case VOID_FTYPE_PV2SF_V4SF:
32407 case VOID_FTYPE_PV4DI_V4DI:
32408 case VOID_FTYPE_PV2DI_V2DI:
32409 case VOID_FTYPE_PCHAR_V32QI:
32410 case VOID_FTYPE_PCHAR_V16QI:
32411 case VOID_FTYPE_PFLOAT_V8SF:
32412 case VOID_FTYPE_PFLOAT_V4SF:
32413 case VOID_FTYPE_PDOUBLE_V4DF:
32414 case VOID_FTYPE_PDOUBLE_V2DF:
32415 case VOID_FTYPE_PLONGLONG_LONGLONG:
32416 case VOID_FTYPE_PULONGLONG_ULONGLONG:
32417 case VOID_FTYPE_PINT_INT:
32418 nargs = 1;
32419 klass = store;
32420 /* Reserve memory operand for target. */
32421 memory = ARRAY_SIZE (args);
32422 break;
32423 case V4SF_FTYPE_V4SF_PCV2SF:
32424 case V2DF_FTYPE_V2DF_PCDOUBLE:
32425 nargs = 2;
32426 klass = load;
32427 memory = 1;
32428 break;
32429 case V8SF_FTYPE_PCV8SF_V8SI:
32430 case V4DF_FTYPE_PCV4DF_V4DI:
32431 case V4SF_FTYPE_PCV4SF_V4SI:
32432 case V2DF_FTYPE_PCV2DF_V2DI:
32433 case V8SI_FTYPE_PCV8SI_V8SI:
32434 case V4DI_FTYPE_PCV4DI_V4DI:
32435 case V4SI_FTYPE_PCV4SI_V4SI:
32436 case V2DI_FTYPE_PCV2DI_V2DI:
32437 nargs = 2;
32438 klass = load;
32439 memory = 0;
32440 break;
32441 case VOID_FTYPE_PV8SF_V8SI_V8SF:
32442 case VOID_FTYPE_PV4DF_V4DI_V4DF:
32443 case VOID_FTYPE_PV4SF_V4SI_V4SF:
32444 case VOID_FTYPE_PV2DF_V2DI_V2DF:
32445 case VOID_FTYPE_PV8SI_V8SI_V8SI:
32446 case VOID_FTYPE_PV4DI_V4DI_V4DI:
32447 case VOID_FTYPE_PV4SI_V4SI_V4SI:
32448 case VOID_FTYPE_PV2DI_V2DI_V2DI:
32449 nargs = 2;
32450 klass = store;
32451 /* Reserve memory operand for target. */
32452 memory = ARRAY_SIZE (args);
32453 break;
32454 case VOID_FTYPE_UINT_UINT_UINT:
32455 case VOID_FTYPE_UINT64_UINT_UINT:
32456 case UCHAR_FTYPE_UINT_UINT_UINT:
32457 case UCHAR_FTYPE_UINT64_UINT_UINT:
32458 nargs = 3;
32459 klass = load;
32460 memory = ARRAY_SIZE (args);
32461 last_arg_constant = true;
32462 break;
32463 default:
32464 gcc_unreachable ();
32465 }
32466
32467 gcc_assert (nargs <= ARRAY_SIZE (args));
32468
32469 if (klass == store)
32470 {
32471 arg = CALL_EXPR_ARG (exp, 0);
32472 op = expand_normal (arg);
32473 gcc_assert (target == 0);
32474 if (memory)
32475 {
32476 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32477 target = gen_rtx_MEM (tmode, op);
32478 }
32479 else
32480 target = force_reg (tmode, op);
32481 arg_adjust = 1;
32482 }
32483 else
32484 {
32485 arg_adjust = 0;
32486 if (optimize
32487 || target == 0
32488 || !register_operand (target, tmode)
32489 || GET_MODE (target) != tmode)
32490 target = gen_reg_rtx (tmode);
32491 }
32492
32493 for (i = 0; i < nargs; i++)
32494 {
32495 enum machine_mode mode = insn_p->operand[i + 1].mode;
32496 bool match;
32497
32498 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
32499 op = expand_normal (arg);
32500 match = insn_p->operand[i + 1].predicate (op, mode);
32501
32502 if (last_arg_constant && (i + 1) == nargs)
32503 {
32504 if (!match)
32505 {
32506 if (icode == CODE_FOR_lwp_lwpvalsi3
32507 || icode == CODE_FOR_lwp_lwpinssi3
32508 || icode == CODE_FOR_lwp_lwpvaldi3
32509 || icode == CODE_FOR_lwp_lwpinsdi3)
32510 error ("the last argument must be a 32-bit immediate");
32511 else
32512 error ("the last argument must be an 8-bit immediate");
32513 return const0_rtx;
32514 }
32515 }
32516 else
32517 {
32518 if (i == memory)
32519 {
32520 /* This must be the memory operand. */
32521 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32522 op = gen_rtx_MEM (mode, op);
32523 gcc_assert (GET_MODE (op) == mode
32524 || GET_MODE (op) == VOIDmode);
32525 }
32526 else
32527 {
32528 /* This must be register. */
32529 if (VECTOR_MODE_P (mode))
32530 op = safe_vector_operand (op, mode);
32531
32532 gcc_assert (GET_MODE (op) == mode
32533 || GET_MODE (op) == VOIDmode);
32534 op = copy_to_mode_reg (mode, op);
32535 }
32536 }
32537
32538 args[i].op = op;
32539 args[i].mode = mode;
32540 }
32541
32542 switch (nargs)
32543 {
32544 case 0:
32545 pat = GEN_FCN (icode) (target);
32546 break;
32547 case 1:
32548 pat = GEN_FCN (icode) (target, args[0].op);
32549 break;
32550 case 2:
32551 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32552 break;
32553 case 3:
32554 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32555 break;
32556 default:
32557 gcc_unreachable ();
32558 }
32559
32560 if (! pat)
32561 return 0;
32562 emit_insn (pat);
32563 return klass == store ? 0 : target;
32564 }
32565
32566 /* Return the integer constant in ARG. Constrain it to be in the range
32567 of the subparts of VEC_TYPE; issue an error if not. */
32568
32569 static int
32570 get_element_number (tree vec_type, tree arg)
32571 {
32572 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32573
32574 if (!tree_fits_uhwi_p (arg)
32575 || (elt = tree_to_uhwi (arg), elt > max))
32576 {
32577 error ("selector must be an integer constant in the range 0..%wi", max);
32578 return 0;
32579 }
32580
32581 return elt;
32582 }
32583
32584 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32585 ix86_expand_vector_init. We DO have language-level syntax for this, in
32586 the form of (type){ init-list }. Except that since we can't place emms
32587 instructions from inside the compiler, we can't allow the use of MMX
32588 registers unless the user explicitly asks for it. So we do *not* define
32589 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32590 we have builtins invoked by mmintrin.h that gives us license to emit
32591 these sorts of instructions. */
32592
32593 static rtx
32594 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32595 {
32596 enum machine_mode tmode = TYPE_MODE (type);
32597 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32598 int i, n_elt = GET_MODE_NUNITS (tmode);
32599 rtvec v = rtvec_alloc (n_elt);
32600
32601 gcc_assert (VECTOR_MODE_P (tmode));
32602 gcc_assert (call_expr_nargs (exp) == n_elt);
32603
32604 for (i = 0; i < n_elt; ++i)
32605 {
32606 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32607 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32608 }
32609
32610 if (!target || !register_operand (target, tmode))
32611 target = gen_reg_rtx (tmode);
32612
32613 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32614 return target;
32615 }
32616
32617 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32618 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32619 had a language-level syntax for referencing vector elements. */
32620
32621 static rtx
32622 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32623 {
32624 enum machine_mode tmode, mode0;
32625 tree arg0, arg1;
32626 int elt;
32627 rtx op0;
32628
32629 arg0 = CALL_EXPR_ARG (exp, 0);
32630 arg1 = CALL_EXPR_ARG (exp, 1);
32631
32632 op0 = expand_normal (arg0);
32633 elt = get_element_number (TREE_TYPE (arg0), arg1);
32634
32635 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32636 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32637 gcc_assert (VECTOR_MODE_P (mode0));
32638
32639 op0 = force_reg (mode0, op0);
32640
32641 if (optimize || !target || !register_operand (target, tmode))
32642 target = gen_reg_rtx (tmode);
32643
32644 ix86_expand_vector_extract (true, target, op0, elt);
32645
32646 return target;
32647 }
32648
32649 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32650 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32651 a language-level syntax for referencing vector elements. */
32652
32653 static rtx
32654 ix86_expand_vec_set_builtin (tree exp)
32655 {
32656 enum machine_mode tmode, mode1;
32657 tree arg0, arg1, arg2;
32658 int elt;
32659 rtx op0, op1, target;
32660
32661 arg0 = CALL_EXPR_ARG (exp, 0);
32662 arg1 = CALL_EXPR_ARG (exp, 1);
32663 arg2 = CALL_EXPR_ARG (exp, 2);
32664
32665 tmode = TYPE_MODE (TREE_TYPE (arg0));
32666 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32667 gcc_assert (VECTOR_MODE_P (tmode));
32668
32669 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32670 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32671 elt = get_element_number (TREE_TYPE (arg0), arg2);
32672
32673 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32674 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32675
32676 op0 = force_reg (tmode, op0);
32677 op1 = force_reg (mode1, op1);
32678
32679 /* OP0 is the source of these builtin functions and shouldn't be
32680 modified. Create a copy, use it and return it as target. */
32681 target = gen_reg_rtx (tmode);
32682 emit_move_insn (target, op0);
32683 ix86_expand_vector_set (true, target, op1, elt);
32684
32685 return target;
32686 }
32687
32688 /* Expand an expression EXP that calls a built-in function,
32689 with result going to TARGET if that's convenient
32690 (and in mode MODE if that's convenient).
32691 SUBTARGET may be used as the target for computing one of EXP's operands.
32692 IGNORE is nonzero if the value is to be ignored. */
32693
32694 static rtx
32695 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32696 enum machine_mode mode, int ignore)
32697 {
32698 const struct builtin_description *d;
32699 size_t i;
32700 enum insn_code icode;
32701 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32702 tree arg0, arg1, arg2, arg3, arg4;
32703 rtx op0, op1, op2, op3, op4, pat, insn;
32704 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32705 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32706
32707 /* For CPU builtins that can be folded, fold first and expand the fold. */
32708 switch (fcode)
32709 {
32710 case IX86_BUILTIN_CPU_INIT:
32711 {
32712 /* Make it call __cpu_indicator_init in libgcc. */
32713 tree call_expr, fndecl, type;
32714 type = build_function_type_list (integer_type_node, NULL_TREE);
32715 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32716 call_expr = build_call_expr (fndecl, 0);
32717 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32718 }
32719 case IX86_BUILTIN_CPU_IS:
32720 case IX86_BUILTIN_CPU_SUPPORTS:
32721 {
32722 tree arg0 = CALL_EXPR_ARG (exp, 0);
32723 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32724 gcc_assert (fold_expr != NULL_TREE);
32725 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32726 }
32727 }
32728
32729 /* Determine whether the builtin function is available under the current ISA.
32730 Originally the builtin was not created if it wasn't applicable to the
32731 current ISA based on the command line switches. With function specific
32732 options, we need to check in the context of the function making the call
32733 whether it is supported. */
32734 if (ix86_builtins_isa[fcode].isa
32735 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32736 {
32737 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32738 NULL, (enum fpmath_unit) 0, false);
32739
32740 if (!opts)
32741 error ("%qE needs unknown isa option", fndecl);
32742 else
32743 {
32744 gcc_assert (opts != NULL);
32745 error ("%qE needs isa option %s", fndecl, opts);
32746 free (opts);
32747 }
32748 return const0_rtx;
32749 }
32750
32751 switch (fcode)
32752 {
32753 case IX86_BUILTIN_MASKMOVQ:
32754 case IX86_BUILTIN_MASKMOVDQU:
32755 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32756 ? CODE_FOR_mmx_maskmovq
32757 : CODE_FOR_sse2_maskmovdqu);
32758 /* Note the arg order is different from the operand order. */
32759 arg1 = CALL_EXPR_ARG (exp, 0);
32760 arg2 = CALL_EXPR_ARG (exp, 1);
32761 arg0 = CALL_EXPR_ARG (exp, 2);
32762 op0 = expand_normal (arg0);
32763 op1 = expand_normal (arg1);
32764 op2 = expand_normal (arg2);
32765 mode0 = insn_data[icode].operand[0].mode;
32766 mode1 = insn_data[icode].operand[1].mode;
32767 mode2 = insn_data[icode].operand[2].mode;
32768
32769 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32770 op0 = gen_rtx_MEM (mode1, op0);
32771
32772 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32773 op0 = copy_to_mode_reg (mode0, op0);
32774 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32775 op1 = copy_to_mode_reg (mode1, op1);
32776 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32777 op2 = copy_to_mode_reg (mode2, op2);
32778 pat = GEN_FCN (icode) (op0, op1, op2);
32779 if (! pat)
32780 return 0;
32781 emit_insn (pat);
32782 return 0;
32783
32784 case IX86_BUILTIN_LDMXCSR:
32785 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32786 target = assign_386_stack_local (SImode, SLOT_TEMP);
32787 emit_move_insn (target, op0);
32788 emit_insn (gen_sse_ldmxcsr (target));
32789 return 0;
32790
32791 case IX86_BUILTIN_STMXCSR:
32792 target = assign_386_stack_local (SImode, SLOT_TEMP);
32793 emit_insn (gen_sse_stmxcsr (target));
32794 return copy_to_mode_reg (SImode, target);
32795
32796 case IX86_BUILTIN_CLFLUSH:
32797 arg0 = CALL_EXPR_ARG (exp, 0);
32798 op0 = expand_normal (arg0);
32799 icode = CODE_FOR_sse2_clflush;
32800 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32801 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32802
32803 emit_insn (gen_sse2_clflush (op0));
32804 return 0;
32805
32806 case IX86_BUILTIN_MONITOR:
32807 arg0 = CALL_EXPR_ARG (exp, 0);
32808 arg1 = CALL_EXPR_ARG (exp, 1);
32809 arg2 = CALL_EXPR_ARG (exp, 2);
32810 op0 = expand_normal (arg0);
32811 op1 = expand_normal (arg1);
32812 op2 = expand_normal (arg2);
32813 if (!REG_P (op0))
32814 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32815 if (!REG_P (op1))
32816 op1 = copy_to_mode_reg (SImode, op1);
32817 if (!REG_P (op2))
32818 op2 = copy_to_mode_reg (SImode, op2);
32819 emit_insn (ix86_gen_monitor (op0, op1, op2));
32820 return 0;
32821
32822 case IX86_BUILTIN_MWAIT:
32823 arg0 = CALL_EXPR_ARG (exp, 0);
32824 arg1 = CALL_EXPR_ARG (exp, 1);
32825 op0 = expand_normal (arg0);
32826 op1 = expand_normal (arg1);
32827 if (!REG_P (op0))
32828 op0 = copy_to_mode_reg (SImode, op0);
32829 if (!REG_P (op1))
32830 op1 = copy_to_mode_reg (SImode, op1);
32831 emit_insn (gen_sse3_mwait (op0, op1));
32832 return 0;
32833
32834 case IX86_BUILTIN_VEC_INIT_V2SI:
32835 case IX86_BUILTIN_VEC_INIT_V4HI:
32836 case IX86_BUILTIN_VEC_INIT_V8QI:
32837 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32838
32839 case IX86_BUILTIN_VEC_EXT_V2DF:
32840 case IX86_BUILTIN_VEC_EXT_V2DI:
32841 case IX86_BUILTIN_VEC_EXT_V4SF:
32842 case IX86_BUILTIN_VEC_EXT_V4SI:
32843 case IX86_BUILTIN_VEC_EXT_V8HI:
32844 case IX86_BUILTIN_VEC_EXT_V2SI:
32845 case IX86_BUILTIN_VEC_EXT_V4HI:
32846 case IX86_BUILTIN_VEC_EXT_V16QI:
32847 return ix86_expand_vec_ext_builtin (exp, target);
32848
32849 case IX86_BUILTIN_VEC_SET_V2DI:
32850 case IX86_BUILTIN_VEC_SET_V4SF:
32851 case IX86_BUILTIN_VEC_SET_V4SI:
32852 case IX86_BUILTIN_VEC_SET_V8HI:
32853 case IX86_BUILTIN_VEC_SET_V4HI:
32854 case IX86_BUILTIN_VEC_SET_V16QI:
32855 return ix86_expand_vec_set_builtin (exp);
32856
32857 case IX86_BUILTIN_INFQ:
32858 case IX86_BUILTIN_HUGE_VALQ:
32859 {
32860 REAL_VALUE_TYPE inf;
32861 rtx tmp;
32862
32863 real_inf (&inf);
32864 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32865
32866 tmp = validize_mem (force_const_mem (mode, tmp));
32867
32868 if (target == 0)
32869 target = gen_reg_rtx (mode);
32870
32871 emit_move_insn (target, tmp);
32872 return target;
32873 }
32874
32875 case IX86_BUILTIN_RDPMC:
32876 case IX86_BUILTIN_RDTSC:
32877 case IX86_BUILTIN_RDTSCP:
32878
32879 op0 = gen_reg_rtx (DImode);
32880 op1 = gen_reg_rtx (DImode);
32881
32882 if (fcode == IX86_BUILTIN_RDPMC)
32883 {
32884 arg0 = CALL_EXPR_ARG (exp, 0);
32885 op2 = expand_normal (arg0);
32886 if (!register_operand (op2, SImode))
32887 op2 = copy_to_mode_reg (SImode, op2);
32888
32889 insn = (TARGET_64BIT
32890 ? gen_rdpmc_rex64 (op0, op1, op2)
32891 : gen_rdpmc (op0, op2));
32892 emit_insn (insn);
32893 }
32894 else if (fcode == IX86_BUILTIN_RDTSC)
32895 {
32896 insn = (TARGET_64BIT
32897 ? gen_rdtsc_rex64 (op0, op1)
32898 : gen_rdtsc (op0));
32899 emit_insn (insn);
32900 }
32901 else
32902 {
32903 op2 = gen_reg_rtx (SImode);
32904
32905 insn = (TARGET_64BIT
32906 ? gen_rdtscp_rex64 (op0, op1, op2)
32907 : gen_rdtscp (op0, op2));
32908 emit_insn (insn);
32909
32910 arg0 = CALL_EXPR_ARG (exp, 0);
32911 op4 = expand_normal (arg0);
32912 if (!address_operand (op4, VOIDmode))
32913 {
32914 op4 = convert_memory_address (Pmode, op4);
32915 op4 = copy_addr_to_reg (op4);
32916 }
32917 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
32918 }
32919
32920 if (target == 0)
32921 {
32922 /* mode is VOIDmode if __builtin_rd* has been called
32923 without lhs. */
32924 if (mode == VOIDmode)
32925 return target;
32926 target = gen_reg_rtx (mode);
32927 }
32928
32929 if (TARGET_64BIT)
32930 {
32931 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
32932 op1, 1, OPTAB_DIRECT);
32933 op0 = expand_simple_binop (DImode, IOR, op0, op1,
32934 op0, 1, OPTAB_DIRECT);
32935 }
32936
32937 emit_move_insn (target, op0);
32938 return target;
32939
32940 case IX86_BUILTIN_FXSAVE:
32941 case IX86_BUILTIN_FXRSTOR:
32942 case IX86_BUILTIN_FXSAVE64:
32943 case IX86_BUILTIN_FXRSTOR64:
32944 case IX86_BUILTIN_FNSTENV:
32945 case IX86_BUILTIN_FLDENV:
32946 case IX86_BUILTIN_FNSTSW:
32947 mode0 = BLKmode;
32948 switch (fcode)
32949 {
32950 case IX86_BUILTIN_FXSAVE:
32951 icode = CODE_FOR_fxsave;
32952 break;
32953 case IX86_BUILTIN_FXRSTOR:
32954 icode = CODE_FOR_fxrstor;
32955 break;
32956 case IX86_BUILTIN_FXSAVE64:
32957 icode = CODE_FOR_fxsave64;
32958 break;
32959 case IX86_BUILTIN_FXRSTOR64:
32960 icode = CODE_FOR_fxrstor64;
32961 break;
32962 case IX86_BUILTIN_FNSTENV:
32963 icode = CODE_FOR_fnstenv;
32964 break;
32965 case IX86_BUILTIN_FLDENV:
32966 icode = CODE_FOR_fldenv;
32967 break;
32968 case IX86_BUILTIN_FNSTSW:
32969 icode = CODE_FOR_fnstsw;
32970 mode0 = HImode;
32971 break;
32972 default:
32973 gcc_unreachable ();
32974 }
32975
32976 arg0 = CALL_EXPR_ARG (exp, 0);
32977 op0 = expand_normal (arg0);
32978
32979 if (!address_operand (op0, VOIDmode))
32980 {
32981 op0 = convert_memory_address (Pmode, op0);
32982 op0 = copy_addr_to_reg (op0);
32983 }
32984 op0 = gen_rtx_MEM (mode0, op0);
32985
32986 pat = GEN_FCN (icode) (op0);
32987 if (pat)
32988 emit_insn (pat);
32989 return 0;
32990
32991 case IX86_BUILTIN_XSAVE:
32992 case IX86_BUILTIN_XRSTOR:
32993 case IX86_BUILTIN_XSAVE64:
32994 case IX86_BUILTIN_XRSTOR64:
32995 case IX86_BUILTIN_XSAVEOPT:
32996 case IX86_BUILTIN_XSAVEOPT64:
32997 arg0 = CALL_EXPR_ARG (exp, 0);
32998 arg1 = CALL_EXPR_ARG (exp, 1);
32999 op0 = expand_normal (arg0);
33000 op1 = expand_normal (arg1);
33001
33002 if (!address_operand (op0, VOIDmode))
33003 {
33004 op0 = convert_memory_address (Pmode, op0);
33005 op0 = copy_addr_to_reg (op0);
33006 }
33007 op0 = gen_rtx_MEM (BLKmode, op0);
33008
33009 op1 = force_reg (DImode, op1);
33010
33011 if (TARGET_64BIT)
33012 {
33013 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
33014 NULL, 1, OPTAB_DIRECT);
33015 switch (fcode)
33016 {
33017 case IX86_BUILTIN_XSAVE:
33018 icode = CODE_FOR_xsave_rex64;
33019 break;
33020 case IX86_BUILTIN_XRSTOR:
33021 icode = CODE_FOR_xrstor_rex64;
33022 break;
33023 case IX86_BUILTIN_XSAVE64:
33024 icode = CODE_FOR_xsave64;
33025 break;
33026 case IX86_BUILTIN_XRSTOR64:
33027 icode = CODE_FOR_xrstor64;
33028 break;
33029 case IX86_BUILTIN_XSAVEOPT:
33030 icode = CODE_FOR_xsaveopt_rex64;
33031 break;
33032 case IX86_BUILTIN_XSAVEOPT64:
33033 icode = CODE_FOR_xsaveopt64;
33034 break;
33035 default:
33036 gcc_unreachable ();
33037 }
33038
33039 op2 = gen_lowpart (SImode, op2);
33040 op1 = gen_lowpart (SImode, op1);
33041 pat = GEN_FCN (icode) (op0, op1, op2);
33042 }
33043 else
33044 {
33045 switch (fcode)
33046 {
33047 case IX86_BUILTIN_XSAVE:
33048 icode = CODE_FOR_xsave;
33049 break;
33050 case IX86_BUILTIN_XRSTOR:
33051 icode = CODE_FOR_xrstor;
33052 break;
33053 case IX86_BUILTIN_XSAVEOPT:
33054 icode = CODE_FOR_xsaveopt;
33055 break;
33056 default:
33057 gcc_unreachable ();
33058 }
33059 pat = GEN_FCN (icode) (op0, op1);
33060 }
33061
33062 if (pat)
33063 emit_insn (pat);
33064 return 0;
33065
33066 case IX86_BUILTIN_LLWPCB:
33067 arg0 = CALL_EXPR_ARG (exp, 0);
33068 op0 = expand_normal (arg0);
33069 icode = CODE_FOR_lwp_llwpcb;
33070 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
33071 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
33072 emit_insn (gen_lwp_llwpcb (op0));
33073 return 0;
33074
33075 case IX86_BUILTIN_SLWPCB:
33076 icode = CODE_FOR_lwp_slwpcb;
33077 if (!target
33078 || !insn_data[icode].operand[0].predicate (target, Pmode))
33079 target = gen_reg_rtx (Pmode);
33080 emit_insn (gen_lwp_slwpcb (target));
33081 return target;
33082
33083 case IX86_BUILTIN_BEXTRI32:
33084 case IX86_BUILTIN_BEXTRI64:
33085 arg0 = CALL_EXPR_ARG (exp, 0);
33086 arg1 = CALL_EXPR_ARG (exp, 1);
33087 op0 = expand_normal (arg0);
33088 op1 = expand_normal (arg1);
33089 icode = (fcode == IX86_BUILTIN_BEXTRI32
33090 ? CODE_FOR_tbm_bextri_si
33091 : CODE_FOR_tbm_bextri_di);
33092 if (!CONST_INT_P (op1))
33093 {
33094 error ("last argument must be an immediate");
33095 return const0_rtx;
33096 }
33097 else
33098 {
33099 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
33100 unsigned char lsb_index = INTVAL (op1) & 0xFF;
33101 op1 = GEN_INT (length);
33102 op2 = GEN_INT (lsb_index);
33103 pat = GEN_FCN (icode) (target, op0, op1, op2);
33104 if (pat)
33105 emit_insn (pat);
33106 return target;
33107 }
33108
33109 case IX86_BUILTIN_RDRAND16_STEP:
33110 icode = CODE_FOR_rdrandhi_1;
33111 mode0 = HImode;
33112 goto rdrand_step;
33113
33114 case IX86_BUILTIN_RDRAND32_STEP:
33115 icode = CODE_FOR_rdrandsi_1;
33116 mode0 = SImode;
33117 goto rdrand_step;
33118
33119 case IX86_BUILTIN_RDRAND64_STEP:
33120 icode = CODE_FOR_rdranddi_1;
33121 mode0 = DImode;
33122
33123 rdrand_step:
33124 op0 = gen_reg_rtx (mode0);
33125 emit_insn (GEN_FCN (icode) (op0));
33126
33127 arg0 = CALL_EXPR_ARG (exp, 0);
33128 op1 = expand_normal (arg0);
33129 if (!address_operand (op1, VOIDmode))
33130 {
33131 op1 = convert_memory_address (Pmode, op1);
33132 op1 = copy_addr_to_reg (op1);
33133 }
33134 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33135
33136 op1 = gen_reg_rtx (SImode);
33137 emit_move_insn (op1, CONST1_RTX (SImode));
33138
33139 /* Emit SImode conditional move. */
33140 if (mode0 == HImode)
33141 {
33142 op2 = gen_reg_rtx (SImode);
33143 emit_insn (gen_zero_extendhisi2 (op2, op0));
33144 }
33145 else if (mode0 == SImode)
33146 op2 = op0;
33147 else
33148 op2 = gen_rtx_SUBREG (SImode, op0, 0);
33149
33150 if (target == 0)
33151 target = gen_reg_rtx (SImode);
33152
33153 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
33154 const0_rtx);
33155 emit_insn (gen_rtx_SET (VOIDmode, target,
33156 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
33157 return target;
33158
33159 case IX86_BUILTIN_RDSEED16_STEP:
33160 icode = CODE_FOR_rdseedhi_1;
33161 mode0 = HImode;
33162 goto rdseed_step;
33163
33164 case IX86_BUILTIN_RDSEED32_STEP:
33165 icode = CODE_FOR_rdseedsi_1;
33166 mode0 = SImode;
33167 goto rdseed_step;
33168
33169 case IX86_BUILTIN_RDSEED64_STEP:
33170 icode = CODE_FOR_rdseeddi_1;
33171 mode0 = DImode;
33172
33173 rdseed_step:
33174 op0 = gen_reg_rtx (mode0);
33175 emit_insn (GEN_FCN (icode) (op0));
33176
33177 arg0 = CALL_EXPR_ARG (exp, 0);
33178 op1 = expand_normal (arg0);
33179 if (!address_operand (op1, VOIDmode))
33180 {
33181 op1 = convert_memory_address (Pmode, op1);
33182 op1 = copy_addr_to_reg (op1);
33183 }
33184 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33185
33186 op2 = gen_reg_rtx (QImode);
33187
33188 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
33189 const0_rtx);
33190 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
33191
33192 if (target == 0)
33193 target = gen_reg_rtx (SImode);
33194
33195 emit_insn (gen_zero_extendqisi2 (target, op2));
33196 return target;
33197
33198 case IX86_BUILTIN_ADDCARRYX32:
33199 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
33200 mode0 = SImode;
33201 goto addcarryx;
33202
33203 case IX86_BUILTIN_ADDCARRYX64:
33204 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
33205 mode0 = DImode;
33206
33207 addcarryx:
33208 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
33209 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
33210 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
33211 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
33212
33213 op0 = gen_reg_rtx (QImode);
33214
33215 /* Generate CF from input operand. */
33216 op1 = expand_normal (arg0);
33217 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
33218 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
33219
33220 /* Gen ADCX instruction to compute X+Y+CF. */
33221 op2 = expand_normal (arg1);
33222 op3 = expand_normal (arg2);
33223
33224 if (!REG_P (op2))
33225 op2 = copy_to_mode_reg (mode0, op2);
33226 if (!REG_P (op3))
33227 op3 = copy_to_mode_reg (mode0, op3);
33228
33229 op0 = gen_reg_rtx (mode0);
33230
33231 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
33232 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
33233 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
33234
33235 /* Store the result. */
33236 op4 = expand_normal (arg3);
33237 if (!address_operand (op4, VOIDmode))
33238 {
33239 op4 = convert_memory_address (Pmode, op4);
33240 op4 = copy_addr_to_reg (op4);
33241 }
33242 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
33243
33244 /* Return current CF value. */
33245 if (target == 0)
33246 target = gen_reg_rtx (QImode);
33247
33248 PUT_MODE (pat, QImode);
33249 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
33250 return target;
33251
33252 case IX86_BUILTIN_GATHERSIV2DF:
33253 icode = CODE_FOR_avx2_gathersiv2df;
33254 goto gather_gen;
33255 case IX86_BUILTIN_GATHERSIV4DF:
33256 icode = CODE_FOR_avx2_gathersiv4df;
33257 goto gather_gen;
33258 case IX86_BUILTIN_GATHERDIV2DF:
33259 icode = CODE_FOR_avx2_gatherdiv2df;
33260 goto gather_gen;
33261 case IX86_BUILTIN_GATHERDIV4DF:
33262 icode = CODE_FOR_avx2_gatherdiv4df;
33263 goto gather_gen;
33264 case IX86_BUILTIN_GATHERSIV4SF:
33265 icode = CODE_FOR_avx2_gathersiv4sf;
33266 goto gather_gen;
33267 case IX86_BUILTIN_GATHERSIV8SF:
33268 icode = CODE_FOR_avx2_gathersiv8sf;
33269 goto gather_gen;
33270 case IX86_BUILTIN_GATHERDIV4SF:
33271 icode = CODE_FOR_avx2_gatherdiv4sf;
33272 goto gather_gen;
33273 case IX86_BUILTIN_GATHERDIV8SF:
33274 icode = CODE_FOR_avx2_gatherdiv8sf;
33275 goto gather_gen;
33276 case IX86_BUILTIN_GATHERSIV2DI:
33277 icode = CODE_FOR_avx2_gathersiv2di;
33278 goto gather_gen;
33279 case IX86_BUILTIN_GATHERSIV4DI:
33280 icode = CODE_FOR_avx2_gathersiv4di;
33281 goto gather_gen;
33282 case IX86_BUILTIN_GATHERDIV2DI:
33283 icode = CODE_FOR_avx2_gatherdiv2di;
33284 goto gather_gen;
33285 case IX86_BUILTIN_GATHERDIV4DI:
33286 icode = CODE_FOR_avx2_gatherdiv4di;
33287 goto gather_gen;
33288 case IX86_BUILTIN_GATHERSIV4SI:
33289 icode = CODE_FOR_avx2_gathersiv4si;
33290 goto gather_gen;
33291 case IX86_BUILTIN_GATHERSIV8SI:
33292 icode = CODE_FOR_avx2_gathersiv8si;
33293 goto gather_gen;
33294 case IX86_BUILTIN_GATHERDIV4SI:
33295 icode = CODE_FOR_avx2_gatherdiv4si;
33296 goto gather_gen;
33297 case IX86_BUILTIN_GATHERDIV8SI:
33298 icode = CODE_FOR_avx2_gatherdiv8si;
33299 goto gather_gen;
33300 case IX86_BUILTIN_GATHERALTSIV4DF:
33301 icode = CODE_FOR_avx2_gathersiv4df;
33302 goto gather_gen;
33303 case IX86_BUILTIN_GATHERALTDIV8SF:
33304 icode = CODE_FOR_avx2_gatherdiv8sf;
33305 goto gather_gen;
33306 case IX86_BUILTIN_GATHERALTSIV4DI:
33307 icode = CODE_FOR_avx2_gathersiv4di;
33308 goto gather_gen;
33309 case IX86_BUILTIN_GATHERALTDIV8SI:
33310 icode = CODE_FOR_avx2_gatherdiv8si;
33311 goto gather_gen;
33312
33313 gather_gen:
33314 arg0 = CALL_EXPR_ARG (exp, 0);
33315 arg1 = CALL_EXPR_ARG (exp, 1);
33316 arg2 = CALL_EXPR_ARG (exp, 2);
33317 arg3 = CALL_EXPR_ARG (exp, 3);
33318 arg4 = CALL_EXPR_ARG (exp, 4);
33319 op0 = expand_normal (arg0);
33320 op1 = expand_normal (arg1);
33321 op2 = expand_normal (arg2);
33322 op3 = expand_normal (arg3);
33323 op4 = expand_normal (arg4);
33324 /* Note the arg order is different from the operand order. */
33325 mode0 = insn_data[icode].operand[1].mode;
33326 mode2 = insn_data[icode].operand[3].mode;
33327 mode3 = insn_data[icode].operand[4].mode;
33328 mode4 = insn_data[icode].operand[5].mode;
33329
33330 if (target == NULL_RTX
33331 || GET_MODE (target) != insn_data[icode].operand[0].mode)
33332 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
33333 else
33334 subtarget = target;
33335
33336 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
33337 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
33338 {
33339 rtx half = gen_reg_rtx (V4SImode);
33340 if (!nonimmediate_operand (op2, V8SImode))
33341 op2 = copy_to_mode_reg (V8SImode, op2);
33342 emit_insn (gen_vec_extract_lo_v8si (half, op2));
33343 op2 = half;
33344 }
33345 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
33346 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
33347 {
33348 rtx (*gen) (rtx, rtx);
33349 rtx half = gen_reg_rtx (mode0);
33350 if (mode0 == V4SFmode)
33351 gen = gen_vec_extract_lo_v8sf;
33352 else
33353 gen = gen_vec_extract_lo_v8si;
33354 if (!nonimmediate_operand (op0, GET_MODE (op0)))
33355 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
33356 emit_insn (gen (half, op0));
33357 op0 = half;
33358 if (!nonimmediate_operand (op3, GET_MODE (op3)))
33359 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
33360 emit_insn (gen (half, op3));
33361 op3 = half;
33362 }
33363
33364 /* Force memory operand only with base register here. But we
33365 don't want to do it on memory operand for other builtin
33366 functions. */
33367 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
33368
33369 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33370 op0 = copy_to_mode_reg (mode0, op0);
33371 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
33372 op1 = copy_to_mode_reg (Pmode, op1);
33373 if (!insn_data[icode].operand[3].predicate (op2, mode2))
33374 op2 = copy_to_mode_reg (mode2, op2);
33375 if (!insn_data[icode].operand[4].predicate (op3, mode3))
33376 op3 = copy_to_mode_reg (mode3, op3);
33377 if (!insn_data[icode].operand[5].predicate (op4, mode4))
33378 {
33379 error ("last argument must be scale 1, 2, 4, 8");
33380 return const0_rtx;
33381 }
33382
33383 /* Optimize. If mask is known to have all high bits set,
33384 replace op0 with pc_rtx to signal that the instruction
33385 overwrites the whole destination and doesn't use its
33386 previous contents. */
33387 if (optimize)
33388 {
33389 if (TREE_CODE (arg3) == VECTOR_CST)
33390 {
33391 unsigned int negative = 0;
33392 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
33393 {
33394 tree cst = VECTOR_CST_ELT (arg3, i);
33395 if (TREE_CODE (cst) == INTEGER_CST
33396 && tree_int_cst_sign_bit (cst))
33397 negative++;
33398 else if (TREE_CODE (cst) == REAL_CST
33399 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
33400 negative++;
33401 }
33402 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
33403 op0 = pc_rtx;
33404 }
33405 else if (TREE_CODE (arg3) == SSA_NAME)
33406 {
33407 /* Recognize also when mask is like:
33408 __v2df src = _mm_setzero_pd ();
33409 __v2df mask = _mm_cmpeq_pd (src, src);
33410 or
33411 __v8sf src = _mm256_setzero_ps ();
33412 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
33413 as that is a cheaper way to load all ones into
33414 a register than having to load a constant from
33415 memory. */
33416 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
33417 if (is_gimple_call (def_stmt))
33418 {
33419 tree fndecl = gimple_call_fndecl (def_stmt);
33420 if (fndecl
33421 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33422 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
33423 {
33424 case IX86_BUILTIN_CMPPD:
33425 case IX86_BUILTIN_CMPPS:
33426 case IX86_BUILTIN_CMPPD256:
33427 case IX86_BUILTIN_CMPPS256:
33428 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
33429 break;
33430 /* FALLTHRU */
33431 case IX86_BUILTIN_CMPEQPD:
33432 case IX86_BUILTIN_CMPEQPS:
33433 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
33434 && initializer_zerop (gimple_call_arg (def_stmt,
33435 1)))
33436 op0 = pc_rtx;
33437 break;
33438 default:
33439 break;
33440 }
33441 }
33442 }
33443 }
33444
33445 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
33446 if (! pat)
33447 return const0_rtx;
33448 emit_insn (pat);
33449
33450 if (fcode == IX86_BUILTIN_GATHERDIV8SF
33451 || fcode == IX86_BUILTIN_GATHERDIV8SI)
33452 {
33453 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
33454 ? V4SFmode : V4SImode;
33455 if (target == NULL_RTX)
33456 target = gen_reg_rtx (tmode);
33457 if (tmode == V4SFmode)
33458 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
33459 else
33460 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
33461 }
33462 else
33463 target = subtarget;
33464
33465 return target;
33466
33467 case IX86_BUILTIN_XABORT:
33468 icode = CODE_FOR_xabort;
33469 arg0 = CALL_EXPR_ARG (exp, 0);
33470 op0 = expand_normal (arg0);
33471 mode0 = insn_data[icode].operand[0].mode;
33472 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33473 {
33474 error ("the xabort's argument must be an 8-bit immediate");
33475 return const0_rtx;
33476 }
33477 emit_insn (gen_xabort (op0));
33478 return 0;
33479
33480 default:
33481 break;
33482 }
33483
33484 for (i = 0, d = bdesc_special_args;
33485 i < ARRAY_SIZE (bdesc_special_args);
33486 i++, d++)
33487 if (d->code == fcode)
33488 return ix86_expand_special_args_builtin (d, exp, target);
33489
33490 for (i = 0, d = bdesc_args;
33491 i < ARRAY_SIZE (bdesc_args);
33492 i++, d++)
33493 if (d->code == fcode)
33494 switch (fcode)
33495 {
33496 case IX86_BUILTIN_FABSQ:
33497 case IX86_BUILTIN_COPYSIGNQ:
33498 if (!TARGET_SSE)
33499 /* Emit a normal call if SSE isn't available. */
33500 return expand_call (exp, target, ignore);
33501 default:
33502 return ix86_expand_args_builtin (d, exp, target);
33503 }
33504
33505 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
33506 if (d->code == fcode)
33507 return ix86_expand_sse_comi (d, exp, target);
33508
33509 for (i = 0, d = bdesc_pcmpestr;
33510 i < ARRAY_SIZE (bdesc_pcmpestr);
33511 i++, d++)
33512 if (d->code == fcode)
33513 return ix86_expand_sse_pcmpestr (d, exp, target);
33514
33515 for (i = 0, d = bdesc_pcmpistr;
33516 i < ARRAY_SIZE (bdesc_pcmpistr);
33517 i++, d++)
33518 if (d->code == fcode)
33519 return ix86_expand_sse_pcmpistr (d, exp, target);
33520
33521 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33522 if (d->code == fcode)
33523 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33524 (enum ix86_builtin_func_type)
33525 d->flag, d->comparison);
33526
33527 gcc_unreachable ();
33528 }
33529
33530 /* Returns a function decl for a vectorized version of the builtin function
33531 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33532 if it is not available. */
33533
33534 static tree
33535 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33536 tree type_in)
33537 {
33538 enum machine_mode in_mode, out_mode;
33539 int in_n, out_n;
33540 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33541
33542 if (TREE_CODE (type_out) != VECTOR_TYPE
33543 || TREE_CODE (type_in) != VECTOR_TYPE
33544 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33545 return NULL_TREE;
33546
33547 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33548 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33549 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33550 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33551
33552 switch (fn)
33553 {
33554 case BUILT_IN_SQRT:
33555 if (out_mode == DFmode && in_mode == DFmode)
33556 {
33557 if (out_n == 2 && in_n == 2)
33558 return ix86_builtins[IX86_BUILTIN_SQRTPD];
33559 else if (out_n == 4 && in_n == 4)
33560 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
33561 }
33562 break;
33563
33564 case BUILT_IN_SQRTF:
33565 if (out_mode == SFmode && in_mode == SFmode)
33566 {
33567 if (out_n == 4 && in_n == 4)
33568 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
33569 else if (out_n == 8 && in_n == 8)
33570 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
33571 }
33572 break;
33573
33574 case BUILT_IN_IFLOOR:
33575 case BUILT_IN_LFLOOR:
33576 case BUILT_IN_LLFLOOR:
33577 /* The round insn does not trap on denormals. */
33578 if (flag_trapping_math || !TARGET_ROUND)
33579 break;
33580
33581 if (out_mode == SImode && in_mode == DFmode)
33582 {
33583 if (out_n == 4 && in_n == 2)
33584 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
33585 else if (out_n == 8 && in_n == 4)
33586 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
33587 }
33588 break;
33589
33590 case BUILT_IN_IFLOORF:
33591 case BUILT_IN_LFLOORF:
33592 case BUILT_IN_LLFLOORF:
33593 /* The round insn does not trap on denormals. */
33594 if (flag_trapping_math || !TARGET_ROUND)
33595 break;
33596
33597 if (out_mode == SImode && in_mode == SFmode)
33598 {
33599 if (out_n == 4 && in_n == 4)
33600 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
33601 else if (out_n == 8 && in_n == 8)
33602 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
33603 }
33604 break;
33605
33606 case BUILT_IN_ICEIL:
33607 case BUILT_IN_LCEIL:
33608 case BUILT_IN_LLCEIL:
33609 /* The round insn does not trap on denormals. */
33610 if (flag_trapping_math || !TARGET_ROUND)
33611 break;
33612
33613 if (out_mode == SImode && in_mode == DFmode)
33614 {
33615 if (out_n == 4 && in_n == 2)
33616 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
33617 else if (out_n == 8 && in_n == 4)
33618 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
33619 }
33620 break;
33621
33622 case BUILT_IN_ICEILF:
33623 case BUILT_IN_LCEILF:
33624 case BUILT_IN_LLCEILF:
33625 /* The round insn does not trap on denormals. */
33626 if (flag_trapping_math || !TARGET_ROUND)
33627 break;
33628
33629 if (out_mode == SImode && in_mode == SFmode)
33630 {
33631 if (out_n == 4 && in_n == 4)
33632 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
33633 else if (out_n == 8 && in_n == 8)
33634 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
33635 }
33636 break;
33637
33638 case BUILT_IN_IRINT:
33639 case BUILT_IN_LRINT:
33640 case BUILT_IN_LLRINT:
33641 if (out_mode == SImode && in_mode == DFmode)
33642 {
33643 if (out_n == 4 && in_n == 2)
33644 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33645 else if (out_n == 8 && in_n == 4)
33646 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33647 }
33648 break;
33649
33650 case BUILT_IN_IRINTF:
33651 case BUILT_IN_LRINTF:
33652 case BUILT_IN_LLRINTF:
33653 if (out_mode == SImode && in_mode == SFmode)
33654 {
33655 if (out_n == 4 && in_n == 4)
33656 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33657 else if (out_n == 8 && in_n == 8)
33658 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33659 }
33660 break;
33661
33662 case BUILT_IN_IROUND:
33663 case BUILT_IN_LROUND:
33664 case BUILT_IN_LLROUND:
33665 /* The round insn does not trap on denormals. */
33666 if (flag_trapping_math || !TARGET_ROUND)
33667 break;
33668
33669 if (out_mode == SImode && in_mode == DFmode)
33670 {
33671 if (out_n == 4 && in_n == 2)
33672 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33673 else if (out_n == 8 && in_n == 4)
33674 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33675 }
33676 break;
33677
33678 case BUILT_IN_IROUNDF:
33679 case BUILT_IN_LROUNDF:
33680 case BUILT_IN_LLROUNDF:
33681 /* The round insn does not trap on denormals. */
33682 if (flag_trapping_math || !TARGET_ROUND)
33683 break;
33684
33685 if (out_mode == SImode && in_mode == SFmode)
33686 {
33687 if (out_n == 4 && in_n == 4)
33688 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33689 else if (out_n == 8 && in_n == 8)
33690 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33691 }
33692 break;
33693
33694 case BUILT_IN_COPYSIGN:
33695 if (out_mode == DFmode && in_mode == DFmode)
33696 {
33697 if (out_n == 2 && in_n == 2)
33698 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33699 else if (out_n == 4 && in_n == 4)
33700 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33701 }
33702 break;
33703
33704 case BUILT_IN_COPYSIGNF:
33705 if (out_mode == SFmode && in_mode == SFmode)
33706 {
33707 if (out_n == 4 && in_n == 4)
33708 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33709 else if (out_n == 8 && in_n == 8)
33710 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33711 }
33712 break;
33713
33714 case BUILT_IN_FLOOR:
33715 /* The round insn does not trap on denormals. */
33716 if (flag_trapping_math || !TARGET_ROUND)
33717 break;
33718
33719 if (out_mode == DFmode && in_mode == DFmode)
33720 {
33721 if (out_n == 2 && in_n == 2)
33722 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33723 else if (out_n == 4 && in_n == 4)
33724 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33725 }
33726 break;
33727
33728 case BUILT_IN_FLOORF:
33729 /* The round insn does not trap on denormals. */
33730 if (flag_trapping_math || !TARGET_ROUND)
33731 break;
33732
33733 if (out_mode == SFmode && in_mode == SFmode)
33734 {
33735 if (out_n == 4 && in_n == 4)
33736 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33737 else if (out_n == 8 && in_n == 8)
33738 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33739 }
33740 break;
33741
33742 case BUILT_IN_CEIL:
33743 /* The round insn does not trap on denormals. */
33744 if (flag_trapping_math || !TARGET_ROUND)
33745 break;
33746
33747 if (out_mode == DFmode && in_mode == DFmode)
33748 {
33749 if (out_n == 2 && in_n == 2)
33750 return ix86_builtins[IX86_BUILTIN_CEILPD];
33751 else if (out_n == 4 && in_n == 4)
33752 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33753 }
33754 break;
33755
33756 case BUILT_IN_CEILF:
33757 /* The round insn does not trap on denormals. */
33758 if (flag_trapping_math || !TARGET_ROUND)
33759 break;
33760
33761 if (out_mode == SFmode && in_mode == SFmode)
33762 {
33763 if (out_n == 4 && in_n == 4)
33764 return ix86_builtins[IX86_BUILTIN_CEILPS];
33765 else if (out_n == 8 && in_n == 8)
33766 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33767 }
33768 break;
33769
33770 case BUILT_IN_TRUNC:
33771 /* The round insn does not trap on denormals. */
33772 if (flag_trapping_math || !TARGET_ROUND)
33773 break;
33774
33775 if (out_mode == DFmode && in_mode == DFmode)
33776 {
33777 if (out_n == 2 && in_n == 2)
33778 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33779 else if (out_n == 4 && in_n == 4)
33780 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33781 }
33782 break;
33783
33784 case BUILT_IN_TRUNCF:
33785 /* The round insn does not trap on denormals. */
33786 if (flag_trapping_math || !TARGET_ROUND)
33787 break;
33788
33789 if (out_mode == SFmode && in_mode == SFmode)
33790 {
33791 if (out_n == 4 && in_n == 4)
33792 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33793 else if (out_n == 8 && in_n == 8)
33794 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33795 }
33796 break;
33797
33798 case BUILT_IN_RINT:
33799 /* The round insn does not trap on denormals. */
33800 if (flag_trapping_math || !TARGET_ROUND)
33801 break;
33802
33803 if (out_mode == DFmode && in_mode == DFmode)
33804 {
33805 if (out_n == 2 && in_n == 2)
33806 return ix86_builtins[IX86_BUILTIN_RINTPD];
33807 else if (out_n == 4 && in_n == 4)
33808 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33809 }
33810 break;
33811
33812 case BUILT_IN_RINTF:
33813 /* The round insn does not trap on denormals. */
33814 if (flag_trapping_math || !TARGET_ROUND)
33815 break;
33816
33817 if (out_mode == SFmode && in_mode == SFmode)
33818 {
33819 if (out_n == 4 && in_n == 4)
33820 return ix86_builtins[IX86_BUILTIN_RINTPS];
33821 else if (out_n == 8 && in_n == 8)
33822 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33823 }
33824 break;
33825
33826 case BUILT_IN_ROUND:
33827 /* The round insn does not trap on denormals. */
33828 if (flag_trapping_math || !TARGET_ROUND)
33829 break;
33830
33831 if (out_mode == DFmode && in_mode == DFmode)
33832 {
33833 if (out_n == 2 && in_n == 2)
33834 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33835 else if (out_n == 4 && in_n == 4)
33836 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33837 }
33838 break;
33839
33840 case BUILT_IN_ROUNDF:
33841 /* The round insn does not trap on denormals. */
33842 if (flag_trapping_math || !TARGET_ROUND)
33843 break;
33844
33845 if (out_mode == SFmode && in_mode == SFmode)
33846 {
33847 if (out_n == 4 && in_n == 4)
33848 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33849 else if (out_n == 8 && in_n == 8)
33850 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33851 }
33852 break;
33853
33854 case BUILT_IN_FMA:
33855 if (out_mode == DFmode && in_mode == DFmode)
33856 {
33857 if (out_n == 2 && in_n == 2)
33858 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33859 if (out_n == 4 && in_n == 4)
33860 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33861 }
33862 break;
33863
33864 case BUILT_IN_FMAF:
33865 if (out_mode == SFmode && in_mode == SFmode)
33866 {
33867 if (out_n == 4 && in_n == 4)
33868 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33869 if (out_n == 8 && in_n == 8)
33870 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33871 }
33872 break;
33873
33874 default:
33875 break;
33876 }
33877
33878 /* Dispatch to a handler for a vectorization library. */
33879 if (ix86_veclib_handler)
33880 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33881 type_in);
33882
33883 return NULL_TREE;
33884 }
33885
33886 /* Handler for an SVML-style interface to
33887 a library with vectorized intrinsics. */
33888
33889 static tree
33890 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33891 {
33892 char name[20];
33893 tree fntype, new_fndecl, args;
33894 unsigned arity;
33895 const char *bname;
33896 enum machine_mode el_mode, in_mode;
33897 int n, in_n;
33898
33899 /* The SVML is suitable for unsafe math only. */
33900 if (!flag_unsafe_math_optimizations)
33901 return NULL_TREE;
33902
33903 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33904 n = TYPE_VECTOR_SUBPARTS (type_out);
33905 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33906 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33907 if (el_mode != in_mode
33908 || n != in_n)
33909 return NULL_TREE;
33910
33911 switch (fn)
33912 {
33913 case BUILT_IN_EXP:
33914 case BUILT_IN_LOG:
33915 case BUILT_IN_LOG10:
33916 case BUILT_IN_POW:
33917 case BUILT_IN_TANH:
33918 case BUILT_IN_TAN:
33919 case BUILT_IN_ATAN:
33920 case BUILT_IN_ATAN2:
33921 case BUILT_IN_ATANH:
33922 case BUILT_IN_CBRT:
33923 case BUILT_IN_SINH:
33924 case BUILT_IN_SIN:
33925 case BUILT_IN_ASINH:
33926 case BUILT_IN_ASIN:
33927 case BUILT_IN_COSH:
33928 case BUILT_IN_COS:
33929 case BUILT_IN_ACOSH:
33930 case BUILT_IN_ACOS:
33931 if (el_mode != DFmode || n != 2)
33932 return NULL_TREE;
33933 break;
33934
33935 case BUILT_IN_EXPF:
33936 case BUILT_IN_LOGF:
33937 case BUILT_IN_LOG10F:
33938 case BUILT_IN_POWF:
33939 case BUILT_IN_TANHF:
33940 case BUILT_IN_TANF:
33941 case BUILT_IN_ATANF:
33942 case BUILT_IN_ATAN2F:
33943 case BUILT_IN_ATANHF:
33944 case BUILT_IN_CBRTF:
33945 case BUILT_IN_SINHF:
33946 case BUILT_IN_SINF:
33947 case BUILT_IN_ASINHF:
33948 case BUILT_IN_ASINF:
33949 case BUILT_IN_COSHF:
33950 case BUILT_IN_COSF:
33951 case BUILT_IN_ACOSHF:
33952 case BUILT_IN_ACOSF:
33953 if (el_mode != SFmode || n != 4)
33954 return NULL_TREE;
33955 break;
33956
33957 default:
33958 return NULL_TREE;
33959 }
33960
33961 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33962
33963 if (fn == BUILT_IN_LOGF)
33964 strcpy (name, "vmlsLn4");
33965 else if (fn == BUILT_IN_LOG)
33966 strcpy (name, "vmldLn2");
33967 else if (n == 4)
33968 {
33969 sprintf (name, "vmls%s", bname+10);
33970 name[strlen (name)-1] = '4';
33971 }
33972 else
33973 sprintf (name, "vmld%s2", bname+10);
33974
33975 /* Convert to uppercase. */
33976 name[4] &= ~0x20;
33977
33978 arity = 0;
33979 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33980 args;
33981 args = TREE_CHAIN (args))
33982 arity++;
33983
33984 if (arity == 1)
33985 fntype = build_function_type_list (type_out, type_in, NULL);
33986 else
33987 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33988
33989 /* Build a function declaration for the vectorized function. */
33990 new_fndecl = build_decl (BUILTINS_LOCATION,
33991 FUNCTION_DECL, get_identifier (name), fntype);
33992 TREE_PUBLIC (new_fndecl) = 1;
33993 DECL_EXTERNAL (new_fndecl) = 1;
33994 DECL_IS_NOVOPS (new_fndecl) = 1;
33995 TREE_READONLY (new_fndecl) = 1;
33996
33997 return new_fndecl;
33998 }
33999
34000 /* Handler for an ACML-style interface to
34001 a library with vectorized intrinsics. */
34002
34003 static tree
34004 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
34005 {
34006 char name[20] = "__vr.._";
34007 tree fntype, new_fndecl, args;
34008 unsigned arity;
34009 const char *bname;
34010 enum machine_mode el_mode, in_mode;
34011 int n, in_n;
34012
34013 /* The ACML is 64bits only and suitable for unsafe math only as
34014 it does not correctly support parts of IEEE with the required
34015 precision such as denormals. */
34016 if (!TARGET_64BIT
34017 || !flag_unsafe_math_optimizations)
34018 return NULL_TREE;
34019
34020 el_mode = TYPE_MODE (TREE_TYPE (type_out));
34021 n = TYPE_VECTOR_SUBPARTS (type_out);
34022 in_mode = TYPE_MODE (TREE_TYPE (type_in));
34023 in_n = TYPE_VECTOR_SUBPARTS (type_in);
34024 if (el_mode != in_mode
34025 || n != in_n)
34026 return NULL_TREE;
34027
34028 switch (fn)
34029 {
34030 case BUILT_IN_SIN:
34031 case BUILT_IN_COS:
34032 case BUILT_IN_EXP:
34033 case BUILT_IN_LOG:
34034 case BUILT_IN_LOG2:
34035 case BUILT_IN_LOG10:
34036 name[4] = 'd';
34037 name[5] = '2';
34038 if (el_mode != DFmode
34039 || n != 2)
34040 return NULL_TREE;
34041 break;
34042
34043 case BUILT_IN_SINF:
34044 case BUILT_IN_COSF:
34045 case BUILT_IN_EXPF:
34046 case BUILT_IN_POWF:
34047 case BUILT_IN_LOGF:
34048 case BUILT_IN_LOG2F:
34049 case BUILT_IN_LOG10F:
34050 name[4] = 's';
34051 name[5] = '4';
34052 if (el_mode != SFmode
34053 || n != 4)
34054 return NULL_TREE;
34055 break;
34056
34057 default:
34058 return NULL_TREE;
34059 }
34060
34061 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34062 sprintf (name + 7, "%s", bname+10);
34063
34064 arity = 0;
34065 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34066 args;
34067 args = TREE_CHAIN (args))
34068 arity++;
34069
34070 if (arity == 1)
34071 fntype = build_function_type_list (type_out, type_in, NULL);
34072 else
34073 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34074
34075 /* Build a function declaration for the vectorized function. */
34076 new_fndecl = build_decl (BUILTINS_LOCATION,
34077 FUNCTION_DECL, get_identifier (name), fntype);
34078 TREE_PUBLIC (new_fndecl) = 1;
34079 DECL_EXTERNAL (new_fndecl) = 1;
34080 DECL_IS_NOVOPS (new_fndecl) = 1;
34081 TREE_READONLY (new_fndecl) = 1;
34082
34083 return new_fndecl;
34084 }
34085
34086 /* Returns a decl of a function that implements gather load with
34087 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
34088 Return NULL_TREE if it is not available. */
34089
34090 static tree
34091 ix86_vectorize_builtin_gather (const_tree mem_vectype,
34092 const_tree index_type, int scale)
34093 {
34094 bool si;
34095 enum ix86_builtins code;
34096
34097 if (! TARGET_AVX2)
34098 return NULL_TREE;
34099
34100 if ((TREE_CODE (index_type) != INTEGER_TYPE
34101 && !POINTER_TYPE_P (index_type))
34102 || (TYPE_MODE (index_type) != SImode
34103 && TYPE_MODE (index_type) != DImode))
34104 return NULL_TREE;
34105
34106 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
34107 return NULL_TREE;
34108
34109 /* v*gather* insn sign extends index to pointer mode. */
34110 if (TYPE_PRECISION (index_type) < POINTER_SIZE
34111 && TYPE_UNSIGNED (index_type))
34112 return NULL_TREE;
34113
34114 if (scale <= 0
34115 || scale > 8
34116 || (scale & (scale - 1)) != 0)
34117 return NULL_TREE;
34118
34119 si = TYPE_MODE (index_type) == SImode;
34120 switch (TYPE_MODE (mem_vectype))
34121 {
34122 case V2DFmode:
34123 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
34124 break;
34125 case V4DFmode:
34126 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
34127 break;
34128 case V2DImode:
34129 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
34130 break;
34131 case V4DImode:
34132 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
34133 break;
34134 case V4SFmode:
34135 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
34136 break;
34137 case V8SFmode:
34138 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
34139 break;
34140 case V4SImode:
34141 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
34142 break;
34143 case V8SImode:
34144 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
34145 break;
34146 default:
34147 return NULL_TREE;
34148 }
34149
34150 return ix86_builtins[code];
34151 }
34152
34153 /* Returns a code for a target-specific builtin that implements
34154 reciprocal of the function, or NULL_TREE if not available. */
34155
34156 static tree
34157 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
34158 bool sqrt ATTRIBUTE_UNUSED)
34159 {
34160 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
34161 && flag_finite_math_only && !flag_trapping_math
34162 && flag_unsafe_math_optimizations))
34163 return NULL_TREE;
34164
34165 if (md_fn)
34166 /* Machine dependent builtins. */
34167 switch (fn)
34168 {
34169 /* Vectorized version of sqrt to rsqrt conversion. */
34170 case IX86_BUILTIN_SQRTPS_NR:
34171 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
34172
34173 case IX86_BUILTIN_SQRTPS_NR256:
34174 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
34175
34176 default:
34177 return NULL_TREE;
34178 }
34179 else
34180 /* Normal builtins. */
34181 switch (fn)
34182 {
34183 /* Sqrt to rsqrt conversion. */
34184 case BUILT_IN_SQRTF:
34185 return ix86_builtins[IX86_BUILTIN_RSQRTF];
34186
34187 default:
34188 return NULL_TREE;
34189 }
34190 }
34191 \f
34192 /* Helper for avx_vpermilps256_operand et al. This is also used by
34193 the expansion functions to turn the parallel back into a mask.
34194 The return value is 0 for no match and the imm8+1 for a match. */
34195
34196 int
34197 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
34198 {
34199 unsigned i, nelt = GET_MODE_NUNITS (mode);
34200 unsigned mask = 0;
34201 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34202
34203 if (XVECLEN (par, 0) != (int) nelt)
34204 return 0;
34205
34206 /* Validate that all of the elements are constants, and not totally
34207 out of range. Copy the data into an integral array to make the
34208 subsequent checks easier. */
34209 for (i = 0; i < nelt; ++i)
34210 {
34211 rtx er = XVECEXP (par, 0, i);
34212 unsigned HOST_WIDE_INT ei;
34213
34214 if (!CONST_INT_P (er))
34215 return 0;
34216 ei = INTVAL (er);
34217 if (ei >= nelt)
34218 return 0;
34219 ipar[i] = ei;
34220 }
34221
34222 switch (mode)
34223 {
34224 case V4DFmode:
34225 /* In the 256-bit DFmode case, we can only move elements within
34226 a 128-bit lane. */
34227 for (i = 0; i < 2; ++i)
34228 {
34229 if (ipar[i] >= 2)
34230 return 0;
34231 mask |= ipar[i] << i;
34232 }
34233 for (i = 2; i < 4; ++i)
34234 {
34235 if (ipar[i] < 2)
34236 return 0;
34237 mask |= (ipar[i] - 2) << i;
34238 }
34239 break;
34240
34241 case V8SFmode:
34242 /* In the 256-bit SFmode case, we have full freedom of movement
34243 within the low 128-bit lane, but the high 128-bit lane must
34244 mirror the exact same pattern. */
34245 for (i = 0; i < 4; ++i)
34246 if (ipar[i] + 4 != ipar[i + 4])
34247 return 0;
34248 nelt = 4;
34249 /* FALLTHRU */
34250
34251 case V2DFmode:
34252 case V4SFmode:
34253 /* In the 128-bit case, we've full freedom in the placement of
34254 the elements from the source operand. */
34255 for (i = 0; i < nelt; ++i)
34256 mask |= ipar[i] << (i * (nelt / 2));
34257 break;
34258
34259 default:
34260 gcc_unreachable ();
34261 }
34262
34263 /* Make sure success has a non-zero value by adding one. */
34264 return mask + 1;
34265 }
34266
34267 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
34268 the expansion functions to turn the parallel back into a mask.
34269 The return value is 0 for no match and the imm8+1 for a match. */
34270
34271 int
34272 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
34273 {
34274 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
34275 unsigned mask = 0;
34276 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34277
34278 if (XVECLEN (par, 0) != (int) nelt)
34279 return 0;
34280
34281 /* Validate that all of the elements are constants, and not totally
34282 out of range. Copy the data into an integral array to make the
34283 subsequent checks easier. */
34284 for (i = 0; i < nelt; ++i)
34285 {
34286 rtx er = XVECEXP (par, 0, i);
34287 unsigned HOST_WIDE_INT ei;
34288
34289 if (!CONST_INT_P (er))
34290 return 0;
34291 ei = INTVAL (er);
34292 if (ei >= 2 * nelt)
34293 return 0;
34294 ipar[i] = ei;
34295 }
34296
34297 /* Validate that the halves of the permute are halves. */
34298 for (i = 0; i < nelt2 - 1; ++i)
34299 if (ipar[i] + 1 != ipar[i + 1])
34300 return 0;
34301 for (i = nelt2; i < nelt - 1; ++i)
34302 if (ipar[i] + 1 != ipar[i + 1])
34303 return 0;
34304
34305 /* Reconstruct the mask. */
34306 for (i = 0; i < 2; ++i)
34307 {
34308 unsigned e = ipar[i * nelt2];
34309 if (e % nelt2)
34310 return 0;
34311 e /= nelt2;
34312 mask |= e << (i * 4);
34313 }
34314
34315 /* Make sure success has a non-zero value by adding one. */
34316 return mask + 1;
34317 }
34318 \f
34319 /* Store OPERAND to the memory after reload is completed. This means
34320 that we can't easily use assign_stack_local. */
34321 rtx
34322 ix86_force_to_memory (enum machine_mode mode, rtx operand)
34323 {
34324 rtx result;
34325
34326 gcc_assert (reload_completed);
34327 if (ix86_using_red_zone ())
34328 {
34329 result = gen_rtx_MEM (mode,
34330 gen_rtx_PLUS (Pmode,
34331 stack_pointer_rtx,
34332 GEN_INT (-RED_ZONE_SIZE)));
34333 emit_move_insn (result, operand);
34334 }
34335 else if (TARGET_64BIT)
34336 {
34337 switch (mode)
34338 {
34339 case HImode:
34340 case SImode:
34341 operand = gen_lowpart (DImode, operand);
34342 /* FALLTHRU */
34343 case DImode:
34344 emit_insn (
34345 gen_rtx_SET (VOIDmode,
34346 gen_rtx_MEM (DImode,
34347 gen_rtx_PRE_DEC (DImode,
34348 stack_pointer_rtx)),
34349 operand));
34350 break;
34351 default:
34352 gcc_unreachable ();
34353 }
34354 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34355 }
34356 else
34357 {
34358 switch (mode)
34359 {
34360 case DImode:
34361 {
34362 rtx operands[2];
34363 split_double_mode (mode, &operand, 1, operands, operands + 1);
34364 emit_insn (
34365 gen_rtx_SET (VOIDmode,
34366 gen_rtx_MEM (SImode,
34367 gen_rtx_PRE_DEC (Pmode,
34368 stack_pointer_rtx)),
34369 operands[1]));
34370 emit_insn (
34371 gen_rtx_SET (VOIDmode,
34372 gen_rtx_MEM (SImode,
34373 gen_rtx_PRE_DEC (Pmode,
34374 stack_pointer_rtx)),
34375 operands[0]));
34376 }
34377 break;
34378 case HImode:
34379 /* Store HImodes as SImodes. */
34380 operand = gen_lowpart (SImode, operand);
34381 /* FALLTHRU */
34382 case SImode:
34383 emit_insn (
34384 gen_rtx_SET (VOIDmode,
34385 gen_rtx_MEM (GET_MODE (operand),
34386 gen_rtx_PRE_DEC (SImode,
34387 stack_pointer_rtx)),
34388 operand));
34389 break;
34390 default:
34391 gcc_unreachable ();
34392 }
34393 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34394 }
34395 return result;
34396 }
34397
34398 /* Free operand from the memory. */
34399 void
34400 ix86_free_from_memory (enum machine_mode mode)
34401 {
34402 if (!ix86_using_red_zone ())
34403 {
34404 int size;
34405
34406 if (mode == DImode || TARGET_64BIT)
34407 size = 8;
34408 else
34409 size = 4;
34410 /* Use LEA to deallocate stack space. In peephole2 it will be converted
34411 to pop or add instruction if registers are available. */
34412 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
34413 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
34414 GEN_INT (size))));
34415 }
34416 }
34417
34418 /* Return a register priority for hard reg REGNO. */
34419 static int
34420 ix86_register_priority (int hard_regno)
34421 {
34422 /* ebp and r13 as the base always wants a displacement, r12 as the
34423 base always wants an index. So discourage their usage in an
34424 address. */
34425 if (hard_regno == R12_REG || hard_regno == R13_REG)
34426 return 0;
34427 if (hard_regno == BP_REG)
34428 return 1;
34429 /* New x86-64 int registers result in bigger code size. Discourage
34430 them. */
34431 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
34432 return 2;
34433 /* New x86-64 SSE registers result in bigger code size. Discourage
34434 them. */
34435 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
34436 return 2;
34437 /* Usage of AX register results in smaller code. Prefer it. */
34438 if (hard_regno == 0)
34439 return 4;
34440 return 3;
34441 }
34442
34443 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
34444
34445 Put float CONST_DOUBLE in the constant pool instead of fp regs.
34446 QImode must go into class Q_REGS.
34447 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
34448 movdf to do mem-to-mem moves through integer regs. */
34449
34450 static reg_class_t
34451 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
34452 {
34453 enum machine_mode mode = GET_MODE (x);
34454
34455 /* We're only allowed to return a subclass of CLASS. Many of the
34456 following checks fail for NO_REGS, so eliminate that early. */
34457 if (regclass == NO_REGS)
34458 return NO_REGS;
34459
34460 /* All classes can load zeros. */
34461 if (x == CONST0_RTX (mode))
34462 return regclass;
34463
34464 /* Force constants into memory if we are loading a (nonzero) constant into
34465 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
34466 instructions to load from a constant. */
34467 if (CONSTANT_P (x)
34468 && (MAYBE_MMX_CLASS_P (regclass)
34469 || MAYBE_SSE_CLASS_P (regclass)
34470 || MAYBE_MASK_CLASS_P (regclass)))
34471 return NO_REGS;
34472
34473 /* Prefer SSE regs only, if we can use them for math. */
34474 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
34475 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
34476
34477 /* Floating-point constants need more complex checks. */
34478 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
34479 {
34480 /* General regs can load everything. */
34481 if (reg_class_subset_p (regclass, GENERAL_REGS))
34482 return regclass;
34483
34484 /* Floats can load 0 and 1 plus some others. Note that we eliminated
34485 zero above. We only want to wind up preferring 80387 registers if
34486 we plan on doing computation with them. */
34487 if (TARGET_80387
34488 && standard_80387_constant_p (x) > 0)
34489 {
34490 /* Limit class to non-sse. */
34491 if (regclass == FLOAT_SSE_REGS)
34492 return FLOAT_REGS;
34493 if (regclass == FP_TOP_SSE_REGS)
34494 return FP_TOP_REG;
34495 if (regclass == FP_SECOND_SSE_REGS)
34496 return FP_SECOND_REG;
34497 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
34498 return regclass;
34499 }
34500
34501 return NO_REGS;
34502 }
34503
34504 /* Generally when we see PLUS here, it's the function invariant
34505 (plus soft-fp const_int). Which can only be computed into general
34506 regs. */
34507 if (GET_CODE (x) == PLUS)
34508 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
34509
34510 /* QImode constants are easy to load, but non-constant QImode data
34511 must go into Q_REGS. */
34512 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
34513 {
34514 if (reg_class_subset_p (regclass, Q_REGS))
34515 return regclass;
34516 if (reg_class_subset_p (Q_REGS, regclass))
34517 return Q_REGS;
34518 return NO_REGS;
34519 }
34520
34521 return regclass;
34522 }
34523
34524 /* Discourage putting floating-point values in SSE registers unless
34525 SSE math is being used, and likewise for the 387 registers. */
34526 static reg_class_t
34527 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34528 {
34529 enum machine_mode mode = GET_MODE (x);
34530
34531 /* Restrict the output reload class to the register bank that we are doing
34532 math on. If we would like not to return a subset of CLASS, reject this
34533 alternative: if reload cannot do this, it will still use its choice. */
34534 mode = GET_MODE (x);
34535 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34536 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
34537
34538 if (X87_FLOAT_MODE_P (mode))
34539 {
34540 if (regclass == FP_TOP_SSE_REGS)
34541 return FP_TOP_REG;
34542 else if (regclass == FP_SECOND_SSE_REGS)
34543 return FP_SECOND_REG;
34544 else
34545 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34546 }
34547
34548 return regclass;
34549 }
34550
34551 static reg_class_t
34552 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34553 enum machine_mode mode, secondary_reload_info *sri)
34554 {
34555 /* Double-word spills from general registers to non-offsettable memory
34556 references (zero-extended addresses) require special handling. */
34557 if (TARGET_64BIT
34558 && MEM_P (x)
34559 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34560 && INTEGER_CLASS_P (rclass)
34561 && !offsettable_memref_p (x))
34562 {
34563 sri->icode = (in_p
34564 ? CODE_FOR_reload_noff_load
34565 : CODE_FOR_reload_noff_store);
34566 /* Add the cost of moving address to a temporary. */
34567 sri->extra_cost = 1;
34568
34569 return NO_REGS;
34570 }
34571
34572 /* QImode spills from non-QI registers require
34573 intermediate register on 32bit targets. */
34574 if (mode == QImode
34575 && (MAYBE_MASK_CLASS_P (rclass)
34576 || (!TARGET_64BIT && !in_p
34577 && INTEGER_CLASS_P (rclass)
34578 && MAYBE_NON_Q_CLASS_P (rclass))))
34579 {
34580 int regno;
34581
34582 if (REG_P (x))
34583 regno = REGNO (x);
34584 else
34585 regno = -1;
34586
34587 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34588 regno = true_regnum (x);
34589
34590 /* Return Q_REGS if the operand is in memory. */
34591 if (regno == -1)
34592 return Q_REGS;
34593 }
34594
34595 /* This condition handles corner case where an expression involving
34596 pointers gets vectorized. We're trying to use the address of a
34597 stack slot as a vector initializer.
34598
34599 (set (reg:V2DI 74 [ vect_cst_.2 ])
34600 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34601
34602 Eventually frame gets turned into sp+offset like this:
34603
34604 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34605 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34606 (const_int 392 [0x188]))))
34607
34608 That later gets turned into:
34609
34610 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34611 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34612 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34613
34614 We'll have the following reload recorded:
34615
34616 Reload 0: reload_in (DI) =
34617 (plus:DI (reg/f:DI 7 sp)
34618 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34619 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34620 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34621 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34622 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34623 reload_reg_rtx: (reg:V2DI 22 xmm1)
34624
34625 Which isn't going to work since SSE instructions can't handle scalar
34626 additions. Returning GENERAL_REGS forces the addition into integer
34627 register and reload can handle subsequent reloads without problems. */
34628
34629 if (in_p && GET_CODE (x) == PLUS
34630 && SSE_CLASS_P (rclass)
34631 && SCALAR_INT_MODE_P (mode))
34632 return GENERAL_REGS;
34633
34634 return NO_REGS;
34635 }
34636
34637 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34638
34639 static bool
34640 ix86_class_likely_spilled_p (reg_class_t rclass)
34641 {
34642 switch (rclass)
34643 {
34644 case AREG:
34645 case DREG:
34646 case CREG:
34647 case BREG:
34648 case AD_REGS:
34649 case SIREG:
34650 case DIREG:
34651 case SSE_FIRST_REG:
34652 case FP_TOP_REG:
34653 case FP_SECOND_REG:
34654 case BND_REGS:
34655 return true;
34656
34657 default:
34658 break;
34659 }
34660
34661 return false;
34662 }
34663
34664 /* If we are copying between general and FP registers, we need a memory
34665 location. The same is true for SSE and MMX registers.
34666
34667 To optimize register_move_cost performance, allow inline variant.
34668
34669 The macro can't work reliably when one of the CLASSES is class containing
34670 registers from multiple units (SSE, MMX, integer). We avoid this by never
34671 combining those units in single alternative in the machine description.
34672 Ensure that this constraint holds to avoid unexpected surprises.
34673
34674 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34675 enforce these sanity checks. */
34676
34677 static inline bool
34678 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34679 enum machine_mode mode, int strict)
34680 {
34681 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34682 return false;
34683 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34684 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34685 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34686 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34687 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34688 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34689 {
34690 gcc_assert (!strict || lra_in_progress);
34691 return true;
34692 }
34693
34694 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34695 return true;
34696
34697 /* ??? This is a lie. We do have moves between mmx/general, and for
34698 mmx/sse2. But by saying we need secondary memory we discourage the
34699 register allocator from using the mmx registers unless needed. */
34700 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34701 return true;
34702
34703 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34704 {
34705 /* SSE1 doesn't have any direct moves from other classes. */
34706 if (!TARGET_SSE2)
34707 return true;
34708
34709 /* If the target says that inter-unit moves are more expensive
34710 than moving through memory, then don't generate them. */
34711 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34712 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34713 return true;
34714
34715 /* Between SSE and general, we have moves no larger than word size. */
34716 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34717 return true;
34718 }
34719
34720 return false;
34721 }
34722
34723 bool
34724 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34725 enum machine_mode mode, int strict)
34726 {
34727 return inline_secondary_memory_needed (class1, class2, mode, strict);
34728 }
34729
34730 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34731
34732 On the 80386, this is the size of MODE in words,
34733 except in the FP regs, where a single reg is always enough. */
34734
34735 static unsigned char
34736 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34737 {
34738 if (MAYBE_INTEGER_CLASS_P (rclass))
34739 {
34740 if (mode == XFmode)
34741 return (TARGET_64BIT ? 2 : 3);
34742 else if (mode == XCmode)
34743 return (TARGET_64BIT ? 4 : 6);
34744 else
34745 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34746 }
34747 else
34748 {
34749 if (COMPLEX_MODE_P (mode))
34750 return 2;
34751 else
34752 return 1;
34753 }
34754 }
34755
34756 /* Return true if the registers in CLASS cannot represent the change from
34757 modes FROM to TO. */
34758
34759 bool
34760 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34761 enum reg_class regclass)
34762 {
34763 if (from == to)
34764 return false;
34765
34766 /* x87 registers can't do subreg at all, as all values are reformatted
34767 to extended precision. */
34768 if (MAYBE_FLOAT_CLASS_P (regclass))
34769 return true;
34770
34771 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34772 {
34773 /* Vector registers do not support QI or HImode loads. If we don't
34774 disallow a change to these modes, reload will assume it's ok to
34775 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34776 the vec_dupv4hi pattern. */
34777 if (GET_MODE_SIZE (from) < 4)
34778 return true;
34779
34780 /* Vector registers do not support subreg with nonzero offsets, which
34781 are otherwise valid for integer registers. Since we can't see
34782 whether we have a nonzero offset from here, prohibit all
34783 nonparadoxical subregs changing size. */
34784 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34785 return true;
34786 }
34787
34788 return false;
34789 }
34790
34791 /* Return the cost of moving data of mode M between a
34792 register and memory. A value of 2 is the default; this cost is
34793 relative to those in `REGISTER_MOVE_COST'.
34794
34795 This function is used extensively by register_move_cost that is used to
34796 build tables at startup. Make it inline in this case.
34797 When IN is 2, return maximum of in and out move cost.
34798
34799 If moving between registers and memory is more expensive than
34800 between two registers, you should define this macro to express the
34801 relative cost.
34802
34803 Model also increased moving costs of QImode registers in non
34804 Q_REGS classes.
34805 */
34806 static inline int
34807 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34808 int in)
34809 {
34810 int cost;
34811 if (FLOAT_CLASS_P (regclass))
34812 {
34813 int index;
34814 switch (mode)
34815 {
34816 case SFmode:
34817 index = 0;
34818 break;
34819 case DFmode:
34820 index = 1;
34821 break;
34822 case XFmode:
34823 index = 2;
34824 break;
34825 default:
34826 return 100;
34827 }
34828 if (in == 2)
34829 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34830 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34831 }
34832 if (SSE_CLASS_P (regclass))
34833 {
34834 int index;
34835 switch (GET_MODE_SIZE (mode))
34836 {
34837 case 4:
34838 index = 0;
34839 break;
34840 case 8:
34841 index = 1;
34842 break;
34843 case 16:
34844 index = 2;
34845 break;
34846 default:
34847 return 100;
34848 }
34849 if (in == 2)
34850 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34851 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34852 }
34853 if (MMX_CLASS_P (regclass))
34854 {
34855 int index;
34856 switch (GET_MODE_SIZE (mode))
34857 {
34858 case 4:
34859 index = 0;
34860 break;
34861 case 8:
34862 index = 1;
34863 break;
34864 default:
34865 return 100;
34866 }
34867 if (in)
34868 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34869 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34870 }
34871 switch (GET_MODE_SIZE (mode))
34872 {
34873 case 1:
34874 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34875 {
34876 if (!in)
34877 return ix86_cost->int_store[0];
34878 if (TARGET_PARTIAL_REG_DEPENDENCY
34879 && optimize_function_for_speed_p (cfun))
34880 cost = ix86_cost->movzbl_load;
34881 else
34882 cost = ix86_cost->int_load[0];
34883 if (in == 2)
34884 return MAX (cost, ix86_cost->int_store[0]);
34885 return cost;
34886 }
34887 else
34888 {
34889 if (in == 2)
34890 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34891 if (in)
34892 return ix86_cost->movzbl_load;
34893 else
34894 return ix86_cost->int_store[0] + 4;
34895 }
34896 break;
34897 case 2:
34898 if (in == 2)
34899 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
34900 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
34901 default:
34902 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
34903 if (mode == TFmode)
34904 mode = XFmode;
34905 if (in == 2)
34906 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
34907 else if (in)
34908 cost = ix86_cost->int_load[2];
34909 else
34910 cost = ix86_cost->int_store[2];
34911 return (cost * (((int) GET_MODE_SIZE (mode)
34912 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
34913 }
34914 }
34915
34916 static int
34917 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
34918 bool in)
34919 {
34920 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
34921 }
34922
34923
34924 /* Return the cost of moving data from a register in class CLASS1 to
34925 one in class CLASS2.
34926
34927 It is not required that the cost always equal 2 when FROM is the same as TO;
34928 on some machines it is expensive to move between registers if they are not
34929 general registers. */
34930
34931 static int
34932 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
34933 reg_class_t class2_i)
34934 {
34935 enum reg_class class1 = (enum reg_class) class1_i;
34936 enum reg_class class2 = (enum reg_class) class2_i;
34937
34938 /* In case we require secondary memory, compute cost of the store followed
34939 by load. In order to avoid bad register allocation choices, we need
34940 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
34941
34942 if (inline_secondary_memory_needed (class1, class2, mode, 0))
34943 {
34944 int cost = 1;
34945
34946 cost += inline_memory_move_cost (mode, class1, 2);
34947 cost += inline_memory_move_cost (mode, class2, 2);
34948
34949 /* In case of copying from general_purpose_register we may emit multiple
34950 stores followed by single load causing memory size mismatch stall.
34951 Count this as arbitrarily high cost of 20. */
34952 if (targetm.class_max_nregs (class1, mode)
34953 > targetm.class_max_nregs (class2, mode))
34954 cost += 20;
34955
34956 /* In the case of FP/MMX moves, the registers actually overlap, and we
34957 have to switch modes in order to treat them differently. */
34958 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
34959 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
34960 cost += 20;
34961
34962 return cost;
34963 }
34964
34965 /* Moves between SSE/MMX and integer unit are expensive. */
34966 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
34967 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34968
34969 /* ??? By keeping returned value relatively high, we limit the number
34970 of moves between integer and MMX/SSE registers for all targets.
34971 Additionally, high value prevents problem with x86_modes_tieable_p(),
34972 where integer modes in MMX/SSE registers are not tieable
34973 because of missing QImode and HImode moves to, from or between
34974 MMX/SSE registers. */
34975 return MAX (8, ix86_cost->mmxsse_to_integer);
34976
34977 if (MAYBE_FLOAT_CLASS_P (class1))
34978 return ix86_cost->fp_move;
34979 if (MAYBE_SSE_CLASS_P (class1))
34980 return ix86_cost->sse_move;
34981 if (MAYBE_MMX_CLASS_P (class1))
34982 return ix86_cost->mmx_move;
34983 return 2;
34984 }
34985
34986 /* Return TRUE if hard register REGNO can hold a value of machine-mode
34987 MODE. */
34988
34989 bool
34990 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
34991 {
34992 /* Flags and only flags can only hold CCmode values. */
34993 if (CC_REGNO_P (regno))
34994 return GET_MODE_CLASS (mode) == MODE_CC;
34995 if (GET_MODE_CLASS (mode) == MODE_CC
34996 || GET_MODE_CLASS (mode) == MODE_RANDOM
34997 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
34998 return false;
34999 if (STACK_REGNO_P (regno))
35000 return VALID_FP_MODE_P (mode);
35001 if (MASK_REGNO_P (regno))
35002 return VALID_MASK_REG_MODE (mode);
35003 if (BND_REGNO_P (regno))
35004 return VALID_BND_REG_MODE (mode);
35005 if (SSE_REGNO_P (regno))
35006 {
35007 /* We implement the move patterns for all vector modes into and
35008 out of SSE registers, even when no operation instructions
35009 are available. */
35010
35011 /* For AVX-512 we allow, regardless of regno:
35012 - XI mode
35013 - any of 512-bit wide vector mode
35014 - any scalar mode. */
35015 if (TARGET_AVX512F
35016 && (mode == XImode
35017 || VALID_AVX512F_REG_MODE (mode)
35018 || VALID_AVX512F_SCALAR_MODE (mode)))
35019 return true;
35020
35021 /* xmm16-xmm31 are only available for AVX-512. */
35022 if (EXT_REX_SSE_REGNO_P (regno))
35023 return false;
35024
35025 /* OImode move is available only when AVX is enabled. */
35026 return ((TARGET_AVX && mode == OImode)
35027 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35028 || VALID_SSE_REG_MODE (mode)
35029 || VALID_SSE2_REG_MODE (mode)
35030 || VALID_MMX_REG_MODE (mode)
35031 || VALID_MMX_REG_MODE_3DNOW (mode));
35032 }
35033 if (MMX_REGNO_P (regno))
35034 {
35035 /* We implement the move patterns for 3DNOW modes even in MMX mode,
35036 so if the register is available at all, then we can move data of
35037 the given mode into or out of it. */
35038 return (VALID_MMX_REG_MODE (mode)
35039 || VALID_MMX_REG_MODE_3DNOW (mode));
35040 }
35041
35042 if (mode == QImode)
35043 {
35044 /* Take care for QImode values - they can be in non-QI regs,
35045 but then they do cause partial register stalls. */
35046 if (ANY_QI_REGNO_P (regno))
35047 return true;
35048 if (!TARGET_PARTIAL_REG_STALL)
35049 return true;
35050 /* LRA checks if the hard register is OK for the given mode.
35051 QImode values can live in non-QI regs, so we allow all
35052 registers here. */
35053 if (lra_in_progress)
35054 return true;
35055 return !can_create_pseudo_p ();
35056 }
35057 /* We handle both integer and floats in the general purpose registers. */
35058 else if (VALID_INT_MODE_P (mode))
35059 return true;
35060 else if (VALID_FP_MODE_P (mode))
35061 return true;
35062 else if (VALID_DFP_MODE_P (mode))
35063 return true;
35064 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
35065 on to use that value in smaller contexts, this can easily force a
35066 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
35067 supporting DImode, allow it. */
35068 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
35069 return true;
35070
35071 return false;
35072 }
35073
35074 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
35075 tieable integer mode. */
35076
35077 static bool
35078 ix86_tieable_integer_mode_p (enum machine_mode mode)
35079 {
35080 switch (mode)
35081 {
35082 case HImode:
35083 case SImode:
35084 return true;
35085
35086 case QImode:
35087 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
35088
35089 case DImode:
35090 return TARGET_64BIT;
35091
35092 default:
35093 return false;
35094 }
35095 }
35096
35097 /* Return true if MODE1 is accessible in a register that can hold MODE2
35098 without copying. That is, all register classes that can hold MODE2
35099 can also hold MODE1. */
35100
35101 bool
35102 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
35103 {
35104 if (mode1 == mode2)
35105 return true;
35106
35107 if (ix86_tieable_integer_mode_p (mode1)
35108 && ix86_tieable_integer_mode_p (mode2))
35109 return true;
35110
35111 /* MODE2 being XFmode implies fp stack or general regs, which means we
35112 can tie any smaller floating point modes to it. Note that we do not
35113 tie this with TFmode. */
35114 if (mode2 == XFmode)
35115 return mode1 == SFmode || mode1 == DFmode;
35116
35117 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
35118 that we can tie it with SFmode. */
35119 if (mode2 == DFmode)
35120 return mode1 == SFmode;
35121
35122 /* If MODE2 is only appropriate for an SSE register, then tie with
35123 any other mode acceptable to SSE registers. */
35124 if (GET_MODE_SIZE (mode2) == 32
35125 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35126 return (GET_MODE_SIZE (mode1) == 32
35127 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35128 if (GET_MODE_SIZE (mode2) == 16
35129 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35130 return (GET_MODE_SIZE (mode1) == 16
35131 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35132
35133 /* If MODE2 is appropriate for an MMX register, then tie
35134 with any other mode acceptable to MMX registers. */
35135 if (GET_MODE_SIZE (mode2) == 8
35136 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
35137 return (GET_MODE_SIZE (mode1) == 8
35138 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
35139
35140 return false;
35141 }
35142
35143 /* Return the cost of moving between two registers of mode MODE. */
35144
35145 static int
35146 ix86_set_reg_reg_cost (enum machine_mode mode)
35147 {
35148 unsigned int units = UNITS_PER_WORD;
35149
35150 switch (GET_MODE_CLASS (mode))
35151 {
35152 default:
35153 break;
35154
35155 case MODE_CC:
35156 units = GET_MODE_SIZE (CCmode);
35157 break;
35158
35159 case MODE_FLOAT:
35160 if ((TARGET_SSE && mode == TFmode)
35161 || (TARGET_80387 && mode == XFmode)
35162 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
35163 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
35164 units = GET_MODE_SIZE (mode);
35165 break;
35166
35167 case MODE_COMPLEX_FLOAT:
35168 if ((TARGET_SSE && mode == TCmode)
35169 || (TARGET_80387 && mode == XCmode)
35170 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
35171 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
35172 units = GET_MODE_SIZE (mode);
35173 break;
35174
35175 case MODE_VECTOR_INT:
35176 case MODE_VECTOR_FLOAT:
35177 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
35178 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35179 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
35180 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
35181 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
35182 units = GET_MODE_SIZE (mode);
35183 }
35184
35185 /* Return the cost of moving between two registers of mode MODE,
35186 assuming that the move will be in pieces of at most UNITS bytes. */
35187 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
35188 }
35189
35190 /* Compute a (partial) cost for rtx X. Return true if the complete
35191 cost has been computed, and false if subexpressions should be
35192 scanned. In either case, *TOTAL contains the cost result. */
35193
35194 static bool
35195 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
35196 bool speed)
35197 {
35198 enum rtx_code code = (enum rtx_code) code_i;
35199 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
35200 enum machine_mode mode = GET_MODE (x);
35201 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
35202
35203 switch (code)
35204 {
35205 case SET:
35206 if (register_operand (SET_DEST (x), VOIDmode)
35207 && reg_or_0_operand (SET_SRC (x), VOIDmode))
35208 {
35209 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
35210 return true;
35211 }
35212 return false;
35213
35214 case CONST_INT:
35215 case CONST:
35216 case LABEL_REF:
35217 case SYMBOL_REF:
35218 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
35219 *total = 3;
35220 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
35221 *total = 2;
35222 else if (flag_pic && SYMBOLIC_CONST (x)
35223 && (!TARGET_64BIT
35224 || (!GET_CODE (x) != LABEL_REF
35225 && (GET_CODE (x) != SYMBOL_REF
35226 || !SYMBOL_REF_LOCAL_P (x)))))
35227 *total = 1;
35228 else
35229 *total = 0;
35230 return true;
35231
35232 case CONST_DOUBLE:
35233 if (mode == VOIDmode)
35234 {
35235 *total = 0;
35236 return true;
35237 }
35238 switch (standard_80387_constant_p (x))
35239 {
35240 case 1: /* 0.0 */
35241 *total = 1;
35242 return true;
35243 default: /* Other constants */
35244 *total = 2;
35245 return true;
35246 case 0:
35247 case -1:
35248 break;
35249 }
35250 if (SSE_FLOAT_MODE_P (mode))
35251 {
35252 case CONST_VECTOR:
35253 switch (standard_sse_constant_p (x))
35254 {
35255 case 0:
35256 break;
35257 case 1: /* 0: xor eliminates false dependency */
35258 *total = 0;
35259 return true;
35260 default: /* -1: cmp contains false dependency */
35261 *total = 1;
35262 return true;
35263 }
35264 }
35265 /* Fall back to (MEM (SYMBOL_REF)), since that's where
35266 it'll probably end up. Add a penalty for size. */
35267 *total = (COSTS_N_INSNS (1)
35268 + (flag_pic != 0 && !TARGET_64BIT)
35269 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
35270 return true;
35271
35272 case ZERO_EXTEND:
35273 /* The zero extensions is often completely free on x86_64, so make
35274 it as cheap as possible. */
35275 if (TARGET_64BIT && mode == DImode
35276 && GET_MODE (XEXP (x, 0)) == SImode)
35277 *total = 1;
35278 else if (TARGET_ZERO_EXTEND_WITH_AND)
35279 *total = cost->add;
35280 else
35281 *total = cost->movzx;
35282 return false;
35283
35284 case SIGN_EXTEND:
35285 *total = cost->movsx;
35286 return false;
35287
35288 case ASHIFT:
35289 if (SCALAR_INT_MODE_P (mode)
35290 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
35291 && CONST_INT_P (XEXP (x, 1)))
35292 {
35293 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35294 if (value == 1)
35295 {
35296 *total = cost->add;
35297 return false;
35298 }
35299 if ((value == 2 || value == 3)
35300 && cost->lea <= cost->shift_const)
35301 {
35302 *total = cost->lea;
35303 return false;
35304 }
35305 }
35306 /* FALLTHRU */
35307
35308 case ROTATE:
35309 case ASHIFTRT:
35310 case LSHIFTRT:
35311 case ROTATERT:
35312 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35313 {
35314 /* ??? Should be SSE vector operation cost. */
35315 /* At least for published AMD latencies, this really is the same
35316 as the latency for a simple fpu operation like fabs. */
35317 /* V*QImode is emulated with 1-11 insns. */
35318 if (mode == V16QImode || mode == V32QImode)
35319 {
35320 int count = 11;
35321 if (TARGET_XOP && mode == V16QImode)
35322 {
35323 /* For XOP we use vpshab, which requires a broadcast of the
35324 value to the variable shift insn. For constants this
35325 means a V16Q const in mem; even when we can perform the
35326 shift with one insn set the cost to prefer paddb. */
35327 if (CONSTANT_P (XEXP (x, 1)))
35328 {
35329 *total = (cost->fabs
35330 + rtx_cost (XEXP (x, 0), code, 0, speed)
35331 + (speed ? 2 : COSTS_N_BYTES (16)));
35332 return true;
35333 }
35334 count = 3;
35335 }
35336 else if (TARGET_SSSE3)
35337 count = 7;
35338 *total = cost->fabs * count;
35339 }
35340 else
35341 *total = cost->fabs;
35342 }
35343 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35344 {
35345 if (CONST_INT_P (XEXP (x, 1)))
35346 {
35347 if (INTVAL (XEXP (x, 1)) > 32)
35348 *total = cost->shift_const + COSTS_N_INSNS (2);
35349 else
35350 *total = cost->shift_const * 2;
35351 }
35352 else
35353 {
35354 if (GET_CODE (XEXP (x, 1)) == AND)
35355 *total = cost->shift_var * 2;
35356 else
35357 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
35358 }
35359 }
35360 else
35361 {
35362 if (CONST_INT_P (XEXP (x, 1)))
35363 *total = cost->shift_const;
35364 else if (GET_CODE (XEXP (x, 1)) == SUBREG
35365 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
35366 {
35367 /* Return the cost after shift-and truncation. */
35368 *total = cost->shift_var;
35369 return true;
35370 }
35371 else
35372 *total = cost->shift_var;
35373 }
35374 return false;
35375
35376 case FMA:
35377 {
35378 rtx sub;
35379
35380 gcc_assert (FLOAT_MODE_P (mode));
35381 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
35382
35383 /* ??? SSE scalar/vector cost should be used here. */
35384 /* ??? Bald assumption that fma has the same cost as fmul. */
35385 *total = cost->fmul;
35386 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
35387
35388 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
35389 sub = XEXP (x, 0);
35390 if (GET_CODE (sub) == NEG)
35391 sub = XEXP (sub, 0);
35392 *total += rtx_cost (sub, FMA, 0, speed);
35393
35394 sub = XEXP (x, 2);
35395 if (GET_CODE (sub) == NEG)
35396 sub = XEXP (sub, 0);
35397 *total += rtx_cost (sub, FMA, 2, speed);
35398 return true;
35399 }
35400
35401 case MULT:
35402 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35403 {
35404 /* ??? SSE scalar cost should be used here. */
35405 *total = cost->fmul;
35406 return false;
35407 }
35408 else if (X87_FLOAT_MODE_P (mode))
35409 {
35410 *total = cost->fmul;
35411 return false;
35412 }
35413 else if (FLOAT_MODE_P (mode))
35414 {
35415 /* ??? SSE vector cost should be used here. */
35416 *total = cost->fmul;
35417 return false;
35418 }
35419 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35420 {
35421 /* V*QImode is emulated with 7-13 insns. */
35422 if (mode == V16QImode || mode == V32QImode)
35423 {
35424 int extra = 11;
35425 if (TARGET_XOP && mode == V16QImode)
35426 extra = 5;
35427 else if (TARGET_SSSE3)
35428 extra = 6;
35429 *total = cost->fmul * 2 + cost->fabs * extra;
35430 }
35431 /* V*DImode is emulated with 5-8 insns. */
35432 else if (mode == V2DImode || mode == V4DImode)
35433 {
35434 if (TARGET_XOP && mode == V2DImode)
35435 *total = cost->fmul * 2 + cost->fabs * 3;
35436 else
35437 *total = cost->fmul * 3 + cost->fabs * 5;
35438 }
35439 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
35440 insns, including two PMULUDQ. */
35441 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
35442 *total = cost->fmul * 2 + cost->fabs * 5;
35443 else
35444 *total = cost->fmul;
35445 return false;
35446 }
35447 else
35448 {
35449 rtx op0 = XEXP (x, 0);
35450 rtx op1 = XEXP (x, 1);
35451 int nbits;
35452 if (CONST_INT_P (XEXP (x, 1)))
35453 {
35454 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35455 for (nbits = 0; value != 0; value &= value - 1)
35456 nbits++;
35457 }
35458 else
35459 /* This is arbitrary. */
35460 nbits = 7;
35461
35462 /* Compute costs correctly for widening multiplication. */
35463 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
35464 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
35465 == GET_MODE_SIZE (mode))
35466 {
35467 int is_mulwiden = 0;
35468 enum machine_mode inner_mode = GET_MODE (op0);
35469
35470 if (GET_CODE (op0) == GET_CODE (op1))
35471 is_mulwiden = 1, op1 = XEXP (op1, 0);
35472 else if (CONST_INT_P (op1))
35473 {
35474 if (GET_CODE (op0) == SIGN_EXTEND)
35475 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
35476 == INTVAL (op1);
35477 else
35478 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
35479 }
35480
35481 if (is_mulwiden)
35482 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
35483 }
35484
35485 *total = (cost->mult_init[MODE_INDEX (mode)]
35486 + nbits * cost->mult_bit
35487 + rtx_cost (op0, outer_code, opno, speed)
35488 + rtx_cost (op1, outer_code, opno, speed));
35489
35490 return true;
35491 }
35492
35493 case DIV:
35494 case UDIV:
35495 case MOD:
35496 case UMOD:
35497 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35498 /* ??? SSE cost should be used here. */
35499 *total = cost->fdiv;
35500 else if (X87_FLOAT_MODE_P (mode))
35501 *total = cost->fdiv;
35502 else if (FLOAT_MODE_P (mode))
35503 /* ??? SSE vector cost should be used here. */
35504 *total = cost->fdiv;
35505 else
35506 *total = cost->divide[MODE_INDEX (mode)];
35507 return false;
35508
35509 case PLUS:
35510 if (GET_MODE_CLASS (mode) == MODE_INT
35511 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
35512 {
35513 if (GET_CODE (XEXP (x, 0)) == PLUS
35514 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
35515 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
35516 && CONSTANT_P (XEXP (x, 1)))
35517 {
35518 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35519 if (val == 2 || val == 4 || val == 8)
35520 {
35521 *total = cost->lea;
35522 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35523 outer_code, opno, speed);
35524 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35525 outer_code, opno, speed);
35526 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35527 return true;
35528 }
35529 }
35530 else if (GET_CODE (XEXP (x, 0)) == MULT
35531 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35532 {
35533 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35534 if (val == 2 || val == 4 || val == 8)
35535 {
35536 *total = cost->lea;
35537 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35538 outer_code, opno, speed);
35539 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35540 return true;
35541 }
35542 }
35543 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35544 {
35545 *total = cost->lea;
35546 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35547 outer_code, opno, speed);
35548 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35549 outer_code, opno, speed);
35550 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35551 return true;
35552 }
35553 }
35554 /* FALLTHRU */
35555
35556 case MINUS:
35557 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35558 {
35559 /* ??? SSE cost should be used here. */
35560 *total = cost->fadd;
35561 return false;
35562 }
35563 else if (X87_FLOAT_MODE_P (mode))
35564 {
35565 *total = cost->fadd;
35566 return false;
35567 }
35568 else if (FLOAT_MODE_P (mode))
35569 {
35570 /* ??? SSE vector cost should be used here. */
35571 *total = cost->fadd;
35572 return false;
35573 }
35574 /* FALLTHRU */
35575
35576 case AND:
35577 case IOR:
35578 case XOR:
35579 if (GET_MODE_CLASS (mode) == MODE_INT
35580 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35581 {
35582 *total = (cost->add * 2
35583 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35584 << (GET_MODE (XEXP (x, 0)) != DImode))
35585 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35586 << (GET_MODE (XEXP (x, 1)) != DImode)));
35587 return true;
35588 }
35589 /* FALLTHRU */
35590
35591 case NEG:
35592 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35593 {
35594 /* ??? SSE cost should be used here. */
35595 *total = cost->fchs;
35596 return false;
35597 }
35598 else if (X87_FLOAT_MODE_P (mode))
35599 {
35600 *total = cost->fchs;
35601 return false;
35602 }
35603 else if (FLOAT_MODE_P (mode))
35604 {
35605 /* ??? SSE vector cost should be used here. */
35606 *total = cost->fchs;
35607 return false;
35608 }
35609 /* FALLTHRU */
35610
35611 case NOT:
35612 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35613 {
35614 /* ??? Should be SSE vector operation cost. */
35615 /* At least for published AMD latencies, this really is the same
35616 as the latency for a simple fpu operation like fabs. */
35617 *total = cost->fabs;
35618 }
35619 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35620 *total = cost->add * 2;
35621 else
35622 *total = cost->add;
35623 return false;
35624
35625 case COMPARE:
35626 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35627 && XEXP (XEXP (x, 0), 1) == const1_rtx
35628 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35629 && XEXP (x, 1) == const0_rtx)
35630 {
35631 /* This kind of construct is implemented using test[bwl].
35632 Treat it as if we had an AND. */
35633 *total = (cost->add
35634 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35635 + rtx_cost (const1_rtx, outer_code, opno, speed));
35636 return true;
35637 }
35638 return false;
35639
35640 case FLOAT_EXTEND:
35641 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35642 *total = 0;
35643 return false;
35644
35645 case ABS:
35646 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35647 /* ??? SSE cost should be used here. */
35648 *total = cost->fabs;
35649 else if (X87_FLOAT_MODE_P (mode))
35650 *total = cost->fabs;
35651 else if (FLOAT_MODE_P (mode))
35652 /* ??? SSE vector cost should be used here. */
35653 *total = cost->fabs;
35654 return false;
35655
35656 case SQRT:
35657 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35658 /* ??? SSE cost should be used here. */
35659 *total = cost->fsqrt;
35660 else if (X87_FLOAT_MODE_P (mode))
35661 *total = cost->fsqrt;
35662 else if (FLOAT_MODE_P (mode))
35663 /* ??? SSE vector cost should be used here. */
35664 *total = cost->fsqrt;
35665 return false;
35666
35667 case UNSPEC:
35668 if (XINT (x, 1) == UNSPEC_TP)
35669 *total = 0;
35670 return false;
35671
35672 case VEC_SELECT:
35673 case VEC_CONCAT:
35674 case VEC_MERGE:
35675 case VEC_DUPLICATE:
35676 /* ??? Assume all of these vector manipulation patterns are
35677 recognizable. In which case they all pretty much have the
35678 same cost. */
35679 *total = cost->fabs;
35680 return true;
35681
35682 default:
35683 return false;
35684 }
35685 }
35686
35687 #if TARGET_MACHO
35688
35689 static int current_machopic_label_num;
35690
35691 /* Given a symbol name and its associated stub, write out the
35692 definition of the stub. */
35693
35694 void
35695 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35696 {
35697 unsigned int length;
35698 char *binder_name, *symbol_name, lazy_ptr_name[32];
35699 int label = ++current_machopic_label_num;
35700
35701 /* For 64-bit we shouldn't get here. */
35702 gcc_assert (!TARGET_64BIT);
35703
35704 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35705 symb = targetm.strip_name_encoding (symb);
35706
35707 length = strlen (stub);
35708 binder_name = XALLOCAVEC (char, length + 32);
35709 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35710
35711 length = strlen (symb);
35712 symbol_name = XALLOCAVEC (char, length + 32);
35713 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35714
35715 sprintf (lazy_ptr_name, "L%d$lz", label);
35716
35717 if (MACHOPIC_ATT_STUB)
35718 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35719 else if (MACHOPIC_PURE)
35720 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35721 else
35722 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35723
35724 fprintf (file, "%s:\n", stub);
35725 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35726
35727 if (MACHOPIC_ATT_STUB)
35728 {
35729 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35730 }
35731 else if (MACHOPIC_PURE)
35732 {
35733 /* PIC stub. */
35734 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35735 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35736 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35737 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35738 label, lazy_ptr_name, label);
35739 fprintf (file, "\tjmp\t*%%ecx\n");
35740 }
35741 else
35742 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35743
35744 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35745 it needs no stub-binding-helper. */
35746 if (MACHOPIC_ATT_STUB)
35747 return;
35748
35749 fprintf (file, "%s:\n", binder_name);
35750
35751 if (MACHOPIC_PURE)
35752 {
35753 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35754 fprintf (file, "\tpushl\t%%ecx\n");
35755 }
35756 else
35757 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35758
35759 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35760
35761 /* N.B. Keep the correspondence of these
35762 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35763 old-pic/new-pic/non-pic stubs; altering this will break
35764 compatibility with existing dylibs. */
35765 if (MACHOPIC_PURE)
35766 {
35767 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35768 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35769 }
35770 else
35771 /* 16-byte -mdynamic-no-pic stub. */
35772 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35773
35774 fprintf (file, "%s:\n", lazy_ptr_name);
35775 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35776 fprintf (file, ASM_LONG "%s\n", binder_name);
35777 }
35778 #endif /* TARGET_MACHO */
35779
35780 /* Order the registers for register allocator. */
35781
35782 void
35783 x86_order_regs_for_local_alloc (void)
35784 {
35785 int pos = 0;
35786 int i;
35787
35788 /* First allocate the local general purpose registers. */
35789 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35790 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35791 reg_alloc_order [pos++] = i;
35792
35793 /* Global general purpose registers. */
35794 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35795 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35796 reg_alloc_order [pos++] = i;
35797
35798 /* x87 registers come first in case we are doing FP math
35799 using them. */
35800 if (!TARGET_SSE_MATH)
35801 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35802 reg_alloc_order [pos++] = i;
35803
35804 /* SSE registers. */
35805 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35806 reg_alloc_order [pos++] = i;
35807 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35808 reg_alloc_order [pos++] = i;
35809
35810 /* Extended REX SSE registers. */
35811 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
35812 reg_alloc_order [pos++] = i;
35813
35814 /* Mask register. */
35815 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
35816 reg_alloc_order [pos++] = i;
35817
35818 /* MPX bound registers. */
35819 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
35820 reg_alloc_order [pos++] = i;
35821
35822 /* x87 registers. */
35823 if (TARGET_SSE_MATH)
35824 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35825 reg_alloc_order [pos++] = i;
35826
35827 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35828 reg_alloc_order [pos++] = i;
35829
35830 /* Initialize the rest of array as we do not allocate some registers
35831 at all. */
35832 while (pos < FIRST_PSEUDO_REGISTER)
35833 reg_alloc_order [pos++] = 0;
35834 }
35835
35836 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35837 in struct attribute_spec handler. */
35838 static tree
35839 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35840 tree args,
35841 int flags ATTRIBUTE_UNUSED,
35842 bool *no_add_attrs)
35843 {
35844 if (TREE_CODE (*node) != FUNCTION_TYPE
35845 && TREE_CODE (*node) != METHOD_TYPE
35846 && TREE_CODE (*node) != FIELD_DECL
35847 && TREE_CODE (*node) != TYPE_DECL)
35848 {
35849 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35850 name);
35851 *no_add_attrs = true;
35852 return NULL_TREE;
35853 }
35854 if (TARGET_64BIT)
35855 {
35856 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35857 name);
35858 *no_add_attrs = true;
35859 return NULL_TREE;
35860 }
35861 if (is_attribute_p ("callee_pop_aggregate_return", name))
35862 {
35863 tree cst;
35864
35865 cst = TREE_VALUE (args);
35866 if (TREE_CODE (cst) != INTEGER_CST)
35867 {
35868 warning (OPT_Wattributes,
35869 "%qE attribute requires an integer constant argument",
35870 name);
35871 *no_add_attrs = true;
35872 }
35873 else if (compare_tree_int (cst, 0) != 0
35874 && compare_tree_int (cst, 1) != 0)
35875 {
35876 warning (OPT_Wattributes,
35877 "argument to %qE attribute is neither zero, nor one",
35878 name);
35879 *no_add_attrs = true;
35880 }
35881
35882 return NULL_TREE;
35883 }
35884
35885 return NULL_TREE;
35886 }
35887
35888 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35889 struct attribute_spec.handler. */
35890 static tree
35891 ix86_handle_abi_attribute (tree *node, tree name,
35892 tree args ATTRIBUTE_UNUSED,
35893 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35894 {
35895 if (TREE_CODE (*node) != FUNCTION_TYPE
35896 && TREE_CODE (*node) != METHOD_TYPE
35897 && TREE_CODE (*node) != FIELD_DECL
35898 && TREE_CODE (*node) != TYPE_DECL)
35899 {
35900 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35901 name);
35902 *no_add_attrs = true;
35903 return NULL_TREE;
35904 }
35905
35906 /* Can combine regparm with all attributes but fastcall. */
35907 if (is_attribute_p ("ms_abi", name))
35908 {
35909 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
35910 {
35911 error ("ms_abi and sysv_abi attributes are not compatible");
35912 }
35913
35914 return NULL_TREE;
35915 }
35916 else if (is_attribute_p ("sysv_abi", name))
35917 {
35918 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
35919 {
35920 error ("ms_abi and sysv_abi attributes are not compatible");
35921 }
35922
35923 return NULL_TREE;
35924 }
35925
35926 return NULL_TREE;
35927 }
35928
35929 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
35930 struct attribute_spec.handler. */
35931 static tree
35932 ix86_handle_struct_attribute (tree *node, tree name,
35933 tree args ATTRIBUTE_UNUSED,
35934 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35935 {
35936 tree *type = NULL;
35937 if (DECL_P (*node))
35938 {
35939 if (TREE_CODE (*node) == TYPE_DECL)
35940 type = &TREE_TYPE (*node);
35941 }
35942 else
35943 type = node;
35944
35945 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
35946 {
35947 warning (OPT_Wattributes, "%qE attribute ignored",
35948 name);
35949 *no_add_attrs = true;
35950 }
35951
35952 else if ((is_attribute_p ("ms_struct", name)
35953 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
35954 || ((is_attribute_p ("gcc_struct", name)
35955 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
35956 {
35957 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
35958 name);
35959 *no_add_attrs = true;
35960 }
35961
35962 return NULL_TREE;
35963 }
35964
35965 static tree
35966 ix86_handle_fndecl_attribute (tree *node, tree name,
35967 tree args ATTRIBUTE_UNUSED,
35968 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35969 {
35970 if (TREE_CODE (*node) != FUNCTION_DECL)
35971 {
35972 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35973 name);
35974 *no_add_attrs = true;
35975 }
35976 return NULL_TREE;
35977 }
35978
35979 static bool
35980 ix86_ms_bitfield_layout_p (const_tree record_type)
35981 {
35982 return ((TARGET_MS_BITFIELD_LAYOUT
35983 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
35984 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
35985 }
35986
35987 /* Returns an expression indicating where the this parameter is
35988 located on entry to the FUNCTION. */
35989
35990 static rtx
35991 x86_this_parameter (tree function)
35992 {
35993 tree type = TREE_TYPE (function);
35994 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
35995 int nregs;
35996
35997 if (TARGET_64BIT)
35998 {
35999 const int *parm_regs;
36000
36001 if (ix86_function_type_abi (type) == MS_ABI)
36002 parm_regs = x86_64_ms_abi_int_parameter_registers;
36003 else
36004 parm_regs = x86_64_int_parameter_registers;
36005 return gen_rtx_REG (Pmode, parm_regs[aggr]);
36006 }
36007
36008 nregs = ix86_function_regparm (type, function);
36009
36010 if (nregs > 0 && !stdarg_p (type))
36011 {
36012 int regno;
36013 unsigned int ccvt = ix86_get_callcvt (type);
36014
36015 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36016 regno = aggr ? DX_REG : CX_REG;
36017 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36018 {
36019 regno = CX_REG;
36020 if (aggr)
36021 return gen_rtx_MEM (SImode,
36022 plus_constant (Pmode, stack_pointer_rtx, 4));
36023 }
36024 else
36025 {
36026 regno = AX_REG;
36027 if (aggr)
36028 {
36029 regno = DX_REG;
36030 if (nregs == 1)
36031 return gen_rtx_MEM (SImode,
36032 plus_constant (Pmode,
36033 stack_pointer_rtx, 4));
36034 }
36035 }
36036 return gen_rtx_REG (SImode, regno);
36037 }
36038
36039 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
36040 aggr ? 8 : 4));
36041 }
36042
36043 /* Determine whether x86_output_mi_thunk can succeed. */
36044
36045 static bool
36046 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
36047 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
36048 HOST_WIDE_INT vcall_offset, const_tree function)
36049 {
36050 /* 64-bit can handle anything. */
36051 if (TARGET_64BIT)
36052 return true;
36053
36054 /* For 32-bit, everything's fine if we have one free register. */
36055 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
36056 return true;
36057
36058 /* Need a free register for vcall_offset. */
36059 if (vcall_offset)
36060 return false;
36061
36062 /* Need a free register for GOT references. */
36063 if (flag_pic && !targetm.binds_local_p (function))
36064 return false;
36065
36066 /* Otherwise ok. */
36067 return true;
36068 }
36069
36070 /* Output the assembler code for a thunk function. THUNK_DECL is the
36071 declaration for the thunk function itself, FUNCTION is the decl for
36072 the target function. DELTA is an immediate constant offset to be
36073 added to THIS. If VCALL_OFFSET is nonzero, the word at
36074 *(*this + vcall_offset) should be added to THIS. */
36075
36076 static void
36077 x86_output_mi_thunk (FILE *file,
36078 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
36079 HOST_WIDE_INT vcall_offset, tree function)
36080 {
36081 rtx this_param = x86_this_parameter (function);
36082 rtx this_reg, tmp, fnaddr;
36083 unsigned int tmp_regno;
36084
36085 if (TARGET_64BIT)
36086 tmp_regno = R10_REG;
36087 else
36088 {
36089 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
36090 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36091 tmp_regno = AX_REG;
36092 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36093 tmp_regno = DX_REG;
36094 else
36095 tmp_regno = CX_REG;
36096 }
36097
36098 emit_note (NOTE_INSN_PROLOGUE_END);
36099
36100 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
36101 pull it in now and let DELTA benefit. */
36102 if (REG_P (this_param))
36103 this_reg = this_param;
36104 else if (vcall_offset)
36105 {
36106 /* Put the this parameter into %eax. */
36107 this_reg = gen_rtx_REG (Pmode, AX_REG);
36108 emit_move_insn (this_reg, this_param);
36109 }
36110 else
36111 this_reg = NULL_RTX;
36112
36113 /* Adjust the this parameter by a fixed constant. */
36114 if (delta)
36115 {
36116 rtx delta_rtx = GEN_INT (delta);
36117 rtx delta_dst = this_reg ? this_reg : this_param;
36118
36119 if (TARGET_64BIT)
36120 {
36121 if (!x86_64_general_operand (delta_rtx, Pmode))
36122 {
36123 tmp = gen_rtx_REG (Pmode, tmp_regno);
36124 emit_move_insn (tmp, delta_rtx);
36125 delta_rtx = tmp;
36126 }
36127 }
36128
36129 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
36130 }
36131
36132 /* Adjust the this parameter by a value stored in the vtable. */
36133 if (vcall_offset)
36134 {
36135 rtx vcall_addr, vcall_mem, this_mem;
36136
36137 tmp = gen_rtx_REG (Pmode, tmp_regno);
36138
36139 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
36140 if (Pmode != ptr_mode)
36141 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
36142 emit_move_insn (tmp, this_mem);
36143
36144 /* Adjust the this parameter. */
36145 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
36146 if (TARGET_64BIT
36147 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
36148 {
36149 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
36150 emit_move_insn (tmp2, GEN_INT (vcall_offset));
36151 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
36152 }
36153
36154 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
36155 if (Pmode != ptr_mode)
36156 emit_insn (gen_addsi_1_zext (this_reg,
36157 gen_rtx_REG (ptr_mode,
36158 REGNO (this_reg)),
36159 vcall_mem));
36160 else
36161 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
36162 }
36163
36164 /* If necessary, drop THIS back to its stack slot. */
36165 if (this_reg && this_reg != this_param)
36166 emit_move_insn (this_param, this_reg);
36167
36168 fnaddr = XEXP (DECL_RTL (function), 0);
36169 if (TARGET_64BIT)
36170 {
36171 if (!flag_pic || targetm.binds_local_p (function)
36172 || TARGET_PECOFF)
36173 ;
36174 else
36175 {
36176 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
36177 tmp = gen_rtx_CONST (Pmode, tmp);
36178 fnaddr = gen_rtx_MEM (Pmode, tmp);
36179 }
36180 }
36181 else
36182 {
36183 if (!flag_pic || targetm.binds_local_p (function))
36184 ;
36185 #if TARGET_MACHO
36186 else if (TARGET_MACHO)
36187 {
36188 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
36189 fnaddr = XEXP (fnaddr, 0);
36190 }
36191 #endif /* TARGET_MACHO */
36192 else
36193 {
36194 tmp = gen_rtx_REG (Pmode, CX_REG);
36195 output_set_got (tmp, NULL_RTX);
36196
36197 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
36198 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
36199 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
36200 }
36201 }
36202
36203 /* Our sibling call patterns do not allow memories, because we have no
36204 predicate that can distinguish between frame and non-frame memory.
36205 For our purposes here, we can get away with (ab)using a jump pattern,
36206 because we're going to do no optimization. */
36207 if (MEM_P (fnaddr))
36208 emit_jump_insn (gen_indirect_jump (fnaddr));
36209 else
36210 {
36211 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
36212 fnaddr = legitimize_pic_address (fnaddr,
36213 gen_rtx_REG (Pmode, tmp_regno));
36214
36215 if (!sibcall_insn_operand (fnaddr, word_mode))
36216 {
36217 tmp = gen_rtx_REG (word_mode, tmp_regno);
36218 if (GET_MODE (fnaddr) != word_mode)
36219 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
36220 emit_move_insn (tmp, fnaddr);
36221 fnaddr = tmp;
36222 }
36223
36224 tmp = gen_rtx_MEM (QImode, fnaddr);
36225 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
36226 tmp = emit_call_insn (tmp);
36227 SIBLING_CALL_P (tmp) = 1;
36228 }
36229 emit_barrier ();
36230
36231 /* Emit just enough of rest_of_compilation to get the insns emitted.
36232 Note that use_thunk calls assemble_start_function et al. */
36233 tmp = get_insns ();
36234 shorten_branches (tmp);
36235 final_start_function (tmp, file, 1);
36236 final (tmp, file, 1);
36237 final_end_function ();
36238 }
36239
36240 static void
36241 x86_file_start (void)
36242 {
36243 default_file_start ();
36244 #if TARGET_MACHO
36245 darwin_file_start ();
36246 #endif
36247 if (X86_FILE_START_VERSION_DIRECTIVE)
36248 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
36249 if (X86_FILE_START_FLTUSED)
36250 fputs ("\t.global\t__fltused\n", asm_out_file);
36251 if (ix86_asm_dialect == ASM_INTEL)
36252 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
36253 }
36254
36255 int
36256 x86_field_alignment (tree field, int computed)
36257 {
36258 enum machine_mode mode;
36259 tree type = TREE_TYPE (field);
36260
36261 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
36262 return computed;
36263 mode = TYPE_MODE (strip_array_types (type));
36264 if (mode == DFmode || mode == DCmode
36265 || GET_MODE_CLASS (mode) == MODE_INT
36266 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
36267 return MIN (32, computed);
36268 return computed;
36269 }
36270
36271 /* Output assembler code to FILE to increment profiler label # LABELNO
36272 for profiling a function entry. */
36273 void
36274 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
36275 {
36276 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
36277 : MCOUNT_NAME);
36278
36279 if (TARGET_64BIT)
36280 {
36281 #ifndef NO_PROFILE_COUNTERS
36282 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
36283 #endif
36284
36285 if (!TARGET_PECOFF && flag_pic)
36286 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
36287 else
36288 fprintf (file, "\tcall\t%s\n", mcount_name);
36289 }
36290 else if (flag_pic)
36291 {
36292 #ifndef NO_PROFILE_COUNTERS
36293 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
36294 LPREFIX, labelno);
36295 #endif
36296 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
36297 }
36298 else
36299 {
36300 #ifndef NO_PROFILE_COUNTERS
36301 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
36302 LPREFIX, labelno);
36303 #endif
36304 fprintf (file, "\tcall\t%s\n", mcount_name);
36305 }
36306 }
36307
36308 /* We don't have exact information about the insn sizes, but we may assume
36309 quite safely that we are informed about all 1 byte insns and memory
36310 address sizes. This is enough to eliminate unnecessary padding in
36311 99% of cases. */
36312
36313 static int
36314 min_insn_size (rtx insn)
36315 {
36316 int l = 0, len;
36317
36318 if (!INSN_P (insn) || !active_insn_p (insn))
36319 return 0;
36320
36321 /* Discard alignments we've emit and jump instructions. */
36322 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
36323 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
36324 return 0;
36325
36326 /* Important case - calls are always 5 bytes.
36327 It is common to have many calls in the row. */
36328 if (CALL_P (insn)
36329 && symbolic_reference_mentioned_p (PATTERN (insn))
36330 && !SIBLING_CALL_P (insn))
36331 return 5;
36332 len = get_attr_length (insn);
36333 if (len <= 1)
36334 return 1;
36335
36336 /* For normal instructions we rely on get_attr_length being exact,
36337 with a few exceptions. */
36338 if (!JUMP_P (insn))
36339 {
36340 enum attr_type type = get_attr_type (insn);
36341
36342 switch (type)
36343 {
36344 case TYPE_MULTI:
36345 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
36346 || asm_noperands (PATTERN (insn)) >= 0)
36347 return 0;
36348 break;
36349 case TYPE_OTHER:
36350 case TYPE_FCMP:
36351 break;
36352 default:
36353 /* Otherwise trust get_attr_length. */
36354 return len;
36355 }
36356
36357 l = get_attr_length_address (insn);
36358 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
36359 l = 4;
36360 }
36361 if (l)
36362 return 1+l;
36363 else
36364 return 2;
36365 }
36366
36367 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36368
36369 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
36370 window. */
36371
36372 static void
36373 ix86_avoid_jump_mispredicts (void)
36374 {
36375 rtx insn, start = get_insns ();
36376 int nbytes = 0, njumps = 0;
36377 int isjump = 0;
36378
36379 /* Look for all minimal intervals of instructions containing 4 jumps.
36380 The intervals are bounded by START and INSN. NBYTES is the total
36381 size of instructions in the interval including INSN and not including
36382 START. When the NBYTES is smaller than 16 bytes, it is possible
36383 that the end of START and INSN ends up in the same 16byte page.
36384
36385 The smallest offset in the page INSN can start is the case where START
36386 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
36387 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
36388 */
36389 for (insn = start; insn; insn = NEXT_INSN (insn))
36390 {
36391 int min_size;
36392
36393 if (LABEL_P (insn))
36394 {
36395 int align = label_to_alignment (insn);
36396 int max_skip = label_to_max_skip (insn);
36397
36398 if (max_skip > 15)
36399 max_skip = 15;
36400 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
36401 already in the current 16 byte page, because otherwise
36402 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
36403 bytes to reach 16 byte boundary. */
36404 if (align <= 0
36405 || (align <= 3 && max_skip != (1 << align) - 1))
36406 max_skip = 0;
36407 if (dump_file)
36408 fprintf (dump_file, "Label %i with max_skip %i\n",
36409 INSN_UID (insn), max_skip);
36410 if (max_skip)
36411 {
36412 while (nbytes + max_skip >= 16)
36413 {
36414 start = NEXT_INSN (start);
36415 if (JUMP_P (start) || CALL_P (start))
36416 njumps--, isjump = 1;
36417 else
36418 isjump = 0;
36419 nbytes -= min_insn_size (start);
36420 }
36421 }
36422 continue;
36423 }
36424
36425 min_size = min_insn_size (insn);
36426 nbytes += min_size;
36427 if (dump_file)
36428 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
36429 INSN_UID (insn), min_size);
36430 if (JUMP_P (insn) || CALL_P (insn))
36431 njumps++;
36432 else
36433 continue;
36434
36435 while (njumps > 3)
36436 {
36437 start = NEXT_INSN (start);
36438 if (JUMP_P (start) || CALL_P (start))
36439 njumps--, isjump = 1;
36440 else
36441 isjump = 0;
36442 nbytes -= min_insn_size (start);
36443 }
36444 gcc_assert (njumps >= 0);
36445 if (dump_file)
36446 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
36447 INSN_UID (start), INSN_UID (insn), nbytes);
36448
36449 if (njumps == 3 && isjump && nbytes < 16)
36450 {
36451 int padsize = 15 - nbytes + min_insn_size (insn);
36452
36453 if (dump_file)
36454 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
36455 INSN_UID (insn), padsize);
36456 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
36457 }
36458 }
36459 }
36460 #endif
36461
36462 /* AMD Athlon works faster
36463 when RET is not destination of conditional jump or directly preceded
36464 by other jump instruction. We avoid the penalty by inserting NOP just
36465 before the RET instructions in such cases. */
36466 static void
36467 ix86_pad_returns (void)
36468 {
36469 edge e;
36470 edge_iterator ei;
36471
36472 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36473 {
36474 basic_block bb = e->src;
36475 rtx ret = BB_END (bb);
36476 rtx prev;
36477 bool replace = false;
36478
36479 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
36480 || optimize_bb_for_size_p (bb))
36481 continue;
36482 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
36483 if (active_insn_p (prev) || LABEL_P (prev))
36484 break;
36485 if (prev && LABEL_P (prev))
36486 {
36487 edge e;
36488 edge_iterator ei;
36489
36490 FOR_EACH_EDGE (e, ei, bb->preds)
36491 if (EDGE_FREQUENCY (e) && e->src->index >= 0
36492 && !(e->flags & EDGE_FALLTHRU))
36493 {
36494 replace = true;
36495 break;
36496 }
36497 }
36498 if (!replace)
36499 {
36500 prev = prev_active_insn (ret);
36501 if (prev
36502 && ((JUMP_P (prev) && any_condjump_p (prev))
36503 || CALL_P (prev)))
36504 replace = true;
36505 /* Empty functions get branch mispredict even when
36506 the jump destination is not visible to us. */
36507 if (!prev && !optimize_function_for_size_p (cfun))
36508 replace = true;
36509 }
36510 if (replace)
36511 {
36512 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
36513 delete_insn (ret);
36514 }
36515 }
36516 }
36517
36518 /* Count the minimum number of instructions in BB. Return 4 if the
36519 number of instructions >= 4. */
36520
36521 static int
36522 ix86_count_insn_bb (basic_block bb)
36523 {
36524 rtx insn;
36525 int insn_count = 0;
36526
36527 /* Count number of instructions in this block. Return 4 if the number
36528 of instructions >= 4. */
36529 FOR_BB_INSNS (bb, insn)
36530 {
36531 /* Only happen in exit blocks. */
36532 if (JUMP_P (insn)
36533 && ANY_RETURN_P (PATTERN (insn)))
36534 break;
36535
36536 if (NONDEBUG_INSN_P (insn)
36537 && GET_CODE (PATTERN (insn)) != USE
36538 && GET_CODE (PATTERN (insn)) != CLOBBER)
36539 {
36540 insn_count++;
36541 if (insn_count >= 4)
36542 return insn_count;
36543 }
36544 }
36545
36546 return insn_count;
36547 }
36548
36549
36550 /* Count the minimum number of instructions in code path in BB.
36551 Return 4 if the number of instructions >= 4. */
36552
36553 static int
36554 ix86_count_insn (basic_block bb)
36555 {
36556 edge e;
36557 edge_iterator ei;
36558 int min_prev_count;
36559
36560 /* Only bother counting instructions along paths with no
36561 more than 2 basic blocks between entry and exit. Given
36562 that BB has an edge to exit, determine if a predecessor
36563 of BB has an edge from entry. If so, compute the number
36564 of instructions in the predecessor block. If there
36565 happen to be multiple such blocks, compute the minimum. */
36566 min_prev_count = 4;
36567 FOR_EACH_EDGE (e, ei, bb->preds)
36568 {
36569 edge prev_e;
36570 edge_iterator prev_ei;
36571
36572 if (e->src == ENTRY_BLOCK_PTR)
36573 {
36574 min_prev_count = 0;
36575 break;
36576 }
36577 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36578 {
36579 if (prev_e->src == ENTRY_BLOCK_PTR)
36580 {
36581 int count = ix86_count_insn_bb (e->src);
36582 if (count < min_prev_count)
36583 min_prev_count = count;
36584 break;
36585 }
36586 }
36587 }
36588
36589 if (min_prev_count < 4)
36590 min_prev_count += ix86_count_insn_bb (bb);
36591
36592 return min_prev_count;
36593 }
36594
36595 /* Pad short function to 4 instructions. */
36596
36597 static void
36598 ix86_pad_short_function (void)
36599 {
36600 edge e;
36601 edge_iterator ei;
36602
36603 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36604 {
36605 rtx ret = BB_END (e->src);
36606 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36607 {
36608 int insn_count = ix86_count_insn (e->src);
36609
36610 /* Pad short function. */
36611 if (insn_count < 4)
36612 {
36613 rtx insn = ret;
36614
36615 /* Find epilogue. */
36616 while (insn
36617 && (!NOTE_P (insn)
36618 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36619 insn = PREV_INSN (insn);
36620
36621 if (!insn)
36622 insn = ret;
36623
36624 /* Two NOPs count as one instruction. */
36625 insn_count = 2 * (4 - insn_count);
36626 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36627 }
36628 }
36629 }
36630 }
36631
36632 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36633 the epilogue, the Windows system unwinder will apply epilogue logic and
36634 produce incorrect offsets. This can be avoided by adding a nop between
36635 the last insn that can throw and the first insn of the epilogue. */
36636
36637 static void
36638 ix86_seh_fixup_eh_fallthru (void)
36639 {
36640 edge e;
36641 edge_iterator ei;
36642
36643 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36644 {
36645 rtx insn, next;
36646
36647 /* Find the beginning of the epilogue. */
36648 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36649 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36650 break;
36651 if (insn == NULL)
36652 continue;
36653
36654 /* We only care about preceding insns that can throw. */
36655 insn = prev_active_insn (insn);
36656 if (insn == NULL || !can_throw_internal (insn))
36657 continue;
36658
36659 /* Do not separate calls from their debug information. */
36660 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36661 if (NOTE_P (next)
36662 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36663 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36664 insn = next;
36665 else
36666 break;
36667
36668 emit_insn_after (gen_nops (const1_rtx), insn);
36669 }
36670 }
36671
36672 /* Implement machine specific optimizations. We implement padding of returns
36673 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36674 static void
36675 ix86_reorg (void)
36676 {
36677 /* We are freeing block_for_insn in the toplev to keep compatibility
36678 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36679 compute_bb_for_insn ();
36680
36681 if (TARGET_SEH && current_function_has_exception_handlers ())
36682 ix86_seh_fixup_eh_fallthru ();
36683
36684 if (optimize && optimize_function_for_speed_p (cfun))
36685 {
36686 if (TARGET_PAD_SHORT_FUNCTION)
36687 ix86_pad_short_function ();
36688 else if (TARGET_PAD_RETURNS)
36689 ix86_pad_returns ();
36690 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36691 if (TARGET_FOUR_JUMP_LIMIT)
36692 ix86_avoid_jump_mispredicts ();
36693 #endif
36694 }
36695 }
36696
36697 /* Return nonzero when QImode register that must be represented via REX prefix
36698 is used. */
36699 bool
36700 x86_extended_QIreg_mentioned_p (rtx insn)
36701 {
36702 int i;
36703 extract_insn_cached (insn);
36704 for (i = 0; i < recog_data.n_operands; i++)
36705 if (GENERAL_REG_P (recog_data.operand[i])
36706 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36707 return true;
36708 return false;
36709 }
36710
36711 /* Return nonzero when P points to register encoded via REX prefix.
36712 Called via for_each_rtx. */
36713 static int
36714 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36715 {
36716 unsigned int regno;
36717 if (!REG_P (*p))
36718 return 0;
36719 regno = REGNO (*p);
36720 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36721 }
36722
36723 /* Return true when INSN mentions register that must be encoded using REX
36724 prefix. */
36725 bool
36726 x86_extended_reg_mentioned_p (rtx insn)
36727 {
36728 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36729 extended_reg_mentioned_1, NULL);
36730 }
36731
36732 /* If profitable, negate (without causing overflow) integer constant
36733 of mode MODE at location LOC. Return true in this case. */
36734 bool
36735 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36736 {
36737 HOST_WIDE_INT val;
36738
36739 if (!CONST_INT_P (*loc))
36740 return false;
36741
36742 switch (mode)
36743 {
36744 case DImode:
36745 /* DImode x86_64 constants must fit in 32 bits. */
36746 gcc_assert (x86_64_immediate_operand (*loc, mode));
36747
36748 mode = SImode;
36749 break;
36750
36751 case SImode:
36752 case HImode:
36753 case QImode:
36754 break;
36755
36756 default:
36757 gcc_unreachable ();
36758 }
36759
36760 /* Avoid overflows. */
36761 if (mode_signbit_p (mode, *loc))
36762 return false;
36763
36764 val = INTVAL (*loc);
36765
36766 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36767 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36768 if ((val < 0 && val != -128)
36769 || val == 128)
36770 {
36771 *loc = GEN_INT (-val);
36772 return true;
36773 }
36774
36775 return false;
36776 }
36777
36778 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36779 optabs would emit if we didn't have TFmode patterns. */
36780
36781 void
36782 x86_emit_floatuns (rtx operands[2])
36783 {
36784 rtx neglab, donelab, i0, i1, f0, in, out;
36785 enum machine_mode mode, inmode;
36786
36787 inmode = GET_MODE (operands[1]);
36788 gcc_assert (inmode == SImode || inmode == DImode);
36789
36790 out = operands[0];
36791 in = force_reg (inmode, operands[1]);
36792 mode = GET_MODE (out);
36793 neglab = gen_label_rtx ();
36794 donelab = gen_label_rtx ();
36795 f0 = gen_reg_rtx (mode);
36796
36797 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36798
36799 expand_float (out, in, 0);
36800
36801 emit_jump_insn (gen_jump (donelab));
36802 emit_barrier ();
36803
36804 emit_label (neglab);
36805
36806 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36807 1, OPTAB_DIRECT);
36808 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36809 1, OPTAB_DIRECT);
36810 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36811
36812 expand_float (f0, i0, 0);
36813
36814 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36815
36816 emit_label (donelab);
36817 }
36818 \f
36819 /* AVX512F does support 64-byte integer vector operations,
36820 thus the longest vector we are faced with is V64QImode. */
36821 #define MAX_VECT_LEN 64
36822
36823 struct expand_vec_perm_d
36824 {
36825 rtx target, op0, op1;
36826 unsigned char perm[MAX_VECT_LEN];
36827 enum machine_mode vmode;
36828 unsigned char nelt;
36829 bool one_operand_p;
36830 bool testing_p;
36831 };
36832
36833 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36834 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36835 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36836
36837 /* Get a vector mode of the same size as the original but with elements
36838 twice as wide. This is only guaranteed to apply to integral vectors. */
36839
36840 static inline enum machine_mode
36841 get_mode_wider_vector (enum machine_mode o)
36842 {
36843 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36844 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36845 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36846 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36847 return n;
36848 }
36849
36850 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36851 with all elements equal to VAR. Return true if successful. */
36852
36853 static bool
36854 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36855 rtx target, rtx val)
36856 {
36857 bool ok;
36858
36859 switch (mode)
36860 {
36861 case V2SImode:
36862 case V2SFmode:
36863 if (!mmx_ok)
36864 return false;
36865 /* FALLTHRU */
36866
36867 case V4DFmode:
36868 case V4DImode:
36869 case V8SFmode:
36870 case V8SImode:
36871 case V2DFmode:
36872 case V2DImode:
36873 case V4SFmode:
36874 case V4SImode:
36875 {
36876 rtx insn, dup;
36877
36878 /* First attempt to recognize VAL as-is. */
36879 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36880 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36881 if (recog_memoized (insn) < 0)
36882 {
36883 rtx seq;
36884 /* If that fails, force VAL into a register. */
36885
36886 start_sequence ();
36887 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36888 seq = get_insns ();
36889 end_sequence ();
36890 if (seq)
36891 emit_insn_before (seq, insn);
36892
36893 ok = recog_memoized (insn) >= 0;
36894 gcc_assert (ok);
36895 }
36896 }
36897 return true;
36898
36899 case V4HImode:
36900 if (!mmx_ok)
36901 return false;
36902 if (TARGET_SSE || TARGET_3DNOW_A)
36903 {
36904 rtx x;
36905
36906 val = gen_lowpart (SImode, val);
36907 x = gen_rtx_TRUNCATE (HImode, val);
36908 x = gen_rtx_VEC_DUPLICATE (mode, x);
36909 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36910 return true;
36911 }
36912 goto widen;
36913
36914 case V8QImode:
36915 if (!mmx_ok)
36916 return false;
36917 goto widen;
36918
36919 case V8HImode:
36920 if (TARGET_SSE2)
36921 {
36922 struct expand_vec_perm_d dperm;
36923 rtx tmp1, tmp2;
36924
36925 permute:
36926 memset (&dperm, 0, sizeof (dperm));
36927 dperm.target = target;
36928 dperm.vmode = mode;
36929 dperm.nelt = GET_MODE_NUNITS (mode);
36930 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
36931 dperm.one_operand_p = true;
36932
36933 /* Extend to SImode using a paradoxical SUBREG. */
36934 tmp1 = gen_reg_rtx (SImode);
36935 emit_move_insn (tmp1, gen_lowpart (SImode, val));
36936
36937 /* Insert the SImode value as low element of a V4SImode vector. */
36938 tmp2 = gen_reg_rtx (V4SImode);
36939 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
36940 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
36941
36942 ok = (expand_vec_perm_1 (&dperm)
36943 || expand_vec_perm_broadcast_1 (&dperm));
36944 gcc_assert (ok);
36945 return ok;
36946 }
36947 goto widen;
36948
36949 case V16QImode:
36950 if (TARGET_SSE2)
36951 goto permute;
36952 goto widen;
36953
36954 widen:
36955 /* Replicate the value once into the next wider mode and recurse. */
36956 {
36957 enum machine_mode smode, wsmode, wvmode;
36958 rtx x;
36959
36960 smode = GET_MODE_INNER (mode);
36961 wvmode = get_mode_wider_vector (mode);
36962 wsmode = GET_MODE_INNER (wvmode);
36963
36964 val = convert_modes (wsmode, smode, val, true);
36965 x = expand_simple_binop (wsmode, ASHIFT, val,
36966 GEN_INT (GET_MODE_BITSIZE (smode)),
36967 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36968 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
36969
36970 x = gen_reg_rtx (wvmode);
36971 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
36972 gcc_assert (ok);
36973 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
36974 return ok;
36975 }
36976
36977 case V16HImode:
36978 case V32QImode:
36979 {
36980 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
36981 rtx x = gen_reg_rtx (hvmode);
36982
36983 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
36984 gcc_assert (ok);
36985
36986 x = gen_rtx_VEC_CONCAT (mode, x, x);
36987 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36988 }
36989 return true;
36990
36991 default:
36992 return false;
36993 }
36994 }
36995
36996 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36997 whose ONE_VAR element is VAR, and other elements are zero. Return true
36998 if successful. */
36999
37000 static bool
37001 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
37002 rtx target, rtx var, int one_var)
37003 {
37004 enum machine_mode vsimode;
37005 rtx new_target;
37006 rtx x, tmp;
37007 bool use_vector_set = false;
37008
37009 switch (mode)
37010 {
37011 case V2DImode:
37012 /* For SSE4.1, we normally use vector set. But if the second
37013 element is zero and inter-unit moves are OK, we use movq
37014 instead. */
37015 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
37016 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
37017 && one_var == 0));
37018 break;
37019 case V16QImode:
37020 case V4SImode:
37021 case V4SFmode:
37022 use_vector_set = TARGET_SSE4_1;
37023 break;
37024 case V8HImode:
37025 use_vector_set = TARGET_SSE2;
37026 break;
37027 case V4HImode:
37028 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
37029 break;
37030 case V32QImode:
37031 case V16HImode:
37032 case V8SImode:
37033 case V8SFmode:
37034 case V4DFmode:
37035 use_vector_set = TARGET_AVX;
37036 break;
37037 case V4DImode:
37038 /* Use ix86_expand_vector_set in 64bit mode only. */
37039 use_vector_set = TARGET_AVX && TARGET_64BIT;
37040 break;
37041 default:
37042 break;
37043 }
37044
37045 if (use_vector_set)
37046 {
37047 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
37048 var = force_reg (GET_MODE_INNER (mode), var);
37049 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37050 return true;
37051 }
37052
37053 switch (mode)
37054 {
37055 case V2SFmode:
37056 case V2SImode:
37057 if (!mmx_ok)
37058 return false;
37059 /* FALLTHRU */
37060
37061 case V2DFmode:
37062 case V2DImode:
37063 if (one_var != 0)
37064 return false;
37065 var = force_reg (GET_MODE_INNER (mode), var);
37066 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
37067 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37068 return true;
37069
37070 case V4SFmode:
37071 case V4SImode:
37072 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
37073 new_target = gen_reg_rtx (mode);
37074 else
37075 new_target = target;
37076 var = force_reg (GET_MODE_INNER (mode), var);
37077 x = gen_rtx_VEC_DUPLICATE (mode, var);
37078 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
37079 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
37080 if (one_var != 0)
37081 {
37082 /* We need to shuffle the value to the correct position, so
37083 create a new pseudo to store the intermediate result. */
37084
37085 /* With SSE2, we can use the integer shuffle insns. */
37086 if (mode != V4SFmode && TARGET_SSE2)
37087 {
37088 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
37089 const1_rtx,
37090 GEN_INT (one_var == 1 ? 0 : 1),
37091 GEN_INT (one_var == 2 ? 0 : 1),
37092 GEN_INT (one_var == 3 ? 0 : 1)));
37093 if (target != new_target)
37094 emit_move_insn (target, new_target);
37095 return true;
37096 }
37097
37098 /* Otherwise convert the intermediate result to V4SFmode and
37099 use the SSE1 shuffle instructions. */
37100 if (mode != V4SFmode)
37101 {
37102 tmp = gen_reg_rtx (V4SFmode);
37103 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
37104 }
37105 else
37106 tmp = new_target;
37107
37108 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
37109 const1_rtx,
37110 GEN_INT (one_var == 1 ? 0 : 1),
37111 GEN_INT (one_var == 2 ? 0+4 : 1+4),
37112 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
37113
37114 if (mode != V4SFmode)
37115 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
37116 else if (tmp != target)
37117 emit_move_insn (target, tmp);
37118 }
37119 else if (target != new_target)
37120 emit_move_insn (target, new_target);
37121 return true;
37122
37123 case V8HImode:
37124 case V16QImode:
37125 vsimode = V4SImode;
37126 goto widen;
37127 case V4HImode:
37128 case V8QImode:
37129 if (!mmx_ok)
37130 return false;
37131 vsimode = V2SImode;
37132 goto widen;
37133 widen:
37134 if (one_var != 0)
37135 return false;
37136
37137 /* Zero extend the variable element to SImode and recurse. */
37138 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
37139
37140 x = gen_reg_rtx (vsimode);
37141 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
37142 var, one_var))
37143 gcc_unreachable ();
37144
37145 emit_move_insn (target, gen_lowpart (mode, x));
37146 return true;
37147
37148 default:
37149 return false;
37150 }
37151 }
37152
37153 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37154 consisting of the values in VALS. It is known that all elements
37155 except ONE_VAR are constants. Return true if successful. */
37156
37157 static bool
37158 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
37159 rtx target, rtx vals, int one_var)
37160 {
37161 rtx var = XVECEXP (vals, 0, one_var);
37162 enum machine_mode wmode;
37163 rtx const_vec, x;
37164
37165 const_vec = copy_rtx (vals);
37166 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
37167 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
37168
37169 switch (mode)
37170 {
37171 case V2DFmode:
37172 case V2DImode:
37173 case V2SFmode:
37174 case V2SImode:
37175 /* For the two element vectors, it's just as easy to use
37176 the general case. */
37177 return false;
37178
37179 case V4DImode:
37180 /* Use ix86_expand_vector_set in 64bit mode only. */
37181 if (!TARGET_64BIT)
37182 return false;
37183 case V4DFmode:
37184 case V8SFmode:
37185 case V8SImode:
37186 case V16HImode:
37187 case V32QImode:
37188 case V4SFmode:
37189 case V4SImode:
37190 case V8HImode:
37191 case V4HImode:
37192 break;
37193
37194 case V16QImode:
37195 if (TARGET_SSE4_1)
37196 break;
37197 wmode = V8HImode;
37198 goto widen;
37199 case V8QImode:
37200 wmode = V4HImode;
37201 goto widen;
37202 widen:
37203 /* There's no way to set one QImode entry easily. Combine
37204 the variable value with its adjacent constant value, and
37205 promote to an HImode set. */
37206 x = XVECEXP (vals, 0, one_var ^ 1);
37207 if (one_var & 1)
37208 {
37209 var = convert_modes (HImode, QImode, var, true);
37210 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
37211 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37212 x = GEN_INT (INTVAL (x) & 0xff);
37213 }
37214 else
37215 {
37216 var = convert_modes (HImode, QImode, var, true);
37217 x = gen_int_mode (INTVAL (x) << 8, HImode);
37218 }
37219 if (x != const0_rtx)
37220 var = expand_simple_binop (HImode, IOR, var, x, var,
37221 1, OPTAB_LIB_WIDEN);
37222
37223 x = gen_reg_rtx (wmode);
37224 emit_move_insn (x, gen_lowpart (wmode, const_vec));
37225 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
37226
37227 emit_move_insn (target, gen_lowpart (mode, x));
37228 return true;
37229
37230 default:
37231 return false;
37232 }
37233
37234 emit_move_insn (target, const_vec);
37235 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37236 return true;
37237 }
37238
37239 /* A subroutine of ix86_expand_vector_init_general. Use vector
37240 concatenate to handle the most general case: all values variable,
37241 and none identical. */
37242
37243 static void
37244 ix86_expand_vector_init_concat (enum machine_mode mode,
37245 rtx target, rtx *ops, int n)
37246 {
37247 enum machine_mode cmode, hmode = VOIDmode;
37248 rtx first[8], second[4];
37249 rtvec v;
37250 int i, j;
37251
37252 switch (n)
37253 {
37254 case 2:
37255 switch (mode)
37256 {
37257 case V8SImode:
37258 cmode = V4SImode;
37259 break;
37260 case V8SFmode:
37261 cmode = V4SFmode;
37262 break;
37263 case V4DImode:
37264 cmode = V2DImode;
37265 break;
37266 case V4DFmode:
37267 cmode = V2DFmode;
37268 break;
37269 case V4SImode:
37270 cmode = V2SImode;
37271 break;
37272 case V4SFmode:
37273 cmode = V2SFmode;
37274 break;
37275 case V2DImode:
37276 cmode = DImode;
37277 break;
37278 case V2SImode:
37279 cmode = SImode;
37280 break;
37281 case V2DFmode:
37282 cmode = DFmode;
37283 break;
37284 case V2SFmode:
37285 cmode = SFmode;
37286 break;
37287 default:
37288 gcc_unreachable ();
37289 }
37290
37291 if (!register_operand (ops[1], cmode))
37292 ops[1] = force_reg (cmode, ops[1]);
37293 if (!register_operand (ops[0], cmode))
37294 ops[0] = force_reg (cmode, ops[0]);
37295 emit_insn (gen_rtx_SET (VOIDmode, target,
37296 gen_rtx_VEC_CONCAT (mode, ops[0],
37297 ops[1])));
37298 break;
37299
37300 case 4:
37301 switch (mode)
37302 {
37303 case V4DImode:
37304 cmode = V2DImode;
37305 break;
37306 case V4DFmode:
37307 cmode = V2DFmode;
37308 break;
37309 case V4SImode:
37310 cmode = V2SImode;
37311 break;
37312 case V4SFmode:
37313 cmode = V2SFmode;
37314 break;
37315 default:
37316 gcc_unreachable ();
37317 }
37318 goto half;
37319
37320 case 8:
37321 switch (mode)
37322 {
37323 case V8SImode:
37324 cmode = V2SImode;
37325 hmode = V4SImode;
37326 break;
37327 case V8SFmode:
37328 cmode = V2SFmode;
37329 hmode = V4SFmode;
37330 break;
37331 default:
37332 gcc_unreachable ();
37333 }
37334 goto half;
37335
37336 half:
37337 /* FIXME: We process inputs backward to help RA. PR 36222. */
37338 i = n - 1;
37339 j = (n >> 1) - 1;
37340 for (; i > 0; i -= 2, j--)
37341 {
37342 first[j] = gen_reg_rtx (cmode);
37343 v = gen_rtvec (2, ops[i - 1], ops[i]);
37344 ix86_expand_vector_init (false, first[j],
37345 gen_rtx_PARALLEL (cmode, v));
37346 }
37347
37348 n >>= 1;
37349 if (n > 2)
37350 {
37351 gcc_assert (hmode != VOIDmode);
37352 for (i = j = 0; i < n; i += 2, j++)
37353 {
37354 second[j] = gen_reg_rtx (hmode);
37355 ix86_expand_vector_init_concat (hmode, second [j],
37356 &first [i], 2);
37357 }
37358 n >>= 1;
37359 ix86_expand_vector_init_concat (mode, target, second, n);
37360 }
37361 else
37362 ix86_expand_vector_init_concat (mode, target, first, n);
37363 break;
37364
37365 default:
37366 gcc_unreachable ();
37367 }
37368 }
37369
37370 /* A subroutine of ix86_expand_vector_init_general. Use vector
37371 interleave to handle the most general case: all values variable,
37372 and none identical. */
37373
37374 static void
37375 ix86_expand_vector_init_interleave (enum machine_mode mode,
37376 rtx target, rtx *ops, int n)
37377 {
37378 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
37379 int i, j;
37380 rtx op0, op1;
37381 rtx (*gen_load_even) (rtx, rtx, rtx);
37382 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
37383 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
37384
37385 switch (mode)
37386 {
37387 case V8HImode:
37388 gen_load_even = gen_vec_setv8hi;
37389 gen_interleave_first_low = gen_vec_interleave_lowv4si;
37390 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37391 inner_mode = HImode;
37392 first_imode = V4SImode;
37393 second_imode = V2DImode;
37394 third_imode = VOIDmode;
37395 break;
37396 case V16QImode:
37397 gen_load_even = gen_vec_setv16qi;
37398 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
37399 gen_interleave_second_low = gen_vec_interleave_lowv4si;
37400 inner_mode = QImode;
37401 first_imode = V8HImode;
37402 second_imode = V4SImode;
37403 third_imode = V2DImode;
37404 break;
37405 default:
37406 gcc_unreachable ();
37407 }
37408
37409 for (i = 0; i < n; i++)
37410 {
37411 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
37412 op0 = gen_reg_rtx (SImode);
37413 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
37414
37415 /* Insert the SImode value as low element of V4SImode vector. */
37416 op1 = gen_reg_rtx (V4SImode);
37417 op0 = gen_rtx_VEC_MERGE (V4SImode,
37418 gen_rtx_VEC_DUPLICATE (V4SImode,
37419 op0),
37420 CONST0_RTX (V4SImode),
37421 const1_rtx);
37422 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
37423
37424 /* Cast the V4SImode vector back to a vector in orignal mode. */
37425 op0 = gen_reg_rtx (mode);
37426 emit_move_insn (op0, gen_lowpart (mode, op1));
37427
37428 /* Load even elements into the second position. */
37429 emit_insn (gen_load_even (op0,
37430 force_reg (inner_mode,
37431 ops [i + i + 1]),
37432 const1_rtx));
37433
37434 /* Cast vector to FIRST_IMODE vector. */
37435 ops[i] = gen_reg_rtx (first_imode);
37436 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
37437 }
37438
37439 /* Interleave low FIRST_IMODE vectors. */
37440 for (i = j = 0; i < n; i += 2, j++)
37441 {
37442 op0 = gen_reg_rtx (first_imode);
37443 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
37444
37445 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
37446 ops[j] = gen_reg_rtx (second_imode);
37447 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
37448 }
37449
37450 /* Interleave low SECOND_IMODE vectors. */
37451 switch (second_imode)
37452 {
37453 case V4SImode:
37454 for (i = j = 0; i < n / 2; i += 2, j++)
37455 {
37456 op0 = gen_reg_rtx (second_imode);
37457 emit_insn (gen_interleave_second_low (op0, ops[i],
37458 ops[i + 1]));
37459
37460 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
37461 vector. */
37462 ops[j] = gen_reg_rtx (third_imode);
37463 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
37464 }
37465 second_imode = V2DImode;
37466 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37467 /* FALLTHRU */
37468
37469 case V2DImode:
37470 op0 = gen_reg_rtx (second_imode);
37471 emit_insn (gen_interleave_second_low (op0, ops[0],
37472 ops[1]));
37473
37474 /* Cast the SECOND_IMODE vector back to a vector on original
37475 mode. */
37476 emit_insn (gen_rtx_SET (VOIDmode, target,
37477 gen_lowpart (mode, op0)));
37478 break;
37479
37480 default:
37481 gcc_unreachable ();
37482 }
37483 }
37484
37485 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
37486 all values variable, and none identical. */
37487
37488 static void
37489 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
37490 rtx target, rtx vals)
37491 {
37492 rtx ops[32], op0, op1;
37493 enum machine_mode half_mode = VOIDmode;
37494 int n, i;
37495
37496 switch (mode)
37497 {
37498 case V2SFmode:
37499 case V2SImode:
37500 if (!mmx_ok && !TARGET_SSE)
37501 break;
37502 /* FALLTHRU */
37503
37504 case V8SFmode:
37505 case V8SImode:
37506 case V4DFmode:
37507 case V4DImode:
37508 case V4SFmode:
37509 case V4SImode:
37510 case V2DFmode:
37511 case V2DImode:
37512 n = GET_MODE_NUNITS (mode);
37513 for (i = 0; i < n; i++)
37514 ops[i] = XVECEXP (vals, 0, i);
37515 ix86_expand_vector_init_concat (mode, target, ops, n);
37516 return;
37517
37518 case V32QImode:
37519 half_mode = V16QImode;
37520 goto half;
37521
37522 case V16HImode:
37523 half_mode = V8HImode;
37524 goto half;
37525
37526 half:
37527 n = GET_MODE_NUNITS (mode);
37528 for (i = 0; i < n; i++)
37529 ops[i] = XVECEXP (vals, 0, i);
37530 op0 = gen_reg_rtx (half_mode);
37531 op1 = gen_reg_rtx (half_mode);
37532 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37533 n >> 2);
37534 ix86_expand_vector_init_interleave (half_mode, op1,
37535 &ops [n >> 1], n >> 2);
37536 emit_insn (gen_rtx_SET (VOIDmode, target,
37537 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37538 return;
37539
37540 case V16QImode:
37541 if (!TARGET_SSE4_1)
37542 break;
37543 /* FALLTHRU */
37544
37545 case V8HImode:
37546 if (!TARGET_SSE2)
37547 break;
37548
37549 /* Don't use ix86_expand_vector_init_interleave if we can't
37550 move from GPR to SSE register directly. */
37551 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37552 break;
37553
37554 n = GET_MODE_NUNITS (mode);
37555 for (i = 0; i < n; i++)
37556 ops[i] = XVECEXP (vals, 0, i);
37557 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37558 return;
37559
37560 case V4HImode:
37561 case V8QImode:
37562 break;
37563
37564 default:
37565 gcc_unreachable ();
37566 }
37567
37568 {
37569 int i, j, n_elts, n_words, n_elt_per_word;
37570 enum machine_mode inner_mode;
37571 rtx words[4], shift;
37572
37573 inner_mode = GET_MODE_INNER (mode);
37574 n_elts = GET_MODE_NUNITS (mode);
37575 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37576 n_elt_per_word = n_elts / n_words;
37577 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37578
37579 for (i = 0; i < n_words; ++i)
37580 {
37581 rtx word = NULL_RTX;
37582
37583 for (j = 0; j < n_elt_per_word; ++j)
37584 {
37585 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37586 elt = convert_modes (word_mode, inner_mode, elt, true);
37587
37588 if (j == 0)
37589 word = elt;
37590 else
37591 {
37592 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37593 word, 1, OPTAB_LIB_WIDEN);
37594 word = expand_simple_binop (word_mode, IOR, word, elt,
37595 word, 1, OPTAB_LIB_WIDEN);
37596 }
37597 }
37598
37599 words[i] = word;
37600 }
37601
37602 if (n_words == 1)
37603 emit_move_insn (target, gen_lowpart (mode, words[0]));
37604 else if (n_words == 2)
37605 {
37606 rtx tmp = gen_reg_rtx (mode);
37607 emit_clobber (tmp);
37608 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37609 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37610 emit_move_insn (target, tmp);
37611 }
37612 else if (n_words == 4)
37613 {
37614 rtx tmp = gen_reg_rtx (V4SImode);
37615 gcc_assert (word_mode == SImode);
37616 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37617 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37618 emit_move_insn (target, gen_lowpart (mode, tmp));
37619 }
37620 else
37621 gcc_unreachable ();
37622 }
37623 }
37624
37625 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37626 instructions unless MMX_OK is true. */
37627
37628 void
37629 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37630 {
37631 enum machine_mode mode = GET_MODE (target);
37632 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37633 int n_elts = GET_MODE_NUNITS (mode);
37634 int n_var = 0, one_var = -1;
37635 bool all_same = true, all_const_zero = true;
37636 int i;
37637 rtx x;
37638
37639 for (i = 0; i < n_elts; ++i)
37640 {
37641 x = XVECEXP (vals, 0, i);
37642 if (!(CONST_INT_P (x)
37643 || GET_CODE (x) == CONST_DOUBLE
37644 || GET_CODE (x) == CONST_FIXED))
37645 n_var++, one_var = i;
37646 else if (x != CONST0_RTX (inner_mode))
37647 all_const_zero = false;
37648 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37649 all_same = false;
37650 }
37651
37652 /* Constants are best loaded from the constant pool. */
37653 if (n_var == 0)
37654 {
37655 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37656 return;
37657 }
37658
37659 /* If all values are identical, broadcast the value. */
37660 if (all_same
37661 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37662 XVECEXP (vals, 0, 0)))
37663 return;
37664
37665 /* Values where only one field is non-constant are best loaded from
37666 the pool and overwritten via move later. */
37667 if (n_var == 1)
37668 {
37669 if (all_const_zero
37670 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37671 XVECEXP (vals, 0, one_var),
37672 one_var))
37673 return;
37674
37675 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37676 return;
37677 }
37678
37679 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37680 }
37681
37682 void
37683 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37684 {
37685 enum machine_mode mode = GET_MODE (target);
37686 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37687 enum machine_mode half_mode;
37688 bool use_vec_merge = false;
37689 rtx tmp;
37690 static rtx (*gen_extract[6][2]) (rtx, rtx)
37691 = {
37692 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37693 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37694 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37695 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37696 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37697 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37698 };
37699 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37700 = {
37701 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37702 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37703 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37704 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37705 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37706 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37707 };
37708 int i, j, n;
37709
37710 switch (mode)
37711 {
37712 case V2SFmode:
37713 case V2SImode:
37714 if (mmx_ok)
37715 {
37716 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37717 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37718 if (elt == 0)
37719 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37720 else
37721 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37722 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37723 return;
37724 }
37725 break;
37726
37727 case V2DImode:
37728 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37729 if (use_vec_merge)
37730 break;
37731
37732 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37733 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37734 if (elt == 0)
37735 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37736 else
37737 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37738 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37739 return;
37740
37741 case V2DFmode:
37742 {
37743 rtx op0, op1;
37744
37745 /* For the two element vectors, we implement a VEC_CONCAT with
37746 the extraction of the other element. */
37747
37748 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37749 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37750
37751 if (elt == 0)
37752 op0 = val, op1 = tmp;
37753 else
37754 op0 = tmp, op1 = val;
37755
37756 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37757 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37758 }
37759 return;
37760
37761 case V4SFmode:
37762 use_vec_merge = TARGET_SSE4_1;
37763 if (use_vec_merge)
37764 break;
37765
37766 switch (elt)
37767 {
37768 case 0:
37769 use_vec_merge = true;
37770 break;
37771
37772 case 1:
37773 /* tmp = target = A B C D */
37774 tmp = copy_to_reg (target);
37775 /* target = A A B B */
37776 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37777 /* target = X A B B */
37778 ix86_expand_vector_set (false, target, val, 0);
37779 /* target = A X C D */
37780 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37781 const1_rtx, const0_rtx,
37782 GEN_INT (2+4), GEN_INT (3+4)));
37783 return;
37784
37785 case 2:
37786 /* tmp = target = A B C D */
37787 tmp = copy_to_reg (target);
37788 /* tmp = X B C D */
37789 ix86_expand_vector_set (false, tmp, val, 0);
37790 /* target = A B X D */
37791 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37792 const0_rtx, const1_rtx,
37793 GEN_INT (0+4), GEN_INT (3+4)));
37794 return;
37795
37796 case 3:
37797 /* tmp = target = A B C D */
37798 tmp = copy_to_reg (target);
37799 /* tmp = X B C D */
37800 ix86_expand_vector_set (false, tmp, val, 0);
37801 /* target = A B X D */
37802 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37803 const0_rtx, const1_rtx,
37804 GEN_INT (2+4), GEN_INT (0+4)));
37805 return;
37806
37807 default:
37808 gcc_unreachable ();
37809 }
37810 break;
37811
37812 case V4SImode:
37813 use_vec_merge = TARGET_SSE4_1;
37814 if (use_vec_merge)
37815 break;
37816
37817 /* Element 0 handled by vec_merge below. */
37818 if (elt == 0)
37819 {
37820 use_vec_merge = true;
37821 break;
37822 }
37823
37824 if (TARGET_SSE2)
37825 {
37826 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37827 store into element 0, then shuffle them back. */
37828
37829 rtx order[4];
37830
37831 order[0] = GEN_INT (elt);
37832 order[1] = const1_rtx;
37833 order[2] = const2_rtx;
37834 order[3] = GEN_INT (3);
37835 order[elt] = const0_rtx;
37836
37837 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37838 order[1], order[2], order[3]));
37839
37840 ix86_expand_vector_set (false, target, val, 0);
37841
37842 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37843 order[1], order[2], order[3]));
37844 }
37845 else
37846 {
37847 /* For SSE1, we have to reuse the V4SF code. */
37848 rtx t = gen_reg_rtx (V4SFmode);
37849 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
37850 emit_move_insn (target, gen_lowpart (mode, t));
37851 }
37852 return;
37853
37854 case V8HImode:
37855 use_vec_merge = TARGET_SSE2;
37856 break;
37857 case V4HImode:
37858 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37859 break;
37860
37861 case V16QImode:
37862 use_vec_merge = TARGET_SSE4_1;
37863 break;
37864
37865 case V8QImode:
37866 break;
37867
37868 case V32QImode:
37869 half_mode = V16QImode;
37870 j = 0;
37871 n = 16;
37872 goto half;
37873
37874 case V16HImode:
37875 half_mode = V8HImode;
37876 j = 1;
37877 n = 8;
37878 goto half;
37879
37880 case V8SImode:
37881 half_mode = V4SImode;
37882 j = 2;
37883 n = 4;
37884 goto half;
37885
37886 case V4DImode:
37887 half_mode = V2DImode;
37888 j = 3;
37889 n = 2;
37890 goto half;
37891
37892 case V8SFmode:
37893 half_mode = V4SFmode;
37894 j = 4;
37895 n = 4;
37896 goto half;
37897
37898 case V4DFmode:
37899 half_mode = V2DFmode;
37900 j = 5;
37901 n = 2;
37902 goto half;
37903
37904 half:
37905 /* Compute offset. */
37906 i = elt / n;
37907 elt %= n;
37908
37909 gcc_assert (i <= 1);
37910
37911 /* Extract the half. */
37912 tmp = gen_reg_rtx (half_mode);
37913 emit_insn (gen_extract[j][i] (tmp, target));
37914
37915 /* Put val in tmp at elt. */
37916 ix86_expand_vector_set (false, tmp, val, elt);
37917
37918 /* Put it back. */
37919 emit_insn (gen_insert[j][i] (target, target, tmp));
37920 return;
37921
37922 default:
37923 break;
37924 }
37925
37926 if (use_vec_merge)
37927 {
37928 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
37929 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
37930 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37931 }
37932 else
37933 {
37934 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37935
37936 emit_move_insn (mem, target);
37937
37938 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37939 emit_move_insn (tmp, val);
37940
37941 emit_move_insn (target, mem);
37942 }
37943 }
37944
37945 void
37946 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
37947 {
37948 enum machine_mode mode = GET_MODE (vec);
37949 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37950 bool use_vec_extr = false;
37951 rtx tmp;
37952
37953 switch (mode)
37954 {
37955 case V2SImode:
37956 case V2SFmode:
37957 if (!mmx_ok)
37958 break;
37959 /* FALLTHRU */
37960
37961 case V2DFmode:
37962 case V2DImode:
37963 use_vec_extr = true;
37964 break;
37965
37966 case V4SFmode:
37967 use_vec_extr = TARGET_SSE4_1;
37968 if (use_vec_extr)
37969 break;
37970
37971 switch (elt)
37972 {
37973 case 0:
37974 tmp = vec;
37975 break;
37976
37977 case 1:
37978 case 3:
37979 tmp = gen_reg_rtx (mode);
37980 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
37981 GEN_INT (elt), GEN_INT (elt),
37982 GEN_INT (elt+4), GEN_INT (elt+4)));
37983 break;
37984
37985 case 2:
37986 tmp = gen_reg_rtx (mode);
37987 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
37988 break;
37989
37990 default:
37991 gcc_unreachable ();
37992 }
37993 vec = tmp;
37994 use_vec_extr = true;
37995 elt = 0;
37996 break;
37997
37998 case V4SImode:
37999 use_vec_extr = TARGET_SSE4_1;
38000 if (use_vec_extr)
38001 break;
38002
38003 if (TARGET_SSE2)
38004 {
38005 switch (elt)
38006 {
38007 case 0:
38008 tmp = vec;
38009 break;
38010
38011 case 1:
38012 case 3:
38013 tmp = gen_reg_rtx (mode);
38014 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
38015 GEN_INT (elt), GEN_INT (elt),
38016 GEN_INT (elt), GEN_INT (elt)));
38017 break;
38018
38019 case 2:
38020 tmp = gen_reg_rtx (mode);
38021 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
38022 break;
38023
38024 default:
38025 gcc_unreachable ();
38026 }
38027 vec = tmp;
38028 use_vec_extr = true;
38029 elt = 0;
38030 }
38031 else
38032 {
38033 /* For SSE1, we have to reuse the V4SF code. */
38034 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
38035 gen_lowpart (V4SFmode, vec), elt);
38036 return;
38037 }
38038 break;
38039
38040 case V8HImode:
38041 use_vec_extr = TARGET_SSE2;
38042 break;
38043 case V4HImode:
38044 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
38045 break;
38046
38047 case V16QImode:
38048 use_vec_extr = TARGET_SSE4_1;
38049 break;
38050
38051 case V8SFmode:
38052 if (TARGET_AVX)
38053 {
38054 tmp = gen_reg_rtx (V4SFmode);
38055 if (elt < 4)
38056 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
38057 else
38058 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
38059 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38060 return;
38061 }
38062 break;
38063
38064 case V4DFmode:
38065 if (TARGET_AVX)
38066 {
38067 tmp = gen_reg_rtx (V2DFmode);
38068 if (elt < 2)
38069 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
38070 else
38071 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
38072 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38073 return;
38074 }
38075 break;
38076
38077 case V32QImode:
38078 if (TARGET_AVX)
38079 {
38080 tmp = gen_reg_rtx (V16QImode);
38081 if (elt < 16)
38082 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
38083 else
38084 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
38085 ix86_expand_vector_extract (false, target, tmp, elt & 15);
38086 return;
38087 }
38088 break;
38089
38090 case V16HImode:
38091 if (TARGET_AVX)
38092 {
38093 tmp = gen_reg_rtx (V8HImode);
38094 if (elt < 8)
38095 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
38096 else
38097 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
38098 ix86_expand_vector_extract (false, target, tmp, elt & 7);
38099 return;
38100 }
38101 break;
38102
38103 case V8SImode:
38104 if (TARGET_AVX)
38105 {
38106 tmp = gen_reg_rtx (V4SImode);
38107 if (elt < 4)
38108 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
38109 else
38110 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
38111 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38112 return;
38113 }
38114 break;
38115
38116 case V4DImode:
38117 if (TARGET_AVX)
38118 {
38119 tmp = gen_reg_rtx (V2DImode);
38120 if (elt < 2)
38121 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
38122 else
38123 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
38124 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38125 return;
38126 }
38127 break;
38128
38129 case V8QImode:
38130 /* ??? Could extract the appropriate HImode element and shift. */
38131 default:
38132 break;
38133 }
38134
38135 if (use_vec_extr)
38136 {
38137 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
38138 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
38139
38140 /* Let the rtl optimizers know about the zero extension performed. */
38141 if (inner_mode == QImode || inner_mode == HImode)
38142 {
38143 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
38144 target = gen_lowpart (SImode, target);
38145 }
38146
38147 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38148 }
38149 else
38150 {
38151 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38152
38153 emit_move_insn (mem, vec);
38154
38155 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38156 emit_move_insn (target, tmp);
38157 }
38158 }
38159
38160 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
38161 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
38162 The upper bits of DEST are undefined, though they shouldn't cause
38163 exceptions (some bits from src or all zeros are ok). */
38164
38165 static void
38166 emit_reduc_half (rtx dest, rtx src, int i)
38167 {
38168 rtx tem, d = dest;
38169 switch (GET_MODE (src))
38170 {
38171 case V4SFmode:
38172 if (i == 128)
38173 tem = gen_sse_movhlps (dest, src, src);
38174 else
38175 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
38176 GEN_INT (1 + 4), GEN_INT (1 + 4));
38177 break;
38178 case V2DFmode:
38179 tem = gen_vec_interleave_highv2df (dest, src, src);
38180 break;
38181 case V16QImode:
38182 case V8HImode:
38183 case V4SImode:
38184 case V2DImode:
38185 d = gen_reg_rtx (V1TImode);
38186 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
38187 GEN_INT (i / 2));
38188 break;
38189 case V8SFmode:
38190 if (i == 256)
38191 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
38192 else
38193 tem = gen_avx_shufps256 (dest, src, src,
38194 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
38195 break;
38196 case V4DFmode:
38197 if (i == 256)
38198 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
38199 else
38200 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
38201 break;
38202 case V32QImode:
38203 case V16HImode:
38204 case V8SImode:
38205 case V4DImode:
38206 if (i == 256)
38207 {
38208 if (GET_MODE (dest) != V4DImode)
38209 d = gen_reg_rtx (V4DImode);
38210 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
38211 gen_lowpart (V4DImode, src),
38212 const1_rtx);
38213 }
38214 else
38215 {
38216 d = gen_reg_rtx (V2TImode);
38217 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
38218 GEN_INT (i / 2));
38219 }
38220 break;
38221 default:
38222 gcc_unreachable ();
38223 }
38224 emit_insn (tem);
38225 if (d != dest)
38226 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
38227 }
38228
38229 /* Expand a vector reduction. FN is the binary pattern to reduce;
38230 DEST is the destination; IN is the input vector. */
38231
38232 void
38233 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
38234 {
38235 rtx half, dst, vec = in;
38236 enum machine_mode mode = GET_MODE (in);
38237 int i;
38238
38239 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
38240 if (TARGET_SSE4_1
38241 && mode == V8HImode
38242 && fn == gen_uminv8hi3)
38243 {
38244 emit_insn (gen_sse4_1_phminposuw (dest, in));
38245 return;
38246 }
38247
38248 for (i = GET_MODE_BITSIZE (mode);
38249 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
38250 i >>= 1)
38251 {
38252 half = gen_reg_rtx (mode);
38253 emit_reduc_half (half, vec, i);
38254 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
38255 dst = dest;
38256 else
38257 dst = gen_reg_rtx (mode);
38258 emit_insn (fn (dst, half, vec));
38259 vec = dst;
38260 }
38261 }
38262 \f
38263 /* Target hook for scalar_mode_supported_p. */
38264 static bool
38265 ix86_scalar_mode_supported_p (enum machine_mode mode)
38266 {
38267 if (DECIMAL_FLOAT_MODE_P (mode))
38268 return default_decimal_float_supported_p ();
38269 else if (mode == TFmode)
38270 return true;
38271 else
38272 return default_scalar_mode_supported_p (mode);
38273 }
38274
38275 /* Implements target hook vector_mode_supported_p. */
38276 static bool
38277 ix86_vector_mode_supported_p (enum machine_mode mode)
38278 {
38279 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
38280 return true;
38281 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
38282 return true;
38283 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
38284 return true;
38285 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
38286 return true;
38287 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
38288 return true;
38289 return false;
38290 }
38291
38292 /* Target hook for c_mode_for_suffix. */
38293 static enum machine_mode
38294 ix86_c_mode_for_suffix (char suffix)
38295 {
38296 if (suffix == 'q')
38297 return TFmode;
38298 if (suffix == 'w')
38299 return XFmode;
38300
38301 return VOIDmode;
38302 }
38303
38304 /* Worker function for TARGET_MD_ASM_CLOBBERS.
38305
38306 We do this in the new i386 backend to maintain source compatibility
38307 with the old cc0-based compiler. */
38308
38309 static tree
38310 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
38311 tree inputs ATTRIBUTE_UNUSED,
38312 tree clobbers)
38313 {
38314 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
38315 clobbers);
38316 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
38317 clobbers);
38318 return clobbers;
38319 }
38320
38321 /* Implements target vector targetm.asm.encode_section_info. */
38322
38323 static void ATTRIBUTE_UNUSED
38324 ix86_encode_section_info (tree decl, rtx rtl, int first)
38325 {
38326 default_encode_section_info (decl, rtl, first);
38327
38328 if (TREE_CODE (decl) == VAR_DECL
38329 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
38330 && ix86_in_large_data_p (decl))
38331 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
38332 }
38333
38334 /* Worker function for REVERSE_CONDITION. */
38335
38336 enum rtx_code
38337 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
38338 {
38339 return (mode != CCFPmode && mode != CCFPUmode
38340 ? reverse_condition (code)
38341 : reverse_condition_maybe_unordered (code));
38342 }
38343
38344 /* Output code to perform an x87 FP register move, from OPERANDS[1]
38345 to OPERANDS[0]. */
38346
38347 const char *
38348 output_387_reg_move (rtx insn, rtx *operands)
38349 {
38350 if (REG_P (operands[0]))
38351 {
38352 if (REG_P (operands[1])
38353 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38354 {
38355 if (REGNO (operands[0]) == FIRST_STACK_REG)
38356 return output_387_ffreep (operands, 0);
38357 return "fstp\t%y0";
38358 }
38359 if (STACK_TOP_P (operands[0]))
38360 return "fld%Z1\t%y1";
38361 return "fst\t%y0";
38362 }
38363 else if (MEM_P (operands[0]))
38364 {
38365 gcc_assert (REG_P (operands[1]));
38366 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38367 return "fstp%Z0\t%y0";
38368 else
38369 {
38370 /* There is no non-popping store to memory for XFmode.
38371 So if we need one, follow the store with a load. */
38372 if (GET_MODE (operands[0]) == XFmode)
38373 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
38374 else
38375 return "fst%Z0\t%y0";
38376 }
38377 }
38378 else
38379 gcc_unreachable();
38380 }
38381
38382 /* Output code to perform a conditional jump to LABEL, if C2 flag in
38383 FP status register is set. */
38384
38385 void
38386 ix86_emit_fp_unordered_jump (rtx label)
38387 {
38388 rtx reg = gen_reg_rtx (HImode);
38389 rtx temp;
38390
38391 emit_insn (gen_x86_fnstsw_1 (reg));
38392
38393 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
38394 {
38395 emit_insn (gen_x86_sahf_1 (reg));
38396
38397 temp = gen_rtx_REG (CCmode, FLAGS_REG);
38398 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
38399 }
38400 else
38401 {
38402 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
38403
38404 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
38405 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
38406 }
38407
38408 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
38409 gen_rtx_LABEL_REF (VOIDmode, label),
38410 pc_rtx);
38411 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
38412
38413 emit_jump_insn (temp);
38414 predict_jump (REG_BR_PROB_BASE * 10 / 100);
38415 }
38416
38417 /* Output code to perform a log1p XFmode calculation. */
38418
38419 void ix86_emit_i387_log1p (rtx op0, rtx op1)
38420 {
38421 rtx label1 = gen_label_rtx ();
38422 rtx label2 = gen_label_rtx ();
38423
38424 rtx tmp = gen_reg_rtx (XFmode);
38425 rtx tmp2 = gen_reg_rtx (XFmode);
38426 rtx test;
38427
38428 emit_insn (gen_absxf2 (tmp, op1));
38429 test = gen_rtx_GE (VOIDmode, tmp,
38430 CONST_DOUBLE_FROM_REAL_VALUE (
38431 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
38432 XFmode));
38433 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
38434
38435 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38436 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
38437 emit_jump (label2);
38438
38439 emit_label (label1);
38440 emit_move_insn (tmp, CONST1_RTX (XFmode));
38441 emit_insn (gen_addxf3 (tmp, op1, tmp));
38442 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38443 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
38444
38445 emit_label (label2);
38446 }
38447
38448 /* Emit code for round calculation. */
38449 void ix86_emit_i387_round (rtx op0, rtx op1)
38450 {
38451 enum machine_mode inmode = GET_MODE (op1);
38452 enum machine_mode outmode = GET_MODE (op0);
38453 rtx e1, e2, res, tmp, tmp1, half;
38454 rtx scratch = gen_reg_rtx (HImode);
38455 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
38456 rtx jump_label = gen_label_rtx ();
38457 rtx insn;
38458 rtx (*gen_abs) (rtx, rtx);
38459 rtx (*gen_neg) (rtx, rtx);
38460
38461 switch (inmode)
38462 {
38463 case SFmode:
38464 gen_abs = gen_abssf2;
38465 break;
38466 case DFmode:
38467 gen_abs = gen_absdf2;
38468 break;
38469 case XFmode:
38470 gen_abs = gen_absxf2;
38471 break;
38472 default:
38473 gcc_unreachable ();
38474 }
38475
38476 switch (outmode)
38477 {
38478 case SFmode:
38479 gen_neg = gen_negsf2;
38480 break;
38481 case DFmode:
38482 gen_neg = gen_negdf2;
38483 break;
38484 case XFmode:
38485 gen_neg = gen_negxf2;
38486 break;
38487 case HImode:
38488 gen_neg = gen_neghi2;
38489 break;
38490 case SImode:
38491 gen_neg = gen_negsi2;
38492 break;
38493 case DImode:
38494 gen_neg = gen_negdi2;
38495 break;
38496 default:
38497 gcc_unreachable ();
38498 }
38499
38500 e1 = gen_reg_rtx (inmode);
38501 e2 = gen_reg_rtx (inmode);
38502 res = gen_reg_rtx (outmode);
38503
38504 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
38505
38506 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
38507
38508 /* scratch = fxam(op1) */
38509 emit_insn (gen_rtx_SET (VOIDmode, scratch,
38510 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
38511 UNSPEC_FXAM)));
38512 /* e1 = fabs(op1) */
38513 emit_insn (gen_abs (e1, op1));
38514
38515 /* e2 = e1 + 0.5 */
38516 half = force_reg (inmode, half);
38517 emit_insn (gen_rtx_SET (VOIDmode, e2,
38518 gen_rtx_PLUS (inmode, e1, half)));
38519
38520 /* res = floor(e2) */
38521 if (inmode != XFmode)
38522 {
38523 tmp1 = gen_reg_rtx (XFmode);
38524
38525 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
38526 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
38527 }
38528 else
38529 tmp1 = e2;
38530
38531 switch (outmode)
38532 {
38533 case SFmode:
38534 case DFmode:
38535 {
38536 rtx tmp0 = gen_reg_rtx (XFmode);
38537
38538 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38539
38540 emit_insn (gen_rtx_SET (VOIDmode, res,
38541 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38542 UNSPEC_TRUNC_NOOP)));
38543 }
38544 break;
38545 case XFmode:
38546 emit_insn (gen_frndintxf2_floor (res, tmp1));
38547 break;
38548 case HImode:
38549 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38550 break;
38551 case SImode:
38552 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38553 break;
38554 case DImode:
38555 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38556 break;
38557 default:
38558 gcc_unreachable ();
38559 }
38560
38561 /* flags = signbit(a) */
38562 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38563
38564 /* if (flags) then res = -res */
38565 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38566 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38567 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38568 pc_rtx);
38569 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38570 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38571 JUMP_LABEL (insn) = jump_label;
38572
38573 emit_insn (gen_neg (res, res));
38574
38575 emit_label (jump_label);
38576 LABEL_NUSES (jump_label) = 1;
38577
38578 emit_move_insn (op0, res);
38579 }
38580
38581 /* Output code to perform a Newton-Rhapson approximation of a single precision
38582 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38583
38584 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38585 {
38586 rtx x0, x1, e0, e1;
38587
38588 x0 = gen_reg_rtx (mode);
38589 e0 = gen_reg_rtx (mode);
38590 e1 = gen_reg_rtx (mode);
38591 x1 = gen_reg_rtx (mode);
38592
38593 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38594
38595 b = force_reg (mode, b);
38596
38597 /* x0 = rcp(b) estimate */
38598 emit_insn (gen_rtx_SET (VOIDmode, x0,
38599 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38600 UNSPEC_RCP)));
38601 /* e0 = x0 * b */
38602 emit_insn (gen_rtx_SET (VOIDmode, e0,
38603 gen_rtx_MULT (mode, x0, b)));
38604
38605 /* e0 = x0 * e0 */
38606 emit_insn (gen_rtx_SET (VOIDmode, e0,
38607 gen_rtx_MULT (mode, x0, e0)));
38608
38609 /* e1 = x0 + x0 */
38610 emit_insn (gen_rtx_SET (VOIDmode, e1,
38611 gen_rtx_PLUS (mode, x0, x0)));
38612
38613 /* x1 = e1 - e0 */
38614 emit_insn (gen_rtx_SET (VOIDmode, x1,
38615 gen_rtx_MINUS (mode, e1, e0)));
38616
38617 /* res = a * x1 */
38618 emit_insn (gen_rtx_SET (VOIDmode, res,
38619 gen_rtx_MULT (mode, a, x1)));
38620 }
38621
38622 /* Output code to perform a Newton-Rhapson approximation of a
38623 single precision floating point [reciprocal] square root. */
38624
38625 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38626 bool recip)
38627 {
38628 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38629 REAL_VALUE_TYPE r;
38630
38631 x0 = gen_reg_rtx (mode);
38632 e0 = gen_reg_rtx (mode);
38633 e1 = gen_reg_rtx (mode);
38634 e2 = gen_reg_rtx (mode);
38635 e3 = gen_reg_rtx (mode);
38636
38637 real_from_integer (&r, VOIDmode, -3, SIGNED);
38638 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38639
38640 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38641 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38642
38643 if (VECTOR_MODE_P (mode))
38644 {
38645 mthree = ix86_build_const_vector (mode, true, mthree);
38646 mhalf = ix86_build_const_vector (mode, true, mhalf);
38647 }
38648
38649 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38650 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38651
38652 a = force_reg (mode, a);
38653
38654 /* x0 = rsqrt(a) estimate */
38655 emit_insn (gen_rtx_SET (VOIDmode, x0,
38656 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38657 UNSPEC_RSQRT)));
38658
38659 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38660 if (!recip)
38661 {
38662 rtx zero, mask;
38663
38664 zero = gen_reg_rtx (mode);
38665 mask = gen_reg_rtx (mode);
38666
38667 zero = force_reg (mode, CONST0_RTX(mode));
38668 emit_insn (gen_rtx_SET (VOIDmode, mask,
38669 gen_rtx_NE (mode, zero, a)));
38670
38671 emit_insn (gen_rtx_SET (VOIDmode, x0,
38672 gen_rtx_AND (mode, x0, mask)));
38673 }
38674
38675 /* e0 = x0 * a */
38676 emit_insn (gen_rtx_SET (VOIDmode, e0,
38677 gen_rtx_MULT (mode, x0, a)));
38678 /* e1 = e0 * x0 */
38679 emit_insn (gen_rtx_SET (VOIDmode, e1,
38680 gen_rtx_MULT (mode, e0, x0)));
38681
38682 /* e2 = e1 - 3. */
38683 mthree = force_reg (mode, mthree);
38684 emit_insn (gen_rtx_SET (VOIDmode, e2,
38685 gen_rtx_PLUS (mode, e1, mthree)));
38686
38687 mhalf = force_reg (mode, mhalf);
38688 if (recip)
38689 /* e3 = -.5 * x0 */
38690 emit_insn (gen_rtx_SET (VOIDmode, e3,
38691 gen_rtx_MULT (mode, x0, mhalf)));
38692 else
38693 /* e3 = -.5 * e0 */
38694 emit_insn (gen_rtx_SET (VOIDmode, e3,
38695 gen_rtx_MULT (mode, e0, mhalf)));
38696 /* ret = e2 * e3 */
38697 emit_insn (gen_rtx_SET (VOIDmode, res,
38698 gen_rtx_MULT (mode, e2, e3)));
38699 }
38700
38701 #ifdef TARGET_SOLARIS
38702 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38703
38704 static void
38705 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38706 tree decl)
38707 {
38708 /* With Binutils 2.15, the "@unwind" marker must be specified on
38709 every occurrence of the ".eh_frame" section, not just the first
38710 one. */
38711 if (TARGET_64BIT
38712 && strcmp (name, ".eh_frame") == 0)
38713 {
38714 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38715 flags & SECTION_WRITE ? "aw" : "a");
38716 return;
38717 }
38718
38719 #ifndef USE_GAS
38720 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38721 {
38722 solaris_elf_asm_comdat_section (name, flags, decl);
38723 return;
38724 }
38725 #endif
38726
38727 default_elf_asm_named_section (name, flags, decl);
38728 }
38729 #endif /* TARGET_SOLARIS */
38730
38731 /* Return the mangling of TYPE if it is an extended fundamental type. */
38732
38733 static const char *
38734 ix86_mangle_type (const_tree type)
38735 {
38736 type = TYPE_MAIN_VARIANT (type);
38737
38738 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38739 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38740 return NULL;
38741
38742 switch (TYPE_MODE (type))
38743 {
38744 case TFmode:
38745 /* __float128 is "g". */
38746 return "g";
38747 case XFmode:
38748 /* "long double" or __float80 is "e". */
38749 return "e";
38750 default:
38751 return NULL;
38752 }
38753 }
38754
38755 /* For 32-bit code we can save PIC register setup by using
38756 __stack_chk_fail_local hidden function instead of calling
38757 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38758 register, so it is better to call __stack_chk_fail directly. */
38759
38760 static tree ATTRIBUTE_UNUSED
38761 ix86_stack_protect_fail (void)
38762 {
38763 return TARGET_64BIT
38764 ? default_external_stack_protect_fail ()
38765 : default_hidden_stack_protect_fail ();
38766 }
38767
38768 /* Select a format to encode pointers in exception handling data. CODE
38769 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38770 true if the symbol may be affected by dynamic relocations.
38771
38772 ??? All x86 object file formats are capable of representing this.
38773 After all, the relocation needed is the same as for the call insn.
38774 Whether or not a particular assembler allows us to enter such, I
38775 guess we'll have to see. */
38776 int
38777 asm_preferred_eh_data_format (int code, int global)
38778 {
38779 if (flag_pic)
38780 {
38781 int type = DW_EH_PE_sdata8;
38782 if (!TARGET_64BIT
38783 || ix86_cmodel == CM_SMALL_PIC
38784 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38785 type = DW_EH_PE_sdata4;
38786 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38787 }
38788 if (ix86_cmodel == CM_SMALL
38789 || (ix86_cmodel == CM_MEDIUM && code))
38790 return DW_EH_PE_udata4;
38791 return DW_EH_PE_absptr;
38792 }
38793 \f
38794 /* Expand copysign from SIGN to the positive value ABS_VALUE
38795 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38796 the sign-bit. */
38797 static void
38798 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38799 {
38800 enum machine_mode mode = GET_MODE (sign);
38801 rtx sgn = gen_reg_rtx (mode);
38802 if (mask == NULL_RTX)
38803 {
38804 enum machine_mode vmode;
38805
38806 if (mode == SFmode)
38807 vmode = V4SFmode;
38808 else if (mode == DFmode)
38809 vmode = V2DFmode;
38810 else
38811 vmode = mode;
38812
38813 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38814 if (!VECTOR_MODE_P (mode))
38815 {
38816 /* We need to generate a scalar mode mask in this case. */
38817 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38818 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38819 mask = gen_reg_rtx (mode);
38820 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38821 }
38822 }
38823 else
38824 mask = gen_rtx_NOT (mode, mask);
38825 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38826 gen_rtx_AND (mode, mask, sign)));
38827 emit_insn (gen_rtx_SET (VOIDmode, result,
38828 gen_rtx_IOR (mode, abs_value, sgn)));
38829 }
38830
38831 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38832 mask for masking out the sign-bit is stored in *SMASK, if that is
38833 non-null. */
38834 static rtx
38835 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38836 {
38837 enum machine_mode vmode, mode = GET_MODE (op0);
38838 rtx xa, mask;
38839
38840 xa = gen_reg_rtx (mode);
38841 if (mode == SFmode)
38842 vmode = V4SFmode;
38843 else if (mode == DFmode)
38844 vmode = V2DFmode;
38845 else
38846 vmode = mode;
38847 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38848 if (!VECTOR_MODE_P (mode))
38849 {
38850 /* We need to generate a scalar mode mask in this case. */
38851 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38852 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38853 mask = gen_reg_rtx (mode);
38854 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38855 }
38856 emit_insn (gen_rtx_SET (VOIDmode, xa,
38857 gen_rtx_AND (mode, op0, mask)));
38858
38859 if (smask)
38860 *smask = mask;
38861
38862 return xa;
38863 }
38864
38865 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38866 swapping the operands if SWAP_OPERANDS is true. The expanded
38867 code is a forward jump to a newly created label in case the
38868 comparison is true. The generated label rtx is returned. */
38869 static rtx
38870 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38871 bool swap_operands)
38872 {
38873 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
38874 rtx label, tmp;
38875
38876 if (swap_operands)
38877 {
38878 tmp = op0;
38879 op0 = op1;
38880 op1 = tmp;
38881 }
38882
38883 label = gen_label_rtx ();
38884 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
38885 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38886 gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
38887 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38888 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38889 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38890 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38891 JUMP_LABEL (tmp) = label;
38892
38893 return label;
38894 }
38895
38896 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
38897 using comparison code CODE. Operands are swapped for the comparison if
38898 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
38899 static rtx
38900 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
38901 bool swap_operands)
38902 {
38903 rtx (*insn)(rtx, rtx, rtx, rtx);
38904 enum machine_mode mode = GET_MODE (op0);
38905 rtx mask = gen_reg_rtx (mode);
38906
38907 if (swap_operands)
38908 {
38909 rtx tmp = op0;
38910 op0 = op1;
38911 op1 = tmp;
38912 }
38913
38914 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
38915
38916 emit_insn (insn (mask, op0, op1,
38917 gen_rtx_fmt_ee (code, mode, op0, op1)));
38918 return mask;
38919 }
38920
38921 /* Generate and return a rtx of mode MODE for 2**n where n is the number
38922 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
38923 static rtx
38924 ix86_gen_TWO52 (enum machine_mode mode)
38925 {
38926 REAL_VALUE_TYPE TWO52r;
38927 rtx TWO52;
38928
38929 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
38930 TWO52 = const_double_from_real_value (TWO52r, mode);
38931 TWO52 = force_reg (mode, TWO52);
38932
38933 return TWO52;
38934 }
38935
38936 /* Expand SSE sequence for computing lround from OP1 storing
38937 into OP0. */
38938 void
38939 ix86_expand_lround (rtx op0, rtx op1)
38940 {
38941 /* C code for the stuff we're doing below:
38942 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
38943 return (long)tmp;
38944 */
38945 enum machine_mode mode = GET_MODE (op1);
38946 const struct real_format *fmt;
38947 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38948 rtx adj;
38949
38950 /* load nextafter (0.5, 0.0) */
38951 fmt = REAL_MODE_FORMAT (mode);
38952 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38953 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38954
38955 /* adj = copysign (0.5, op1) */
38956 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
38957 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
38958
38959 /* adj = op1 + adj */
38960 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
38961
38962 /* op0 = (imode)adj */
38963 expand_fix (op0, adj, 0);
38964 }
38965
38966 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
38967 into OPERAND0. */
38968 void
38969 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
38970 {
38971 /* C code for the stuff we're doing below (for do_floor):
38972 xi = (long)op1;
38973 xi -= (double)xi > op1 ? 1 : 0;
38974 return xi;
38975 */
38976 enum machine_mode fmode = GET_MODE (op1);
38977 enum machine_mode imode = GET_MODE (op0);
38978 rtx ireg, freg, label, tmp;
38979
38980 /* reg = (long)op1 */
38981 ireg = gen_reg_rtx (imode);
38982 expand_fix (ireg, op1, 0);
38983
38984 /* freg = (double)reg */
38985 freg = gen_reg_rtx (fmode);
38986 expand_float (freg, ireg, 0);
38987
38988 /* ireg = (freg > op1) ? ireg - 1 : ireg */
38989 label = ix86_expand_sse_compare_and_jump (UNLE,
38990 freg, op1, !do_floor);
38991 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
38992 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
38993 emit_move_insn (ireg, tmp);
38994
38995 emit_label (label);
38996 LABEL_NUSES (label) = 1;
38997
38998 emit_move_insn (op0, ireg);
38999 }
39000
39001 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
39002 result in OPERAND0. */
39003 void
39004 ix86_expand_rint (rtx operand0, rtx operand1)
39005 {
39006 /* C code for the stuff we're doing below:
39007 xa = fabs (operand1);
39008 if (!isless (xa, 2**52))
39009 return operand1;
39010 xa = xa + 2**52 - 2**52;
39011 return copysign (xa, operand1);
39012 */
39013 enum machine_mode mode = GET_MODE (operand0);
39014 rtx res, xa, label, TWO52, mask;
39015
39016 res = gen_reg_rtx (mode);
39017 emit_move_insn (res, operand1);
39018
39019 /* xa = abs (operand1) */
39020 xa = ix86_expand_sse_fabs (res, &mask);
39021
39022 /* if (!isless (xa, TWO52)) goto label; */
39023 TWO52 = ix86_gen_TWO52 (mode);
39024 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39025
39026 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39027 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39028
39029 ix86_sse_copysign_to_positive (res, xa, res, mask);
39030
39031 emit_label (label);
39032 LABEL_NUSES (label) = 1;
39033
39034 emit_move_insn (operand0, res);
39035 }
39036
39037 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39038 into OPERAND0. */
39039 void
39040 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
39041 {
39042 /* C code for the stuff we expand below.
39043 double xa = fabs (x), x2;
39044 if (!isless (xa, TWO52))
39045 return x;
39046 xa = xa + TWO52 - TWO52;
39047 x2 = copysign (xa, x);
39048 Compensate. Floor:
39049 if (x2 > x)
39050 x2 -= 1;
39051 Compensate. Ceil:
39052 if (x2 < x)
39053 x2 -= -1;
39054 return x2;
39055 */
39056 enum machine_mode mode = GET_MODE (operand0);
39057 rtx xa, TWO52, tmp, label, one, res, mask;
39058
39059 TWO52 = ix86_gen_TWO52 (mode);
39060
39061 /* Temporary for holding the result, initialized to the input
39062 operand to ease control flow. */
39063 res = gen_reg_rtx (mode);
39064 emit_move_insn (res, operand1);
39065
39066 /* xa = abs (operand1) */
39067 xa = ix86_expand_sse_fabs (res, &mask);
39068
39069 /* if (!isless (xa, TWO52)) goto label; */
39070 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39071
39072 /* xa = xa + TWO52 - TWO52; */
39073 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39074 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39075
39076 /* xa = copysign (xa, operand1) */
39077 ix86_sse_copysign_to_positive (xa, xa, res, mask);
39078
39079 /* generate 1.0 or -1.0 */
39080 one = force_reg (mode,
39081 const_double_from_real_value (do_floor
39082 ? dconst1 : dconstm1, mode));
39083
39084 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39085 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39086 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39087 gen_rtx_AND (mode, one, tmp)));
39088 /* We always need to subtract here to preserve signed zero. */
39089 tmp = expand_simple_binop (mode, MINUS,
39090 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39091 emit_move_insn (res, tmp);
39092
39093 emit_label (label);
39094 LABEL_NUSES (label) = 1;
39095
39096 emit_move_insn (operand0, res);
39097 }
39098
39099 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39100 into OPERAND0. */
39101 void
39102 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
39103 {
39104 /* C code for the stuff we expand below.
39105 double xa = fabs (x), x2;
39106 if (!isless (xa, TWO52))
39107 return x;
39108 x2 = (double)(long)x;
39109 Compensate. Floor:
39110 if (x2 > x)
39111 x2 -= 1;
39112 Compensate. Ceil:
39113 if (x2 < x)
39114 x2 += 1;
39115 if (HONOR_SIGNED_ZEROS (mode))
39116 return copysign (x2, x);
39117 return x2;
39118 */
39119 enum machine_mode mode = GET_MODE (operand0);
39120 rtx xa, xi, TWO52, tmp, label, one, res, mask;
39121
39122 TWO52 = ix86_gen_TWO52 (mode);
39123
39124 /* Temporary for holding the result, initialized to the input
39125 operand to ease control flow. */
39126 res = gen_reg_rtx (mode);
39127 emit_move_insn (res, operand1);
39128
39129 /* xa = abs (operand1) */
39130 xa = ix86_expand_sse_fabs (res, &mask);
39131
39132 /* if (!isless (xa, TWO52)) goto label; */
39133 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39134
39135 /* xa = (double)(long)x */
39136 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39137 expand_fix (xi, res, 0);
39138 expand_float (xa, xi, 0);
39139
39140 /* generate 1.0 */
39141 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39142
39143 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39144 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39145 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39146 gen_rtx_AND (mode, one, tmp)));
39147 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
39148 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39149 emit_move_insn (res, tmp);
39150
39151 if (HONOR_SIGNED_ZEROS (mode))
39152 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39153
39154 emit_label (label);
39155 LABEL_NUSES (label) = 1;
39156
39157 emit_move_insn (operand0, res);
39158 }
39159
39160 /* Expand SSE sequence for computing round from OPERAND1 storing
39161 into OPERAND0. Sequence that works without relying on DImode truncation
39162 via cvttsd2siq that is only available on 64bit targets. */
39163 void
39164 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
39165 {
39166 /* C code for the stuff we expand below.
39167 double xa = fabs (x), xa2, x2;
39168 if (!isless (xa, TWO52))
39169 return x;
39170 Using the absolute value and copying back sign makes
39171 -0.0 -> -0.0 correct.
39172 xa2 = xa + TWO52 - TWO52;
39173 Compensate.
39174 dxa = xa2 - xa;
39175 if (dxa <= -0.5)
39176 xa2 += 1;
39177 else if (dxa > 0.5)
39178 xa2 -= 1;
39179 x2 = copysign (xa2, x);
39180 return x2;
39181 */
39182 enum machine_mode mode = GET_MODE (operand0);
39183 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
39184
39185 TWO52 = ix86_gen_TWO52 (mode);
39186
39187 /* Temporary for holding the result, initialized to the input
39188 operand to ease control flow. */
39189 res = gen_reg_rtx (mode);
39190 emit_move_insn (res, operand1);
39191
39192 /* xa = abs (operand1) */
39193 xa = ix86_expand_sse_fabs (res, &mask);
39194
39195 /* if (!isless (xa, TWO52)) goto label; */
39196 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39197
39198 /* xa2 = xa + TWO52 - TWO52; */
39199 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39200 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
39201
39202 /* dxa = xa2 - xa; */
39203 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
39204
39205 /* generate 0.5, 1.0 and -0.5 */
39206 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
39207 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
39208 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
39209 0, OPTAB_DIRECT);
39210
39211 /* Compensate. */
39212 tmp = gen_reg_rtx (mode);
39213 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
39214 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
39215 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39216 gen_rtx_AND (mode, one, tmp)));
39217 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39218 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
39219 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
39220 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39221 gen_rtx_AND (mode, one, tmp)));
39222 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39223
39224 /* res = copysign (xa2, operand1) */
39225 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
39226
39227 emit_label (label);
39228 LABEL_NUSES (label) = 1;
39229
39230 emit_move_insn (operand0, res);
39231 }
39232
39233 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39234 into OPERAND0. */
39235 void
39236 ix86_expand_trunc (rtx operand0, rtx operand1)
39237 {
39238 /* C code for SSE variant we expand below.
39239 double xa = fabs (x), x2;
39240 if (!isless (xa, TWO52))
39241 return x;
39242 x2 = (double)(long)x;
39243 if (HONOR_SIGNED_ZEROS (mode))
39244 return copysign (x2, x);
39245 return x2;
39246 */
39247 enum machine_mode mode = GET_MODE (operand0);
39248 rtx xa, xi, TWO52, label, res, mask;
39249
39250 TWO52 = ix86_gen_TWO52 (mode);
39251
39252 /* Temporary for holding the result, initialized to the input
39253 operand to ease control flow. */
39254 res = gen_reg_rtx (mode);
39255 emit_move_insn (res, operand1);
39256
39257 /* xa = abs (operand1) */
39258 xa = ix86_expand_sse_fabs (res, &mask);
39259
39260 /* if (!isless (xa, TWO52)) goto label; */
39261 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39262
39263 /* x = (double)(long)x */
39264 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39265 expand_fix (xi, res, 0);
39266 expand_float (res, xi, 0);
39267
39268 if (HONOR_SIGNED_ZEROS (mode))
39269 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39270
39271 emit_label (label);
39272 LABEL_NUSES (label) = 1;
39273
39274 emit_move_insn (operand0, res);
39275 }
39276
39277 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39278 into OPERAND0. */
39279 void
39280 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
39281 {
39282 enum machine_mode mode = GET_MODE (operand0);
39283 rtx xa, mask, TWO52, label, one, res, smask, tmp;
39284
39285 /* C code for SSE variant we expand below.
39286 double xa = fabs (x), x2;
39287 if (!isless (xa, TWO52))
39288 return x;
39289 xa2 = xa + TWO52 - TWO52;
39290 Compensate:
39291 if (xa2 > xa)
39292 xa2 -= 1.0;
39293 x2 = copysign (xa2, x);
39294 return x2;
39295 */
39296
39297 TWO52 = ix86_gen_TWO52 (mode);
39298
39299 /* Temporary for holding the result, initialized to the input
39300 operand to ease control flow. */
39301 res = gen_reg_rtx (mode);
39302 emit_move_insn (res, operand1);
39303
39304 /* xa = abs (operand1) */
39305 xa = ix86_expand_sse_fabs (res, &smask);
39306
39307 /* if (!isless (xa, TWO52)) goto label; */
39308 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39309
39310 /* res = xa + TWO52 - TWO52; */
39311 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39312 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
39313 emit_move_insn (res, tmp);
39314
39315 /* generate 1.0 */
39316 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39317
39318 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
39319 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
39320 emit_insn (gen_rtx_SET (VOIDmode, mask,
39321 gen_rtx_AND (mode, mask, one)));
39322 tmp = expand_simple_binop (mode, MINUS,
39323 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
39324 emit_move_insn (res, tmp);
39325
39326 /* res = copysign (res, operand1) */
39327 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
39328
39329 emit_label (label);
39330 LABEL_NUSES (label) = 1;
39331
39332 emit_move_insn (operand0, res);
39333 }
39334
39335 /* Expand SSE sequence for computing round from OPERAND1 storing
39336 into OPERAND0. */
39337 void
39338 ix86_expand_round (rtx operand0, rtx operand1)
39339 {
39340 /* C code for the stuff we're doing below:
39341 double xa = fabs (x);
39342 if (!isless (xa, TWO52))
39343 return x;
39344 xa = (double)(long)(xa + nextafter (0.5, 0.0));
39345 return copysign (xa, x);
39346 */
39347 enum machine_mode mode = GET_MODE (operand0);
39348 rtx res, TWO52, xa, label, xi, half, mask;
39349 const struct real_format *fmt;
39350 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39351
39352 /* Temporary for holding the result, initialized to the input
39353 operand to ease control flow. */
39354 res = gen_reg_rtx (mode);
39355 emit_move_insn (res, operand1);
39356
39357 TWO52 = ix86_gen_TWO52 (mode);
39358 xa = ix86_expand_sse_fabs (res, &mask);
39359 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39360
39361 /* load nextafter (0.5, 0.0) */
39362 fmt = REAL_MODE_FORMAT (mode);
39363 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39364 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39365
39366 /* xa = xa + 0.5 */
39367 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
39368 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
39369
39370 /* xa = (double)(int64_t)xa */
39371 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39372 expand_fix (xi, xa, 0);
39373 expand_float (xa, xi, 0);
39374
39375 /* res = copysign (xa, operand1) */
39376 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
39377
39378 emit_label (label);
39379 LABEL_NUSES (label) = 1;
39380
39381 emit_move_insn (operand0, res);
39382 }
39383
39384 /* Expand SSE sequence for computing round
39385 from OP1 storing into OP0 using sse4 round insn. */
39386 void
39387 ix86_expand_round_sse4 (rtx op0, rtx op1)
39388 {
39389 enum machine_mode mode = GET_MODE (op0);
39390 rtx e1, e2, res, half;
39391 const struct real_format *fmt;
39392 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39393 rtx (*gen_copysign) (rtx, rtx, rtx);
39394 rtx (*gen_round) (rtx, rtx, rtx);
39395
39396 switch (mode)
39397 {
39398 case SFmode:
39399 gen_copysign = gen_copysignsf3;
39400 gen_round = gen_sse4_1_roundsf2;
39401 break;
39402 case DFmode:
39403 gen_copysign = gen_copysigndf3;
39404 gen_round = gen_sse4_1_rounddf2;
39405 break;
39406 default:
39407 gcc_unreachable ();
39408 }
39409
39410 /* round (a) = trunc (a + copysign (0.5, a)) */
39411
39412 /* load nextafter (0.5, 0.0) */
39413 fmt = REAL_MODE_FORMAT (mode);
39414 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39415 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39416 half = const_double_from_real_value (pred_half, mode);
39417
39418 /* e1 = copysign (0.5, op1) */
39419 e1 = gen_reg_rtx (mode);
39420 emit_insn (gen_copysign (e1, half, op1));
39421
39422 /* e2 = op1 + e1 */
39423 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
39424
39425 /* res = trunc (e2) */
39426 res = gen_reg_rtx (mode);
39427 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
39428
39429 emit_move_insn (op0, res);
39430 }
39431 \f
39432
39433 /* Table of valid machine attributes. */
39434 static const struct attribute_spec ix86_attribute_table[] =
39435 {
39436 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
39437 affects_type_identity } */
39438 /* Stdcall attribute says callee is responsible for popping arguments
39439 if they are not variable. */
39440 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39441 true },
39442 /* Fastcall attribute says callee is responsible for popping arguments
39443 if they are not variable. */
39444 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39445 true },
39446 /* Thiscall attribute says callee is responsible for popping arguments
39447 if they are not variable. */
39448 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39449 true },
39450 /* Cdecl attribute says the callee is a normal C declaration */
39451 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39452 true },
39453 /* Regparm attribute specifies how many integer arguments are to be
39454 passed in registers. */
39455 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
39456 true },
39457 /* Sseregparm attribute says we are using x86_64 calling conventions
39458 for FP arguments. */
39459 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39460 true },
39461 /* The transactional memory builtins are implicitly regparm or fastcall
39462 depending on the ABI. Override the generic do-nothing attribute that
39463 these builtins were declared with. */
39464 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
39465 true },
39466 /* force_align_arg_pointer says this function realigns the stack at entry. */
39467 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
39468 false, true, true, ix86_handle_cconv_attribute, false },
39469 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39470 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
39471 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
39472 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
39473 false },
39474 #endif
39475 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39476 false },
39477 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39478 false },
39479 #ifdef SUBTARGET_ATTRIBUTE_TABLE
39480 SUBTARGET_ATTRIBUTE_TABLE,
39481 #endif
39482 /* ms_abi and sysv_abi calling convention function attributes. */
39483 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39484 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39485 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
39486 false },
39487 { "callee_pop_aggregate_return", 1, 1, false, true, true,
39488 ix86_handle_callee_pop_aggregate_return, true },
39489 /* End element. */
39490 { NULL, 0, 0, false, false, false, NULL, false }
39491 };
39492
39493 /* Implement targetm.vectorize.builtin_vectorization_cost. */
39494 static int
39495 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
39496 tree vectype,
39497 int misalign ATTRIBUTE_UNUSED)
39498 {
39499 unsigned elements;
39500
39501 switch (type_of_cost)
39502 {
39503 case scalar_stmt:
39504 return ix86_cost->scalar_stmt_cost;
39505
39506 case scalar_load:
39507 return ix86_cost->scalar_load_cost;
39508
39509 case scalar_store:
39510 return ix86_cost->scalar_store_cost;
39511
39512 case vector_stmt:
39513 return ix86_cost->vec_stmt_cost;
39514
39515 case vector_load:
39516 return ix86_cost->vec_align_load_cost;
39517
39518 case vector_store:
39519 return ix86_cost->vec_store_cost;
39520
39521 case vec_to_scalar:
39522 return ix86_cost->vec_to_scalar_cost;
39523
39524 case scalar_to_vec:
39525 return ix86_cost->scalar_to_vec_cost;
39526
39527 case unaligned_load:
39528 case unaligned_store:
39529 return ix86_cost->vec_unalign_load_cost;
39530
39531 case cond_branch_taken:
39532 return ix86_cost->cond_taken_branch_cost;
39533
39534 case cond_branch_not_taken:
39535 return ix86_cost->cond_not_taken_branch_cost;
39536
39537 case vec_perm:
39538 case vec_promote_demote:
39539 return ix86_cost->vec_stmt_cost;
39540
39541 case vec_construct:
39542 elements = TYPE_VECTOR_SUBPARTS (vectype);
39543 return elements / 2 + 1;
39544
39545 default:
39546 gcc_unreachable ();
39547 }
39548 }
39549
39550 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39551 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39552 insn every time. */
39553
39554 static GTY(()) rtx vselect_insn;
39555
39556 /* Initialize vselect_insn. */
39557
39558 static void
39559 init_vselect_insn (void)
39560 {
39561 unsigned i;
39562 rtx x;
39563
39564 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39565 for (i = 0; i < MAX_VECT_LEN; ++i)
39566 XVECEXP (x, 0, i) = const0_rtx;
39567 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39568 const0_rtx), x);
39569 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39570 start_sequence ();
39571 vselect_insn = emit_insn (x);
39572 end_sequence ();
39573 }
39574
39575 /* Construct (set target (vec_select op0 (parallel perm))) and
39576 return true if that's a valid instruction in the active ISA. */
39577
39578 static bool
39579 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39580 unsigned nelt, bool testing_p)
39581 {
39582 unsigned int i;
39583 rtx x, save_vconcat;
39584 int icode;
39585
39586 if (vselect_insn == NULL_RTX)
39587 init_vselect_insn ();
39588
39589 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39590 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39591 for (i = 0; i < nelt; ++i)
39592 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39593 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39594 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39595 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39596 SET_DEST (PATTERN (vselect_insn)) = target;
39597 icode = recog_memoized (vselect_insn);
39598
39599 if (icode >= 0 && !testing_p)
39600 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39601
39602 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39603 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39604 INSN_CODE (vselect_insn) = -1;
39605
39606 return icode >= 0;
39607 }
39608
39609 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39610
39611 static bool
39612 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39613 const unsigned char *perm, unsigned nelt,
39614 bool testing_p)
39615 {
39616 enum machine_mode v2mode;
39617 rtx x;
39618 bool ok;
39619
39620 if (vselect_insn == NULL_RTX)
39621 init_vselect_insn ();
39622
39623 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39624 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39625 PUT_MODE (x, v2mode);
39626 XEXP (x, 0) = op0;
39627 XEXP (x, 1) = op1;
39628 ok = expand_vselect (target, x, perm, nelt, testing_p);
39629 XEXP (x, 0) = const0_rtx;
39630 XEXP (x, 1) = const0_rtx;
39631 return ok;
39632 }
39633
39634 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39635 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39636
39637 static bool
39638 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39639 {
39640 enum machine_mode vmode = d->vmode;
39641 unsigned i, mask, nelt = d->nelt;
39642 rtx target, op0, op1, x;
39643 rtx rperm[32], vperm;
39644
39645 if (d->one_operand_p)
39646 return false;
39647 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39648 ;
39649 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39650 ;
39651 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39652 ;
39653 else
39654 return false;
39655
39656 /* This is a blend, not a permute. Elements must stay in their
39657 respective lanes. */
39658 for (i = 0; i < nelt; ++i)
39659 {
39660 unsigned e = d->perm[i];
39661 if (!(e == i || e == i + nelt))
39662 return false;
39663 }
39664
39665 if (d->testing_p)
39666 return true;
39667
39668 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39669 decision should be extracted elsewhere, so that we only try that
39670 sequence once all budget==3 options have been tried. */
39671 target = d->target;
39672 op0 = d->op0;
39673 op1 = d->op1;
39674 mask = 0;
39675
39676 switch (vmode)
39677 {
39678 case V4DFmode:
39679 case V8SFmode:
39680 case V2DFmode:
39681 case V4SFmode:
39682 case V8HImode:
39683 case V8SImode:
39684 for (i = 0; i < nelt; ++i)
39685 mask |= (d->perm[i] >= nelt) << i;
39686 break;
39687
39688 case V2DImode:
39689 for (i = 0; i < 2; ++i)
39690 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39691 vmode = V8HImode;
39692 goto do_subreg;
39693
39694 case V4SImode:
39695 for (i = 0; i < 4; ++i)
39696 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39697 vmode = V8HImode;
39698 goto do_subreg;
39699
39700 case V16QImode:
39701 /* See if bytes move in pairs so we can use pblendw with
39702 an immediate argument, rather than pblendvb with a vector
39703 argument. */
39704 for (i = 0; i < 16; i += 2)
39705 if (d->perm[i] + 1 != d->perm[i + 1])
39706 {
39707 use_pblendvb:
39708 for (i = 0; i < nelt; ++i)
39709 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39710
39711 finish_pblendvb:
39712 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39713 vperm = force_reg (vmode, vperm);
39714
39715 if (GET_MODE_SIZE (vmode) == 16)
39716 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39717 else
39718 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39719 if (target != d->target)
39720 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39721 return true;
39722 }
39723
39724 for (i = 0; i < 8; ++i)
39725 mask |= (d->perm[i * 2] >= 16) << i;
39726 vmode = V8HImode;
39727 /* FALLTHRU */
39728
39729 do_subreg:
39730 target = gen_reg_rtx (vmode);
39731 op0 = gen_lowpart (vmode, op0);
39732 op1 = gen_lowpart (vmode, op1);
39733 break;
39734
39735 case V32QImode:
39736 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39737 for (i = 0; i < 32; i += 2)
39738 if (d->perm[i] + 1 != d->perm[i + 1])
39739 goto use_pblendvb;
39740 /* See if bytes move in quadruplets. If yes, vpblendd
39741 with immediate can be used. */
39742 for (i = 0; i < 32; i += 4)
39743 if (d->perm[i] + 2 != d->perm[i + 2])
39744 break;
39745 if (i < 32)
39746 {
39747 /* See if bytes move the same in both lanes. If yes,
39748 vpblendw with immediate can be used. */
39749 for (i = 0; i < 16; i += 2)
39750 if (d->perm[i] + 16 != d->perm[i + 16])
39751 goto use_pblendvb;
39752
39753 /* Use vpblendw. */
39754 for (i = 0; i < 16; ++i)
39755 mask |= (d->perm[i * 2] >= 32) << i;
39756 vmode = V16HImode;
39757 goto do_subreg;
39758 }
39759
39760 /* Use vpblendd. */
39761 for (i = 0; i < 8; ++i)
39762 mask |= (d->perm[i * 4] >= 32) << i;
39763 vmode = V8SImode;
39764 goto do_subreg;
39765
39766 case V16HImode:
39767 /* See if words move in pairs. If yes, vpblendd can be used. */
39768 for (i = 0; i < 16; i += 2)
39769 if (d->perm[i] + 1 != d->perm[i + 1])
39770 break;
39771 if (i < 16)
39772 {
39773 /* See if words move the same in both lanes. If not,
39774 vpblendvb must be used. */
39775 for (i = 0; i < 8; i++)
39776 if (d->perm[i] + 8 != d->perm[i + 8])
39777 {
39778 /* Use vpblendvb. */
39779 for (i = 0; i < 32; ++i)
39780 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39781
39782 vmode = V32QImode;
39783 nelt = 32;
39784 target = gen_reg_rtx (vmode);
39785 op0 = gen_lowpart (vmode, op0);
39786 op1 = gen_lowpart (vmode, op1);
39787 goto finish_pblendvb;
39788 }
39789
39790 /* Use vpblendw. */
39791 for (i = 0; i < 16; ++i)
39792 mask |= (d->perm[i] >= 16) << i;
39793 break;
39794 }
39795
39796 /* Use vpblendd. */
39797 for (i = 0; i < 8; ++i)
39798 mask |= (d->perm[i * 2] >= 16) << i;
39799 vmode = V8SImode;
39800 goto do_subreg;
39801
39802 case V4DImode:
39803 /* Use vpblendd. */
39804 for (i = 0; i < 4; ++i)
39805 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39806 vmode = V8SImode;
39807 goto do_subreg;
39808
39809 default:
39810 gcc_unreachable ();
39811 }
39812
39813 /* This matches five different patterns with the different modes. */
39814 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39815 x = gen_rtx_SET (VOIDmode, target, x);
39816 emit_insn (x);
39817 if (target != d->target)
39818 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39819
39820 return true;
39821 }
39822
39823 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39824 in terms of the variable form of vpermilps.
39825
39826 Note that we will have already failed the immediate input vpermilps,
39827 which requires that the high and low part shuffle be identical; the
39828 variable form doesn't require that. */
39829
39830 static bool
39831 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39832 {
39833 rtx rperm[8], vperm;
39834 unsigned i;
39835
39836 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39837 return false;
39838
39839 /* We can only permute within the 128-bit lane. */
39840 for (i = 0; i < 8; ++i)
39841 {
39842 unsigned e = d->perm[i];
39843 if (i < 4 ? e >= 4 : e < 4)
39844 return false;
39845 }
39846
39847 if (d->testing_p)
39848 return true;
39849
39850 for (i = 0; i < 8; ++i)
39851 {
39852 unsigned e = d->perm[i];
39853
39854 /* Within each 128-bit lane, the elements of op0 are numbered
39855 from 0 and the elements of op1 are numbered from 4. */
39856 if (e >= 8 + 4)
39857 e -= 8;
39858 else if (e >= 4)
39859 e -= 4;
39860
39861 rperm[i] = GEN_INT (e);
39862 }
39863
39864 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39865 vperm = force_reg (V8SImode, vperm);
39866 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39867
39868 return true;
39869 }
39870
39871 /* Return true if permutation D can be performed as VMODE permutation
39872 instead. */
39873
39874 static bool
39875 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39876 {
39877 unsigned int i, j, chunk;
39878
39879 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39880 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39881 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39882 return false;
39883
39884 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39885 return true;
39886
39887 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39888 for (i = 0; i < d->nelt; i += chunk)
39889 if (d->perm[i] & (chunk - 1))
39890 return false;
39891 else
39892 for (j = 1; j < chunk; ++j)
39893 if (d->perm[i] + j != d->perm[i + j])
39894 return false;
39895
39896 return true;
39897 }
39898
39899 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39900 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
39901
39902 static bool
39903 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
39904 {
39905 unsigned i, nelt, eltsz, mask;
39906 unsigned char perm[32];
39907 enum machine_mode vmode = V16QImode;
39908 rtx rperm[32], vperm, target, op0, op1;
39909
39910 nelt = d->nelt;
39911
39912 if (!d->one_operand_p)
39913 {
39914 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
39915 {
39916 if (TARGET_AVX2
39917 && valid_perm_using_mode_p (V2TImode, d))
39918 {
39919 if (d->testing_p)
39920 return true;
39921
39922 /* Use vperm2i128 insn. The pattern uses
39923 V4DImode instead of V2TImode. */
39924 target = d->target;
39925 if (d->vmode != V4DImode)
39926 target = gen_reg_rtx (V4DImode);
39927 op0 = gen_lowpart (V4DImode, d->op0);
39928 op1 = gen_lowpart (V4DImode, d->op1);
39929 rperm[0]
39930 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
39931 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
39932 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
39933 if (target != d->target)
39934 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39935 return true;
39936 }
39937 return false;
39938 }
39939 }
39940 else
39941 {
39942 if (GET_MODE_SIZE (d->vmode) == 16)
39943 {
39944 if (!TARGET_SSSE3)
39945 return false;
39946 }
39947 else if (GET_MODE_SIZE (d->vmode) == 32)
39948 {
39949 if (!TARGET_AVX2)
39950 return false;
39951
39952 /* V4DImode should be already handled through
39953 expand_vselect by vpermq instruction. */
39954 gcc_assert (d->vmode != V4DImode);
39955
39956 vmode = V32QImode;
39957 if (d->vmode == V8SImode
39958 || d->vmode == V16HImode
39959 || d->vmode == V32QImode)
39960 {
39961 /* First see if vpermq can be used for
39962 V8SImode/V16HImode/V32QImode. */
39963 if (valid_perm_using_mode_p (V4DImode, d))
39964 {
39965 for (i = 0; i < 4; i++)
39966 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
39967 if (d->testing_p)
39968 return true;
39969 target = gen_reg_rtx (V4DImode);
39970 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
39971 perm, 4, false))
39972 {
39973 emit_move_insn (d->target,
39974 gen_lowpart (d->vmode, target));
39975 return true;
39976 }
39977 return false;
39978 }
39979
39980 /* Next see if vpermd can be used. */
39981 if (valid_perm_using_mode_p (V8SImode, d))
39982 vmode = V8SImode;
39983 }
39984 /* Or if vpermps can be used. */
39985 else if (d->vmode == V8SFmode)
39986 vmode = V8SImode;
39987
39988 if (vmode == V32QImode)
39989 {
39990 /* vpshufb only works intra lanes, it is not
39991 possible to shuffle bytes in between the lanes. */
39992 for (i = 0; i < nelt; ++i)
39993 if ((d->perm[i] ^ i) & (nelt / 2))
39994 return false;
39995 }
39996 }
39997 else
39998 return false;
39999 }
40000
40001 if (d->testing_p)
40002 return true;
40003
40004 if (vmode == V8SImode)
40005 for (i = 0; i < 8; ++i)
40006 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
40007 else
40008 {
40009 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40010 if (!d->one_operand_p)
40011 mask = 2 * nelt - 1;
40012 else if (vmode == V16QImode)
40013 mask = nelt - 1;
40014 else
40015 mask = nelt / 2 - 1;
40016
40017 for (i = 0; i < nelt; ++i)
40018 {
40019 unsigned j, e = d->perm[i] & mask;
40020 for (j = 0; j < eltsz; ++j)
40021 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
40022 }
40023 }
40024
40025 vperm = gen_rtx_CONST_VECTOR (vmode,
40026 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
40027 vperm = force_reg (vmode, vperm);
40028
40029 target = d->target;
40030 if (d->vmode != vmode)
40031 target = gen_reg_rtx (vmode);
40032 op0 = gen_lowpart (vmode, d->op0);
40033 if (d->one_operand_p)
40034 {
40035 if (vmode == V16QImode)
40036 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
40037 else if (vmode == V32QImode)
40038 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
40039 else if (vmode == V8SFmode)
40040 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
40041 else
40042 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
40043 }
40044 else
40045 {
40046 op1 = gen_lowpart (vmode, d->op1);
40047 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
40048 }
40049 if (target != d->target)
40050 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40051
40052 return true;
40053 }
40054
40055 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
40056 in a single instruction. */
40057
40058 static bool
40059 expand_vec_perm_1 (struct expand_vec_perm_d *d)
40060 {
40061 unsigned i, nelt = d->nelt;
40062 unsigned char perm2[MAX_VECT_LEN];
40063
40064 /* Check plain VEC_SELECT first, because AVX has instructions that could
40065 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
40066 input where SEL+CONCAT may not. */
40067 if (d->one_operand_p)
40068 {
40069 int mask = nelt - 1;
40070 bool identity_perm = true;
40071 bool broadcast_perm = true;
40072
40073 for (i = 0; i < nelt; i++)
40074 {
40075 perm2[i] = d->perm[i] & mask;
40076 if (perm2[i] != i)
40077 identity_perm = false;
40078 if (perm2[i])
40079 broadcast_perm = false;
40080 }
40081
40082 if (identity_perm)
40083 {
40084 if (!d->testing_p)
40085 emit_move_insn (d->target, d->op0);
40086 return true;
40087 }
40088 else if (broadcast_perm && TARGET_AVX2)
40089 {
40090 /* Use vpbroadcast{b,w,d}. */
40091 rtx (*gen) (rtx, rtx) = NULL;
40092 switch (d->vmode)
40093 {
40094 case V32QImode:
40095 gen = gen_avx2_pbroadcastv32qi_1;
40096 break;
40097 case V16HImode:
40098 gen = gen_avx2_pbroadcastv16hi_1;
40099 break;
40100 case V8SImode:
40101 gen = gen_avx2_pbroadcastv8si_1;
40102 break;
40103 case V16QImode:
40104 gen = gen_avx2_pbroadcastv16qi;
40105 break;
40106 case V8HImode:
40107 gen = gen_avx2_pbroadcastv8hi;
40108 break;
40109 case V8SFmode:
40110 gen = gen_avx2_vec_dupv8sf_1;
40111 break;
40112 /* For other modes prefer other shuffles this function creates. */
40113 default: break;
40114 }
40115 if (gen != NULL)
40116 {
40117 if (!d->testing_p)
40118 emit_insn (gen (d->target, d->op0));
40119 return true;
40120 }
40121 }
40122
40123 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
40124 return true;
40125
40126 /* There are plenty of patterns in sse.md that are written for
40127 SEL+CONCAT and are not replicated for a single op. Perhaps
40128 that should be changed, to avoid the nastiness here. */
40129
40130 /* Recognize interleave style patterns, which means incrementing
40131 every other permutation operand. */
40132 for (i = 0; i < nelt; i += 2)
40133 {
40134 perm2[i] = d->perm[i] & mask;
40135 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
40136 }
40137 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40138 d->testing_p))
40139 return true;
40140
40141 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
40142 if (nelt >= 4)
40143 {
40144 for (i = 0; i < nelt; i += 4)
40145 {
40146 perm2[i + 0] = d->perm[i + 0] & mask;
40147 perm2[i + 1] = d->perm[i + 1] & mask;
40148 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
40149 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
40150 }
40151
40152 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40153 d->testing_p))
40154 return true;
40155 }
40156 }
40157
40158 /* Finally, try the fully general two operand permute. */
40159 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
40160 d->testing_p))
40161 return true;
40162
40163 /* Recognize interleave style patterns with reversed operands. */
40164 if (!d->one_operand_p)
40165 {
40166 for (i = 0; i < nelt; ++i)
40167 {
40168 unsigned e = d->perm[i];
40169 if (e >= nelt)
40170 e -= nelt;
40171 else
40172 e += nelt;
40173 perm2[i] = e;
40174 }
40175
40176 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
40177 d->testing_p))
40178 return true;
40179 }
40180
40181 /* Try the SSE4.1 blend variable merge instructions. */
40182 if (expand_vec_perm_blend (d))
40183 return true;
40184
40185 /* Try one of the AVX vpermil variable permutations. */
40186 if (expand_vec_perm_vpermil (d))
40187 return true;
40188
40189 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
40190 vpshufb, vpermd, vpermps or vpermq variable permutation. */
40191 if (expand_vec_perm_pshufb (d))
40192 return true;
40193
40194 return false;
40195 }
40196
40197 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40198 in terms of a pair of pshuflw + pshufhw instructions. */
40199
40200 static bool
40201 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
40202 {
40203 unsigned char perm2[MAX_VECT_LEN];
40204 unsigned i;
40205 bool ok;
40206
40207 if (d->vmode != V8HImode || !d->one_operand_p)
40208 return false;
40209
40210 /* The two permutations only operate in 64-bit lanes. */
40211 for (i = 0; i < 4; ++i)
40212 if (d->perm[i] >= 4)
40213 return false;
40214 for (i = 4; i < 8; ++i)
40215 if (d->perm[i] < 4)
40216 return false;
40217
40218 if (d->testing_p)
40219 return true;
40220
40221 /* Emit the pshuflw. */
40222 memcpy (perm2, d->perm, 4);
40223 for (i = 4; i < 8; ++i)
40224 perm2[i] = i;
40225 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
40226 gcc_assert (ok);
40227
40228 /* Emit the pshufhw. */
40229 memcpy (perm2 + 4, d->perm + 4, 4);
40230 for (i = 0; i < 4; ++i)
40231 perm2[i] = i;
40232 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
40233 gcc_assert (ok);
40234
40235 return true;
40236 }
40237
40238 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40239 the permutation using the SSSE3 palignr instruction. This succeeds
40240 when all of the elements in PERM fit within one vector and we merely
40241 need to shift them down so that a single vector permutation has a
40242 chance to succeed. */
40243
40244 static bool
40245 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
40246 {
40247 unsigned i, nelt = d->nelt;
40248 unsigned min, max;
40249 bool in_order, ok;
40250 rtx shift, target;
40251 struct expand_vec_perm_d dcopy;
40252
40253 /* Even with AVX, palignr only operates on 128-bit vectors. */
40254 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40255 return false;
40256
40257 min = nelt, max = 0;
40258 for (i = 0; i < nelt; ++i)
40259 {
40260 unsigned e = d->perm[i];
40261 if (e < min)
40262 min = e;
40263 if (e > max)
40264 max = e;
40265 }
40266 if (min == 0 || max - min >= nelt)
40267 return false;
40268
40269 /* Given that we have SSSE3, we know we'll be able to implement the
40270 single operand permutation after the palignr with pshufb. */
40271 if (d->testing_p)
40272 return true;
40273
40274 dcopy = *d;
40275 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
40276 target = gen_reg_rtx (TImode);
40277 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
40278 gen_lowpart (TImode, d->op0), shift));
40279
40280 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
40281 dcopy.one_operand_p = true;
40282
40283 in_order = true;
40284 for (i = 0; i < nelt; ++i)
40285 {
40286 unsigned e = dcopy.perm[i] - min;
40287 if (e != i)
40288 in_order = false;
40289 dcopy.perm[i] = e;
40290 }
40291
40292 /* Test for the degenerate case where the alignment by itself
40293 produces the desired permutation. */
40294 if (in_order)
40295 {
40296 emit_move_insn (d->target, dcopy.op0);
40297 return true;
40298 }
40299
40300 ok = expand_vec_perm_1 (&dcopy);
40301 gcc_assert (ok);
40302
40303 return ok;
40304 }
40305
40306 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
40307
40308 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40309 a two vector permutation into a single vector permutation by using
40310 an interleave operation to merge the vectors. */
40311
40312 static bool
40313 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
40314 {
40315 struct expand_vec_perm_d dremap, dfinal;
40316 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
40317 unsigned HOST_WIDE_INT contents;
40318 unsigned char remap[2 * MAX_VECT_LEN];
40319 rtx seq;
40320 bool ok, same_halves = false;
40321
40322 if (GET_MODE_SIZE (d->vmode) == 16)
40323 {
40324 if (d->one_operand_p)
40325 return false;
40326 }
40327 else if (GET_MODE_SIZE (d->vmode) == 32)
40328 {
40329 if (!TARGET_AVX)
40330 return false;
40331 /* For 32-byte modes allow even d->one_operand_p.
40332 The lack of cross-lane shuffling in some instructions
40333 might prevent a single insn shuffle. */
40334 dfinal = *d;
40335 dfinal.testing_p = true;
40336 /* If expand_vec_perm_interleave3 can expand this into
40337 a 3 insn sequence, give up and let it be expanded as
40338 3 insn sequence. While that is one insn longer,
40339 it doesn't need a memory operand and in the common
40340 case that both interleave low and high permutations
40341 with the same operands are adjacent needs 4 insns
40342 for both after CSE. */
40343 if (expand_vec_perm_interleave3 (&dfinal))
40344 return false;
40345 }
40346 else
40347 return false;
40348
40349 /* Examine from whence the elements come. */
40350 contents = 0;
40351 for (i = 0; i < nelt; ++i)
40352 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
40353
40354 memset (remap, 0xff, sizeof (remap));
40355 dremap = *d;
40356
40357 if (GET_MODE_SIZE (d->vmode) == 16)
40358 {
40359 unsigned HOST_WIDE_INT h1, h2, h3, h4;
40360
40361 /* Split the two input vectors into 4 halves. */
40362 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
40363 h2 = h1 << nelt2;
40364 h3 = h2 << nelt2;
40365 h4 = h3 << nelt2;
40366
40367 /* If the elements from the low halves use interleave low, and similarly
40368 for interleave high. If the elements are from mis-matched halves, we
40369 can use shufps for V4SF/V4SI or do a DImode shuffle. */
40370 if ((contents & (h1 | h3)) == contents)
40371 {
40372 /* punpckl* */
40373 for (i = 0; i < nelt2; ++i)
40374 {
40375 remap[i] = i * 2;
40376 remap[i + nelt] = i * 2 + 1;
40377 dremap.perm[i * 2] = i;
40378 dremap.perm[i * 2 + 1] = i + nelt;
40379 }
40380 if (!TARGET_SSE2 && d->vmode == V4SImode)
40381 dremap.vmode = V4SFmode;
40382 }
40383 else if ((contents & (h2 | h4)) == contents)
40384 {
40385 /* punpckh* */
40386 for (i = 0; i < nelt2; ++i)
40387 {
40388 remap[i + nelt2] = i * 2;
40389 remap[i + nelt + nelt2] = i * 2 + 1;
40390 dremap.perm[i * 2] = i + nelt2;
40391 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
40392 }
40393 if (!TARGET_SSE2 && d->vmode == V4SImode)
40394 dremap.vmode = V4SFmode;
40395 }
40396 else if ((contents & (h1 | h4)) == contents)
40397 {
40398 /* shufps */
40399 for (i = 0; i < nelt2; ++i)
40400 {
40401 remap[i] = i;
40402 remap[i + nelt + nelt2] = i + nelt2;
40403 dremap.perm[i] = i;
40404 dremap.perm[i + nelt2] = i + nelt + nelt2;
40405 }
40406 if (nelt != 4)
40407 {
40408 /* shufpd */
40409 dremap.vmode = V2DImode;
40410 dremap.nelt = 2;
40411 dremap.perm[0] = 0;
40412 dremap.perm[1] = 3;
40413 }
40414 }
40415 else if ((contents & (h2 | h3)) == contents)
40416 {
40417 /* shufps */
40418 for (i = 0; i < nelt2; ++i)
40419 {
40420 remap[i + nelt2] = i;
40421 remap[i + nelt] = i + nelt2;
40422 dremap.perm[i] = i + nelt2;
40423 dremap.perm[i + nelt2] = i + nelt;
40424 }
40425 if (nelt != 4)
40426 {
40427 /* shufpd */
40428 dremap.vmode = V2DImode;
40429 dremap.nelt = 2;
40430 dremap.perm[0] = 1;
40431 dremap.perm[1] = 2;
40432 }
40433 }
40434 else
40435 return false;
40436 }
40437 else
40438 {
40439 unsigned int nelt4 = nelt / 4, nzcnt = 0;
40440 unsigned HOST_WIDE_INT q[8];
40441 unsigned int nonzero_halves[4];
40442
40443 /* Split the two input vectors into 8 quarters. */
40444 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
40445 for (i = 1; i < 8; ++i)
40446 q[i] = q[0] << (nelt4 * i);
40447 for (i = 0; i < 4; ++i)
40448 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
40449 {
40450 nonzero_halves[nzcnt] = i;
40451 ++nzcnt;
40452 }
40453
40454 if (nzcnt == 1)
40455 {
40456 gcc_assert (d->one_operand_p);
40457 nonzero_halves[1] = nonzero_halves[0];
40458 same_halves = true;
40459 }
40460 else if (d->one_operand_p)
40461 {
40462 gcc_assert (nonzero_halves[0] == 0);
40463 gcc_assert (nonzero_halves[1] == 1);
40464 }
40465
40466 if (nzcnt <= 2)
40467 {
40468 if (d->perm[0] / nelt2 == nonzero_halves[1])
40469 {
40470 /* Attempt to increase the likelihood that dfinal
40471 shuffle will be intra-lane. */
40472 char tmph = nonzero_halves[0];
40473 nonzero_halves[0] = nonzero_halves[1];
40474 nonzero_halves[1] = tmph;
40475 }
40476
40477 /* vperm2f128 or vperm2i128. */
40478 for (i = 0; i < nelt2; ++i)
40479 {
40480 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
40481 remap[i + nonzero_halves[0] * nelt2] = i;
40482 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
40483 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
40484 }
40485
40486 if (d->vmode != V8SFmode
40487 && d->vmode != V4DFmode
40488 && d->vmode != V8SImode)
40489 {
40490 dremap.vmode = V8SImode;
40491 dremap.nelt = 8;
40492 for (i = 0; i < 4; ++i)
40493 {
40494 dremap.perm[i] = i + nonzero_halves[0] * 4;
40495 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
40496 }
40497 }
40498 }
40499 else if (d->one_operand_p)
40500 return false;
40501 else if (TARGET_AVX2
40502 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
40503 {
40504 /* vpunpckl* */
40505 for (i = 0; i < nelt4; ++i)
40506 {
40507 remap[i] = i * 2;
40508 remap[i + nelt] = i * 2 + 1;
40509 remap[i + nelt2] = i * 2 + nelt2;
40510 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
40511 dremap.perm[i * 2] = i;
40512 dremap.perm[i * 2 + 1] = i + nelt;
40513 dremap.perm[i * 2 + nelt2] = i + nelt2;
40514 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
40515 }
40516 }
40517 else if (TARGET_AVX2
40518 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
40519 {
40520 /* vpunpckh* */
40521 for (i = 0; i < nelt4; ++i)
40522 {
40523 remap[i + nelt4] = i * 2;
40524 remap[i + nelt + nelt4] = i * 2 + 1;
40525 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
40526 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
40527 dremap.perm[i * 2] = i + nelt4;
40528 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
40529 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
40530 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
40531 }
40532 }
40533 else
40534 return false;
40535 }
40536
40537 /* Use the remapping array set up above to move the elements from their
40538 swizzled locations into their final destinations. */
40539 dfinal = *d;
40540 for (i = 0; i < nelt; ++i)
40541 {
40542 unsigned e = remap[d->perm[i]];
40543 gcc_assert (e < nelt);
40544 /* If same_halves is true, both halves of the remapped vector are the
40545 same. Avoid cross-lane accesses if possible. */
40546 if (same_halves && i >= nelt2)
40547 {
40548 gcc_assert (e < nelt2);
40549 dfinal.perm[i] = e + nelt2;
40550 }
40551 else
40552 dfinal.perm[i] = e;
40553 }
40554 dremap.target = gen_reg_rtx (dremap.vmode);
40555 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40556 dfinal.op1 = dfinal.op0;
40557 dfinal.one_operand_p = true;
40558
40559 /* Test if the final remap can be done with a single insn. For V4SFmode or
40560 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40561 start_sequence ();
40562 ok = expand_vec_perm_1 (&dfinal);
40563 seq = get_insns ();
40564 end_sequence ();
40565
40566 if (!ok)
40567 return false;
40568
40569 if (d->testing_p)
40570 return true;
40571
40572 if (dremap.vmode != dfinal.vmode)
40573 {
40574 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40575 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40576 }
40577
40578 ok = expand_vec_perm_1 (&dremap);
40579 gcc_assert (ok);
40580
40581 emit_insn (seq);
40582 return true;
40583 }
40584
40585 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40586 a single vector cross-lane permutation into vpermq followed
40587 by any of the single insn permutations. */
40588
40589 static bool
40590 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40591 {
40592 struct expand_vec_perm_d dremap, dfinal;
40593 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40594 unsigned contents[2];
40595 bool ok;
40596
40597 if (!(TARGET_AVX2
40598 && (d->vmode == V32QImode || d->vmode == V16HImode)
40599 && d->one_operand_p))
40600 return false;
40601
40602 contents[0] = 0;
40603 contents[1] = 0;
40604 for (i = 0; i < nelt2; ++i)
40605 {
40606 contents[0] |= 1u << (d->perm[i] / nelt4);
40607 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40608 }
40609
40610 for (i = 0; i < 2; ++i)
40611 {
40612 unsigned int cnt = 0;
40613 for (j = 0; j < 4; ++j)
40614 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40615 return false;
40616 }
40617
40618 if (d->testing_p)
40619 return true;
40620
40621 dremap = *d;
40622 dremap.vmode = V4DImode;
40623 dremap.nelt = 4;
40624 dremap.target = gen_reg_rtx (V4DImode);
40625 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40626 dremap.op1 = dremap.op0;
40627 dremap.one_operand_p = true;
40628 for (i = 0; i < 2; ++i)
40629 {
40630 unsigned int cnt = 0;
40631 for (j = 0; j < 4; ++j)
40632 if ((contents[i] & (1u << j)) != 0)
40633 dremap.perm[2 * i + cnt++] = j;
40634 for (; cnt < 2; ++cnt)
40635 dremap.perm[2 * i + cnt] = 0;
40636 }
40637
40638 dfinal = *d;
40639 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40640 dfinal.op1 = dfinal.op0;
40641 dfinal.one_operand_p = true;
40642 for (i = 0, j = 0; i < nelt; ++i)
40643 {
40644 if (i == nelt2)
40645 j = 2;
40646 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40647 if ((d->perm[i] / nelt4) == dremap.perm[j])
40648 ;
40649 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40650 dfinal.perm[i] |= nelt4;
40651 else
40652 gcc_unreachable ();
40653 }
40654
40655 ok = expand_vec_perm_1 (&dremap);
40656 gcc_assert (ok);
40657
40658 ok = expand_vec_perm_1 (&dfinal);
40659 gcc_assert (ok);
40660
40661 return true;
40662 }
40663
40664 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40665 a vector permutation using two instructions, vperm2f128 resp.
40666 vperm2i128 followed by any single in-lane permutation. */
40667
40668 static bool
40669 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40670 {
40671 struct expand_vec_perm_d dfirst, dsecond;
40672 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40673 bool ok;
40674
40675 if (!TARGET_AVX
40676 || GET_MODE_SIZE (d->vmode) != 32
40677 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40678 return false;
40679
40680 dsecond = *d;
40681 dsecond.one_operand_p = false;
40682 dsecond.testing_p = true;
40683
40684 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40685 immediate. For perm < 16 the second permutation uses
40686 d->op0 as first operand, for perm >= 16 it uses d->op1
40687 as first operand. The second operand is the result of
40688 vperm2[fi]128. */
40689 for (perm = 0; perm < 32; perm++)
40690 {
40691 /* Ignore permutations which do not move anything cross-lane. */
40692 if (perm < 16)
40693 {
40694 /* The second shuffle for e.g. V4DFmode has
40695 0123 and ABCD operands.
40696 Ignore AB23, as 23 is already in the second lane
40697 of the first operand. */
40698 if ((perm & 0xc) == (1 << 2)) continue;
40699 /* And 01CD, as 01 is in the first lane of the first
40700 operand. */
40701 if ((perm & 3) == 0) continue;
40702 /* And 4567, as then the vperm2[fi]128 doesn't change
40703 anything on the original 4567 second operand. */
40704 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40705 }
40706 else
40707 {
40708 /* The second shuffle for e.g. V4DFmode has
40709 4567 and ABCD operands.
40710 Ignore AB67, as 67 is already in the second lane
40711 of the first operand. */
40712 if ((perm & 0xc) == (3 << 2)) continue;
40713 /* And 45CD, as 45 is in the first lane of the first
40714 operand. */
40715 if ((perm & 3) == 2) continue;
40716 /* And 0123, as then the vperm2[fi]128 doesn't change
40717 anything on the original 0123 first operand. */
40718 if ((perm & 0xf) == (1 << 2)) continue;
40719 }
40720
40721 for (i = 0; i < nelt; i++)
40722 {
40723 j = d->perm[i] / nelt2;
40724 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40725 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40726 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40727 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40728 else
40729 break;
40730 }
40731
40732 if (i == nelt)
40733 {
40734 start_sequence ();
40735 ok = expand_vec_perm_1 (&dsecond);
40736 end_sequence ();
40737 }
40738 else
40739 ok = false;
40740
40741 if (ok)
40742 {
40743 if (d->testing_p)
40744 return true;
40745
40746 /* Found a usable second shuffle. dfirst will be
40747 vperm2f128 on d->op0 and d->op1. */
40748 dsecond.testing_p = false;
40749 dfirst = *d;
40750 dfirst.target = gen_reg_rtx (d->vmode);
40751 for (i = 0; i < nelt; i++)
40752 dfirst.perm[i] = (i & (nelt2 - 1))
40753 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40754
40755 ok = expand_vec_perm_1 (&dfirst);
40756 gcc_assert (ok);
40757
40758 /* And dsecond is some single insn shuffle, taking
40759 d->op0 and result of vperm2f128 (if perm < 16) or
40760 d->op1 and result of vperm2f128 (otherwise). */
40761 dsecond.op1 = dfirst.target;
40762 if (perm >= 16)
40763 dsecond.op0 = dfirst.op1;
40764
40765 ok = expand_vec_perm_1 (&dsecond);
40766 gcc_assert (ok);
40767
40768 return true;
40769 }
40770
40771 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40772 if (d->one_operand_p)
40773 return false;
40774 }
40775
40776 return false;
40777 }
40778
40779 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40780 a two vector permutation using 2 intra-lane interleave insns
40781 and cross-lane shuffle for 32-byte vectors. */
40782
40783 static bool
40784 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40785 {
40786 unsigned i, nelt;
40787 rtx (*gen) (rtx, rtx, rtx);
40788
40789 if (d->one_operand_p)
40790 return false;
40791 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40792 ;
40793 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40794 ;
40795 else
40796 return false;
40797
40798 nelt = d->nelt;
40799 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40800 return false;
40801 for (i = 0; i < nelt; i += 2)
40802 if (d->perm[i] != d->perm[0] + i / 2
40803 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40804 return false;
40805
40806 if (d->testing_p)
40807 return true;
40808
40809 switch (d->vmode)
40810 {
40811 case V32QImode:
40812 if (d->perm[0])
40813 gen = gen_vec_interleave_highv32qi;
40814 else
40815 gen = gen_vec_interleave_lowv32qi;
40816 break;
40817 case V16HImode:
40818 if (d->perm[0])
40819 gen = gen_vec_interleave_highv16hi;
40820 else
40821 gen = gen_vec_interleave_lowv16hi;
40822 break;
40823 case V8SImode:
40824 if (d->perm[0])
40825 gen = gen_vec_interleave_highv8si;
40826 else
40827 gen = gen_vec_interleave_lowv8si;
40828 break;
40829 case V4DImode:
40830 if (d->perm[0])
40831 gen = gen_vec_interleave_highv4di;
40832 else
40833 gen = gen_vec_interleave_lowv4di;
40834 break;
40835 case V8SFmode:
40836 if (d->perm[0])
40837 gen = gen_vec_interleave_highv8sf;
40838 else
40839 gen = gen_vec_interleave_lowv8sf;
40840 break;
40841 case V4DFmode:
40842 if (d->perm[0])
40843 gen = gen_vec_interleave_highv4df;
40844 else
40845 gen = gen_vec_interleave_lowv4df;
40846 break;
40847 default:
40848 gcc_unreachable ();
40849 }
40850
40851 emit_insn (gen (d->target, d->op0, d->op1));
40852 return true;
40853 }
40854
40855 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40856 a single vector permutation using a single intra-lane vector
40857 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40858 the non-swapped and swapped vectors together. */
40859
40860 static bool
40861 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40862 {
40863 struct expand_vec_perm_d dfirst, dsecond;
40864 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40865 rtx seq;
40866 bool ok;
40867 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40868
40869 if (!TARGET_AVX
40870 || TARGET_AVX2
40871 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40872 || !d->one_operand_p)
40873 return false;
40874
40875 dfirst = *d;
40876 for (i = 0; i < nelt; i++)
40877 dfirst.perm[i] = 0xff;
40878 for (i = 0, msk = 0; i < nelt; i++)
40879 {
40880 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40881 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40882 return false;
40883 dfirst.perm[j] = d->perm[i];
40884 if (j != i)
40885 msk |= (1 << i);
40886 }
40887 for (i = 0; i < nelt; i++)
40888 if (dfirst.perm[i] == 0xff)
40889 dfirst.perm[i] = i;
40890
40891 if (!d->testing_p)
40892 dfirst.target = gen_reg_rtx (dfirst.vmode);
40893
40894 start_sequence ();
40895 ok = expand_vec_perm_1 (&dfirst);
40896 seq = get_insns ();
40897 end_sequence ();
40898
40899 if (!ok)
40900 return false;
40901
40902 if (d->testing_p)
40903 return true;
40904
40905 emit_insn (seq);
40906
40907 dsecond = *d;
40908 dsecond.op0 = dfirst.target;
40909 dsecond.op1 = dfirst.target;
40910 dsecond.one_operand_p = true;
40911 dsecond.target = gen_reg_rtx (dsecond.vmode);
40912 for (i = 0; i < nelt; i++)
40913 dsecond.perm[i] = i ^ nelt2;
40914
40915 ok = expand_vec_perm_1 (&dsecond);
40916 gcc_assert (ok);
40917
40918 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
40919 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
40920 return true;
40921 }
40922
40923 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
40924 permutation using two vperm2f128, followed by a vshufpd insn blending
40925 the two vectors together. */
40926
40927 static bool
40928 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
40929 {
40930 struct expand_vec_perm_d dfirst, dsecond, dthird;
40931 bool ok;
40932
40933 if (!TARGET_AVX || (d->vmode != V4DFmode))
40934 return false;
40935
40936 if (d->testing_p)
40937 return true;
40938
40939 dfirst = *d;
40940 dsecond = *d;
40941 dthird = *d;
40942
40943 dfirst.perm[0] = (d->perm[0] & ~1);
40944 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
40945 dfirst.perm[2] = (d->perm[2] & ~1);
40946 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
40947 dsecond.perm[0] = (d->perm[1] & ~1);
40948 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
40949 dsecond.perm[2] = (d->perm[3] & ~1);
40950 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
40951 dthird.perm[0] = (d->perm[0] % 2);
40952 dthird.perm[1] = (d->perm[1] % 2) + 4;
40953 dthird.perm[2] = (d->perm[2] % 2) + 2;
40954 dthird.perm[3] = (d->perm[3] % 2) + 6;
40955
40956 dfirst.target = gen_reg_rtx (dfirst.vmode);
40957 dsecond.target = gen_reg_rtx (dsecond.vmode);
40958 dthird.op0 = dfirst.target;
40959 dthird.op1 = dsecond.target;
40960 dthird.one_operand_p = false;
40961
40962 canonicalize_perm (&dfirst);
40963 canonicalize_perm (&dsecond);
40964
40965 ok = expand_vec_perm_1 (&dfirst)
40966 && expand_vec_perm_1 (&dsecond)
40967 && expand_vec_perm_1 (&dthird);
40968
40969 gcc_assert (ok);
40970
40971 return true;
40972 }
40973
40974 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
40975 permutation with two pshufb insns and an ior. We should have already
40976 failed all two instruction sequences. */
40977
40978 static bool
40979 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
40980 {
40981 rtx rperm[2][16], vperm, l, h, op, m128;
40982 unsigned int i, nelt, eltsz;
40983
40984 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40985 return false;
40986 gcc_assert (!d->one_operand_p);
40987
40988 nelt = d->nelt;
40989 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40990
40991 /* Generate two permutation masks. If the required element is within
40992 the given vector it is shuffled into the proper lane. If the required
40993 element is in the other vector, force a zero into the lane by setting
40994 bit 7 in the permutation mask. */
40995 m128 = GEN_INT (-128);
40996 for (i = 0; i < nelt; ++i)
40997 {
40998 unsigned j, e = d->perm[i];
40999 unsigned which = (e >= nelt);
41000 if (e >= nelt)
41001 e -= nelt;
41002
41003 for (j = 0; j < eltsz; ++j)
41004 {
41005 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
41006 rperm[1-which][i*eltsz + j] = m128;
41007 }
41008 }
41009
41010 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
41011 vperm = force_reg (V16QImode, vperm);
41012
41013 l = gen_reg_rtx (V16QImode);
41014 op = gen_lowpart (V16QImode, d->op0);
41015 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
41016
41017 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
41018 vperm = force_reg (V16QImode, vperm);
41019
41020 h = gen_reg_rtx (V16QImode);
41021 op = gen_lowpart (V16QImode, d->op1);
41022 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
41023
41024 op = d->target;
41025 if (d->vmode != V16QImode)
41026 op = gen_reg_rtx (V16QImode);
41027 emit_insn (gen_iorv16qi3 (op, l, h));
41028 if (op != d->target)
41029 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41030
41031 return true;
41032 }
41033
41034 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
41035 with two vpshufb insns, vpermq and vpor. We should have already failed
41036 all two or three instruction sequences. */
41037
41038 static bool
41039 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
41040 {
41041 rtx rperm[2][32], vperm, l, h, hp, op, m128;
41042 unsigned int i, nelt, eltsz;
41043
41044 if (!TARGET_AVX2
41045 || !d->one_operand_p
41046 || (d->vmode != V32QImode && d->vmode != V16HImode))
41047 return false;
41048
41049 if (d->testing_p)
41050 return true;
41051
41052 nelt = d->nelt;
41053 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41054
41055 /* Generate two permutation masks. If the required element is within
41056 the same lane, it is shuffled in. If the required element from the
41057 other lane, force a zero by setting bit 7 in the permutation mask.
41058 In the other mask the mask has non-negative elements if element
41059 is requested from the other lane, but also moved to the other lane,
41060 so that the result of vpshufb can have the two V2TImode halves
41061 swapped. */
41062 m128 = GEN_INT (-128);
41063 for (i = 0; i < nelt; ++i)
41064 {
41065 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41066 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41067
41068 for (j = 0; j < eltsz; ++j)
41069 {
41070 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
41071 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
41072 }
41073 }
41074
41075 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41076 vperm = force_reg (V32QImode, vperm);
41077
41078 h = gen_reg_rtx (V32QImode);
41079 op = gen_lowpart (V32QImode, d->op0);
41080 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41081
41082 /* Swap the 128-byte lanes of h into hp. */
41083 hp = gen_reg_rtx (V4DImode);
41084 op = gen_lowpart (V4DImode, h);
41085 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
41086 const1_rtx));
41087
41088 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41089 vperm = force_reg (V32QImode, vperm);
41090
41091 l = gen_reg_rtx (V32QImode);
41092 op = gen_lowpart (V32QImode, d->op0);
41093 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41094
41095 op = d->target;
41096 if (d->vmode != V32QImode)
41097 op = gen_reg_rtx (V32QImode);
41098 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
41099 if (op != d->target)
41100 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41101
41102 return true;
41103 }
41104
41105 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
41106 and extract-odd permutations of two V32QImode and V16QImode operand
41107 with two vpshufb insns, vpor and vpermq. We should have already
41108 failed all two or three instruction sequences. */
41109
41110 static bool
41111 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
41112 {
41113 rtx rperm[2][32], vperm, l, h, ior, op, m128;
41114 unsigned int i, nelt, eltsz;
41115
41116 if (!TARGET_AVX2
41117 || d->one_operand_p
41118 || (d->vmode != V32QImode && d->vmode != V16HImode))
41119 return false;
41120
41121 for (i = 0; i < d->nelt; ++i)
41122 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
41123 return false;
41124
41125 if (d->testing_p)
41126 return true;
41127
41128 nelt = d->nelt;
41129 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41130
41131 /* Generate two permutation masks. In the first permutation mask
41132 the first quarter will contain indexes for the first half
41133 of the op0, the second quarter will contain bit 7 set, third quarter
41134 will contain indexes for the second half of the op0 and the
41135 last quarter bit 7 set. In the second permutation mask
41136 the first quarter will contain bit 7 set, the second quarter
41137 indexes for the first half of the op1, the third quarter bit 7 set
41138 and last quarter indexes for the second half of the op1.
41139 I.e. the first mask e.g. for V32QImode extract even will be:
41140 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
41141 (all values masked with 0xf except for -128) and second mask
41142 for extract even will be
41143 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
41144 m128 = GEN_INT (-128);
41145 for (i = 0; i < nelt; ++i)
41146 {
41147 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41148 unsigned which = d->perm[i] >= nelt;
41149 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
41150
41151 for (j = 0; j < eltsz; ++j)
41152 {
41153 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
41154 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
41155 }
41156 }
41157
41158 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41159 vperm = force_reg (V32QImode, vperm);
41160
41161 l = gen_reg_rtx (V32QImode);
41162 op = gen_lowpart (V32QImode, d->op0);
41163 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41164
41165 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41166 vperm = force_reg (V32QImode, vperm);
41167
41168 h = gen_reg_rtx (V32QImode);
41169 op = gen_lowpart (V32QImode, d->op1);
41170 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41171
41172 ior = gen_reg_rtx (V32QImode);
41173 emit_insn (gen_iorv32qi3 (ior, l, h));
41174
41175 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
41176 op = gen_reg_rtx (V4DImode);
41177 ior = gen_lowpart (V4DImode, ior);
41178 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
41179 const1_rtx, GEN_INT (3)));
41180 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41181
41182 return true;
41183 }
41184
41185 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
41186 and extract-odd permutations. */
41187
41188 static bool
41189 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
41190 {
41191 rtx t1, t2, t3, t4, t5;
41192
41193 switch (d->vmode)
41194 {
41195 case V4DFmode:
41196 t1 = gen_reg_rtx (V4DFmode);
41197 t2 = gen_reg_rtx (V4DFmode);
41198
41199 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41200 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
41201 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
41202
41203 /* Now an unpck[lh]pd will produce the result required. */
41204 if (odd)
41205 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
41206 else
41207 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
41208 emit_insn (t3);
41209 break;
41210
41211 case V8SFmode:
41212 {
41213 int mask = odd ? 0xdd : 0x88;
41214
41215 t1 = gen_reg_rtx (V8SFmode);
41216 t2 = gen_reg_rtx (V8SFmode);
41217 t3 = gen_reg_rtx (V8SFmode);
41218
41219 /* Shuffle within the 128-bit lanes to produce:
41220 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
41221 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
41222 GEN_INT (mask)));
41223
41224 /* Shuffle the lanes around to produce:
41225 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
41226 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
41227 GEN_INT (0x3)));
41228
41229 /* Shuffle within the 128-bit lanes to produce:
41230 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
41231 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
41232
41233 /* Shuffle within the 128-bit lanes to produce:
41234 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
41235 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
41236
41237 /* Shuffle the lanes around to produce:
41238 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
41239 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
41240 GEN_INT (0x20)));
41241 }
41242 break;
41243
41244 case V2DFmode:
41245 case V4SFmode:
41246 case V2DImode:
41247 case V4SImode:
41248 /* These are always directly implementable by expand_vec_perm_1. */
41249 gcc_unreachable ();
41250
41251 case V8HImode:
41252 if (TARGET_SSSE3)
41253 return expand_vec_perm_pshufb2 (d);
41254 else
41255 {
41256 /* We need 2*log2(N)-1 operations to achieve odd/even
41257 with interleave. */
41258 t1 = gen_reg_rtx (V8HImode);
41259 t2 = gen_reg_rtx (V8HImode);
41260 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
41261 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
41262 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
41263 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
41264 if (odd)
41265 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
41266 else
41267 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
41268 emit_insn (t3);
41269 }
41270 break;
41271
41272 case V16QImode:
41273 if (TARGET_SSSE3)
41274 return expand_vec_perm_pshufb2 (d);
41275 else
41276 {
41277 t1 = gen_reg_rtx (V16QImode);
41278 t2 = gen_reg_rtx (V16QImode);
41279 t3 = gen_reg_rtx (V16QImode);
41280 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
41281 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
41282 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
41283 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
41284 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
41285 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
41286 if (odd)
41287 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
41288 else
41289 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
41290 emit_insn (t3);
41291 }
41292 break;
41293
41294 case V16HImode:
41295 case V32QImode:
41296 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
41297
41298 case V4DImode:
41299 if (!TARGET_AVX2)
41300 {
41301 struct expand_vec_perm_d d_copy = *d;
41302 d_copy.vmode = V4DFmode;
41303 d_copy.target = gen_reg_rtx (V4DFmode);
41304 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
41305 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
41306 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41307 {
41308 if (!d->testing_p)
41309 emit_move_insn (d->target,
41310 gen_lowpart (V4DImode, d_copy.target));
41311 return true;
41312 }
41313 return false;
41314 }
41315
41316 t1 = gen_reg_rtx (V4DImode);
41317 t2 = gen_reg_rtx (V4DImode);
41318
41319 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41320 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
41321 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
41322
41323 /* Now an vpunpck[lh]qdq will produce the result required. */
41324 if (odd)
41325 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
41326 else
41327 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
41328 emit_insn (t3);
41329 break;
41330
41331 case V8SImode:
41332 if (!TARGET_AVX2)
41333 {
41334 struct expand_vec_perm_d d_copy = *d;
41335 d_copy.vmode = V8SFmode;
41336 d_copy.target = gen_reg_rtx (V8SFmode);
41337 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
41338 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
41339 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41340 {
41341 if (!d->testing_p)
41342 emit_move_insn (d->target,
41343 gen_lowpart (V8SImode, d_copy.target));
41344 return true;
41345 }
41346 return false;
41347 }
41348
41349 t1 = gen_reg_rtx (V8SImode);
41350 t2 = gen_reg_rtx (V8SImode);
41351 t3 = gen_reg_rtx (V4DImode);
41352 t4 = gen_reg_rtx (V4DImode);
41353 t5 = gen_reg_rtx (V4DImode);
41354
41355 /* Shuffle the lanes around into
41356 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
41357 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
41358 gen_lowpart (V4DImode, d->op1),
41359 GEN_INT (0x20)));
41360 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
41361 gen_lowpart (V4DImode, d->op1),
41362 GEN_INT (0x31)));
41363
41364 /* Swap the 2nd and 3rd position in each lane into
41365 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
41366 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
41367 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41368 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
41369 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41370
41371 /* Now an vpunpck[lh]qdq will produce
41372 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
41373 if (odd)
41374 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
41375 gen_lowpart (V4DImode, t2));
41376 else
41377 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
41378 gen_lowpart (V4DImode, t2));
41379 emit_insn (t3);
41380 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
41381 break;
41382
41383 default:
41384 gcc_unreachable ();
41385 }
41386
41387 return true;
41388 }
41389
41390 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41391 extract-even and extract-odd permutations. */
41392
41393 static bool
41394 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
41395 {
41396 unsigned i, odd, nelt = d->nelt;
41397
41398 odd = d->perm[0];
41399 if (odd != 0 && odd != 1)
41400 return false;
41401
41402 for (i = 1; i < nelt; ++i)
41403 if (d->perm[i] != 2 * i + odd)
41404 return false;
41405
41406 return expand_vec_perm_even_odd_1 (d, odd);
41407 }
41408
41409 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
41410 permutations. We assume that expand_vec_perm_1 has already failed. */
41411
41412 static bool
41413 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
41414 {
41415 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
41416 enum machine_mode vmode = d->vmode;
41417 unsigned char perm2[4];
41418 rtx op0 = d->op0, dest;
41419 bool ok;
41420
41421 switch (vmode)
41422 {
41423 case V4DFmode:
41424 case V8SFmode:
41425 /* These are special-cased in sse.md so that we can optionally
41426 use the vbroadcast instruction. They expand to two insns
41427 if the input happens to be in a register. */
41428 gcc_unreachable ();
41429
41430 case V2DFmode:
41431 case V2DImode:
41432 case V4SFmode:
41433 case V4SImode:
41434 /* These are always implementable using standard shuffle patterns. */
41435 gcc_unreachable ();
41436
41437 case V8HImode:
41438 case V16QImode:
41439 /* These can be implemented via interleave. We save one insn by
41440 stopping once we have promoted to V4SImode and then use pshufd. */
41441 do
41442 {
41443 rtx dest;
41444 rtx (*gen) (rtx, rtx, rtx)
41445 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
41446 : gen_vec_interleave_lowv8hi;
41447
41448 if (elt >= nelt2)
41449 {
41450 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
41451 : gen_vec_interleave_highv8hi;
41452 elt -= nelt2;
41453 }
41454 nelt2 /= 2;
41455
41456 dest = gen_reg_rtx (vmode);
41457 emit_insn (gen (dest, op0, op0));
41458 vmode = get_mode_wider_vector (vmode);
41459 op0 = gen_lowpart (vmode, dest);
41460 }
41461 while (vmode != V4SImode);
41462
41463 memset (perm2, elt, 4);
41464 dest = gen_reg_rtx (V4SImode);
41465 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
41466 gcc_assert (ok);
41467 if (!d->testing_p)
41468 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
41469 return true;
41470
41471 case V32QImode:
41472 case V16HImode:
41473 case V8SImode:
41474 case V4DImode:
41475 /* For AVX2 broadcasts of the first element vpbroadcast* or
41476 vpermq should be used by expand_vec_perm_1. */
41477 gcc_assert (!TARGET_AVX2 || d->perm[0]);
41478 return false;
41479
41480 default:
41481 gcc_unreachable ();
41482 }
41483 }
41484
41485 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41486 broadcast permutations. */
41487
41488 static bool
41489 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
41490 {
41491 unsigned i, elt, nelt = d->nelt;
41492
41493 if (!d->one_operand_p)
41494 return false;
41495
41496 elt = d->perm[0];
41497 for (i = 1; i < nelt; ++i)
41498 if (d->perm[i] != elt)
41499 return false;
41500
41501 return expand_vec_perm_broadcast_1 (d);
41502 }
41503
41504 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
41505 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
41506 all the shorter instruction sequences. */
41507
41508 static bool
41509 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
41510 {
41511 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
41512 unsigned int i, nelt, eltsz;
41513 bool used[4];
41514
41515 if (!TARGET_AVX2
41516 || d->one_operand_p
41517 || (d->vmode != V32QImode && d->vmode != V16HImode))
41518 return false;
41519
41520 if (d->testing_p)
41521 return true;
41522
41523 nelt = d->nelt;
41524 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41525
41526 /* Generate 4 permutation masks. If the required element is within
41527 the same lane, it is shuffled in. If the required element from the
41528 other lane, force a zero by setting bit 7 in the permutation mask.
41529 In the other mask the mask has non-negative elements if element
41530 is requested from the other lane, but also moved to the other lane,
41531 so that the result of vpshufb can have the two V2TImode halves
41532 swapped. */
41533 m128 = GEN_INT (-128);
41534 for (i = 0; i < 32; ++i)
41535 {
41536 rperm[0][i] = m128;
41537 rperm[1][i] = m128;
41538 rperm[2][i] = m128;
41539 rperm[3][i] = m128;
41540 }
41541 used[0] = false;
41542 used[1] = false;
41543 used[2] = false;
41544 used[3] = false;
41545 for (i = 0; i < nelt; ++i)
41546 {
41547 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41548 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41549 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
41550
41551 for (j = 0; j < eltsz; ++j)
41552 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
41553 used[which] = true;
41554 }
41555
41556 for (i = 0; i < 2; ++i)
41557 {
41558 if (!used[2 * i + 1])
41559 {
41560 h[i] = NULL_RTX;
41561 continue;
41562 }
41563 vperm = gen_rtx_CONST_VECTOR (V32QImode,
41564 gen_rtvec_v (32, rperm[2 * i + 1]));
41565 vperm = force_reg (V32QImode, vperm);
41566 h[i] = gen_reg_rtx (V32QImode);
41567 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41568 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
41569 }
41570
41571 /* Swap the 128-byte lanes of h[X]. */
41572 for (i = 0; i < 2; ++i)
41573 {
41574 if (h[i] == NULL_RTX)
41575 continue;
41576 op = gen_reg_rtx (V4DImode);
41577 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
41578 const2_rtx, GEN_INT (3), const0_rtx,
41579 const1_rtx));
41580 h[i] = gen_lowpart (V32QImode, op);
41581 }
41582
41583 for (i = 0; i < 2; ++i)
41584 {
41585 if (!used[2 * i])
41586 {
41587 l[i] = NULL_RTX;
41588 continue;
41589 }
41590 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41591 vperm = force_reg (V32QImode, vperm);
41592 l[i] = gen_reg_rtx (V32QImode);
41593 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41594 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41595 }
41596
41597 for (i = 0; i < 2; ++i)
41598 {
41599 if (h[i] && l[i])
41600 {
41601 op = gen_reg_rtx (V32QImode);
41602 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41603 l[i] = op;
41604 }
41605 else if (h[i])
41606 l[i] = h[i];
41607 }
41608
41609 gcc_assert (l[0] && l[1]);
41610 op = d->target;
41611 if (d->vmode != V32QImode)
41612 op = gen_reg_rtx (V32QImode);
41613 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41614 if (op != d->target)
41615 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41616 return true;
41617 }
41618
41619 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41620 With all of the interface bits taken care of, perform the expansion
41621 in D and return true on success. */
41622
41623 static bool
41624 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41625 {
41626 /* Try a single instruction expansion. */
41627 if (expand_vec_perm_1 (d))
41628 return true;
41629
41630 /* Try sequences of two instructions. */
41631
41632 if (expand_vec_perm_pshuflw_pshufhw (d))
41633 return true;
41634
41635 if (expand_vec_perm_palignr (d))
41636 return true;
41637
41638 if (expand_vec_perm_interleave2 (d))
41639 return true;
41640
41641 if (expand_vec_perm_broadcast (d))
41642 return true;
41643
41644 if (expand_vec_perm_vpermq_perm_1 (d))
41645 return true;
41646
41647 if (expand_vec_perm_vperm2f128 (d))
41648 return true;
41649
41650 /* Try sequences of three instructions. */
41651
41652 if (expand_vec_perm_2vperm2f128_vshuf (d))
41653 return true;
41654
41655 if (expand_vec_perm_pshufb2 (d))
41656 return true;
41657
41658 if (expand_vec_perm_interleave3 (d))
41659 return true;
41660
41661 if (expand_vec_perm_vperm2f128_vblend (d))
41662 return true;
41663
41664 /* Try sequences of four instructions. */
41665
41666 if (expand_vec_perm_vpshufb2_vpermq (d))
41667 return true;
41668
41669 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41670 return true;
41671
41672 /* ??? Look for narrow permutations whose element orderings would
41673 allow the promotion to a wider mode. */
41674
41675 /* ??? Look for sequences of interleave or a wider permute that place
41676 the data into the correct lanes for a half-vector shuffle like
41677 pshuf[lh]w or vpermilps. */
41678
41679 /* ??? Look for sequences of interleave that produce the desired results.
41680 The combinatorics of punpck[lh] get pretty ugly... */
41681
41682 if (expand_vec_perm_even_odd (d))
41683 return true;
41684
41685 /* Even longer sequences. */
41686 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41687 return true;
41688
41689 return false;
41690 }
41691
41692 /* If a permutation only uses one operand, make it clear. Returns true
41693 if the permutation references both operands. */
41694
41695 static bool
41696 canonicalize_perm (struct expand_vec_perm_d *d)
41697 {
41698 int i, which, nelt = d->nelt;
41699
41700 for (i = which = 0; i < nelt; ++i)
41701 which |= (d->perm[i] < nelt ? 1 : 2);
41702
41703 d->one_operand_p = true;
41704 switch (which)
41705 {
41706 default:
41707 gcc_unreachable();
41708
41709 case 3:
41710 if (!rtx_equal_p (d->op0, d->op1))
41711 {
41712 d->one_operand_p = false;
41713 break;
41714 }
41715 /* The elements of PERM do not suggest that only the first operand
41716 is used, but both operands are identical. Allow easier matching
41717 of the permutation by folding the permutation into the single
41718 input vector. */
41719 /* FALLTHRU */
41720
41721 case 2:
41722 for (i = 0; i < nelt; ++i)
41723 d->perm[i] &= nelt - 1;
41724 d->op0 = d->op1;
41725 break;
41726
41727 case 1:
41728 d->op1 = d->op0;
41729 break;
41730 }
41731
41732 return (which == 3);
41733 }
41734
41735 bool
41736 ix86_expand_vec_perm_const (rtx operands[4])
41737 {
41738 struct expand_vec_perm_d d;
41739 unsigned char perm[MAX_VECT_LEN];
41740 int i, nelt;
41741 bool two_args;
41742 rtx sel;
41743
41744 d.target = operands[0];
41745 d.op0 = operands[1];
41746 d.op1 = operands[2];
41747 sel = operands[3];
41748
41749 d.vmode = GET_MODE (d.target);
41750 gcc_assert (VECTOR_MODE_P (d.vmode));
41751 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41752 d.testing_p = false;
41753
41754 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41755 gcc_assert (XVECLEN (sel, 0) == nelt);
41756 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41757
41758 for (i = 0; i < nelt; ++i)
41759 {
41760 rtx e = XVECEXP (sel, 0, i);
41761 int ei = INTVAL (e) & (2 * nelt - 1);
41762 d.perm[i] = ei;
41763 perm[i] = ei;
41764 }
41765
41766 two_args = canonicalize_perm (&d);
41767
41768 if (ix86_expand_vec_perm_const_1 (&d))
41769 return true;
41770
41771 /* If the selector says both arguments are needed, but the operands are the
41772 same, the above tried to expand with one_operand_p and flattened selector.
41773 If that didn't work, retry without one_operand_p; we succeeded with that
41774 during testing. */
41775 if (two_args && d.one_operand_p)
41776 {
41777 d.one_operand_p = false;
41778 memcpy (d.perm, perm, sizeof (perm));
41779 return ix86_expand_vec_perm_const_1 (&d);
41780 }
41781
41782 return false;
41783 }
41784
41785 /* Implement targetm.vectorize.vec_perm_const_ok. */
41786
41787 static bool
41788 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41789 const unsigned char *sel)
41790 {
41791 struct expand_vec_perm_d d;
41792 unsigned int i, nelt, which;
41793 bool ret;
41794
41795 d.vmode = vmode;
41796 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41797 d.testing_p = true;
41798
41799 /* Given sufficient ISA support we can just return true here
41800 for selected vector modes. */
41801 if (GET_MODE_SIZE (d.vmode) == 16)
41802 {
41803 /* All implementable with a single vpperm insn. */
41804 if (TARGET_XOP)
41805 return true;
41806 /* All implementable with 2 pshufb + 1 ior. */
41807 if (TARGET_SSSE3)
41808 return true;
41809 /* All implementable with shufpd or unpck[lh]pd. */
41810 if (d.nelt == 2)
41811 return true;
41812 }
41813
41814 /* Extract the values from the vector CST into the permutation
41815 array in D. */
41816 memcpy (d.perm, sel, nelt);
41817 for (i = which = 0; i < nelt; ++i)
41818 {
41819 unsigned char e = d.perm[i];
41820 gcc_assert (e < 2 * nelt);
41821 which |= (e < nelt ? 1 : 2);
41822 }
41823
41824 /* For all elements from second vector, fold the elements to first. */
41825 if (which == 2)
41826 for (i = 0; i < nelt; ++i)
41827 d.perm[i] -= nelt;
41828
41829 /* Check whether the mask can be applied to the vector type. */
41830 d.one_operand_p = (which != 3);
41831
41832 /* Implementable with shufps or pshufd. */
41833 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41834 return true;
41835
41836 /* Otherwise we have to go through the motions and see if we can
41837 figure out how to generate the requested permutation. */
41838 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41839 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41840 if (!d.one_operand_p)
41841 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41842
41843 start_sequence ();
41844 ret = ix86_expand_vec_perm_const_1 (&d);
41845 end_sequence ();
41846
41847 return ret;
41848 }
41849
41850 void
41851 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41852 {
41853 struct expand_vec_perm_d d;
41854 unsigned i, nelt;
41855
41856 d.target = targ;
41857 d.op0 = op0;
41858 d.op1 = op1;
41859 d.vmode = GET_MODE (targ);
41860 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41861 d.one_operand_p = false;
41862 d.testing_p = false;
41863
41864 for (i = 0; i < nelt; ++i)
41865 d.perm[i] = i * 2 + odd;
41866
41867 /* We'll either be able to implement the permutation directly... */
41868 if (expand_vec_perm_1 (&d))
41869 return;
41870
41871 /* ... or we use the special-case patterns. */
41872 expand_vec_perm_even_odd_1 (&d, odd);
41873 }
41874
41875 static void
41876 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41877 {
41878 struct expand_vec_perm_d d;
41879 unsigned i, nelt, base;
41880 bool ok;
41881
41882 d.target = targ;
41883 d.op0 = op0;
41884 d.op1 = op1;
41885 d.vmode = GET_MODE (targ);
41886 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41887 d.one_operand_p = false;
41888 d.testing_p = false;
41889
41890 base = high_p ? nelt / 2 : 0;
41891 for (i = 0; i < nelt / 2; ++i)
41892 {
41893 d.perm[i * 2] = i + base;
41894 d.perm[i * 2 + 1] = i + base + nelt;
41895 }
41896
41897 /* Note that for AVX this isn't one instruction. */
41898 ok = ix86_expand_vec_perm_const_1 (&d);
41899 gcc_assert (ok);
41900 }
41901
41902
41903 /* Expand a vector operation CODE for a V*QImode in terms of the
41904 same operation on V*HImode. */
41905
41906 void
41907 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
41908 {
41909 enum machine_mode qimode = GET_MODE (dest);
41910 enum machine_mode himode;
41911 rtx (*gen_il) (rtx, rtx, rtx);
41912 rtx (*gen_ih) (rtx, rtx, rtx);
41913 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
41914 struct expand_vec_perm_d d;
41915 bool ok, full_interleave;
41916 bool uns_p = false;
41917 int i;
41918
41919 switch (qimode)
41920 {
41921 case V16QImode:
41922 himode = V8HImode;
41923 gen_il = gen_vec_interleave_lowv16qi;
41924 gen_ih = gen_vec_interleave_highv16qi;
41925 break;
41926 case V32QImode:
41927 himode = V16HImode;
41928 gen_il = gen_avx2_interleave_lowv32qi;
41929 gen_ih = gen_avx2_interleave_highv32qi;
41930 break;
41931 default:
41932 gcc_unreachable ();
41933 }
41934
41935 op2_l = op2_h = op2;
41936 switch (code)
41937 {
41938 case MULT:
41939 /* Unpack data such that we've got a source byte in each low byte of
41940 each word. We don't care what goes into the high byte of each word.
41941 Rather than trying to get zero in there, most convenient is to let
41942 it be a copy of the low byte. */
41943 op2_l = gen_reg_rtx (qimode);
41944 op2_h = gen_reg_rtx (qimode);
41945 emit_insn (gen_il (op2_l, op2, op2));
41946 emit_insn (gen_ih (op2_h, op2, op2));
41947 /* FALLTHRU */
41948
41949 op1_l = gen_reg_rtx (qimode);
41950 op1_h = gen_reg_rtx (qimode);
41951 emit_insn (gen_il (op1_l, op1, op1));
41952 emit_insn (gen_ih (op1_h, op1, op1));
41953 full_interleave = qimode == V16QImode;
41954 break;
41955
41956 case ASHIFT:
41957 case LSHIFTRT:
41958 uns_p = true;
41959 /* FALLTHRU */
41960 case ASHIFTRT:
41961 op1_l = gen_reg_rtx (himode);
41962 op1_h = gen_reg_rtx (himode);
41963 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
41964 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
41965 full_interleave = true;
41966 break;
41967 default:
41968 gcc_unreachable ();
41969 }
41970
41971 /* Perform the operation. */
41972 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
41973 1, OPTAB_DIRECT);
41974 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
41975 1, OPTAB_DIRECT);
41976 gcc_assert (res_l && res_h);
41977
41978 /* Merge the data back into the right place. */
41979 d.target = dest;
41980 d.op0 = gen_lowpart (qimode, res_l);
41981 d.op1 = gen_lowpart (qimode, res_h);
41982 d.vmode = qimode;
41983 d.nelt = GET_MODE_NUNITS (qimode);
41984 d.one_operand_p = false;
41985 d.testing_p = false;
41986
41987 if (full_interleave)
41988 {
41989 /* For SSE2, we used an full interleave, so the desired
41990 results are in the even elements. */
41991 for (i = 0; i < 32; ++i)
41992 d.perm[i] = i * 2;
41993 }
41994 else
41995 {
41996 /* For AVX, the interleave used above was not cross-lane. So the
41997 extraction is evens but with the second and third quarter swapped.
41998 Happily, that is even one insn shorter than even extraction. */
41999 for (i = 0; i < 32; ++i)
42000 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
42001 }
42002
42003 ok = ix86_expand_vec_perm_const_1 (&d);
42004 gcc_assert (ok);
42005
42006 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42007 gen_rtx_fmt_ee (code, qimode, op1, op2));
42008 }
42009
42010 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
42011 if op is CONST_VECTOR with all odd elements equal to their
42012 preceding element. */
42013
42014 static bool
42015 const_vector_equal_evenodd_p (rtx op)
42016 {
42017 enum machine_mode mode = GET_MODE (op);
42018 int i, nunits = GET_MODE_NUNITS (mode);
42019 if (GET_CODE (op) != CONST_VECTOR
42020 || nunits != CONST_VECTOR_NUNITS (op))
42021 return false;
42022 for (i = 0; i < nunits; i += 2)
42023 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
42024 return false;
42025 return true;
42026 }
42027
42028 void
42029 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
42030 bool uns_p, bool odd_p)
42031 {
42032 enum machine_mode mode = GET_MODE (op1);
42033 enum machine_mode wmode = GET_MODE (dest);
42034 rtx x;
42035 rtx orig_op1 = op1, orig_op2 = op2;
42036
42037 if (!nonimmediate_operand (op1, mode))
42038 op1 = force_reg (mode, op1);
42039 if (!nonimmediate_operand (op2, mode))
42040 op2 = force_reg (mode, op2);
42041
42042 /* We only play even/odd games with vectors of SImode. */
42043 gcc_assert (mode == V4SImode || mode == V8SImode);
42044
42045 /* If we're looking for the odd results, shift those members down to
42046 the even slots. For some cpus this is faster than a PSHUFD. */
42047 if (odd_p)
42048 {
42049 /* For XOP use vpmacsdqh, but only for smult, as it is only
42050 signed. */
42051 if (TARGET_XOP && mode == V4SImode && !uns_p)
42052 {
42053 x = force_reg (wmode, CONST0_RTX (wmode));
42054 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
42055 return;
42056 }
42057
42058 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
42059 if (!const_vector_equal_evenodd_p (orig_op1))
42060 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
42061 x, NULL, 1, OPTAB_DIRECT);
42062 if (!const_vector_equal_evenodd_p (orig_op2))
42063 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
42064 x, NULL, 1, OPTAB_DIRECT);
42065 op1 = gen_lowpart (mode, op1);
42066 op2 = gen_lowpart (mode, op2);
42067 }
42068
42069 if (mode == V8SImode)
42070 {
42071 if (uns_p)
42072 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
42073 else
42074 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
42075 }
42076 else if (uns_p)
42077 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
42078 else if (TARGET_SSE4_1)
42079 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
42080 else
42081 {
42082 rtx s1, s2, t0, t1, t2;
42083
42084 /* The easiest way to implement this without PMULDQ is to go through
42085 the motions as if we are performing a full 64-bit multiply. With
42086 the exception that we need to do less shuffling of the elements. */
42087
42088 /* Compute the sign-extension, aka highparts, of the two operands. */
42089 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42090 op1, pc_rtx, pc_rtx);
42091 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42092 op2, pc_rtx, pc_rtx);
42093
42094 /* Multiply LO(A) * HI(B), and vice-versa. */
42095 t1 = gen_reg_rtx (wmode);
42096 t2 = gen_reg_rtx (wmode);
42097 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
42098 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
42099
42100 /* Multiply LO(A) * LO(B). */
42101 t0 = gen_reg_rtx (wmode);
42102 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
42103
42104 /* Combine and shift the highparts into place. */
42105 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
42106 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
42107 1, OPTAB_DIRECT);
42108
42109 /* Combine high and low parts. */
42110 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
42111 return;
42112 }
42113 emit_insn (x);
42114 }
42115
42116 void
42117 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
42118 bool uns_p, bool high_p)
42119 {
42120 enum machine_mode wmode = GET_MODE (dest);
42121 enum machine_mode mode = GET_MODE (op1);
42122 rtx t1, t2, t3, t4, mask;
42123
42124 switch (mode)
42125 {
42126 case V4SImode:
42127 t1 = gen_reg_rtx (mode);
42128 t2 = gen_reg_rtx (mode);
42129 if (TARGET_XOP && !uns_p)
42130 {
42131 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
42132 shuffle the elements once so that all elements are in the right
42133 place for immediate use: { A C B D }. */
42134 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
42135 const1_rtx, GEN_INT (3)));
42136 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
42137 const1_rtx, GEN_INT (3)));
42138 }
42139 else
42140 {
42141 /* Put the elements into place for the multiply. */
42142 ix86_expand_vec_interleave (t1, op1, op1, high_p);
42143 ix86_expand_vec_interleave (t2, op2, op2, high_p);
42144 high_p = false;
42145 }
42146 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
42147 break;
42148
42149 case V8SImode:
42150 /* Shuffle the elements between the lanes. After this we
42151 have { A B E F | C D G H } for each operand. */
42152 t1 = gen_reg_rtx (V4DImode);
42153 t2 = gen_reg_rtx (V4DImode);
42154 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
42155 const0_rtx, const2_rtx,
42156 const1_rtx, GEN_INT (3)));
42157 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
42158 const0_rtx, const2_rtx,
42159 const1_rtx, GEN_INT (3)));
42160
42161 /* Shuffle the elements within the lanes. After this we
42162 have { A A B B | C C D D } or { E E F F | G G H H }. */
42163 t3 = gen_reg_rtx (V8SImode);
42164 t4 = gen_reg_rtx (V8SImode);
42165 mask = GEN_INT (high_p
42166 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
42167 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
42168 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
42169 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
42170
42171 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
42172 break;
42173
42174 case V8HImode:
42175 case V16HImode:
42176 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
42177 uns_p, OPTAB_DIRECT);
42178 t2 = expand_binop (mode,
42179 uns_p ? umul_highpart_optab : smul_highpart_optab,
42180 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
42181 gcc_assert (t1 && t2);
42182
42183 t3 = gen_reg_rtx (mode);
42184 ix86_expand_vec_interleave (t3, t1, t2, high_p);
42185 emit_move_insn (dest, gen_lowpart (wmode, t3));
42186 break;
42187
42188 case V16QImode:
42189 case V32QImode:
42190 t1 = gen_reg_rtx (wmode);
42191 t2 = gen_reg_rtx (wmode);
42192 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
42193 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
42194
42195 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
42196 break;
42197
42198 default:
42199 gcc_unreachable ();
42200 }
42201 }
42202
42203 void
42204 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
42205 {
42206 rtx res_1, res_2, res_3, res_4;
42207
42208 res_1 = gen_reg_rtx (V4SImode);
42209 res_2 = gen_reg_rtx (V4SImode);
42210 res_3 = gen_reg_rtx (V2DImode);
42211 res_4 = gen_reg_rtx (V2DImode);
42212 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
42213 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
42214
42215 /* Move the results in element 2 down to element 1; we don't care
42216 what goes in elements 2 and 3. Then we can merge the parts
42217 back together with an interleave.
42218
42219 Note that two other sequences were tried:
42220 (1) Use interleaves at the start instead of psrldq, which allows
42221 us to use a single shufps to merge things back at the end.
42222 (2) Use shufps here to combine the two vectors, then pshufd to
42223 put the elements in the correct order.
42224 In both cases the cost of the reformatting stall was too high
42225 and the overall sequence slower. */
42226
42227 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
42228 const0_rtx, const2_rtx,
42229 const0_rtx, const0_rtx));
42230 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
42231 const0_rtx, const2_rtx,
42232 const0_rtx, const0_rtx));
42233 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
42234
42235 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
42236 }
42237
42238 void
42239 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
42240 {
42241 enum machine_mode mode = GET_MODE (op0);
42242 rtx t1, t2, t3, t4, t5, t6;
42243
42244 if (TARGET_XOP && mode == V2DImode)
42245 {
42246 /* op1: A,B,C,D, op2: E,F,G,H */
42247 op1 = gen_lowpart (V4SImode, op1);
42248 op2 = gen_lowpart (V4SImode, op2);
42249
42250 t1 = gen_reg_rtx (V4SImode);
42251 t2 = gen_reg_rtx (V4SImode);
42252 t3 = gen_reg_rtx (V2DImode);
42253 t4 = gen_reg_rtx (V2DImode);
42254
42255 /* t1: B,A,D,C */
42256 emit_insn (gen_sse2_pshufd_1 (t1, op1,
42257 GEN_INT (1),
42258 GEN_INT (0),
42259 GEN_INT (3),
42260 GEN_INT (2)));
42261
42262 /* t2: (B*E),(A*F),(D*G),(C*H) */
42263 emit_insn (gen_mulv4si3 (t2, t1, op2));
42264
42265 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
42266 emit_insn (gen_xop_phadddq (t3, t2));
42267
42268 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
42269 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
42270
42271 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
42272 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
42273 }
42274 else
42275 {
42276 enum machine_mode nmode;
42277 rtx (*umul) (rtx, rtx, rtx);
42278
42279 if (mode == V2DImode)
42280 {
42281 umul = gen_vec_widen_umult_even_v4si;
42282 nmode = V4SImode;
42283 }
42284 else if (mode == V4DImode)
42285 {
42286 umul = gen_vec_widen_umult_even_v8si;
42287 nmode = V8SImode;
42288 }
42289 else
42290 gcc_unreachable ();
42291
42292
42293 /* Multiply low parts. */
42294 t1 = gen_reg_rtx (mode);
42295 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
42296
42297 /* Shift input vectors right 32 bits so we can multiply high parts. */
42298 t6 = GEN_INT (32);
42299 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
42300 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
42301
42302 /* Multiply high parts by low parts. */
42303 t4 = gen_reg_rtx (mode);
42304 t5 = gen_reg_rtx (mode);
42305 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
42306 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
42307
42308 /* Combine and shift the highparts back. */
42309 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
42310 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
42311
42312 /* Combine high and low parts. */
42313 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
42314 }
42315
42316 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42317 gen_rtx_MULT (mode, op1, op2));
42318 }
42319
42320 /* Return 1 if control tansfer instruction INSN
42321 should be encoded with bnd prefix.
42322 If insn is NULL then return 1 when control
42323 transfer instructions should be prefixed with
42324 bnd by default for current function. */
42325
42326 bool
42327 ix86_bnd_prefixed_insn_p (rtx insn ATTRIBUTE_UNUSED)
42328 {
42329 return false;
42330 }
42331
42332 /* Calculate integer abs() using only SSE2 instructions. */
42333
42334 void
42335 ix86_expand_sse2_abs (rtx target, rtx input)
42336 {
42337 enum machine_mode mode = GET_MODE (target);
42338 rtx tmp0, tmp1, x;
42339
42340 switch (mode)
42341 {
42342 /* For 32-bit signed integer X, the best way to calculate the absolute
42343 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
42344 case V4SImode:
42345 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
42346 GEN_INT (GET_MODE_BITSIZE
42347 (GET_MODE_INNER (mode)) - 1),
42348 NULL, 0, OPTAB_DIRECT);
42349 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
42350 NULL, 0, OPTAB_DIRECT);
42351 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
42352 target, 0, OPTAB_DIRECT);
42353 break;
42354
42355 /* For 16-bit signed integer X, the best way to calculate the absolute
42356 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
42357 case V8HImode:
42358 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42359
42360 x = expand_simple_binop (mode, SMAX, tmp0, input,
42361 target, 0, OPTAB_DIRECT);
42362 break;
42363
42364 /* For 8-bit signed integer X, the best way to calculate the absolute
42365 value of X is min ((unsigned char) X, (unsigned char) (-X)),
42366 as SSE2 provides the PMINUB insn. */
42367 case V16QImode:
42368 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42369
42370 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
42371 target, 0, OPTAB_DIRECT);
42372 break;
42373
42374 default:
42375 gcc_unreachable ();
42376 }
42377
42378 if (x != target)
42379 emit_move_insn (target, x);
42380 }
42381
42382 /* Expand an insert into a vector register through pinsr insn.
42383 Return true if successful. */
42384
42385 bool
42386 ix86_expand_pinsr (rtx *operands)
42387 {
42388 rtx dst = operands[0];
42389 rtx src = operands[3];
42390
42391 unsigned int size = INTVAL (operands[1]);
42392 unsigned int pos = INTVAL (operands[2]);
42393
42394 if (GET_CODE (dst) == SUBREG)
42395 {
42396 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
42397 dst = SUBREG_REG (dst);
42398 }
42399
42400 if (GET_CODE (src) == SUBREG)
42401 src = SUBREG_REG (src);
42402
42403 switch (GET_MODE (dst))
42404 {
42405 case V16QImode:
42406 case V8HImode:
42407 case V4SImode:
42408 case V2DImode:
42409 {
42410 enum machine_mode srcmode, dstmode;
42411 rtx (*pinsr)(rtx, rtx, rtx, rtx);
42412
42413 srcmode = mode_for_size (size, MODE_INT, 0);
42414
42415 switch (srcmode)
42416 {
42417 case QImode:
42418 if (!TARGET_SSE4_1)
42419 return false;
42420 dstmode = V16QImode;
42421 pinsr = gen_sse4_1_pinsrb;
42422 break;
42423
42424 case HImode:
42425 if (!TARGET_SSE2)
42426 return false;
42427 dstmode = V8HImode;
42428 pinsr = gen_sse2_pinsrw;
42429 break;
42430
42431 case SImode:
42432 if (!TARGET_SSE4_1)
42433 return false;
42434 dstmode = V4SImode;
42435 pinsr = gen_sse4_1_pinsrd;
42436 break;
42437
42438 case DImode:
42439 gcc_assert (TARGET_64BIT);
42440 if (!TARGET_SSE4_1)
42441 return false;
42442 dstmode = V2DImode;
42443 pinsr = gen_sse4_1_pinsrq;
42444 break;
42445
42446 default:
42447 return false;
42448 }
42449
42450 rtx d = dst;
42451 if (GET_MODE (dst) != dstmode)
42452 d = gen_reg_rtx (dstmode);
42453 src = gen_lowpart (srcmode, src);
42454
42455 pos /= size;
42456
42457 emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
42458 GEN_INT (1 << pos)));
42459 if (d != dst)
42460 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
42461 return true;
42462 }
42463
42464 default:
42465 return false;
42466 }
42467 }
42468 \f
42469 /* This function returns the calling abi specific va_list type node.
42470 It returns the FNDECL specific va_list type. */
42471
42472 static tree
42473 ix86_fn_abi_va_list (tree fndecl)
42474 {
42475 if (!TARGET_64BIT)
42476 return va_list_type_node;
42477 gcc_assert (fndecl != NULL_TREE);
42478
42479 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
42480 return ms_va_list_type_node;
42481 else
42482 return sysv_va_list_type_node;
42483 }
42484
42485 /* Returns the canonical va_list type specified by TYPE. If there
42486 is no valid TYPE provided, it return NULL_TREE. */
42487
42488 static tree
42489 ix86_canonical_va_list_type (tree type)
42490 {
42491 tree wtype, htype;
42492
42493 /* Resolve references and pointers to va_list type. */
42494 if (TREE_CODE (type) == MEM_REF)
42495 type = TREE_TYPE (type);
42496 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
42497 type = TREE_TYPE (type);
42498 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
42499 type = TREE_TYPE (type);
42500
42501 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
42502 {
42503 wtype = va_list_type_node;
42504 gcc_assert (wtype != NULL_TREE);
42505 htype = type;
42506 if (TREE_CODE (wtype) == ARRAY_TYPE)
42507 {
42508 /* If va_list is an array type, the argument may have decayed
42509 to a pointer type, e.g. by being passed to another function.
42510 In that case, unwrap both types so that we can compare the
42511 underlying records. */
42512 if (TREE_CODE (htype) == ARRAY_TYPE
42513 || POINTER_TYPE_P (htype))
42514 {
42515 wtype = TREE_TYPE (wtype);
42516 htype = TREE_TYPE (htype);
42517 }
42518 }
42519 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42520 return va_list_type_node;
42521 wtype = sysv_va_list_type_node;
42522 gcc_assert (wtype != NULL_TREE);
42523 htype = type;
42524 if (TREE_CODE (wtype) == ARRAY_TYPE)
42525 {
42526 /* If va_list is an array type, the argument may have decayed
42527 to a pointer type, e.g. by being passed to another function.
42528 In that case, unwrap both types so that we can compare the
42529 underlying records. */
42530 if (TREE_CODE (htype) == ARRAY_TYPE
42531 || POINTER_TYPE_P (htype))
42532 {
42533 wtype = TREE_TYPE (wtype);
42534 htype = TREE_TYPE (htype);
42535 }
42536 }
42537 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42538 return sysv_va_list_type_node;
42539 wtype = ms_va_list_type_node;
42540 gcc_assert (wtype != NULL_TREE);
42541 htype = type;
42542 if (TREE_CODE (wtype) == ARRAY_TYPE)
42543 {
42544 /* If va_list is an array type, the argument may have decayed
42545 to a pointer type, e.g. by being passed to another function.
42546 In that case, unwrap both types so that we can compare the
42547 underlying records. */
42548 if (TREE_CODE (htype) == ARRAY_TYPE
42549 || POINTER_TYPE_P (htype))
42550 {
42551 wtype = TREE_TYPE (wtype);
42552 htype = TREE_TYPE (htype);
42553 }
42554 }
42555 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42556 return ms_va_list_type_node;
42557 return NULL_TREE;
42558 }
42559 return std_canonical_va_list_type (type);
42560 }
42561
42562 /* Iterate through the target-specific builtin types for va_list.
42563 IDX denotes the iterator, *PTREE is set to the result type of
42564 the va_list builtin, and *PNAME to its internal type.
42565 Returns zero if there is no element for this index, otherwise
42566 IDX should be increased upon the next call.
42567 Note, do not iterate a base builtin's name like __builtin_va_list.
42568 Used from c_common_nodes_and_builtins. */
42569
42570 static int
42571 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
42572 {
42573 if (TARGET_64BIT)
42574 {
42575 switch (idx)
42576 {
42577 default:
42578 break;
42579
42580 case 0:
42581 *ptree = ms_va_list_type_node;
42582 *pname = "__builtin_ms_va_list";
42583 return 1;
42584
42585 case 1:
42586 *ptree = sysv_va_list_type_node;
42587 *pname = "__builtin_sysv_va_list";
42588 return 1;
42589 }
42590 }
42591
42592 return 0;
42593 }
42594
42595 #undef TARGET_SCHED_DISPATCH
42596 #define TARGET_SCHED_DISPATCH has_dispatch
42597 #undef TARGET_SCHED_DISPATCH_DO
42598 #define TARGET_SCHED_DISPATCH_DO do_dispatch
42599 #undef TARGET_SCHED_REASSOCIATION_WIDTH
42600 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
42601 #undef TARGET_SCHED_REORDER
42602 #define TARGET_SCHED_REORDER ix86_sched_reorder
42603 #undef TARGET_SCHED_ADJUST_PRIORITY
42604 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
42605 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
42606 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
42607 ix86_dependencies_evaluation_hook
42608
42609 /* The size of the dispatch window is the total number of bytes of
42610 object code allowed in a window. */
42611 #define DISPATCH_WINDOW_SIZE 16
42612
42613 /* Number of dispatch windows considered for scheduling. */
42614 #define MAX_DISPATCH_WINDOWS 3
42615
42616 /* Maximum number of instructions in a window. */
42617 #define MAX_INSN 4
42618
42619 /* Maximum number of immediate operands in a window. */
42620 #define MAX_IMM 4
42621
42622 /* Maximum number of immediate bits allowed in a window. */
42623 #define MAX_IMM_SIZE 128
42624
42625 /* Maximum number of 32 bit immediates allowed in a window. */
42626 #define MAX_IMM_32 4
42627
42628 /* Maximum number of 64 bit immediates allowed in a window. */
42629 #define MAX_IMM_64 2
42630
42631 /* Maximum total of loads or prefetches allowed in a window. */
42632 #define MAX_LOAD 2
42633
42634 /* Maximum total of stores allowed in a window. */
42635 #define MAX_STORE 1
42636
42637 #undef BIG
42638 #define BIG 100
42639
42640
42641 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
42642 enum dispatch_group {
42643 disp_no_group = 0,
42644 disp_load,
42645 disp_store,
42646 disp_load_store,
42647 disp_prefetch,
42648 disp_imm,
42649 disp_imm_32,
42650 disp_imm_64,
42651 disp_branch,
42652 disp_cmp,
42653 disp_jcc,
42654 disp_last
42655 };
42656
42657 /* Number of allowable groups in a dispatch window. It is an array
42658 indexed by dispatch_group enum. 100 is used as a big number,
42659 because the number of these kind of operations does not have any
42660 effect in dispatch window, but we need them for other reasons in
42661 the table. */
42662 static unsigned int num_allowable_groups[disp_last] = {
42663 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42664 };
42665
42666 char group_name[disp_last + 1][16] = {
42667 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42668 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42669 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42670 };
42671
42672 /* Instruction path. */
42673 enum insn_path {
42674 no_path = 0,
42675 path_single, /* Single micro op. */
42676 path_double, /* Double micro op. */
42677 path_multi, /* Instructions with more than 2 micro op.. */
42678 last_path
42679 };
42680
42681 /* sched_insn_info defines a window to the instructions scheduled in
42682 the basic block. It contains a pointer to the insn_info table and
42683 the instruction scheduled.
42684
42685 Windows are allocated for each basic block and are linked
42686 together. */
42687 typedef struct sched_insn_info_s {
42688 rtx insn;
42689 enum dispatch_group group;
42690 enum insn_path path;
42691 int byte_len;
42692 int imm_bytes;
42693 } sched_insn_info;
42694
42695 /* Linked list of dispatch windows. This is a two way list of
42696 dispatch windows of a basic block. It contains information about
42697 the number of uops in the window and the total number of
42698 instructions and of bytes in the object code for this dispatch
42699 window. */
42700 typedef struct dispatch_windows_s {
42701 int num_insn; /* Number of insn in the window. */
42702 int num_uops; /* Number of uops in the window. */
42703 int window_size; /* Number of bytes in the window. */
42704 int window_num; /* Window number between 0 or 1. */
42705 int num_imm; /* Number of immediates in an insn. */
42706 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42707 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42708 int imm_size; /* Total immediates in the window. */
42709 int num_loads; /* Total memory loads in the window. */
42710 int num_stores; /* Total memory stores in the window. */
42711 int violation; /* Violation exists in window. */
42712 sched_insn_info *window; /* Pointer to the window. */
42713 struct dispatch_windows_s *next;
42714 struct dispatch_windows_s *prev;
42715 } dispatch_windows;
42716
42717 /* Immediate valuse used in an insn. */
42718 typedef struct imm_info_s
42719 {
42720 int imm;
42721 int imm32;
42722 int imm64;
42723 } imm_info;
42724
42725 static dispatch_windows *dispatch_window_list;
42726 static dispatch_windows *dispatch_window_list1;
42727
42728 /* Get dispatch group of insn. */
42729
42730 static enum dispatch_group
42731 get_mem_group (rtx insn)
42732 {
42733 enum attr_memory memory;
42734
42735 if (INSN_CODE (insn) < 0)
42736 return disp_no_group;
42737 memory = get_attr_memory (insn);
42738 if (memory == MEMORY_STORE)
42739 return disp_store;
42740
42741 if (memory == MEMORY_LOAD)
42742 return disp_load;
42743
42744 if (memory == MEMORY_BOTH)
42745 return disp_load_store;
42746
42747 return disp_no_group;
42748 }
42749
42750 /* Return true if insn is a compare instruction. */
42751
42752 static bool
42753 is_cmp (rtx insn)
42754 {
42755 enum attr_type type;
42756
42757 type = get_attr_type (insn);
42758 return (type == TYPE_TEST
42759 || type == TYPE_ICMP
42760 || type == TYPE_FCMP
42761 || GET_CODE (PATTERN (insn)) == COMPARE);
42762 }
42763
42764 /* Return true if a dispatch violation encountered. */
42765
42766 static bool
42767 dispatch_violation (void)
42768 {
42769 if (dispatch_window_list->next)
42770 return dispatch_window_list->next->violation;
42771 return dispatch_window_list->violation;
42772 }
42773
42774 /* Return true if insn is a branch instruction. */
42775
42776 static bool
42777 is_branch (rtx insn)
42778 {
42779 return (CALL_P (insn) || JUMP_P (insn));
42780 }
42781
42782 /* Return true if insn is a prefetch instruction. */
42783
42784 static bool
42785 is_prefetch (rtx insn)
42786 {
42787 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42788 }
42789
42790 /* This function initializes a dispatch window and the list container holding a
42791 pointer to the window. */
42792
42793 static void
42794 init_window (int window_num)
42795 {
42796 int i;
42797 dispatch_windows *new_list;
42798
42799 if (window_num == 0)
42800 new_list = dispatch_window_list;
42801 else
42802 new_list = dispatch_window_list1;
42803
42804 new_list->num_insn = 0;
42805 new_list->num_uops = 0;
42806 new_list->window_size = 0;
42807 new_list->next = NULL;
42808 new_list->prev = NULL;
42809 new_list->window_num = window_num;
42810 new_list->num_imm = 0;
42811 new_list->num_imm_32 = 0;
42812 new_list->num_imm_64 = 0;
42813 new_list->imm_size = 0;
42814 new_list->num_loads = 0;
42815 new_list->num_stores = 0;
42816 new_list->violation = false;
42817
42818 for (i = 0; i < MAX_INSN; i++)
42819 {
42820 new_list->window[i].insn = NULL;
42821 new_list->window[i].group = disp_no_group;
42822 new_list->window[i].path = no_path;
42823 new_list->window[i].byte_len = 0;
42824 new_list->window[i].imm_bytes = 0;
42825 }
42826 return;
42827 }
42828
42829 /* This function allocates and initializes a dispatch window and the
42830 list container holding a pointer to the window. */
42831
42832 static dispatch_windows *
42833 allocate_window (void)
42834 {
42835 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42836 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42837
42838 return new_list;
42839 }
42840
42841 /* This routine initializes the dispatch scheduling information. It
42842 initiates building dispatch scheduler tables and constructs the
42843 first dispatch window. */
42844
42845 static void
42846 init_dispatch_sched (void)
42847 {
42848 /* Allocate a dispatch list and a window. */
42849 dispatch_window_list = allocate_window ();
42850 dispatch_window_list1 = allocate_window ();
42851 init_window (0);
42852 init_window (1);
42853 }
42854
42855 /* This function returns true if a branch is detected. End of a basic block
42856 does not have to be a branch, but here we assume only branches end a
42857 window. */
42858
42859 static bool
42860 is_end_basic_block (enum dispatch_group group)
42861 {
42862 return group == disp_branch;
42863 }
42864
42865 /* This function is called when the end of a window processing is reached. */
42866
42867 static void
42868 process_end_window (void)
42869 {
42870 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42871 if (dispatch_window_list->next)
42872 {
42873 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42874 gcc_assert (dispatch_window_list->window_size
42875 + dispatch_window_list1->window_size <= 48);
42876 init_window (1);
42877 }
42878 init_window (0);
42879 }
42880
42881 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42882 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42883 for 48 bytes of instructions. Note that these windows are not dispatch
42884 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42885
42886 static dispatch_windows *
42887 allocate_next_window (int window_num)
42888 {
42889 if (window_num == 0)
42890 {
42891 if (dispatch_window_list->next)
42892 init_window (1);
42893 init_window (0);
42894 return dispatch_window_list;
42895 }
42896
42897 dispatch_window_list->next = dispatch_window_list1;
42898 dispatch_window_list1->prev = dispatch_window_list;
42899
42900 return dispatch_window_list1;
42901 }
42902
42903 /* Increment the number of immediate operands of an instruction. */
42904
42905 static int
42906 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
42907 {
42908 if (*in_rtx == 0)
42909 return 0;
42910
42911 switch ( GET_CODE (*in_rtx))
42912 {
42913 case CONST:
42914 case SYMBOL_REF:
42915 case CONST_INT:
42916 (imm_values->imm)++;
42917 if (x86_64_immediate_operand (*in_rtx, SImode))
42918 (imm_values->imm32)++;
42919 else
42920 (imm_values->imm64)++;
42921 break;
42922
42923 case CONST_DOUBLE:
42924 (imm_values->imm)++;
42925 (imm_values->imm64)++;
42926 break;
42927
42928 case CODE_LABEL:
42929 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
42930 {
42931 (imm_values->imm)++;
42932 (imm_values->imm32)++;
42933 }
42934 break;
42935
42936 default:
42937 break;
42938 }
42939
42940 return 0;
42941 }
42942
42943 /* Compute number of immediate operands of an instruction. */
42944
42945 static void
42946 find_constant (rtx in_rtx, imm_info *imm_values)
42947 {
42948 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
42949 (rtx_function) find_constant_1, (void *) imm_values);
42950 }
42951
42952 /* Return total size of immediate operands of an instruction along with number
42953 of corresponding immediate-operands. It initializes its parameters to zero
42954 befor calling FIND_CONSTANT.
42955 INSN is the input instruction. IMM is the total of immediates.
42956 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
42957 bit immediates. */
42958
42959 static int
42960 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
42961 {
42962 imm_info imm_values = {0, 0, 0};
42963
42964 find_constant (insn, &imm_values);
42965 *imm = imm_values.imm;
42966 *imm32 = imm_values.imm32;
42967 *imm64 = imm_values.imm64;
42968 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
42969 }
42970
42971 /* This function indicates if an operand of an instruction is an
42972 immediate. */
42973
42974 static bool
42975 has_immediate (rtx insn)
42976 {
42977 int num_imm_operand;
42978 int num_imm32_operand;
42979 int num_imm64_operand;
42980
42981 if (insn)
42982 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42983 &num_imm64_operand);
42984 return false;
42985 }
42986
42987 /* Return single or double path for instructions. */
42988
42989 static enum insn_path
42990 get_insn_path (rtx insn)
42991 {
42992 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
42993
42994 if ((int)path == 0)
42995 return path_single;
42996
42997 if ((int)path == 1)
42998 return path_double;
42999
43000 return path_multi;
43001 }
43002
43003 /* Return insn dispatch group. */
43004
43005 static enum dispatch_group
43006 get_insn_group (rtx insn)
43007 {
43008 enum dispatch_group group = get_mem_group (insn);
43009 if (group)
43010 return group;
43011
43012 if (is_branch (insn))
43013 return disp_branch;
43014
43015 if (is_cmp (insn))
43016 return disp_cmp;
43017
43018 if (has_immediate (insn))
43019 return disp_imm;
43020
43021 if (is_prefetch (insn))
43022 return disp_prefetch;
43023
43024 return disp_no_group;
43025 }
43026
43027 /* Count number of GROUP restricted instructions in a dispatch
43028 window WINDOW_LIST. */
43029
43030 static int
43031 count_num_restricted (rtx insn, dispatch_windows *window_list)
43032 {
43033 enum dispatch_group group = get_insn_group (insn);
43034 int imm_size;
43035 int num_imm_operand;
43036 int num_imm32_operand;
43037 int num_imm64_operand;
43038
43039 if (group == disp_no_group)
43040 return 0;
43041
43042 if (group == disp_imm)
43043 {
43044 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43045 &num_imm64_operand);
43046 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
43047 || num_imm_operand + window_list->num_imm > MAX_IMM
43048 || (num_imm32_operand > 0
43049 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
43050 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
43051 || (num_imm64_operand > 0
43052 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
43053 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
43054 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
43055 && num_imm64_operand > 0
43056 && ((window_list->num_imm_64 > 0
43057 && window_list->num_insn >= 2)
43058 || window_list->num_insn >= 3)))
43059 return BIG;
43060
43061 return 1;
43062 }
43063
43064 if ((group == disp_load_store
43065 && (window_list->num_loads >= MAX_LOAD
43066 || window_list->num_stores >= MAX_STORE))
43067 || ((group == disp_load
43068 || group == disp_prefetch)
43069 && window_list->num_loads >= MAX_LOAD)
43070 || (group == disp_store
43071 && window_list->num_stores >= MAX_STORE))
43072 return BIG;
43073
43074 return 1;
43075 }
43076
43077 /* This function returns true if insn satisfies dispatch rules on the
43078 last window scheduled. */
43079
43080 static bool
43081 fits_dispatch_window (rtx insn)
43082 {
43083 dispatch_windows *window_list = dispatch_window_list;
43084 dispatch_windows *window_list_next = dispatch_window_list->next;
43085 unsigned int num_restrict;
43086 enum dispatch_group group = get_insn_group (insn);
43087 enum insn_path path = get_insn_path (insn);
43088 int sum;
43089
43090 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
43091 instructions should be given the lowest priority in the
43092 scheduling process in Haifa scheduler to make sure they will be
43093 scheduled in the same dispatch window as the reference to them. */
43094 if (group == disp_jcc || group == disp_cmp)
43095 return false;
43096
43097 /* Check nonrestricted. */
43098 if (group == disp_no_group || group == disp_branch)
43099 return true;
43100
43101 /* Get last dispatch window. */
43102 if (window_list_next)
43103 window_list = window_list_next;
43104
43105 if (window_list->window_num == 1)
43106 {
43107 sum = window_list->prev->window_size + window_list->window_size;
43108
43109 if (sum == 32
43110 || (min_insn_size (insn) + sum) >= 48)
43111 /* Window 1 is full. Go for next window. */
43112 return true;
43113 }
43114
43115 num_restrict = count_num_restricted (insn, window_list);
43116
43117 if (num_restrict > num_allowable_groups[group])
43118 return false;
43119
43120 /* See if it fits in the first window. */
43121 if (window_list->window_num == 0)
43122 {
43123 /* The first widow should have only single and double path
43124 uops. */
43125 if (path == path_double
43126 && (window_list->num_uops + 2) > MAX_INSN)
43127 return false;
43128 else if (path != path_single)
43129 return false;
43130 }
43131 return true;
43132 }
43133
43134 /* Add an instruction INSN with NUM_UOPS micro-operations to the
43135 dispatch window WINDOW_LIST. */
43136
43137 static void
43138 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
43139 {
43140 int byte_len = min_insn_size (insn);
43141 int num_insn = window_list->num_insn;
43142 int imm_size;
43143 sched_insn_info *window = window_list->window;
43144 enum dispatch_group group = get_insn_group (insn);
43145 enum insn_path path = get_insn_path (insn);
43146 int num_imm_operand;
43147 int num_imm32_operand;
43148 int num_imm64_operand;
43149
43150 if (!window_list->violation && group != disp_cmp
43151 && !fits_dispatch_window (insn))
43152 window_list->violation = true;
43153
43154 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43155 &num_imm64_operand);
43156
43157 /* Initialize window with new instruction. */
43158 window[num_insn].insn = insn;
43159 window[num_insn].byte_len = byte_len;
43160 window[num_insn].group = group;
43161 window[num_insn].path = path;
43162 window[num_insn].imm_bytes = imm_size;
43163
43164 window_list->window_size += byte_len;
43165 window_list->num_insn = num_insn + 1;
43166 window_list->num_uops = window_list->num_uops + num_uops;
43167 window_list->imm_size += imm_size;
43168 window_list->num_imm += num_imm_operand;
43169 window_list->num_imm_32 += num_imm32_operand;
43170 window_list->num_imm_64 += num_imm64_operand;
43171
43172 if (group == disp_store)
43173 window_list->num_stores += 1;
43174 else if (group == disp_load
43175 || group == disp_prefetch)
43176 window_list->num_loads += 1;
43177 else if (group == disp_load_store)
43178 {
43179 window_list->num_stores += 1;
43180 window_list->num_loads += 1;
43181 }
43182 }
43183
43184 /* Adds a scheduled instruction, INSN, to the current dispatch window.
43185 If the total bytes of instructions or the number of instructions in
43186 the window exceed allowable, it allocates a new window. */
43187
43188 static void
43189 add_to_dispatch_window (rtx insn)
43190 {
43191 int byte_len;
43192 dispatch_windows *window_list;
43193 dispatch_windows *next_list;
43194 dispatch_windows *window0_list;
43195 enum insn_path path;
43196 enum dispatch_group insn_group;
43197 bool insn_fits;
43198 int num_insn;
43199 int num_uops;
43200 int window_num;
43201 int insn_num_uops;
43202 int sum;
43203
43204 if (INSN_CODE (insn) < 0)
43205 return;
43206
43207 byte_len = min_insn_size (insn);
43208 window_list = dispatch_window_list;
43209 next_list = window_list->next;
43210 path = get_insn_path (insn);
43211 insn_group = get_insn_group (insn);
43212
43213 /* Get the last dispatch window. */
43214 if (next_list)
43215 window_list = dispatch_window_list->next;
43216
43217 if (path == path_single)
43218 insn_num_uops = 1;
43219 else if (path == path_double)
43220 insn_num_uops = 2;
43221 else
43222 insn_num_uops = (int) path;
43223
43224 /* If current window is full, get a new window.
43225 Window number zero is full, if MAX_INSN uops are scheduled in it.
43226 Window number one is full, if window zero's bytes plus window
43227 one's bytes is 32, or if the bytes of the new instruction added
43228 to the total makes it greater than 48, or it has already MAX_INSN
43229 instructions in it. */
43230 num_insn = window_list->num_insn;
43231 num_uops = window_list->num_uops;
43232 window_num = window_list->window_num;
43233 insn_fits = fits_dispatch_window (insn);
43234
43235 if (num_insn >= MAX_INSN
43236 || num_uops + insn_num_uops > MAX_INSN
43237 || !(insn_fits))
43238 {
43239 window_num = ~window_num & 1;
43240 window_list = allocate_next_window (window_num);
43241 }
43242
43243 if (window_num == 0)
43244 {
43245 add_insn_window (insn, window_list, insn_num_uops);
43246 if (window_list->num_insn >= MAX_INSN
43247 && insn_group == disp_branch)
43248 {
43249 process_end_window ();
43250 return;
43251 }
43252 }
43253 else if (window_num == 1)
43254 {
43255 window0_list = window_list->prev;
43256 sum = window0_list->window_size + window_list->window_size;
43257 if (sum == 32
43258 || (byte_len + sum) >= 48)
43259 {
43260 process_end_window ();
43261 window_list = dispatch_window_list;
43262 }
43263
43264 add_insn_window (insn, window_list, insn_num_uops);
43265 }
43266 else
43267 gcc_unreachable ();
43268
43269 if (is_end_basic_block (insn_group))
43270 {
43271 /* End of basic block is reached do end-basic-block process. */
43272 process_end_window ();
43273 return;
43274 }
43275 }
43276
43277 /* Print the dispatch window, WINDOW_NUM, to FILE. */
43278
43279 DEBUG_FUNCTION static void
43280 debug_dispatch_window_file (FILE *file, int window_num)
43281 {
43282 dispatch_windows *list;
43283 int i;
43284
43285 if (window_num == 0)
43286 list = dispatch_window_list;
43287 else
43288 list = dispatch_window_list1;
43289
43290 fprintf (file, "Window #%d:\n", list->window_num);
43291 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
43292 list->num_insn, list->num_uops, list->window_size);
43293 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43294 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
43295
43296 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
43297 list->num_stores);
43298 fprintf (file, " insn info:\n");
43299
43300 for (i = 0; i < MAX_INSN; i++)
43301 {
43302 if (!list->window[i].insn)
43303 break;
43304 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
43305 i, group_name[list->window[i].group],
43306 i, (void *)list->window[i].insn,
43307 i, list->window[i].path,
43308 i, list->window[i].byte_len,
43309 i, list->window[i].imm_bytes);
43310 }
43311 }
43312
43313 /* Print to stdout a dispatch window. */
43314
43315 DEBUG_FUNCTION void
43316 debug_dispatch_window (int window_num)
43317 {
43318 debug_dispatch_window_file (stdout, window_num);
43319 }
43320
43321 /* Print INSN dispatch information to FILE. */
43322
43323 DEBUG_FUNCTION static void
43324 debug_insn_dispatch_info_file (FILE *file, rtx insn)
43325 {
43326 int byte_len;
43327 enum insn_path path;
43328 enum dispatch_group group;
43329 int imm_size;
43330 int num_imm_operand;
43331 int num_imm32_operand;
43332 int num_imm64_operand;
43333
43334 if (INSN_CODE (insn) < 0)
43335 return;
43336
43337 byte_len = min_insn_size (insn);
43338 path = get_insn_path (insn);
43339 group = get_insn_group (insn);
43340 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43341 &num_imm64_operand);
43342
43343 fprintf (file, " insn info:\n");
43344 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
43345 group_name[group], path, byte_len);
43346 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43347 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
43348 }
43349
43350 /* Print to STDERR the status of the ready list with respect to
43351 dispatch windows. */
43352
43353 DEBUG_FUNCTION void
43354 debug_ready_dispatch (void)
43355 {
43356 int i;
43357 int no_ready = number_in_ready ();
43358
43359 fprintf (stdout, "Number of ready: %d\n", no_ready);
43360
43361 for (i = 0; i < no_ready; i++)
43362 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
43363 }
43364
43365 /* This routine is the driver of the dispatch scheduler. */
43366
43367 static void
43368 do_dispatch (rtx insn, int mode)
43369 {
43370 if (mode == DISPATCH_INIT)
43371 init_dispatch_sched ();
43372 else if (mode == ADD_TO_DISPATCH_WINDOW)
43373 add_to_dispatch_window (insn);
43374 }
43375
43376 /* Return TRUE if Dispatch Scheduling is supported. */
43377
43378 static bool
43379 has_dispatch (rtx insn, int action)
43380 {
43381 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
43382 && flag_dispatch_scheduler)
43383 switch (action)
43384 {
43385 default:
43386 return false;
43387
43388 case IS_DISPATCH_ON:
43389 return true;
43390 break;
43391
43392 case IS_CMP:
43393 return is_cmp (insn);
43394
43395 case DISPATCH_VIOLATION:
43396 return dispatch_violation ();
43397
43398 case FITS_DISPATCH_WINDOW:
43399 return fits_dispatch_window (insn);
43400 }
43401
43402 return false;
43403 }
43404
43405 /* Implementation of reassociation_width target hook used by
43406 reassoc phase to identify parallelism level in reassociated
43407 tree. Statements tree_code is passed in OPC. Arguments type
43408 is passed in MODE.
43409
43410 Currently parallel reassociation is enabled for Atom
43411 processors only and we set reassociation width to be 2
43412 because Atom may issue up to 2 instructions per cycle.
43413
43414 Return value should be fixed if parallel reassociation is
43415 enabled for other processors. */
43416
43417 static int
43418 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
43419 enum machine_mode mode)
43420 {
43421 int res = 1;
43422
43423 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
43424 res = 2;
43425 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
43426 res = 2;
43427
43428 return res;
43429 }
43430
43431 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
43432 place emms and femms instructions. */
43433
43434 static enum machine_mode
43435 ix86_preferred_simd_mode (enum machine_mode mode)
43436 {
43437 if (!TARGET_SSE)
43438 return word_mode;
43439
43440 switch (mode)
43441 {
43442 case QImode:
43443 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
43444 case HImode:
43445 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
43446 case SImode:
43447 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
43448 case DImode:
43449 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
43450
43451 case SFmode:
43452 if (TARGET_AVX && !TARGET_PREFER_AVX128)
43453 return V8SFmode;
43454 else
43455 return V4SFmode;
43456
43457 case DFmode:
43458 if (!TARGET_VECTORIZE_DOUBLE)
43459 return word_mode;
43460 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
43461 return V4DFmode;
43462 else if (TARGET_SSE2)
43463 return V2DFmode;
43464 /* FALLTHRU */
43465
43466 default:
43467 return word_mode;
43468 }
43469 }
43470
43471 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
43472 vectors. */
43473
43474 static unsigned int
43475 ix86_autovectorize_vector_sizes (void)
43476 {
43477 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
43478 }
43479
43480 \f
43481
43482 /* Return class of registers which could be used for pseudo of MODE
43483 and of class RCLASS for spilling instead of memory. Return NO_REGS
43484 if it is not possible or non-profitable. */
43485 static reg_class_t
43486 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
43487 {
43488 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
43489 && (mode == SImode || (TARGET_64BIT && mode == DImode))
43490 && INTEGER_CLASS_P (rclass))
43491 return ALL_SSE_REGS;
43492 return NO_REGS;
43493 }
43494
43495 /* Implement targetm.vectorize.init_cost. */
43496
43497 static void *
43498 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
43499 {
43500 unsigned *cost = XNEWVEC (unsigned, 3);
43501 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
43502 return cost;
43503 }
43504
43505 /* Implement targetm.vectorize.add_stmt_cost. */
43506
43507 static unsigned
43508 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
43509 struct _stmt_vec_info *stmt_info, int misalign,
43510 enum vect_cost_model_location where)
43511 {
43512 unsigned *cost = (unsigned *) data;
43513 unsigned retval = 0;
43514
43515 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
43516 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
43517
43518 /* Statements in an inner loop relative to the loop being
43519 vectorized are weighted more heavily. The value here is
43520 arbitrary and could potentially be improved with analysis. */
43521 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
43522 count *= 50; /* FIXME. */
43523
43524 retval = (unsigned) (count * stmt_cost);
43525 cost[where] += retval;
43526
43527 return retval;
43528 }
43529
43530 /* Implement targetm.vectorize.finish_cost. */
43531
43532 static void
43533 ix86_finish_cost (void *data, unsigned *prologue_cost,
43534 unsigned *body_cost, unsigned *epilogue_cost)
43535 {
43536 unsigned *cost = (unsigned *) data;
43537 *prologue_cost = cost[vect_prologue];
43538 *body_cost = cost[vect_body];
43539 *epilogue_cost = cost[vect_epilogue];
43540 }
43541
43542 /* Implement targetm.vectorize.destroy_cost_data. */
43543
43544 static void
43545 ix86_destroy_cost_data (void *data)
43546 {
43547 free (data);
43548 }
43549
43550 /* Validate target specific memory model bits in VAL. */
43551
43552 static unsigned HOST_WIDE_INT
43553 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
43554 {
43555 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
43556 bool strong;
43557
43558 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
43559 |MEMMODEL_MASK)
43560 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
43561 {
43562 warning (OPT_Winvalid_memory_model,
43563 "Unknown architecture specific memory model");
43564 return MEMMODEL_SEQ_CST;
43565 }
43566 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
43567 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
43568 {
43569 warning (OPT_Winvalid_memory_model,
43570 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
43571 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
43572 }
43573 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
43574 {
43575 warning (OPT_Winvalid_memory_model,
43576 "HLE_RELEASE not used with RELEASE or stronger memory model");
43577 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
43578 }
43579 return val;
43580 }
43581
43582 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
43583
43584 static bool
43585 ix86_float_exceptions_rounding_supported_p (void)
43586 {
43587 /* For x87 floating point with standard excess precision handling,
43588 there is no adddf3 pattern (since x87 floating point only has
43589 XFmode operations) so the default hook implementation gets this
43590 wrong. */
43591 return TARGET_80387 || TARGET_SSE_MATH;
43592 }
43593
43594 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
43595
43596 static void
43597 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
43598 {
43599 if (!TARGET_80387 && !TARGET_SSE_MATH)
43600 return;
43601 tree exceptions_var = create_tmp_var (integer_type_node, NULL);
43602 if (TARGET_80387)
43603 {
43604 tree fenv_index_type = build_index_type (size_int (6));
43605 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
43606 tree fenv_var = create_tmp_var (fenv_type, NULL);
43607 mark_addressable (fenv_var);
43608 tree fenv_ptr = build_pointer_type (fenv_type);
43609 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
43610 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
43611 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
43612 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
43613 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
43614 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
43615 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
43616 tree hold_fnclex = build_call_expr (fnclex, 0);
43617 *hold = build2 (COMPOUND_EXPR, void_type_node, hold_fnstenv,
43618 hold_fnclex);
43619 *clear = build_call_expr (fnclex, 0);
43620 tree sw_var = create_tmp_var (short_unsigned_type_node, NULL);
43621 mark_addressable (sw_var);
43622 tree su_ptr = build_pointer_type (short_unsigned_type_node);
43623 tree sw_addr = build1 (ADDR_EXPR, su_ptr, sw_var);
43624 tree fnstsw_call = build_call_expr (fnstsw, 1, sw_addr);
43625 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
43626 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
43627 exceptions_var, exceptions_x87);
43628 *update = build2 (COMPOUND_EXPR, integer_type_node,
43629 fnstsw_call, update_mod);
43630 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
43631 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
43632 }
43633 if (TARGET_SSE_MATH)
43634 {
43635 tree mxcsr_orig_var = create_tmp_var (unsigned_type_node, NULL);
43636 tree mxcsr_mod_var = create_tmp_var (unsigned_type_node, NULL);
43637 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
43638 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
43639 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
43640 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
43641 mxcsr_orig_var, stmxcsr_hold_call);
43642 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
43643 mxcsr_orig_var,
43644 build_int_cst (unsigned_type_node, 0x1f80));
43645 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
43646 build_int_cst (unsigned_type_node, 0xffffffc0));
43647 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
43648 mxcsr_mod_var, hold_mod_val);
43649 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
43650 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
43651 hold_assign_orig, hold_assign_mod);
43652 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
43653 ldmxcsr_hold_call);
43654 if (*hold)
43655 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
43656 else
43657 *hold = hold_all;
43658 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
43659 if (*clear)
43660 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
43661 ldmxcsr_clear_call);
43662 else
43663 *clear = ldmxcsr_clear_call;
43664 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
43665 tree exceptions_sse = fold_convert (integer_type_node,
43666 stxmcsr_update_call);
43667 if (*update)
43668 {
43669 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
43670 exceptions_var, exceptions_sse);
43671 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
43672 exceptions_var, exceptions_mod);
43673 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
43674 exceptions_assign);
43675 }
43676 else
43677 *update = build2 (MODIFY_EXPR, integer_type_node,
43678 exceptions_var, exceptions_sse);
43679 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
43680 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
43681 ldmxcsr_update_call);
43682 }
43683 tree atomic_feraiseexcept
43684 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
43685 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
43686 1, exceptions_var);
43687 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
43688 atomic_feraiseexcept_call);
43689 }
43690
43691 /* Initialize the GCC target structure. */
43692 #undef TARGET_RETURN_IN_MEMORY
43693 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
43694
43695 #undef TARGET_LEGITIMIZE_ADDRESS
43696 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
43697
43698 #undef TARGET_ATTRIBUTE_TABLE
43699 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
43700 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
43701 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
43702 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43703 # undef TARGET_MERGE_DECL_ATTRIBUTES
43704 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
43705 #endif
43706
43707 #undef TARGET_COMP_TYPE_ATTRIBUTES
43708 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
43709
43710 #undef TARGET_INIT_BUILTINS
43711 #define TARGET_INIT_BUILTINS ix86_init_builtins
43712 #undef TARGET_BUILTIN_DECL
43713 #define TARGET_BUILTIN_DECL ix86_builtin_decl
43714 #undef TARGET_EXPAND_BUILTIN
43715 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
43716
43717 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
43718 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
43719 ix86_builtin_vectorized_function
43720
43721 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
43722 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
43723
43724 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
43725 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
43726
43727 #undef TARGET_VECTORIZE_BUILTIN_GATHER
43728 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
43729
43730 #undef TARGET_BUILTIN_RECIPROCAL
43731 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
43732
43733 #undef TARGET_ASM_FUNCTION_EPILOGUE
43734 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
43735
43736 #undef TARGET_ENCODE_SECTION_INFO
43737 #ifndef SUBTARGET_ENCODE_SECTION_INFO
43738 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
43739 #else
43740 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
43741 #endif
43742
43743 #undef TARGET_ASM_OPEN_PAREN
43744 #define TARGET_ASM_OPEN_PAREN ""
43745 #undef TARGET_ASM_CLOSE_PAREN
43746 #define TARGET_ASM_CLOSE_PAREN ""
43747
43748 #undef TARGET_ASM_BYTE_OP
43749 #define TARGET_ASM_BYTE_OP ASM_BYTE
43750
43751 #undef TARGET_ASM_ALIGNED_HI_OP
43752 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
43753 #undef TARGET_ASM_ALIGNED_SI_OP
43754 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
43755 #ifdef ASM_QUAD
43756 #undef TARGET_ASM_ALIGNED_DI_OP
43757 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
43758 #endif
43759
43760 #undef TARGET_PROFILE_BEFORE_PROLOGUE
43761 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
43762
43763 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
43764 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
43765
43766 #undef TARGET_ASM_UNALIGNED_HI_OP
43767 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
43768 #undef TARGET_ASM_UNALIGNED_SI_OP
43769 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
43770 #undef TARGET_ASM_UNALIGNED_DI_OP
43771 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
43772
43773 #undef TARGET_PRINT_OPERAND
43774 #define TARGET_PRINT_OPERAND ix86_print_operand
43775 #undef TARGET_PRINT_OPERAND_ADDRESS
43776 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
43777 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
43778 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
43779 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
43780 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
43781
43782 #undef TARGET_SCHED_INIT_GLOBAL
43783 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
43784 #undef TARGET_SCHED_ADJUST_COST
43785 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
43786 #undef TARGET_SCHED_ISSUE_RATE
43787 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
43788 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
43789 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
43790 ia32_multipass_dfa_lookahead
43791 #undef TARGET_SCHED_MACRO_FUSION_P
43792 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
43793 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
43794 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
43795
43796 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
43797 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
43798
43799 #undef TARGET_MEMMODEL_CHECK
43800 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
43801
43802 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
43803 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
43804
43805 #ifdef HAVE_AS_TLS
43806 #undef TARGET_HAVE_TLS
43807 #define TARGET_HAVE_TLS true
43808 #endif
43809 #undef TARGET_CANNOT_FORCE_CONST_MEM
43810 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
43811 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
43812 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
43813
43814 #undef TARGET_DELEGITIMIZE_ADDRESS
43815 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
43816
43817 #undef TARGET_MS_BITFIELD_LAYOUT_P
43818 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
43819
43820 #if TARGET_MACHO
43821 #undef TARGET_BINDS_LOCAL_P
43822 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
43823 #endif
43824 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43825 #undef TARGET_BINDS_LOCAL_P
43826 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
43827 #endif
43828
43829 #undef TARGET_ASM_OUTPUT_MI_THUNK
43830 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
43831 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
43832 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
43833
43834 #undef TARGET_ASM_FILE_START
43835 #define TARGET_ASM_FILE_START x86_file_start
43836
43837 #undef TARGET_OPTION_OVERRIDE
43838 #define TARGET_OPTION_OVERRIDE ix86_option_override
43839
43840 #undef TARGET_REGISTER_MOVE_COST
43841 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
43842 #undef TARGET_MEMORY_MOVE_COST
43843 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
43844 #undef TARGET_RTX_COSTS
43845 #define TARGET_RTX_COSTS ix86_rtx_costs
43846 #undef TARGET_ADDRESS_COST
43847 #define TARGET_ADDRESS_COST ix86_address_cost
43848
43849 #undef TARGET_FIXED_CONDITION_CODE_REGS
43850 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
43851 #undef TARGET_CC_MODES_COMPATIBLE
43852 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
43853
43854 #undef TARGET_MACHINE_DEPENDENT_REORG
43855 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
43856
43857 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
43858 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
43859
43860 #undef TARGET_BUILD_BUILTIN_VA_LIST
43861 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
43862
43863 #undef TARGET_FOLD_BUILTIN
43864 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
43865
43866 #undef TARGET_COMPARE_VERSION_PRIORITY
43867 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
43868
43869 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
43870 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
43871 ix86_generate_version_dispatcher_body
43872
43873 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
43874 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
43875 ix86_get_function_versions_dispatcher
43876
43877 #undef TARGET_ENUM_VA_LIST_P
43878 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
43879
43880 #undef TARGET_FN_ABI_VA_LIST
43881 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
43882
43883 #undef TARGET_CANONICAL_VA_LIST_TYPE
43884 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
43885
43886 #undef TARGET_EXPAND_BUILTIN_VA_START
43887 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
43888
43889 #undef TARGET_MD_ASM_CLOBBERS
43890 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
43891
43892 #undef TARGET_PROMOTE_PROTOTYPES
43893 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
43894 #undef TARGET_STRUCT_VALUE_RTX
43895 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
43896 #undef TARGET_SETUP_INCOMING_VARARGS
43897 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
43898 #undef TARGET_MUST_PASS_IN_STACK
43899 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
43900 #undef TARGET_FUNCTION_ARG_ADVANCE
43901 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
43902 #undef TARGET_FUNCTION_ARG
43903 #define TARGET_FUNCTION_ARG ix86_function_arg
43904 #undef TARGET_FUNCTION_ARG_BOUNDARY
43905 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
43906 #undef TARGET_PASS_BY_REFERENCE
43907 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
43908 #undef TARGET_INTERNAL_ARG_POINTER
43909 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
43910 #undef TARGET_UPDATE_STACK_BOUNDARY
43911 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
43912 #undef TARGET_GET_DRAP_RTX
43913 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
43914 #undef TARGET_STRICT_ARGUMENT_NAMING
43915 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
43916 #undef TARGET_STATIC_CHAIN
43917 #define TARGET_STATIC_CHAIN ix86_static_chain
43918 #undef TARGET_TRAMPOLINE_INIT
43919 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
43920 #undef TARGET_RETURN_POPS_ARGS
43921 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
43922
43923 #undef TARGET_LEGITIMATE_COMBINED_INSN
43924 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
43925
43926 #undef TARGET_ASAN_SHADOW_OFFSET
43927 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
43928
43929 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
43930 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
43931
43932 #undef TARGET_SCALAR_MODE_SUPPORTED_P
43933 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
43934
43935 #undef TARGET_VECTOR_MODE_SUPPORTED_P
43936 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
43937
43938 #undef TARGET_C_MODE_FOR_SUFFIX
43939 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
43940
43941 #ifdef HAVE_AS_TLS
43942 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
43943 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
43944 #endif
43945
43946 #ifdef SUBTARGET_INSERT_ATTRIBUTES
43947 #undef TARGET_INSERT_ATTRIBUTES
43948 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
43949 #endif
43950
43951 #undef TARGET_MANGLE_TYPE
43952 #define TARGET_MANGLE_TYPE ix86_mangle_type
43953
43954 #if !TARGET_MACHO
43955 #undef TARGET_STACK_PROTECT_FAIL
43956 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
43957 #endif
43958
43959 #undef TARGET_FUNCTION_VALUE
43960 #define TARGET_FUNCTION_VALUE ix86_function_value
43961
43962 #undef TARGET_FUNCTION_VALUE_REGNO_P
43963 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
43964
43965 #undef TARGET_PROMOTE_FUNCTION_MODE
43966 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
43967
43968 #undef TARGET_MEMBER_TYPE_FORCES_BLK
43969 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
43970
43971 #undef TARGET_INSTANTIATE_DECLS
43972 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
43973
43974 #undef TARGET_SECONDARY_RELOAD
43975 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
43976
43977 #undef TARGET_CLASS_MAX_NREGS
43978 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
43979
43980 #undef TARGET_PREFERRED_RELOAD_CLASS
43981 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
43982 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
43983 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
43984 #undef TARGET_CLASS_LIKELY_SPILLED_P
43985 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
43986
43987 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
43988 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
43989 ix86_builtin_vectorization_cost
43990 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
43991 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
43992 ix86_vectorize_vec_perm_const_ok
43993 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
43994 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
43995 ix86_preferred_simd_mode
43996 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
43997 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
43998 ix86_autovectorize_vector_sizes
43999 #undef TARGET_VECTORIZE_INIT_COST
44000 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
44001 #undef TARGET_VECTORIZE_ADD_STMT_COST
44002 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
44003 #undef TARGET_VECTORIZE_FINISH_COST
44004 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
44005 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
44006 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
44007
44008 #undef TARGET_SET_CURRENT_FUNCTION
44009 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
44010
44011 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
44012 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
44013
44014 #undef TARGET_OPTION_SAVE
44015 #define TARGET_OPTION_SAVE ix86_function_specific_save
44016
44017 #undef TARGET_OPTION_RESTORE
44018 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
44019
44020 #undef TARGET_OPTION_PRINT
44021 #define TARGET_OPTION_PRINT ix86_function_specific_print
44022
44023 #undef TARGET_OPTION_FUNCTION_VERSIONS
44024 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
44025
44026 #undef TARGET_CAN_INLINE_P
44027 #define TARGET_CAN_INLINE_P ix86_can_inline_p
44028
44029 #undef TARGET_EXPAND_TO_RTL_HOOK
44030 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
44031
44032 #undef TARGET_LEGITIMATE_ADDRESS_P
44033 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
44034
44035 #undef TARGET_LRA_P
44036 #define TARGET_LRA_P hook_bool_void_true
44037
44038 #undef TARGET_REGISTER_PRIORITY
44039 #define TARGET_REGISTER_PRIORITY ix86_register_priority
44040
44041 #undef TARGET_REGISTER_USAGE_LEVELING_P
44042 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
44043
44044 #undef TARGET_LEGITIMATE_CONSTANT_P
44045 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
44046
44047 #undef TARGET_FRAME_POINTER_REQUIRED
44048 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
44049
44050 #undef TARGET_CAN_ELIMINATE
44051 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
44052
44053 #undef TARGET_EXTRA_LIVE_ON_ENTRY
44054 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
44055
44056 #undef TARGET_ASM_CODE_END
44057 #define TARGET_ASM_CODE_END ix86_code_end
44058
44059 #undef TARGET_CONDITIONAL_REGISTER_USAGE
44060 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
44061
44062 #if TARGET_MACHO
44063 #undef TARGET_INIT_LIBFUNCS
44064 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
44065 #endif
44066
44067 #undef TARGET_SPILL_CLASS
44068 #define TARGET_SPILL_CLASS ix86_spill_class
44069
44070 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
44071 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
44072 ix86_float_exceptions_rounding_supported_p
44073
44074 struct gcc_target targetm = TARGET_INITIALIZER;
44075 \f
44076 #include "gt-i386.h"