]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
* gimple.h: Remove all includes.
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "stringpool.h"
27 #include "attribs.h"
28 #include "calls.h"
29 #include "stor-layout.h"
30 #include "varasm.h"
31 #include "tm_p.h"
32 #include "regs.h"
33 #include "hard-reg-set.h"
34 #include "insn-config.h"
35 #include "conditions.h"
36 #include "output.h"
37 #include "insn-codes.h"
38 #include "insn-attr.h"
39 #include "flags.h"
40 #include "except.h"
41 #include "function.h"
42 #include "recog.h"
43 #include "expr.h"
44 #include "optabs.h"
45 #include "diagnostic-core.h"
46 #include "toplev.h"
47 #include "basic-block.h"
48 #include "ggc.h"
49 #include "target.h"
50 #include "target-def.h"
51 #include "common/common-target.h"
52 #include "langhooks.h"
53 #include "reload.h"
54 #include "cgraph.h"
55 #include "pointer-set.h"
56 #include "hash-table.h"
57 #include "vec.h"
58 #include "basic-block.h"
59 #include "tree-ssa-alias.h"
60 #include "internal-fn.h"
61 #include "gimple-fold.h"
62 #include "tree-eh.h"
63 #include "gimple-expr.h"
64 #include "is-a.h"
65 #include "gimple.h"
66 #include "gimplify.h"
67 #include "dwarf2.h"
68 #include "df.h"
69 #include "tm-constrs.h"
70 #include "params.h"
71 #include "cselib.h"
72 #include "debug.h"
73 #include "sched-int.h"
74 #include "sbitmap.h"
75 #include "fibheap.h"
76 #include "opts.h"
77 #include "diagnostic.h"
78 #include "dumpfile.h"
79 #include "tree-pass.h"
80 #include "context.h"
81 #include "pass_manager.h"
82
83 static rtx legitimize_dllimport_symbol (rtx, bool);
84 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
85 static rtx legitimize_pe_coff_symbol (rtx, bool);
86
87 #ifndef CHECK_STACK_LIMIT
88 #define CHECK_STACK_LIMIT (-1)
89 #endif
90
91 /* Return index of given mode in mult and division cost tables. */
92 #define MODE_INDEX(mode) \
93 ((mode) == QImode ? 0 \
94 : (mode) == HImode ? 1 \
95 : (mode) == SImode ? 2 \
96 : (mode) == DImode ? 3 \
97 : 4)
98
99 /* Processor costs (relative to an add) */
100 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
101 #define COSTS_N_BYTES(N) ((N) * 2)
102
103 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
104
105 static stringop_algs ix86_size_memcpy[2] = {
106 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
107 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
108 static stringop_algs ix86_size_memset[2] = {
109 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
110 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
111
112 const
113 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
114 COSTS_N_BYTES (2), /* cost of an add instruction */
115 COSTS_N_BYTES (3), /* cost of a lea instruction */
116 COSTS_N_BYTES (2), /* variable shift costs */
117 COSTS_N_BYTES (3), /* constant shift costs */
118 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
119 COSTS_N_BYTES (3), /* HI */
120 COSTS_N_BYTES (3), /* SI */
121 COSTS_N_BYTES (3), /* DI */
122 COSTS_N_BYTES (5)}, /* other */
123 0, /* cost of multiply per each bit set */
124 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
125 COSTS_N_BYTES (3), /* HI */
126 COSTS_N_BYTES (3), /* SI */
127 COSTS_N_BYTES (3), /* DI */
128 COSTS_N_BYTES (5)}, /* other */
129 COSTS_N_BYTES (3), /* cost of movsx */
130 COSTS_N_BYTES (3), /* cost of movzx */
131 0, /* "large" insn */
132 2, /* MOVE_RATIO */
133 2, /* cost for loading QImode using movzbl */
134 {2, 2, 2}, /* cost of loading integer registers
135 in QImode, HImode and SImode.
136 Relative to reg-reg move (2). */
137 {2, 2, 2}, /* cost of storing integer registers */
138 2, /* cost of reg,reg fld/fst */
139 {2, 2, 2}, /* cost of loading fp registers
140 in SFmode, DFmode and XFmode */
141 {2, 2, 2}, /* cost of storing fp registers
142 in SFmode, DFmode and XFmode */
143 3, /* cost of moving MMX register */
144 {3, 3}, /* cost of loading MMX registers
145 in SImode and DImode */
146 {3, 3}, /* cost of storing MMX registers
147 in SImode and DImode */
148 3, /* cost of moving SSE register */
149 {3, 3, 3}, /* cost of loading SSE registers
150 in SImode, DImode and TImode */
151 {3, 3, 3}, /* cost of storing SSE registers
152 in SImode, DImode and TImode */
153 3, /* MMX or SSE register to integer */
154 0, /* size of l1 cache */
155 0, /* size of l2 cache */
156 0, /* size of prefetch block */
157 0, /* number of parallel prefetches */
158 2, /* Branch cost */
159 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
160 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
161 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
162 COSTS_N_BYTES (2), /* cost of FABS instruction. */
163 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
164 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
165 ix86_size_memcpy,
166 ix86_size_memset,
167 1, /* scalar_stmt_cost. */
168 1, /* scalar load_cost. */
169 1, /* scalar_store_cost. */
170 1, /* vec_stmt_cost. */
171 1, /* vec_to_scalar_cost. */
172 1, /* scalar_to_vec_cost. */
173 1, /* vec_align_load_cost. */
174 1, /* vec_unalign_load_cost. */
175 1, /* vec_store_cost. */
176 1, /* cond_taken_branch_cost. */
177 1, /* cond_not_taken_branch_cost. */
178 };
179
180 /* Processor costs (relative to an add) */
181 static stringop_algs i386_memcpy[2] = {
182 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
183 DUMMY_STRINGOP_ALGS};
184 static stringop_algs i386_memset[2] = {
185 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
186 DUMMY_STRINGOP_ALGS};
187
188 static const
189 struct processor_costs i386_cost = { /* 386 specific costs */
190 COSTS_N_INSNS (1), /* cost of an add instruction */
191 COSTS_N_INSNS (1), /* cost of a lea instruction */
192 COSTS_N_INSNS (3), /* variable shift costs */
193 COSTS_N_INSNS (2), /* constant shift costs */
194 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
195 COSTS_N_INSNS (6), /* HI */
196 COSTS_N_INSNS (6), /* SI */
197 COSTS_N_INSNS (6), /* DI */
198 COSTS_N_INSNS (6)}, /* other */
199 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
200 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
201 COSTS_N_INSNS (23), /* HI */
202 COSTS_N_INSNS (23), /* SI */
203 COSTS_N_INSNS (23), /* DI */
204 COSTS_N_INSNS (23)}, /* other */
205 COSTS_N_INSNS (3), /* cost of movsx */
206 COSTS_N_INSNS (2), /* cost of movzx */
207 15, /* "large" insn */
208 3, /* MOVE_RATIO */
209 4, /* cost for loading QImode using movzbl */
210 {2, 4, 2}, /* cost of loading integer registers
211 in QImode, HImode and SImode.
212 Relative to reg-reg move (2). */
213 {2, 4, 2}, /* cost of storing integer registers */
214 2, /* cost of reg,reg fld/fst */
215 {8, 8, 8}, /* cost of loading fp registers
216 in SFmode, DFmode and XFmode */
217 {8, 8, 8}, /* cost of storing fp registers
218 in SFmode, DFmode and XFmode */
219 2, /* cost of moving MMX register */
220 {4, 8}, /* cost of loading MMX registers
221 in SImode and DImode */
222 {4, 8}, /* cost of storing MMX registers
223 in SImode and DImode */
224 2, /* cost of moving SSE register */
225 {4, 8, 16}, /* cost of loading SSE registers
226 in SImode, DImode and TImode */
227 {4, 8, 16}, /* cost of storing SSE registers
228 in SImode, DImode and TImode */
229 3, /* MMX or SSE register to integer */
230 0, /* size of l1 cache */
231 0, /* size of l2 cache */
232 0, /* size of prefetch block */
233 0, /* number of parallel prefetches */
234 1, /* Branch cost */
235 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
236 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
237 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
238 COSTS_N_INSNS (22), /* cost of FABS instruction. */
239 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
240 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
241 i386_memcpy,
242 i386_memset,
243 1, /* scalar_stmt_cost. */
244 1, /* scalar load_cost. */
245 1, /* scalar_store_cost. */
246 1, /* vec_stmt_cost. */
247 1, /* vec_to_scalar_cost. */
248 1, /* scalar_to_vec_cost. */
249 1, /* vec_align_load_cost. */
250 2, /* vec_unalign_load_cost. */
251 1, /* vec_store_cost. */
252 3, /* cond_taken_branch_cost. */
253 1, /* cond_not_taken_branch_cost. */
254 };
255
256 static stringop_algs i486_memcpy[2] = {
257 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
258 DUMMY_STRINGOP_ALGS};
259 static stringop_algs i486_memset[2] = {
260 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
261 DUMMY_STRINGOP_ALGS};
262
263 static const
264 struct processor_costs i486_cost = { /* 486 specific costs */
265 COSTS_N_INSNS (1), /* cost of an add instruction */
266 COSTS_N_INSNS (1), /* cost of a lea instruction */
267 COSTS_N_INSNS (3), /* variable shift costs */
268 COSTS_N_INSNS (2), /* constant shift costs */
269 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
270 COSTS_N_INSNS (12), /* HI */
271 COSTS_N_INSNS (12), /* SI */
272 COSTS_N_INSNS (12), /* DI */
273 COSTS_N_INSNS (12)}, /* other */
274 1, /* cost of multiply per each bit set */
275 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
276 COSTS_N_INSNS (40), /* HI */
277 COSTS_N_INSNS (40), /* SI */
278 COSTS_N_INSNS (40), /* DI */
279 COSTS_N_INSNS (40)}, /* other */
280 COSTS_N_INSNS (3), /* cost of movsx */
281 COSTS_N_INSNS (2), /* cost of movzx */
282 15, /* "large" insn */
283 3, /* MOVE_RATIO */
284 4, /* cost for loading QImode using movzbl */
285 {2, 4, 2}, /* cost of loading integer registers
286 in QImode, HImode and SImode.
287 Relative to reg-reg move (2). */
288 {2, 4, 2}, /* cost of storing integer registers */
289 2, /* cost of reg,reg fld/fst */
290 {8, 8, 8}, /* cost of loading fp registers
291 in SFmode, DFmode and XFmode */
292 {8, 8, 8}, /* cost of storing fp registers
293 in SFmode, DFmode and XFmode */
294 2, /* cost of moving MMX register */
295 {4, 8}, /* cost of loading MMX registers
296 in SImode and DImode */
297 {4, 8}, /* cost of storing MMX registers
298 in SImode and DImode */
299 2, /* cost of moving SSE register */
300 {4, 8, 16}, /* cost of loading SSE registers
301 in SImode, DImode and TImode */
302 {4, 8, 16}, /* cost of storing SSE registers
303 in SImode, DImode and TImode */
304 3, /* MMX or SSE register to integer */
305 4, /* size of l1 cache. 486 has 8kB cache
306 shared for code and data, so 4kB is
307 not really precise. */
308 4, /* size of l2 cache */
309 0, /* size of prefetch block */
310 0, /* number of parallel prefetches */
311 1, /* Branch cost */
312 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
313 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
314 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
315 COSTS_N_INSNS (3), /* cost of FABS instruction. */
316 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
317 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
318 i486_memcpy,
319 i486_memset,
320 1, /* scalar_stmt_cost. */
321 1, /* scalar load_cost. */
322 1, /* scalar_store_cost. */
323 1, /* vec_stmt_cost. */
324 1, /* vec_to_scalar_cost. */
325 1, /* scalar_to_vec_cost. */
326 1, /* vec_align_load_cost. */
327 2, /* vec_unalign_load_cost. */
328 1, /* vec_store_cost. */
329 3, /* cond_taken_branch_cost. */
330 1, /* cond_not_taken_branch_cost. */
331 };
332
333 static stringop_algs pentium_memcpy[2] = {
334 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
335 DUMMY_STRINGOP_ALGS};
336 static stringop_algs pentium_memset[2] = {
337 {libcall, {{-1, rep_prefix_4_byte, false}}},
338 DUMMY_STRINGOP_ALGS};
339
340 static const
341 struct processor_costs pentium_cost = {
342 COSTS_N_INSNS (1), /* cost of an add instruction */
343 COSTS_N_INSNS (1), /* cost of a lea instruction */
344 COSTS_N_INSNS (4), /* variable shift costs */
345 COSTS_N_INSNS (1), /* constant shift costs */
346 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
347 COSTS_N_INSNS (11), /* HI */
348 COSTS_N_INSNS (11), /* SI */
349 COSTS_N_INSNS (11), /* DI */
350 COSTS_N_INSNS (11)}, /* other */
351 0, /* cost of multiply per each bit set */
352 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
353 COSTS_N_INSNS (25), /* HI */
354 COSTS_N_INSNS (25), /* SI */
355 COSTS_N_INSNS (25), /* DI */
356 COSTS_N_INSNS (25)}, /* other */
357 COSTS_N_INSNS (3), /* cost of movsx */
358 COSTS_N_INSNS (2), /* cost of movzx */
359 8, /* "large" insn */
360 6, /* MOVE_RATIO */
361 6, /* cost for loading QImode using movzbl */
362 {2, 4, 2}, /* cost of loading integer registers
363 in QImode, HImode and SImode.
364 Relative to reg-reg move (2). */
365 {2, 4, 2}, /* cost of storing integer registers */
366 2, /* cost of reg,reg fld/fst */
367 {2, 2, 6}, /* cost of loading fp registers
368 in SFmode, DFmode and XFmode */
369 {4, 4, 6}, /* cost of storing fp registers
370 in SFmode, DFmode and XFmode */
371 8, /* cost of moving MMX register */
372 {8, 8}, /* cost of loading MMX registers
373 in SImode and DImode */
374 {8, 8}, /* cost of storing MMX registers
375 in SImode and DImode */
376 2, /* cost of moving SSE register */
377 {4, 8, 16}, /* cost of loading SSE registers
378 in SImode, DImode and TImode */
379 {4, 8, 16}, /* cost of storing SSE registers
380 in SImode, DImode and TImode */
381 3, /* MMX or SSE register to integer */
382 8, /* size of l1 cache. */
383 8, /* size of l2 cache */
384 0, /* size of prefetch block */
385 0, /* number of parallel prefetches */
386 2, /* Branch cost */
387 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
388 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
389 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
390 COSTS_N_INSNS (1), /* cost of FABS instruction. */
391 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
392 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
393 pentium_memcpy,
394 pentium_memset,
395 1, /* scalar_stmt_cost. */
396 1, /* scalar load_cost. */
397 1, /* scalar_store_cost. */
398 1, /* vec_stmt_cost. */
399 1, /* vec_to_scalar_cost. */
400 1, /* scalar_to_vec_cost. */
401 1, /* vec_align_load_cost. */
402 2, /* vec_unalign_load_cost. */
403 1, /* vec_store_cost. */
404 3, /* cond_taken_branch_cost. */
405 1, /* cond_not_taken_branch_cost. */
406 };
407
408 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
409 (we ensure the alignment). For small blocks inline loop is still a
410 noticeable win, for bigger blocks either rep movsl or rep movsb is
411 way to go. Rep movsb has apparently more expensive startup time in CPU,
412 but after 4K the difference is down in the noise. */
413 static stringop_algs pentiumpro_memcpy[2] = {
414 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
415 {8192, rep_prefix_4_byte, false},
416 {-1, rep_prefix_1_byte, false}}},
417 DUMMY_STRINGOP_ALGS};
418 static stringop_algs pentiumpro_memset[2] = {
419 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
420 {8192, rep_prefix_4_byte, false},
421 {-1, libcall, false}}},
422 DUMMY_STRINGOP_ALGS};
423 static const
424 struct processor_costs pentiumpro_cost = {
425 COSTS_N_INSNS (1), /* cost of an add instruction */
426 COSTS_N_INSNS (1), /* cost of a lea instruction */
427 COSTS_N_INSNS (1), /* variable shift costs */
428 COSTS_N_INSNS (1), /* constant shift costs */
429 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
430 COSTS_N_INSNS (4), /* HI */
431 COSTS_N_INSNS (4), /* SI */
432 COSTS_N_INSNS (4), /* DI */
433 COSTS_N_INSNS (4)}, /* other */
434 0, /* cost of multiply per each bit set */
435 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
436 COSTS_N_INSNS (17), /* HI */
437 COSTS_N_INSNS (17), /* SI */
438 COSTS_N_INSNS (17), /* DI */
439 COSTS_N_INSNS (17)}, /* other */
440 COSTS_N_INSNS (1), /* cost of movsx */
441 COSTS_N_INSNS (1), /* cost of movzx */
442 8, /* "large" insn */
443 6, /* MOVE_RATIO */
444 2, /* cost for loading QImode using movzbl */
445 {4, 4, 4}, /* cost of loading integer registers
446 in QImode, HImode and SImode.
447 Relative to reg-reg move (2). */
448 {2, 2, 2}, /* cost of storing integer registers */
449 2, /* cost of reg,reg fld/fst */
450 {2, 2, 6}, /* cost of loading fp registers
451 in SFmode, DFmode and XFmode */
452 {4, 4, 6}, /* cost of storing fp registers
453 in SFmode, DFmode and XFmode */
454 2, /* cost of moving MMX register */
455 {2, 2}, /* cost of loading MMX registers
456 in SImode and DImode */
457 {2, 2}, /* cost of storing MMX registers
458 in SImode and DImode */
459 2, /* cost of moving SSE register */
460 {2, 2, 8}, /* cost of loading SSE registers
461 in SImode, DImode and TImode */
462 {2, 2, 8}, /* cost of storing SSE registers
463 in SImode, DImode and TImode */
464 3, /* MMX or SSE register to integer */
465 8, /* size of l1 cache. */
466 256, /* size of l2 cache */
467 32, /* size of prefetch block */
468 6, /* number of parallel prefetches */
469 2, /* Branch cost */
470 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
471 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
472 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
473 COSTS_N_INSNS (2), /* cost of FABS instruction. */
474 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
475 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
476 pentiumpro_memcpy,
477 pentiumpro_memset,
478 1, /* scalar_stmt_cost. */
479 1, /* scalar load_cost. */
480 1, /* scalar_store_cost. */
481 1, /* vec_stmt_cost. */
482 1, /* vec_to_scalar_cost. */
483 1, /* scalar_to_vec_cost. */
484 1, /* vec_align_load_cost. */
485 2, /* vec_unalign_load_cost. */
486 1, /* vec_store_cost. */
487 3, /* cond_taken_branch_cost. */
488 1, /* cond_not_taken_branch_cost. */
489 };
490
491 static stringop_algs geode_memcpy[2] = {
492 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
493 DUMMY_STRINGOP_ALGS};
494 static stringop_algs geode_memset[2] = {
495 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
496 DUMMY_STRINGOP_ALGS};
497 static const
498 struct processor_costs geode_cost = {
499 COSTS_N_INSNS (1), /* cost of an add instruction */
500 COSTS_N_INSNS (1), /* cost of a lea instruction */
501 COSTS_N_INSNS (2), /* variable shift costs */
502 COSTS_N_INSNS (1), /* constant shift costs */
503 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
504 COSTS_N_INSNS (4), /* HI */
505 COSTS_N_INSNS (7), /* SI */
506 COSTS_N_INSNS (7), /* DI */
507 COSTS_N_INSNS (7)}, /* other */
508 0, /* cost of multiply per each bit set */
509 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
510 COSTS_N_INSNS (23), /* HI */
511 COSTS_N_INSNS (39), /* SI */
512 COSTS_N_INSNS (39), /* DI */
513 COSTS_N_INSNS (39)}, /* other */
514 COSTS_N_INSNS (1), /* cost of movsx */
515 COSTS_N_INSNS (1), /* cost of movzx */
516 8, /* "large" insn */
517 4, /* MOVE_RATIO */
518 1, /* cost for loading QImode using movzbl */
519 {1, 1, 1}, /* cost of loading integer registers
520 in QImode, HImode and SImode.
521 Relative to reg-reg move (2). */
522 {1, 1, 1}, /* cost of storing integer registers */
523 1, /* cost of reg,reg fld/fst */
524 {1, 1, 1}, /* cost of loading fp registers
525 in SFmode, DFmode and XFmode */
526 {4, 6, 6}, /* cost of storing fp registers
527 in SFmode, DFmode and XFmode */
528
529 1, /* cost of moving MMX register */
530 {1, 1}, /* cost of loading MMX registers
531 in SImode and DImode */
532 {1, 1}, /* cost of storing MMX registers
533 in SImode and DImode */
534 1, /* cost of moving SSE register */
535 {1, 1, 1}, /* cost of loading SSE registers
536 in SImode, DImode and TImode */
537 {1, 1, 1}, /* cost of storing SSE registers
538 in SImode, DImode and TImode */
539 1, /* MMX or SSE register to integer */
540 64, /* size of l1 cache. */
541 128, /* size of l2 cache. */
542 32, /* size of prefetch block */
543 1, /* number of parallel prefetches */
544 1, /* Branch cost */
545 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
546 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
547 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
548 COSTS_N_INSNS (1), /* cost of FABS instruction. */
549 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
550 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
551 geode_memcpy,
552 geode_memset,
553 1, /* scalar_stmt_cost. */
554 1, /* scalar load_cost. */
555 1, /* scalar_store_cost. */
556 1, /* vec_stmt_cost. */
557 1, /* vec_to_scalar_cost. */
558 1, /* scalar_to_vec_cost. */
559 1, /* vec_align_load_cost. */
560 2, /* vec_unalign_load_cost. */
561 1, /* vec_store_cost. */
562 3, /* cond_taken_branch_cost. */
563 1, /* cond_not_taken_branch_cost. */
564 };
565
566 static stringop_algs k6_memcpy[2] = {
567 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
568 DUMMY_STRINGOP_ALGS};
569 static stringop_algs k6_memset[2] = {
570 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
571 DUMMY_STRINGOP_ALGS};
572 static const
573 struct processor_costs k6_cost = {
574 COSTS_N_INSNS (1), /* cost of an add instruction */
575 COSTS_N_INSNS (2), /* cost of a lea instruction */
576 COSTS_N_INSNS (1), /* variable shift costs */
577 COSTS_N_INSNS (1), /* constant shift costs */
578 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
579 COSTS_N_INSNS (3), /* HI */
580 COSTS_N_INSNS (3), /* SI */
581 COSTS_N_INSNS (3), /* DI */
582 COSTS_N_INSNS (3)}, /* other */
583 0, /* cost of multiply per each bit set */
584 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
585 COSTS_N_INSNS (18), /* HI */
586 COSTS_N_INSNS (18), /* SI */
587 COSTS_N_INSNS (18), /* DI */
588 COSTS_N_INSNS (18)}, /* other */
589 COSTS_N_INSNS (2), /* cost of movsx */
590 COSTS_N_INSNS (2), /* cost of movzx */
591 8, /* "large" insn */
592 4, /* MOVE_RATIO */
593 3, /* cost for loading QImode using movzbl */
594 {4, 5, 4}, /* cost of loading integer registers
595 in QImode, HImode and SImode.
596 Relative to reg-reg move (2). */
597 {2, 3, 2}, /* cost of storing integer registers */
598 4, /* cost of reg,reg fld/fst */
599 {6, 6, 6}, /* cost of loading fp registers
600 in SFmode, DFmode and XFmode */
601 {4, 4, 4}, /* cost of storing fp registers
602 in SFmode, DFmode and XFmode */
603 2, /* cost of moving MMX register */
604 {2, 2}, /* cost of loading MMX registers
605 in SImode and DImode */
606 {2, 2}, /* cost of storing MMX registers
607 in SImode and DImode */
608 2, /* cost of moving SSE register */
609 {2, 2, 8}, /* cost of loading SSE registers
610 in SImode, DImode and TImode */
611 {2, 2, 8}, /* cost of storing SSE registers
612 in SImode, DImode and TImode */
613 6, /* MMX or SSE register to integer */
614 32, /* size of l1 cache. */
615 32, /* size of l2 cache. Some models
616 have integrated l2 cache, but
617 optimizing for k6 is not important
618 enough to worry about that. */
619 32, /* size of prefetch block */
620 1, /* number of parallel prefetches */
621 1, /* Branch cost */
622 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
623 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
624 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
625 COSTS_N_INSNS (2), /* cost of FABS instruction. */
626 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
627 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
628 k6_memcpy,
629 k6_memset,
630 1, /* scalar_stmt_cost. */
631 1, /* scalar load_cost. */
632 1, /* scalar_store_cost. */
633 1, /* vec_stmt_cost. */
634 1, /* vec_to_scalar_cost. */
635 1, /* scalar_to_vec_cost. */
636 1, /* vec_align_load_cost. */
637 2, /* vec_unalign_load_cost. */
638 1, /* vec_store_cost. */
639 3, /* cond_taken_branch_cost. */
640 1, /* cond_not_taken_branch_cost. */
641 };
642
643 /* For some reason, Athlon deals better with REP prefix (relative to loops)
644 compared to K8. Alignment becomes important after 8 bytes for memcpy and
645 128 bytes for memset. */
646 static stringop_algs athlon_memcpy[2] = {
647 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
648 DUMMY_STRINGOP_ALGS};
649 static stringop_algs athlon_memset[2] = {
650 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS};
652 static const
653 struct processor_costs athlon_cost = {
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (2), /* cost of a lea instruction */
656 COSTS_N_INSNS (1), /* variable shift costs */
657 COSTS_N_INSNS (1), /* constant shift costs */
658 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (5), /* HI */
660 COSTS_N_INSNS (5), /* SI */
661 COSTS_N_INSNS (5), /* DI */
662 COSTS_N_INSNS (5)}, /* other */
663 0, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (26), /* HI */
666 COSTS_N_INSNS (42), /* SI */
667 COSTS_N_INSNS (74), /* DI */
668 COSTS_N_INSNS (74)}, /* other */
669 COSTS_N_INSNS (1), /* cost of movsx */
670 COSTS_N_INSNS (1), /* cost of movzx */
671 8, /* "large" insn */
672 9, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {3, 4, 3}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {3, 4, 3}, /* cost of storing integer registers */
678 4, /* cost of reg,reg fld/fst */
679 {4, 4, 12}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {6, 6, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 4}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 4}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 4, 6}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 4, 5}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 5, /* MMX or SSE register to integer */
694 64, /* size of l1 cache. */
695 256, /* size of l2 cache. */
696 64, /* size of prefetch block */
697 6, /* number of parallel prefetches */
698 5, /* Branch cost */
699 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
700 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
701 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
702 COSTS_N_INSNS (2), /* cost of FABS instruction. */
703 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
704 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
705 athlon_memcpy,
706 athlon_memset,
707 1, /* scalar_stmt_cost. */
708 1, /* scalar load_cost. */
709 1, /* scalar_store_cost. */
710 1, /* vec_stmt_cost. */
711 1, /* vec_to_scalar_cost. */
712 1, /* scalar_to_vec_cost. */
713 1, /* vec_align_load_cost. */
714 2, /* vec_unalign_load_cost. */
715 1, /* vec_store_cost. */
716 3, /* cond_taken_branch_cost. */
717 1, /* cond_not_taken_branch_cost. */
718 };
719
720 /* K8 has optimized REP instruction for medium sized blocks, but for very
721 small blocks it is better to use loop. For large blocks, libcall can
722 do nontemporary accesses and beat inline considerably. */
723 static stringop_algs k8_memcpy[2] = {
724 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
725 {-1, rep_prefix_4_byte, false}}},
726 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
727 {-1, libcall, false}}}};
728 static stringop_algs k8_memset[2] = {
729 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
730 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
731 {libcall, {{48, unrolled_loop, false},
732 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
733 static const
734 struct processor_costs k8_cost = {
735 COSTS_N_INSNS (1), /* cost of an add instruction */
736 COSTS_N_INSNS (2), /* cost of a lea instruction */
737 COSTS_N_INSNS (1), /* variable shift costs */
738 COSTS_N_INSNS (1), /* constant shift costs */
739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
740 COSTS_N_INSNS (4), /* HI */
741 COSTS_N_INSNS (3), /* SI */
742 COSTS_N_INSNS (4), /* DI */
743 COSTS_N_INSNS (5)}, /* other */
744 0, /* cost of multiply per each bit set */
745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
746 COSTS_N_INSNS (26), /* HI */
747 COSTS_N_INSNS (42), /* SI */
748 COSTS_N_INSNS (74), /* DI */
749 COSTS_N_INSNS (74)}, /* other */
750 COSTS_N_INSNS (1), /* cost of movsx */
751 COSTS_N_INSNS (1), /* cost of movzx */
752 8, /* "large" insn */
753 9, /* MOVE_RATIO */
754 4, /* cost for loading QImode using movzbl */
755 {3, 4, 3}, /* cost of loading integer registers
756 in QImode, HImode and SImode.
757 Relative to reg-reg move (2). */
758 {3, 4, 3}, /* cost of storing integer registers */
759 4, /* cost of reg,reg fld/fst */
760 {4, 4, 12}, /* cost of loading fp registers
761 in SFmode, DFmode and XFmode */
762 {6, 6, 8}, /* cost of storing fp registers
763 in SFmode, DFmode and XFmode */
764 2, /* cost of moving MMX register */
765 {3, 3}, /* cost of loading MMX registers
766 in SImode and DImode */
767 {4, 4}, /* cost of storing MMX registers
768 in SImode and DImode */
769 2, /* cost of moving SSE register */
770 {4, 3, 6}, /* cost of loading SSE registers
771 in SImode, DImode and TImode */
772 {4, 4, 5}, /* cost of storing SSE registers
773 in SImode, DImode and TImode */
774 5, /* MMX or SSE register to integer */
775 64, /* size of l1 cache. */
776 512, /* size of l2 cache. */
777 64, /* size of prefetch block */
778 /* New AMD processors never drop prefetches; if they cannot be performed
779 immediately, they are queued. We set number of simultaneous prefetches
780 to a large constant to reflect this (it probably is not a good idea not
781 to limit number of prefetches at all, as their execution also takes some
782 time). */
783 100, /* number of parallel prefetches */
784 3, /* Branch cost */
785 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
786 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
787 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
788 COSTS_N_INSNS (2), /* cost of FABS instruction. */
789 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
790 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
791
792 k8_memcpy,
793 k8_memset,
794 4, /* scalar_stmt_cost. */
795 2, /* scalar load_cost. */
796 2, /* scalar_store_cost. */
797 5, /* vec_stmt_cost. */
798 0, /* vec_to_scalar_cost. */
799 2, /* scalar_to_vec_cost. */
800 2, /* vec_align_load_cost. */
801 3, /* vec_unalign_load_cost. */
802 3, /* vec_store_cost. */
803 3, /* cond_taken_branch_cost. */
804 2, /* cond_not_taken_branch_cost. */
805 };
806
807 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
808 very small blocks it is better to use loop. For large blocks, libcall can
809 do nontemporary accesses and beat inline considerably. */
810 static stringop_algs amdfam10_memcpy[2] = {
811 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
812 {-1, rep_prefix_4_byte, false}}},
813 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
814 {-1, libcall, false}}}};
815 static stringop_algs amdfam10_memset[2] = {
816 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
817 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
818 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}};
820 struct processor_costs amdfam10_cost = {
821 COSTS_N_INSNS (1), /* cost of an add instruction */
822 COSTS_N_INSNS (2), /* cost of a lea instruction */
823 COSTS_N_INSNS (1), /* variable shift costs */
824 COSTS_N_INSNS (1), /* constant shift costs */
825 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
826 COSTS_N_INSNS (4), /* HI */
827 COSTS_N_INSNS (3), /* SI */
828 COSTS_N_INSNS (4), /* DI */
829 COSTS_N_INSNS (5)}, /* other */
830 0, /* cost of multiply per each bit set */
831 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
832 COSTS_N_INSNS (35), /* HI */
833 COSTS_N_INSNS (51), /* SI */
834 COSTS_N_INSNS (83), /* DI */
835 COSTS_N_INSNS (83)}, /* other */
836 COSTS_N_INSNS (1), /* cost of movsx */
837 COSTS_N_INSNS (1), /* cost of movzx */
838 8, /* "large" insn */
839 9, /* MOVE_RATIO */
840 4, /* cost for loading QImode using movzbl */
841 {3, 4, 3}, /* cost of loading integer registers
842 in QImode, HImode and SImode.
843 Relative to reg-reg move (2). */
844 {3, 4, 3}, /* cost of storing integer registers */
845 4, /* cost of reg,reg fld/fst */
846 {4, 4, 12}, /* cost of loading fp registers
847 in SFmode, DFmode and XFmode */
848 {6, 6, 8}, /* cost of storing fp registers
849 in SFmode, DFmode and XFmode */
850 2, /* cost of moving MMX register */
851 {3, 3}, /* cost of loading MMX registers
852 in SImode and DImode */
853 {4, 4}, /* cost of storing MMX registers
854 in SImode and DImode */
855 2, /* cost of moving SSE register */
856 {4, 4, 3}, /* cost of loading SSE registers
857 in SImode, DImode and TImode */
858 {4, 4, 5}, /* cost of storing SSE registers
859 in SImode, DImode and TImode */
860 3, /* MMX or SSE register to integer */
861 /* On K8:
862 MOVD reg64, xmmreg Double FSTORE 4
863 MOVD reg32, xmmreg Double FSTORE 4
864 On AMDFAM10:
865 MOVD reg64, xmmreg Double FADD 3
866 1/1 1/1
867 MOVD reg32, xmmreg Double FADD 3
868 1/1 1/1 */
869 64, /* size of l1 cache. */
870 512, /* size of l2 cache. */
871 64, /* size of prefetch block */
872 /* New AMD processors never drop prefetches; if they cannot be performed
873 immediately, they are queued. We set number of simultaneous prefetches
874 to a large constant to reflect this (it probably is not a good idea not
875 to limit number of prefetches at all, as their execution also takes some
876 time). */
877 100, /* number of parallel prefetches */
878 2, /* Branch cost */
879 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
880 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
881 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
882 COSTS_N_INSNS (2), /* cost of FABS instruction. */
883 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
884 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
885
886 amdfam10_memcpy,
887 amdfam10_memset,
888 4, /* scalar_stmt_cost. */
889 2, /* scalar load_cost. */
890 2, /* scalar_store_cost. */
891 6, /* vec_stmt_cost. */
892 0, /* vec_to_scalar_cost. */
893 2, /* scalar_to_vec_cost. */
894 2, /* vec_align_load_cost. */
895 2, /* vec_unalign_load_cost. */
896 2, /* vec_store_cost. */
897 2, /* cond_taken_branch_cost. */
898 1, /* cond_not_taken_branch_cost. */
899 };
900
901 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
902 very small blocks it is better to use loop. For large blocks, libcall
903 can do nontemporary accesses and beat inline considerably. */
904 static stringop_algs bdver1_memcpy[2] = {
905 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
906 {-1, rep_prefix_4_byte, false}}},
907 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
908 {-1, libcall, false}}}};
909 static stringop_algs bdver1_memset[2] = {
910 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}};
914
915 const struct processor_costs bdver1_cost = {
916 COSTS_N_INSNS (1), /* cost of an add instruction */
917 COSTS_N_INSNS (1), /* cost of a lea instruction */
918 COSTS_N_INSNS (1), /* variable shift costs */
919 COSTS_N_INSNS (1), /* constant shift costs */
920 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
921 COSTS_N_INSNS (4), /* HI */
922 COSTS_N_INSNS (4), /* SI */
923 COSTS_N_INSNS (6), /* DI */
924 COSTS_N_INSNS (6)}, /* other */
925 0, /* cost of multiply per each bit set */
926 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
927 COSTS_N_INSNS (35), /* HI */
928 COSTS_N_INSNS (51), /* SI */
929 COSTS_N_INSNS (83), /* DI */
930 COSTS_N_INSNS (83)}, /* other */
931 COSTS_N_INSNS (1), /* cost of movsx */
932 COSTS_N_INSNS (1), /* cost of movzx */
933 8, /* "large" insn */
934 9, /* MOVE_RATIO */
935 4, /* cost for loading QImode using movzbl */
936 {5, 5, 4}, /* cost of loading integer registers
937 in QImode, HImode and SImode.
938 Relative to reg-reg move (2). */
939 {4, 4, 4}, /* cost of storing integer registers */
940 2, /* cost of reg,reg fld/fst */
941 {5, 5, 12}, /* cost of loading fp registers
942 in SFmode, DFmode and XFmode */
943 {4, 4, 8}, /* cost of storing fp registers
944 in SFmode, DFmode and XFmode */
945 2, /* cost of moving MMX register */
946 {4, 4}, /* cost of loading MMX registers
947 in SImode and DImode */
948 {4, 4}, /* cost of storing MMX registers
949 in SImode and DImode */
950 2, /* cost of moving SSE register */
951 {4, 4, 4}, /* cost of loading SSE registers
952 in SImode, DImode and TImode */
953 {4, 4, 4}, /* cost of storing SSE registers
954 in SImode, DImode and TImode */
955 2, /* MMX or SSE register to integer */
956 /* On K8:
957 MOVD reg64, xmmreg Double FSTORE 4
958 MOVD reg32, xmmreg Double FSTORE 4
959 On AMDFAM10:
960 MOVD reg64, xmmreg Double FADD 3
961 1/1 1/1
962 MOVD reg32, xmmreg Double FADD 3
963 1/1 1/1 */
964 16, /* size of l1 cache. */
965 2048, /* size of l2 cache. */
966 64, /* size of prefetch block */
967 /* New AMD processors never drop prefetches; if they cannot be performed
968 immediately, they are queued. We set number of simultaneous prefetches
969 to a large constant to reflect this (it probably is not a good idea not
970 to limit number of prefetches at all, as their execution also takes some
971 time). */
972 100, /* number of parallel prefetches */
973 2, /* Branch cost */
974 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
975 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
976 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
977 COSTS_N_INSNS (2), /* cost of FABS instruction. */
978 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
979 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
980
981 bdver1_memcpy,
982 bdver1_memset,
983 6, /* scalar_stmt_cost. */
984 4, /* scalar load_cost. */
985 4, /* scalar_store_cost. */
986 6, /* vec_stmt_cost. */
987 0, /* vec_to_scalar_cost. */
988 2, /* scalar_to_vec_cost. */
989 4, /* vec_align_load_cost. */
990 4, /* vec_unalign_load_cost. */
991 4, /* vec_store_cost. */
992 2, /* cond_taken_branch_cost. */
993 1, /* cond_not_taken_branch_cost. */
994 };
995
996 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
997 very small blocks it is better to use loop. For large blocks, libcall
998 can do nontemporary accesses and beat inline considerably. */
999
1000 static stringop_algs bdver2_memcpy[2] = {
1001 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1002 {-1, rep_prefix_4_byte, false}}},
1003 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1004 {-1, libcall, false}}}};
1005 static stringop_algs bdver2_memset[2] = {
1006 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1007 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1008 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1009 {-1, libcall, false}}}};
1010
1011 const struct processor_costs bdver2_cost = {
1012 COSTS_N_INSNS (1), /* cost of an add instruction */
1013 COSTS_N_INSNS (1), /* cost of a lea instruction */
1014 COSTS_N_INSNS (1), /* variable shift costs */
1015 COSTS_N_INSNS (1), /* constant shift costs */
1016 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1017 COSTS_N_INSNS (4), /* HI */
1018 COSTS_N_INSNS (4), /* SI */
1019 COSTS_N_INSNS (6), /* DI */
1020 COSTS_N_INSNS (6)}, /* other */
1021 0, /* cost of multiply per each bit set */
1022 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1023 COSTS_N_INSNS (35), /* HI */
1024 COSTS_N_INSNS (51), /* SI */
1025 COSTS_N_INSNS (83), /* DI */
1026 COSTS_N_INSNS (83)}, /* other */
1027 COSTS_N_INSNS (1), /* cost of movsx */
1028 COSTS_N_INSNS (1), /* cost of movzx */
1029 8, /* "large" insn */
1030 9, /* MOVE_RATIO */
1031 4, /* cost for loading QImode using movzbl */
1032 {5, 5, 4}, /* cost of loading integer registers
1033 in QImode, HImode and SImode.
1034 Relative to reg-reg move (2). */
1035 {4, 4, 4}, /* cost of storing integer registers */
1036 2, /* cost of reg,reg fld/fst */
1037 {5, 5, 12}, /* cost of loading fp registers
1038 in SFmode, DFmode and XFmode */
1039 {4, 4, 8}, /* cost of storing fp registers
1040 in SFmode, DFmode and XFmode */
1041 2, /* cost of moving MMX register */
1042 {4, 4}, /* cost of loading MMX registers
1043 in SImode and DImode */
1044 {4, 4}, /* cost of storing MMX registers
1045 in SImode and DImode */
1046 2, /* cost of moving SSE register */
1047 {4, 4, 4}, /* cost of loading SSE registers
1048 in SImode, DImode and TImode */
1049 {4, 4, 4}, /* cost of storing SSE registers
1050 in SImode, DImode and TImode */
1051 2, /* MMX or SSE register to integer */
1052 /* On K8:
1053 MOVD reg64, xmmreg Double FSTORE 4
1054 MOVD reg32, xmmreg Double FSTORE 4
1055 On AMDFAM10:
1056 MOVD reg64, xmmreg Double FADD 3
1057 1/1 1/1
1058 MOVD reg32, xmmreg Double FADD 3
1059 1/1 1/1 */
1060 16, /* size of l1 cache. */
1061 2048, /* size of l2 cache. */
1062 64, /* size of prefetch block */
1063 /* New AMD processors never drop prefetches; if they cannot be performed
1064 immediately, they are queued. We set number of simultaneous prefetches
1065 to a large constant to reflect this (it probably is not a good idea not
1066 to limit number of prefetches at all, as their execution also takes some
1067 time). */
1068 100, /* number of parallel prefetches */
1069 2, /* Branch cost */
1070 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1071 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1072 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1073 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1074 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1075 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1076
1077 bdver2_memcpy,
1078 bdver2_memset,
1079 6, /* scalar_stmt_cost. */
1080 4, /* scalar load_cost. */
1081 4, /* scalar_store_cost. */
1082 6, /* vec_stmt_cost. */
1083 0, /* vec_to_scalar_cost. */
1084 2, /* scalar_to_vec_cost. */
1085 4, /* vec_align_load_cost. */
1086 4, /* vec_unalign_load_cost. */
1087 4, /* vec_store_cost. */
1088 2, /* cond_taken_branch_cost. */
1089 1, /* cond_not_taken_branch_cost. */
1090 };
1091
1092
1093 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1094 very small blocks it is better to use loop. For large blocks, libcall
1095 can do nontemporary accesses and beat inline considerably. */
1096 static stringop_algs bdver3_memcpy[2] = {
1097 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1098 {-1, rep_prefix_4_byte, false}}},
1099 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1100 {-1, libcall, false}}}};
1101 static stringop_algs bdver3_memset[2] = {
1102 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1103 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1104 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1105 {-1, libcall, false}}}};
1106 struct processor_costs bdver3_cost = {
1107 COSTS_N_INSNS (1), /* cost of an add instruction */
1108 COSTS_N_INSNS (1), /* cost of a lea instruction */
1109 COSTS_N_INSNS (1), /* variable shift costs */
1110 COSTS_N_INSNS (1), /* constant shift costs */
1111 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1112 COSTS_N_INSNS (4), /* HI */
1113 COSTS_N_INSNS (4), /* SI */
1114 COSTS_N_INSNS (6), /* DI */
1115 COSTS_N_INSNS (6)}, /* other */
1116 0, /* cost of multiply per each bit set */
1117 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1118 COSTS_N_INSNS (35), /* HI */
1119 COSTS_N_INSNS (51), /* SI */
1120 COSTS_N_INSNS (83), /* DI */
1121 COSTS_N_INSNS (83)}, /* other */
1122 COSTS_N_INSNS (1), /* cost of movsx */
1123 COSTS_N_INSNS (1), /* cost of movzx */
1124 8, /* "large" insn */
1125 9, /* MOVE_RATIO */
1126 4, /* cost for loading QImode using movzbl */
1127 {5, 5, 4}, /* cost of loading integer registers
1128 in QImode, HImode and SImode.
1129 Relative to reg-reg move (2). */
1130 {4, 4, 4}, /* cost of storing integer registers */
1131 2, /* cost of reg,reg fld/fst */
1132 {5, 5, 12}, /* cost of loading fp registers
1133 in SFmode, DFmode and XFmode */
1134 {4, 4, 8}, /* cost of storing fp registers
1135 in SFmode, DFmode and XFmode */
1136 2, /* cost of moving MMX register */
1137 {4, 4}, /* cost of loading MMX registers
1138 in SImode and DImode */
1139 {4, 4}, /* cost of storing MMX registers
1140 in SImode and DImode */
1141 2, /* cost of moving SSE register */
1142 {4, 4, 4}, /* cost of loading SSE registers
1143 in SImode, DImode and TImode */
1144 {4, 4, 4}, /* cost of storing SSE registers
1145 in SImode, DImode and TImode */
1146 2, /* MMX or SSE register to integer */
1147 16, /* size of l1 cache. */
1148 2048, /* size of l2 cache. */
1149 64, /* size of prefetch block */
1150 /* New AMD processors never drop prefetches; if they cannot be performed
1151 immediately, they are queued. We set number of simultaneous prefetches
1152 to a large constant to reflect this (it probably is not a good idea not
1153 to limit number of prefetches at all, as their execution also takes some
1154 time). */
1155 100, /* number of parallel prefetches */
1156 2, /* Branch cost */
1157 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1158 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1159 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1160 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1161 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1162 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1163
1164 bdver3_memcpy,
1165 bdver3_memset,
1166 6, /* scalar_stmt_cost. */
1167 4, /* scalar load_cost. */
1168 4, /* scalar_store_cost. */
1169 6, /* vec_stmt_cost. */
1170 0, /* vec_to_scalar_cost. */
1171 2, /* scalar_to_vec_cost. */
1172 4, /* vec_align_load_cost. */
1173 4, /* vec_unalign_load_cost. */
1174 4, /* vec_store_cost. */
1175 2, /* cond_taken_branch_cost. */
1176 1, /* cond_not_taken_branch_cost. */
1177 };
1178
1179 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1180 very small blocks it is better to use loop. For large blocks, libcall
1181 can do nontemporary accesses and beat inline considerably. */
1182 static stringop_algs bdver4_memcpy[2] = {
1183 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1184 {-1, rep_prefix_4_byte, false}}},
1185 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1186 {-1, libcall, false}}}};
1187 static stringop_algs bdver4_memset[2] = {
1188 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1189 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1190 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1191 {-1, libcall, false}}}};
1192 struct processor_costs bdver4_cost = {
1193 COSTS_N_INSNS (1), /* cost of an add instruction */
1194 COSTS_N_INSNS (1), /* cost of a lea instruction */
1195 COSTS_N_INSNS (1), /* variable shift costs */
1196 COSTS_N_INSNS (1), /* constant shift costs */
1197 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1198 COSTS_N_INSNS (4), /* HI */
1199 COSTS_N_INSNS (4), /* SI */
1200 COSTS_N_INSNS (6), /* DI */
1201 COSTS_N_INSNS (6)}, /* other */
1202 0, /* cost of multiply per each bit set */
1203 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1204 COSTS_N_INSNS (35), /* HI */
1205 COSTS_N_INSNS (51), /* SI */
1206 COSTS_N_INSNS (83), /* DI */
1207 COSTS_N_INSNS (83)}, /* other */
1208 COSTS_N_INSNS (1), /* cost of movsx */
1209 COSTS_N_INSNS (1), /* cost of movzx */
1210 8, /* "large" insn */
1211 9, /* MOVE_RATIO */
1212 4, /* cost for loading QImode using movzbl */
1213 {5, 5, 4}, /* cost of loading integer registers
1214 in QImode, HImode and SImode.
1215 Relative to reg-reg move (2). */
1216 {4, 4, 4}, /* cost of storing integer registers */
1217 2, /* cost of reg,reg fld/fst */
1218 {5, 5, 12}, /* cost of loading fp registers
1219 in SFmode, DFmode and XFmode */
1220 {4, 4, 8}, /* cost of storing fp registers
1221 in SFmode, DFmode and XFmode */
1222 2, /* cost of moving MMX register */
1223 {4, 4}, /* cost of loading MMX registers
1224 in SImode and DImode */
1225 {4, 4}, /* cost of storing MMX registers
1226 in SImode and DImode */
1227 2, /* cost of moving SSE register */
1228 {4, 4, 4}, /* cost of loading SSE registers
1229 in SImode, DImode and TImode */
1230 {4, 4, 4}, /* cost of storing SSE registers
1231 in SImode, DImode and TImode */
1232 2, /* MMX or SSE register to integer */
1233 16, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 /* New AMD processors never drop prefetches; if they cannot be performed
1237 immediately, they are queued. We set number of simultaneous prefetches
1238 to a large constant to reflect this (it probably is not a good idea not
1239 to limit number of prefetches at all, as their execution also takes some
1240 time). */
1241 100, /* number of parallel prefetches */
1242 2, /* Branch cost */
1243 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1244 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1245 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1246 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1247 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1248 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1249
1250 bdver4_memcpy,
1251 bdver4_memset,
1252 6, /* scalar_stmt_cost. */
1253 4, /* scalar load_cost. */
1254 4, /* scalar_store_cost. */
1255 6, /* vec_stmt_cost. */
1256 0, /* vec_to_scalar_cost. */
1257 2, /* scalar_to_vec_cost. */
1258 4, /* vec_align_load_cost. */
1259 4, /* vec_unalign_load_cost. */
1260 4, /* vec_store_cost. */
1261 2, /* cond_taken_branch_cost. */
1262 1, /* cond_not_taken_branch_cost. */
1263 };
1264
1265 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1266 very small blocks it is better to use loop. For large blocks, libcall can
1267 do nontemporary accesses and beat inline considerably. */
1268 static stringop_algs btver1_memcpy[2] = {
1269 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1270 {-1, rep_prefix_4_byte, false}}},
1271 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1272 {-1, libcall, false}}}};
1273 static stringop_algs btver1_memset[2] = {
1274 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1275 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1276 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1277 {-1, libcall, false}}}};
1278 const struct processor_costs btver1_cost = {
1279 COSTS_N_INSNS (1), /* cost of an add instruction */
1280 COSTS_N_INSNS (2), /* cost of a lea instruction */
1281 COSTS_N_INSNS (1), /* variable shift costs */
1282 COSTS_N_INSNS (1), /* constant shift costs */
1283 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1284 COSTS_N_INSNS (4), /* HI */
1285 COSTS_N_INSNS (3), /* SI */
1286 COSTS_N_INSNS (4), /* DI */
1287 COSTS_N_INSNS (5)}, /* other */
1288 0, /* cost of multiply per each bit set */
1289 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1290 COSTS_N_INSNS (35), /* HI */
1291 COSTS_N_INSNS (51), /* SI */
1292 COSTS_N_INSNS (83), /* DI */
1293 COSTS_N_INSNS (83)}, /* other */
1294 COSTS_N_INSNS (1), /* cost of movsx */
1295 COSTS_N_INSNS (1), /* cost of movzx */
1296 8, /* "large" insn */
1297 9, /* MOVE_RATIO */
1298 4, /* cost for loading QImode using movzbl */
1299 {3, 4, 3}, /* cost of loading integer registers
1300 in QImode, HImode and SImode.
1301 Relative to reg-reg move (2). */
1302 {3, 4, 3}, /* cost of storing integer registers */
1303 4, /* cost of reg,reg fld/fst */
1304 {4, 4, 12}, /* cost of loading fp registers
1305 in SFmode, DFmode and XFmode */
1306 {6, 6, 8}, /* cost of storing fp registers
1307 in SFmode, DFmode and XFmode */
1308 2, /* cost of moving MMX register */
1309 {3, 3}, /* cost of loading MMX registers
1310 in SImode and DImode */
1311 {4, 4}, /* cost of storing MMX registers
1312 in SImode and DImode */
1313 2, /* cost of moving SSE register */
1314 {4, 4, 3}, /* cost of loading SSE registers
1315 in SImode, DImode and TImode */
1316 {4, 4, 5}, /* cost of storing SSE registers
1317 in SImode, DImode and TImode */
1318 3, /* MMX or SSE register to integer */
1319 /* On K8:
1320 MOVD reg64, xmmreg Double FSTORE 4
1321 MOVD reg32, xmmreg Double FSTORE 4
1322 On AMDFAM10:
1323 MOVD reg64, xmmreg Double FADD 3
1324 1/1 1/1
1325 MOVD reg32, xmmreg Double FADD 3
1326 1/1 1/1 */
1327 32, /* size of l1 cache. */
1328 512, /* size of l2 cache. */
1329 64, /* size of prefetch block */
1330 100, /* number of parallel prefetches */
1331 2, /* Branch cost */
1332 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1333 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1334 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1335 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1336 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1337 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1338
1339 btver1_memcpy,
1340 btver1_memset,
1341 4, /* scalar_stmt_cost. */
1342 2, /* scalar load_cost. */
1343 2, /* scalar_store_cost. */
1344 6, /* vec_stmt_cost. */
1345 0, /* vec_to_scalar_cost. */
1346 2, /* scalar_to_vec_cost. */
1347 2, /* vec_align_load_cost. */
1348 2, /* vec_unalign_load_cost. */
1349 2, /* vec_store_cost. */
1350 2, /* cond_taken_branch_cost. */
1351 1, /* cond_not_taken_branch_cost. */
1352 };
1353
1354 static stringop_algs btver2_memcpy[2] = {
1355 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1356 {-1, rep_prefix_4_byte, false}}},
1357 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1358 {-1, libcall, false}}}};
1359 static stringop_algs btver2_memset[2] = {
1360 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1361 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1362 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1363 {-1, libcall, false}}}};
1364 const struct processor_costs btver2_cost = {
1365 COSTS_N_INSNS (1), /* cost of an add instruction */
1366 COSTS_N_INSNS (2), /* cost of a lea instruction */
1367 COSTS_N_INSNS (1), /* variable shift costs */
1368 COSTS_N_INSNS (1), /* constant shift costs */
1369 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1370 COSTS_N_INSNS (4), /* HI */
1371 COSTS_N_INSNS (3), /* SI */
1372 COSTS_N_INSNS (4), /* DI */
1373 COSTS_N_INSNS (5)}, /* other */
1374 0, /* cost of multiply per each bit set */
1375 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1376 COSTS_N_INSNS (35), /* HI */
1377 COSTS_N_INSNS (51), /* SI */
1378 COSTS_N_INSNS (83), /* DI */
1379 COSTS_N_INSNS (83)}, /* other */
1380 COSTS_N_INSNS (1), /* cost of movsx */
1381 COSTS_N_INSNS (1), /* cost of movzx */
1382 8, /* "large" insn */
1383 9, /* MOVE_RATIO */
1384 4, /* cost for loading QImode using movzbl */
1385 {3, 4, 3}, /* cost of loading integer registers
1386 in QImode, HImode and SImode.
1387 Relative to reg-reg move (2). */
1388 {3, 4, 3}, /* cost of storing integer registers */
1389 4, /* cost of reg,reg fld/fst */
1390 {4, 4, 12}, /* cost of loading fp registers
1391 in SFmode, DFmode and XFmode */
1392 {6, 6, 8}, /* cost of storing fp registers
1393 in SFmode, DFmode and XFmode */
1394 2, /* cost of moving MMX register */
1395 {3, 3}, /* cost of loading MMX registers
1396 in SImode and DImode */
1397 {4, 4}, /* cost of storing MMX registers
1398 in SImode and DImode */
1399 2, /* cost of moving SSE register */
1400 {4, 4, 3}, /* cost of loading SSE registers
1401 in SImode, DImode and TImode */
1402 {4, 4, 5}, /* cost of storing SSE registers
1403 in SImode, DImode and TImode */
1404 3, /* MMX or SSE register to integer */
1405 /* On K8:
1406 MOVD reg64, xmmreg Double FSTORE 4
1407 MOVD reg32, xmmreg Double FSTORE 4
1408 On AMDFAM10:
1409 MOVD reg64, xmmreg Double FADD 3
1410 1/1 1/1
1411 MOVD reg32, xmmreg Double FADD 3
1412 1/1 1/1 */
1413 32, /* size of l1 cache. */
1414 2048, /* size of l2 cache. */
1415 64, /* size of prefetch block */
1416 100, /* number of parallel prefetches */
1417 2, /* Branch cost */
1418 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1419 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1420 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1421 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1422 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1423 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1424 btver2_memcpy,
1425 btver2_memset,
1426 4, /* scalar_stmt_cost. */
1427 2, /* scalar load_cost. */
1428 2, /* scalar_store_cost. */
1429 6, /* vec_stmt_cost. */
1430 0, /* vec_to_scalar_cost. */
1431 2, /* scalar_to_vec_cost. */
1432 2, /* vec_align_load_cost. */
1433 2, /* vec_unalign_load_cost. */
1434 2, /* vec_store_cost. */
1435 2, /* cond_taken_branch_cost. */
1436 1, /* cond_not_taken_branch_cost. */
1437 };
1438
1439 static stringop_algs pentium4_memcpy[2] = {
1440 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1441 DUMMY_STRINGOP_ALGS};
1442 static stringop_algs pentium4_memset[2] = {
1443 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1444 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1445 DUMMY_STRINGOP_ALGS};
1446
1447 static const
1448 struct processor_costs pentium4_cost = {
1449 COSTS_N_INSNS (1), /* cost of an add instruction */
1450 COSTS_N_INSNS (3), /* cost of a lea instruction */
1451 COSTS_N_INSNS (4), /* variable shift costs */
1452 COSTS_N_INSNS (4), /* constant shift costs */
1453 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1454 COSTS_N_INSNS (15), /* HI */
1455 COSTS_N_INSNS (15), /* SI */
1456 COSTS_N_INSNS (15), /* DI */
1457 COSTS_N_INSNS (15)}, /* other */
1458 0, /* cost of multiply per each bit set */
1459 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1460 COSTS_N_INSNS (56), /* HI */
1461 COSTS_N_INSNS (56), /* SI */
1462 COSTS_N_INSNS (56), /* DI */
1463 COSTS_N_INSNS (56)}, /* other */
1464 COSTS_N_INSNS (1), /* cost of movsx */
1465 COSTS_N_INSNS (1), /* cost of movzx */
1466 16, /* "large" insn */
1467 6, /* MOVE_RATIO */
1468 2, /* cost for loading QImode using movzbl */
1469 {4, 5, 4}, /* cost of loading integer registers
1470 in QImode, HImode and SImode.
1471 Relative to reg-reg move (2). */
1472 {2, 3, 2}, /* cost of storing integer registers */
1473 2, /* cost of reg,reg fld/fst */
1474 {2, 2, 6}, /* cost of loading fp registers
1475 in SFmode, DFmode and XFmode */
1476 {4, 4, 6}, /* cost of storing fp registers
1477 in SFmode, DFmode and XFmode */
1478 2, /* cost of moving MMX register */
1479 {2, 2}, /* cost of loading MMX registers
1480 in SImode and DImode */
1481 {2, 2}, /* cost of storing MMX registers
1482 in SImode and DImode */
1483 12, /* cost of moving SSE register */
1484 {12, 12, 12}, /* cost of loading SSE registers
1485 in SImode, DImode and TImode */
1486 {2, 2, 8}, /* cost of storing SSE registers
1487 in SImode, DImode and TImode */
1488 10, /* MMX or SSE register to integer */
1489 8, /* size of l1 cache. */
1490 256, /* size of l2 cache. */
1491 64, /* size of prefetch block */
1492 6, /* number of parallel prefetches */
1493 2, /* Branch cost */
1494 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1495 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1496 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1497 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1498 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1499 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1500 pentium4_memcpy,
1501 pentium4_memset,
1502 1, /* scalar_stmt_cost. */
1503 1, /* scalar load_cost. */
1504 1, /* scalar_store_cost. */
1505 1, /* vec_stmt_cost. */
1506 1, /* vec_to_scalar_cost. */
1507 1, /* scalar_to_vec_cost. */
1508 1, /* vec_align_load_cost. */
1509 2, /* vec_unalign_load_cost. */
1510 1, /* vec_store_cost. */
1511 3, /* cond_taken_branch_cost. */
1512 1, /* cond_not_taken_branch_cost. */
1513 };
1514
1515 static stringop_algs nocona_memcpy[2] = {
1516 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1517 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1518 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1519
1520 static stringop_algs nocona_memset[2] = {
1521 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1522 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1523 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1524 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1525
1526 static const
1527 struct processor_costs nocona_cost = {
1528 COSTS_N_INSNS (1), /* cost of an add instruction */
1529 COSTS_N_INSNS (1), /* cost of a lea instruction */
1530 COSTS_N_INSNS (1), /* variable shift costs */
1531 COSTS_N_INSNS (1), /* constant shift costs */
1532 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1533 COSTS_N_INSNS (10), /* HI */
1534 COSTS_N_INSNS (10), /* SI */
1535 COSTS_N_INSNS (10), /* DI */
1536 COSTS_N_INSNS (10)}, /* other */
1537 0, /* cost of multiply per each bit set */
1538 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1539 COSTS_N_INSNS (66), /* HI */
1540 COSTS_N_INSNS (66), /* SI */
1541 COSTS_N_INSNS (66), /* DI */
1542 COSTS_N_INSNS (66)}, /* other */
1543 COSTS_N_INSNS (1), /* cost of movsx */
1544 COSTS_N_INSNS (1), /* cost of movzx */
1545 16, /* "large" insn */
1546 17, /* MOVE_RATIO */
1547 4, /* cost for loading QImode using movzbl */
1548 {4, 4, 4}, /* cost of loading integer registers
1549 in QImode, HImode and SImode.
1550 Relative to reg-reg move (2). */
1551 {4, 4, 4}, /* cost of storing integer registers */
1552 3, /* cost of reg,reg fld/fst */
1553 {12, 12, 12}, /* cost of loading fp registers
1554 in SFmode, DFmode and XFmode */
1555 {4, 4, 4}, /* cost of storing fp registers
1556 in SFmode, DFmode and XFmode */
1557 6, /* cost of moving MMX register */
1558 {12, 12}, /* cost of loading MMX registers
1559 in SImode and DImode */
1560 {12, 12}, /* cost of storing MMX registers
1561 in SImode and DImode */
1562 6, /* cost of moving SSE register */
1563 {12, 12, 12}, /* cost of loading SSE registers
1564 in SImode, DImode and TImode */
1565 {12, 12, 12}, /* cost of storing SSE registers
1566 in SImode, DImode and TImode */
1567 8, /* MMX or SSE register to integer */
1568 8, /* size of l1 cache. */
1569 1024, /* size of l2 cache. */
1570 128, /* size of prefetch block */
1571 8, /* number of parallel prefetches */
1572 1, /* Branch cost */
1573 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1574 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1575 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1576 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1577 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1578 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1579 nocona_memcpy,
1580 nocona_memset,
1581 1, /* scalar_stmt_cost. */
1582 1, /* scalar load_cost. */
1583 1, /* scalar_store_cost. */
1584 1, /* vec_stmt_cost. */
1585 1, /* vec_to_scalar_cost. */
1586 1, /* scalar_to_vec_cost. */
1587 1, /* vec_align_load_cost. */
1588 2, /* vec_unalign_load_cost. */
1589 1, /* vec_store_cost. */
1590 3, /* cond_taken_branch_cost. */
1591 1, /* cond_not_taken_branch_cost. */
1592 };
1593
1594 static stringop_algs atom_memcpy[2] = {
1595 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1596 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1597 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1598 static stringop_algs atom_memset[2] = {
1599 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1600 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1601 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1602 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1603 static const
1604 struct processor_costs atom_cost = {
1605 COSTS_N_INSNS (1), /* cost of an add instruction */
1606 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1607 COSTS_N_INSNS (1), /* variable shift costs */
1608 COSTS_N_INSNS (1), /* constant shift costs */
1609 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1610 COSTS_N_INSNS (4), /* HI */
1611 COSTS_N_INSNS (3), /* SI */
1612 COSTS_N_INSNS (4), /* DI */
1613 COSTS_N_INSNS (2)}, /* other */
1614 0, /* cost of multiply per each bit set */
1615 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1616 COSTS_N_INSNS (26), /* HI */
1617 COSTS_N_INSNS (42), /* SI */
1618 COSTS_N_INSNS (74), /* DI */
1619 COSTS_N_INSNS (74)}, /* other */
1620 COSTS_N_INSNS (1), /* cost of movsx */
1621 COSTS_N_INSNS (1), /* cost of movzx */
1622 8, /* "large" insn */
1623 17, /* MOVE_RATIO */
1624 4, /* cost for loading QImode using movzbl */
1625 {4, 4, 4}, /* cost of loading integer registers
1626 in QImode, HImode and SImode.
1627 Relative to reg-reg move (2). */
1628 {4, 4, 4}, /* cost of storing integer registers */
1629 4, /* cost of reg,reg fld/fst */
1630 {12, 12, 12}, /* cost of loading fp registers
1631 in SFmode, DFmode and XFmode */
1632 {6, 6, 8}, /* cost of storing fp registers
1633 in SFmode, DFmode and XFmode */
1634 2, /* cost of moving MMX register */
1635 {8, 8}, /* cost of loading MMX registers
1636 in SImode and DImode */
1637 {8, 8}, /* cost of storing MMX registers
1638 in SImode and DImode */
1639 2, /* cost of moving SSE register */
1640 {8, 8, 8}, /* cost of loading SSE registers
1641 in SImode, DImode and TImode */
1642 {8, 8, 8}, /* cost of storing SSE registers
1643 in SImode, DImode and TImode */
1644 5, /* MMX or SSE register to integer */
1645 32, /* size of l1 cache. */
1646 256, /* size of l2 cache. */
1647 64, /* size of prefetch block */
1648 6, /* number of parallel prefetches */
1649 3, /* Branch cost */
1650 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1651 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1652 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1653 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1654 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1655 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1656 atom_memcpy,
1657 atom_memset,
1658 1, /* scalar_stmt_cost. */
1659 1, /* scalar load_cost. */
1660 1, /* scalar_store_cost. */
1661 1, /* vec_stmt_cost. */
1662 1, /* vec_to_scalar_cost. */
1663 1, /* scalar_to_vec_cost. */
1664 1, /* vec_align_load_cost. */
1665 2, /* vec_unalign_load_cost. */
1666 1, /* vec_store_cost. */
1667 3, /* cond_taken_branch_cost. */
1668 1, /* cond_not_taken_branch_cost. */
1669 };
1670
1671 static stringop_algs slm_memcpy[2] = {
1672 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1673 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1674 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1675 static stringop_algs slm_memset[2] = {
1676 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1677 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1678 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1679 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1680 static const
1681 struct processor_costs slm_cost = {
1682 COSTS_N_INSNS (1), /* cost of an add instruction */
1683 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1684 COSTS_N_INSNS (1), /* variable shift costs */
1685 COSTS_N_INSNS (1), /* constant shift costs */
1686 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1687 COSTS_N_INSNS (4), /* HI */
1688 COSTS_N_INSNS (3), /* SI */
1689 COSTS_N_INSNS (4), /* DI */
1690 COSTS_N_INSNS (2)}, /* other */
1691 0, /* cost of multiply per each bit set */
1692 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1693 COSTS_N_INSNS (26), /* HI */
1694 COSTS_N_INSNS (42), /* SI */
1695 COSTS_N_INSNS (74), /* DI */
1696 COSTS_N_INSNS (74)}, /* other */
1697 COSTS_N_INSNS (1), /* cost of movsx */
1698 COSTS_N_INSNS (1), /* cost of movzx */
1699 8, /* "large" insn */
1700 17, /* MOVE_RATIO */
1701 4, /* cost for loading QImode using movzbl */
1702 {4, 4, 4}, /* cost of loading integer registers
1703 in QImode, HImode and SImode.
1704 Relative to reg-reg move (2). */
1705 {4, 4, 4}, /* cost of storing integer registers */
1706 4, /* cost of reg,reg fld/fst */
1707 {12, 12, 12}, /* cost of loading fp registers
1708 in SFmode, DFmode and XFmode */
1709 {6, 6, 8}, /* cost of storing fp registers
1710 in SFmode, DFmode and XFmode */
1711 2, /* cost of moving MMX register */
1712 {8, 8}, /* cost of loading MMX registers
1713 in SImode and DImode */
1714 {8, 8}, /* cost of storing MMX registers
1715 in SImode and DImode */
1716 2, /* cost of moving SSE register */
1717 {8, 8, 8}, /* cost of loading SSE registers
1718 in SImode, DImode and TImode */
1719 {8, 8, 8}, /* cost of storing SSE registers
1720 in SImode, DImode and TImode */
1721 5, /* MMX or SSE register to integer */
1722 32, /* size of l1 cache. */
1723 256, /* size of l2 cache. */
1724 64, /* size of prefetch block */
1725 6, /* number of parallel prefetches */
1726 3, /* Branch cost */
1727 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1728 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1729 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1730 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1731 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1732 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1733 slm_memcpy,
1734 slm_memset,
1735 1, /* scalar_stmt_cost. */
1736 1, /* scalar load_cost. */
1737 1, /* scalar_store_cost. */
1738 1, /* vec_stmt_cost. */
1739 1, /* vec_to_scalar_cost. */
1740 1, /* scalar_to_vec_cost. */
1741 1, /* vec_align_load_cost. */
1742 2, /* vec_unalign_load_cost. */
1743 1, /* vec_store_cost. */
1744 3, /* cond_taken_branch_cost. */
1745 1, /* cond_not_taken_branch_cost. */
1746 };
1747
1748 /* Generic should produce code tuned for Core-i7 (and newer chips)
1749 and btver1 (and newer chips). */
1750
1751 static stringop_algs generic_memcpy[2] = {
1752 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1753 {-1, libcall, false}}},
1754 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1755 {-1, libcall, false}}}};
1756 static stringop_algs generic_memset[2] = {
1757 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1758 {-1, libcall, false}}},
1759 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1760 {-1, libcall, false}}}};
1761 static const
1762 struct processor_costs generic_cost = {
1763 COSTS_N_INSNS (1), /* cost of an add instruction */
1764 /* On all chips taken into consideration lea is 2 cycles and more. With
1765 this cost however our current implementation of synth_mult results in
1766 use of unnecessary temporary registers causing regression on several
1767 SPECfp benchmarks. */
1768 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1769 COSTS_N_INSNS (1), /* variable shift costs */
1770 COSTS_N_INSNS (1), /* constant shift costs */
1771 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1772 COSTS_N_INSNS (4), /* HI */
1773 COSTS_N_INSNS (3), /* SI */
1774 COSTS_N_INSNS (4), /* DI */
1775 COSTS_N_INSNS (2)}, /* other */
1776 0, /* cost of multiply per each bit set */
1777 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1778 COSTS_N_INSNS (26), /* HI */
1779 COSTS_N_INSNS (42), /* SI */
1780 COSTS_N_INSNS (74), /* DI */
1781 COSTS_N_INSNS (74)}, /* other */
1782 COSTS_N_INSNS (1), /* cost of movsx */
1783 COSTS_N_INSNS (1), /* cost of movzx */
1784 8, /* "large" insn */
1785 17, /* MOVE_RATIO */
1786 4, /* cost for loading QImode using movzbl */
1787 {4, 4, 4}, /* cost of loading integer registers
1788 in QImode, HImode and SImode.
1789 Relative to reg-reg move (2). */
1790 {4, 4, 4}, /* cost of storing integer registers */
1791 4, /* cost of reg,reg fld/fst */
1792 {12, 12, 12}, /* cost of loading fp registers
1793 in SFmode, DFmode and XFmode */
1794 {6, 6, 8}, /* cost of storing fp registers
1795 in SFmode, DFmode and XFmode */
1796 2, /* cost of moving MMX register */
1797 {8, 8}, /* cost of loading MMX registers
1798 in SImode and DImode */
1799 {8, 8}, /* cost of storing MMX registers
1800 in SImode and DImode */
1801 2, /* cost of moving SSE register */
1802 {8, 8, 8}, /* cost of loading SSE registers
1803 in SImode, DImode and TImode */
1804 {8, 8, 8}, /* cost of storing SSE registers
1805 in SImode, DImode and TImode */
1806 5, /* MMX or SSE register to integer */
1807 32, /* size of l1 cache. */
1808 512, /* size of l2 cache. */
1809 64, /* size of prefetch block */
1810 6, /* number of parallel prefetches */
1811 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1812 value is increased to perhaps more appropriate value of 5. */
1813 3, /* Branch cost */
1814 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1815 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1816 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1817 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1818 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1819 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1820 generic_memcpy,
1821 generic_memset,
1822 1, /* scalar_stmt_cost. */
1823 1, /* scalar load_cost. */
1824 1, /* scalar_store_cost. */
1825 1, /* vec_stmt_cost. */
1826 1, /* vec_to_scalar_cost. */
1827 1, /* scalar_to_vec_cost. */
1828 1, /* vec_align_load_cost. */
1829 2, /* vec_unalign_load_cost. */
1830 1, /* vec_store_cost. */
1831 3, /* cond_taken_branch_cost. */
1832 1, /* cond_not_taken_branch_cost. */
1833 };
1834
1835 /* core_cost should produce code tuned for Core familly of CPUs. */
1836 static stringop_algs core_memcpy[2] = {
1837 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1838 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1839 {-1, libcall, false}}}};
1840 static stringop_algs core_memset[2] = {
1841 {libcall, {{6, loop_1_byte, true},
1842 {24, loop, true},
1843 {8192, rep_prefix_4_byte, true},
1844 {-1, libcall, false}}},
1845 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1846 {-1, libcall, false}}}};
1847
1848 static const
1849 struct processor_costs core_cost = {
1850 COSTS_N_INSNS (1), /* cost of an add instruction */
1851 /* On all chips taken into consideration lea is 2 cycles and more. With
1852 this cost however our current implementation of synth_mult results in
1853 use of unnecessary temporary registers causing regression on several
1854 SPECfp benchmarks. */
1855 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1856 COSTS_N_INSNS (1), /* variable shift costs */
1857 COSTS_N_INSNS (1), /* constant shift costs */
1858 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1859 COSTS_N_INSNS (4), /* HI */
1860 COSTS_N_INSNS (3), /* SI */
1861 COSTS_N_INSNS (4), /* DI */
1862 COSTS_N_INSNS (2)}, /* other */
1863 0, /* cost of multiply per each bit set */
1864 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1865 COSTS_N_INSNS (26), /* HI */
1866 COSTS_N_INSNS (42), /* SI */
1867 COSTS_N_INSNS (74), /* DI */
1868 COSTS_N_INSNS (74)}, /* other */
1869 COSTS_N_INSNS (1), /* cost of movsx */
1870 COSTS_N_INSNS (1), /* cost of movzx */
1871 8, /* "large" insn */
1872 17, /* MOVE_RATIO */
1873 4, /* cost for loading QImode using movzbl */
1874 {4, 4, 4}, /* cost of loading integer registers
1875 in QImode, HImode and SImode.
1876 Relative to reg-reg move (2). */
1877 {4, 4, 4}, /* cost of storing integer registers */
1878 4, /* cost of reg,reg fld/fst */
1879 {12, 12, 12}, /* cost of loading fp registers
1880 in SFmode, DFmode and XFmode */
1881 {6, 6, 8}, /* cost of storing fp registers
1882 in SFmode, DFmode and XFmode */
1883 2, /* cost of moving MMX register */
1884 {8, 8}, /* cost of loading MMX registers
1885 in SImode and DImode */
1886 {8, 8}, /* cost of storing MMX registers
1887 in SImode and DImode */
1888 2, /* cost of moving SSE register */
1889 {8, 8, 8}, /* cost of loading SSE registers
1890 in SImode, DImode and TImode */
1891 {8, 8, 8}, /* cost of storing SSE registers
1892 in SImode, DImode and TImode */
1893 5, /* MMX or SSE register to integer */
1894 64, /* size of l1 cache. */
1895 512, /* size of l2 cache. */
1896 64, /* size of prefetch block */
1897 6, /* number of parallel prefetches */
1898 /* FIXME perhaps more appropriate value is 5. */
1899 3, /* Branch cost */
1900 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1901 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1902 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1903 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1904 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1905 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1906 core_memcpy,
1907 core_memset,
1908 1, /* scalar_stmt_cost. */
1909 1, /* scalar load_cost. */
1910 1, /* scalar_store_cost. */
1911 1, /* vec_stmt_cost. */
1912 1, /* vec_to_scalar_cost. */
1913 1, /* scalar_to_vec_cost. */
1914 1, /* vec_align_load_cost. */
1915 2, /* vec_unalign_load_cost. */
1916 1, /* vec_store_cost. */
1917 3, /* cond_taken_branch_cost. */
1918 1, /* cond_not_taken_branch_cost. */
1919 };
1920
1921
1922 /* Set by -mtune. */
1923 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1924
1925 /* Set by -mtune or -Os. */
1926 const struct processor_costs *ix86_cost = &pentium_cost;
1927
1928 /* Processor feature/optimization bitmasks. */
1929 #define m_386 (1<<PROCESSOR_I386)
1930 #define m_486 (1<<PROCESSOR_I486)
1931 #define m_PENT (1<<PROCESSOR_PENTIUM)
1932 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1933 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1934 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1935 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1936 #define m_CORE2 (1<<PROCESSOR_CORE2)
1937 #define m_COREI7 (1<<PROCESSOR_COREI7)
1938 #define m_COREI7_AVX (1<<PROCESSOR_COREI7_AVX)
1939 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1940 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_COREI7_AVX | m_HASWELL)
1941 #define m_ATOM (1<<PROCESSOR_ATOM)
1942 #define m_SLM (1<<PROCESSOR_SLM)
1943
1944 #define m_GEODE (1<<PROCESSOR_GEODE)
1945 #define m_K6 (1<<PROCESSOR_K6)
1946 #define m_K6_GEODE (m_K6 | m_GEODE)
1947 #define m_K8 (1<<PROCESSOR_K8)
1948 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1949 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1950 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1951 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1952 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1953 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1954 #define m_BDVER4 (1<<PROCESSOR_BDVER4)
1955 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1956 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1957 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
1958 #define m_BTVER (m_BTVER1 | m_BTVER2)
1959 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1960
1961 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1962
1963 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1964 #undef DEF_TUNE
1965 #define DEF_TUNE(tune, name, selector) name,
1966 #include "x86-tune.def"
1967 #undef DEF_TUNE
1968 };
1969
1970 /* Feature tests against the various tunings. */
1971 unsigned char ix86_tune_features[X86_TUNE_LAST];
1972
1973 /* Feature tests against the various tunings used to create ix86_tune_features
1974 based on the processor mask. */
1975 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1976 #undef DEF_TUNE
1977 #define DEF_TUNE(tune, name, selector) selector,
1978 #include "x86-tune.def"
1979 #undef DEF_TUNE
1980 };
1981
1982 /* Feature tests against the various architecture variations. */
1983 unsigned char ix86_arch_features[X86_ARCH_LAST];
1984
1985 /* Feature tests against the various architecture variations, used to create
1986 ix86_arch_features based on the processor mask. */
1987 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1988 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1989 ~(m_386 | m_486 | m_PENT | m_K6),
1990
1991 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1992 ~m_386,
1993
1994 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1995 ~(m_386 | m_486),
1996
1997 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1998 ~m_386,
1999
2000 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2001 ~m_386,
2002 };
2003
2004 /* In case the average insn count for single function invocation is
2005 lower than this constant, emit fast (but longer) prologue and
2006 epilogue code. */
2007 #define FAST_PROLOGUE_INSN_COUNT 20
2008
2009 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2010 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2011 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2012 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2013
2014 /* Array of the smallest class containing reg number REGNO, indexed by
2015 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2016
2017 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2018 {
2019 /* ax, dx, cx, bx */
2020 AREG, DREG, CREG, BREG,
2021 /* si, di, bp, sp */
2022 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2023 /* FP registers */
2024 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2025 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2026 /* arg pointer */
2027 NON_Q_REGS,
2028 /* flags, fpsr, fpcr, frame */
2029 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2030 /* SSE registers */
2031 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2032 SSE_REGS, SSE_REGS,
2033 /* MMX registers */
2034 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2035 MMX_REGS, MMX_REGS,
2036 /* REX registers */
2037 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2038 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2039 /* SSE REX registers */
2040 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2041 SSE_REGS, SSE_REGS,
2042 /* AVX-512 SSE registers */
2043 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2044 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2045 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2046 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2047 /* Mask registers. */
2048 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2049 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2050 /* MPX bound registers */
2051 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2052 };
2053
2054 /* The "default" register map used in 32bit mode. */
2055
2056 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2057 {
2058 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2059 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2060 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2061 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2062 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2063 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2064 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2065 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2066 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2067 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2068 101, 102, 103, 104, /* bound registers */
2069 };
2070
2071 /* The "default" register map used in 64bit mode. */
2072
2073 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2074 {
2075 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2076 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2077 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2078 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2079 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2080 8,9,10,11,12,13,14,15, /* extended integer registers */
2081 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2082 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2083 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2084 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2085 126, 127, 128, 129, /* bound registers */
2086 };
2087
2088 /* Define the register numbers to be used in Dwarf debugging information.
2089 The SVR4 reference port C compiler uses the following register numbers
2090 in its Dwarf output code:
2091 0 for %eax (gcc regno = 0)
2092 1 for %ecx (gcc regno = 2)
2093 2 for %edx (gcc regno = 1)
2094 3 for %ebx (gcc regno = 3)
2095 4 for %esp (gcc regno = 7)
2096 5 for %ebp (gcc regno = 6)
2097 6 for %esi (gcc regno = 4)
2098 7 for %edi (gcc regno = 5)
2099 The following three DWARF register numbers are never generated by
2100 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2101 believes these numbers have these meanings.
2102 8 for %eip (no gcc equivalent)
2103 9 for %eflags (gcc regno = 17)
2104 10 for %trapno (no gcc equivalent)
2105 It is not at all clear how we should number the FP stack registers
2106 for the x86 architecture. If the version of SDB on x86/svr4 were
2107 a bit less brain dead with respect to floating-point then we would
2108 have a precedent to follow with respect to DWARF register numbers
2109 for x86 FP registers, but the SDB on x86/svr4 is so completely
2110 broken with respect to FP registers that it is hardly worth thinking
2111 of it as something to strive for compatibility with.
2112 The version of x86/svr4 SDB I have at the moment does (partially)
2113 seem to believe that DWARF register number 11 is associated with
2114 the x86 register %st(0), but that's about all. Higher DWARF
2115 register numbers don't seem to be associated with anything in
2116 particular, and even for DWARF regno 11, SDB only seems to under-
2117 stand that it should say that a variable lives in %st(0) (when
2118 asked via an `=' command) if we said it was in DWARF regno 11,
2119 but SDB still prints garbage when asked for the value of the
2120 variable in question (via a `/' command).
2121 (Also note that the labels SDB prints for various FP stack regs
2122 when doing an `x' command are all wrong.)
2123 Note that these problems generally don't affect the native SVR4
2124 C compiler because it doesn't allow the use of -O with -g and
2125 because when it is *not* optimizing, it allocates a memory
2126 location for each floating-point variable, and the memory
2127 location is what gets described in the DWARF AT_location
2128 attribute for the variable in question.
2129 Regardless of the severe mental illness of the x86/svr4 SDB, we
2130 do something sensible here and we use the following DWARF
2131 register numbers. Note that these are all stack-top-relative
2132 numbers.
2133 11 for %st(0) (gcc regno = 8)
2134 12 for %st(1) (gcc regno = 9)
2135 13 for %st(2) (gcc regno = 10)
2136 14 for %st(3) (gcc regno = 11)
2137 15 for %st(4) (gcc regno = 12)
2138 16 for %st(5) (gcc regno = 13)
2139 17 for %st(6) (gcc regno = 14)
2140 18 for %st(7) (gcc regno = 15)
2141 */
2142 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2143 {
2144 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2145 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2146 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2147 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2148 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2149 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2150 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2151 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2152 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2153 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2154 -1, -1, -1, -1, /* bound registers */
2155 };
2156
2157 /* Define parameter passing and return registers. */
2158
2159 static int const x86_64_int_parameter_registers[6] =
2160 {
2161 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2162 };
2163
2164 static int const x86_64_ms_abi_int_parameter_registers[4] =
2165 {
2166 CX_REG, DX_REG, R8_REG, R9_REG
2167 };
2168
2169 static int const x86_64_int_return_registers[4] =
2170 {
2171 AX_REG, DX_REG, DI_REG, SI_REG
2172 };
2173
2174 /* Additional registers that are clobbered by SYSV calls. */
2175
2176 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2177 {
2178 SI_REG, DI_REG,
2179 XMM6_REG, XMM7_REG,
2180 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2181 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2182 };
2183
2184 /* Define the structure for the machine field in struct function. */
2185
2186 struct GTY(()) stack_local_entry {
2187 unsigned short mode;
2188 unsigned short n;
2189 rtx rtl;
2190 struct stack_local_entry *next;
2191 };
2192
2193 /* Structure describing stack frame layout.
2194 Stack grows downward:
2195
2196 [arguments]
2197 <- ARG_POINTER
2198 saved pc
2199
2200 saved static chain if ix86_static_chain_on_stack
2201
2202 saved frame pointer if frame_pointer_needed
2203 <- HARD_FRAME_POINTER
2204 [saved regs]
2205 <- regs_save_offset
2206 [padding0]
2207
2208 [saved SSE regs]
2209 <- sse_regs_save_offset
2210 [padding1] |
2211 | <- FRAME_POINTER
2212 [va_arg registers] |
2213 |
2214 [frame] |
2215 |
2216 [padding2] | = to_allocate
2217 <- STACK_POINTER
2218 */
2219 struct ix86_frame
2220 {
2221 int nsseregs;
2222 int nregs;
2223 int va_arg_size;
2224 int red_zone_size;
2225 int outgoing_arguments_size;
2226
2227 /* The offsets relative to ARG_POINTER. */
2228 HOST_WIDE_INT frame_pointer_offset;
2229 HOST_WIDE_INT hard_frame_pointer_offset;
2230 HOST_WIDE_INT stack_pointer_offset;
2231 HOST_WIDE_INT hfp_save_offset;
2232 HOST_WIDE_INT reg_save_offset;
2233 HOST_WIDE_INT sse_reg_save_offset;
2234
2235 /* When save_regs_using_mov is set, emit prologue using
2236 move instead of push instructions. */
2237 bool save_regs_using_mov;
2238 };
2239
2240 /* Which cpu are we scheduling for. */
2241 enum attr_cpu ix86_schedule;
2242
2243 /* Which cpu are we optimizing for. */
2244 enum processor_type ix86_tune;
2245
2246 /* Which instruction set architecture to use. */
2247 enum processor_type ix86_arch;
2248
2249 /* True if processor has SSE prefetch instruction. */
2250 unsigned char x86_prefetch_sse;
2251
2252 /* -mstackrealign option */
2253 static const char ix86_force_align_arg_pointer_string[]
2254 = "force_align_arg_pointer";
2255
2256 static rtx (*ix86_gen_leave) (void);
2257 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2258 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2259 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2260 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2261 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2262 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2263 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2264 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2265 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2266 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2267 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2268
2269 /* Preferred alignment for stack boundary in bits. */
2270 unsigned int ix86_preferred_stack_boundary;
2271
2272 /* Alignment for incoming stack boundary in bits specified at
2273 command line. */
2274 static unsigned int ix86_user_incoming_stack_boundary;
2275
2276 /* Default alignment for incoming stack boundary in bits. */
2277 static unsigned int ix86_default_incoming_stack_boundary;
2278
2279 /* Alignment for incoming stack boundary in bits. */
2280 unsigned int ix86_incoming_stack_boundary;
2281
2282 /* Calling abi specific va_list type nodes. */
2283 static GTY(()) tree sysv_va_list_type_node;
2284 static GTY(()) tree ms_va_list_type_node;
2285
2286 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2287 char internal_label_prefix[16];
2288 int internal_label_prefix_len;
2289
2290 /* Fence to use after loop using movnt. */
2291 tree x86_mfence;
2292
2293 /* Register class used for passing given 64bit part of the argument.
2294 These represent classes as documented by the PS ABI, with the exception
2295 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2296 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2297
2298 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2299 whenever possible (upper half does contain padding). */
2300 enum x86_64_reg_class
2301 {
2302 X86_64_NO_CLASS,
2303 X86_64_INTEGER_CLASS,
2304 X86_64_INTEGERSI_CLASS,
2305 X86_64_SSE_CLASS,
2306 X86_64_SSESF_CLASS,
2307 X86_64_SSEDF_CLASS,
2308 X86_64_SSEUP_CLASS,
2309 X86_64_X87_CLASS,
2310 X86_64_X87UP_CLASS,
2311 X86_64_COMPLEX_X87_CLASS,
2312 X86_64_MEMORY_CLASS
2313 };
2314
2315 #define MAX_CLASSES 4
2316
2317 /* Table of constants used by fldpi, fldln2, etc.... */
2318 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2319 static bool ext_80387_constants_init = 0;
2320
2321 \f
2322 static struct machine_function * ix86_init_machine_status (void);
2323 static rtx ix86_function_value (const_tree, const_tree, bool);
2324 static bool ix86_function_value_regno_p (const unsigned int);
2325 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2326 const_tree);
2327 static rtx ix86_static_chain (const_tree, bool);
2328 static int ix86_function_regparm (const_tree, const_tree);
2329 static void ix86_compute_frame_layout (struct ix86_frame *);
2330 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2331 rtx, rtx, int);
2332 static void ix86_add_new_builtins (HOST_WIDE_INT);
2333 static tree ix86_canonical_va_list_type (tree);
2334 static void predict_jump (int);
2335 static unsigned int split_stack_prologue_scratch_regno (void);
2336 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2337
2338 enum ix86_function_specific_strings
2339 {
2340 IX86_FUNCTION_SPECIFIC_ARCH,
2341 IX86_FUNCTION_SPECIFIC_TUNE,
2342 IX86_FUNCTION_SPECIFIC_MAX
2343 };
2344
2345 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2346 const char *, enum fpmath_unit, bool);
2347 static void ix86_function_specific_save (struct cl_target_option *,
2348 struct gcc_options *opts);
2349 static void ix86_function_specific_restore (struct gcc_options *opts,
2350 struct cl_target_option *);
2351 static void ix86_function_specific_print (FILE *, int,
2352 struct cl_target_option *);
2353 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2354 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2355 struct gcc_options *,
2356 struct gcc_options *,
2357 struct gcc_options *);
2358 static bool ix86_can_inline_p (tree, tree);
2359 static void ix86_set_current_function (tree);
2360 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2361
2362 static enum calling_abi ix86_function_abi (const_tree);
2363
2364 \f
2365 #ifndef SUBTARGET32_DEFAULT_CPU
2366 #define SUBTARGET32_DEFAULT_CPU "i386"
2367 #endif
2368
2369 /* Whether -mtune= or -march= were specified */
2370 static int ix86_tune_defaulted;
2371 static int ix86_arch_specified;
2372
2373 /* Vectorization library interface and handlers. */
2374 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2375
2376 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2377 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2378
2379 /* Processor target table, indexed by processor number */
2380 struct ptt
2381 {
2382 const struct processor_costs *cost; /* Processor costs */
2383 const int align_loop; /* Default alignments. */
2384 const int align_loop_max_skip;
2385 const int align_jump;
2386 const int align_jump_max_skip;
2387 const int align_func;
2388 };
2389
2390 static const struct ptt processor_target_table[PROCESSOR_max] =
2391 {
2392 {&i386_cost, 4, 3, 4, 3, 4},
2393 {&i486_cost, 16, 15, 16, 15, 16},
2394 {&pentium_cost, 16, 7, 16, 7, 16},
2395 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2396 {&geode_cost, 0, 0, 0, 0, 0},
2397 {&k6_cost, 32, 7, 32, 7, 32},
2398 {&athlon_cost, 16, 7, 16, 7, 16},
2399 {&pentium4_cost, 0, 0, 0, 0, 0},
2400 {&k8_cost, 16, 7, 16, 7, 16},
2401 {&nocona_cost, 0, 0, 0, 0, 0},
2402 /* Core 2 */
2403 {&core_cost, 16, 10, 16, 10, 16},
2404 /* Core i7 */
2405 {&core_cost, 16, 10, 16, 10, 16},
2406 /* Core i7 avx */
2407 {&core_cost, 16, 10, 16, 10, 16},
2408 /* Core avx2 */
2409 {&core_cost, 16, 10, 16, 10, 16},
2410 {&generic_cost, 16, 10, 16, 10, 16},
2411 {&amdfam10_cost, 32, 24, 32, 7, 32},
2412 {&bdver1_cost, 16, 10, 16, 7, 11},
2413 {&bdver2_cost, 16, 10, 16, 7, 11},
2414 {&bdver3_cost, 16, 10, 16, 7, 11},
2415 {&bdver4_cost, 16, 10, 16, 7, 11},
2416 {&btver1_cost, 16, 10, 16, 7, 11},
2417 {&btver2_cost, 16, 10, 16, 7, 11},
2418 {&atom_cost, 16, 15, 16, 7, 16},
2419 {&slm_cost, 16, 15, 16, 7, 16}
2420 };
2421
2422 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2423 {
2424 "generic",
2425 "i386",
2426 "i486",
2427 "pentium",
2428 "pentium-mmx",
2429 "pentiumpro",
2430 "pentium2",
2431 "pentium3",
2432 "pentium4",
2433 "pentium-m",
2434 "prescott",
2435 "nocona",
2436 "core2",
2437 "corei7",
2438 "corei7-avx",
2439 "core-avx2",
2440 "atom",
2441 "slm",
2442 "geode",
2443 "k6",
2444 "k6-2",
2445 "k6-3",
2446 "athlon",
2447 "athlon-4",
2448 "k8",
2449 "amdfam10",
2450 "bdver1",
2451 "bdver2",
2452 "bdver3",
2453 "bdver4",
2454 "btver1",
2455 "btver2"
2456 };
2457 \f
2458 static bool
2459 gate_insert_vzeroupper (void)
2460 {
2461 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2462 }
2463
2464 static unsigned int
2465 rest_of_handle_insert_vzeroupper (void)
2466 {
2467 int i;
2468
2469 /* vzeroupper instructions are inserted immediately after reload to
2470 account for possible spills from 256bit registers. The pass
2471 reuses mode switching infrastructure by re-running mode insertion
2472 pass, so disable entities that have already been processed. */
2473 for (i = 0; i < MAX_386_ENTITIES; i++)
2474 ix86_optimize_mode_switching[i] = 0;
2475
2476 ix86_optimize_mode_switching[AVX_U128] = 1;
2477
2478 /* Call optimize_mode_switching. */
2479 g->get_passes ()->execute_pass_mode_switching ();
2480 return 0;
2481 }
2482
2483 namespace {
2484
2485 const pass_data pass_data_insert_vzeroupper =
2486 {
2487 RTL_PASS, /* type */
2488 "vzeroupper", /* name */
2489 OPTGROUP_NONE, /* optinfo_flags */
2490 true, /* has_gate */
2491 true, /* has_execute */
2492 TV_NONE, /* tv_id */
2493 0, /* properties_required */
2494 0, /* properties_provided */
2495 0, /* properties_destroyed */
2496 0, /* todo_flags_start */
2497 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2498 };
2499
2500 class pass_insert_vzeroupper : public rtl_opt_pass
2501 {
2502 public:
2503 pass_insert_vzeroupper(gcc::context *ctxt)
2504 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2505 {}
2506
2507 /* opt_pass methods: */
2508 bool gate () { return gate_insert_vzeroupper (); }
2509 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2510
2511 }; // class pass_insert_vzeroupper
2512
2513 } // anon namespace
2514
2515 rtl_opt_pass *
2516 make_pass_insert_vzeroupper (gcc::context *ctxt)
2517 {
2518 return new pass_insert_vzeroupper (ctxt);
2519 }
2520
2521 /* Return true if a red-zone is in use. */
2522
2523 static inline bool
2524 ix86_using_red_zone (void)
2525 {
2526 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2527 }
2528 \f
2529 /* Return a string that documents the current -m options. The caller is
2530 responsible for freeing the string. */
2531
2532 static char *
2533 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2534 const char *tune, enum fpmath_unit fpmath,
2535 bool add_nl_p)
2536 {
2537 struct ix86_target_opts
2538 {
2539 const char *option; /* option string */
2540 HOST_WIDE_INT mask; /* isa mask options */
2541 };
2542
2543 /* This table is ordered so that options like -msse4.2 that imply
2544 preceding options while match those first. */
2545 static struct ix86_target_opts isa_opts[] =
2546 {
2547 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2548 { "-mfma", OPTION_MASK_ISA_FMA },
2549 { "-mxop", OPTION_MASK_ISA_XOP },
2550 { "-mlwp", OPTION_MASK_ISA_LWP },
2551 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2552 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2553 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2554 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2555 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2556 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2557 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2558 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2559 { "-msse3", OPTION_MASK_ISA_SSE3 },
2560 { "-msse2", OPTION_MASK_ISA_SSE2 },
2561 { "-msse", OPTION_MASK_ISA_SSE },
2562 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2563 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2564 { "-mmmx", OPTION_MASK_ISA_MMX },
2565 { "-mabm", OPTION_MASK_ISA_ABM },
2566 { "-mbmi", OPTION_MASK_ISA_BMI },
2567 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2568 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2569 { "-mhle", OPTION_MASK_ISA_HLE },
2570 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2571 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2572 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2573 { "-madx", OPTION_MASK_ISA_ADX },
2574 { "-mtbm", OPTION_MASK_ISA_TBM },
2575 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2576 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2577 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2578 { "-maes", OPTION_MASK_ISA_AES },
2579 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2580 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2581 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2582 { "-mf16c", OPTION_MASK_ISA_F16C },
2583 { "-mrtm", OPTION_MASK_ISA_RTM },
2584 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2585 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2586 { "-mmpx", OPTION_MASK_ISA_MPX },
2587 };
2588
2589 /* Flag options. */
2590 static struct ix86_target_opts flag_opts[] =
2591 {
2592 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2593 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2594 { "-m80387", MASK_80387 },
2595 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2596 { "-malign-double", MASK_ALIGN_DOUBLE },
2597 { "-mcld", MASK_CLD },
2598 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2599 { "-mieee-fp", MASK_IEEE_FP },
2600 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2601 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2602 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2603 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2604 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2605 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2606 { "-mno-red-zone", MASK_NO_RED_ZONE },
2607 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2608 { "-mrecip", MASK_RECIP },
2609 { "-mrtd", MASK_RTD },
2610 { "-msseregparm", MASK_SSEREGPARM },
2611 { "-mstack-arg-probe", MASK_STACK_PROBE },
2612 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2613 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2614 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2615 { "-mvzeroupper", MASK_VZEROUPPER },
2616 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2617 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2618 { "-mprefer-avx128", MASK_PREFER_AVX128},
2619 };
2620
2621 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2622
2623 char isa_other[40];
2624 char target_other[40];
2625 unsigned num = 0;
2626 unsigned i, j;
2627 char *ret;
2628 char *ptr;
2629 size_t len;
2630 size_t line_len;
2631 size_t sep_len;
2632 const char *abi;
2633
2634 memset (opts, '\0', sizeof (opts));
2635
2636 /* Add -march= option. */
2637 if (arch)
2638 {
2639 opts[num][0] = "-march=";
2640 opts[num++][1] = arch;
2641 }
2642
2643 /* Add -mtune= option. */
2644 if (tune)
2645 {
2646 opts[num][0] = "-mtune=";
2647 opts[num++][1] = tune;
2648 }
2649
2650 /* Add -m32/-m64/-mx32. */
2651 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2652 {
2653 if ((isa & OPTION_MASK_ABI_64) != 0)
2654 abi = "-m64";
2655 else
2656 abi = "-mx32";
2657 isa &= ~ (OPTION_MASK_ISA_64BIT
2658 | OPTION_MASK_ABI_64
2659 | OPTION_MASK_ABI_X32);
2660 }
2661 else
2662 abi = "-m32";
2663 opts[num++][0] = abi;
2664
2665 /* Pick out the options in isa options. */
2666 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2667 {
2668 if ((isa & isa_opts[i].mask) != 0)
2669 {
2670 opts[num++][0] = isa_opts[i].option;
2671 isa &= ~ isa_opts[i].mask;
2672 }
2673 }
2674
2675 if (isa && add_nl_p)
2676 {
2677 opts[num++][0] = isa_other;
2678 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2679 isa);
2680 }
2681
2682 /* Add flag options. */
2683 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2684 {
2685 if ((flags & flag_opts[i].mask) != 0)
2686 {
2687 opts[num++][0] = flag_opts[i].option;
2688 flags &= ~ flag_opts[i].mask;
2689 }
2690 }
2691
2692 if (flags && add_nl_p)
2693 {
2694 opts[num++][0] = target_other;
2695 sprintf (target_other, "(other flags: %#x)", flags);
2696 }
2697
2698 /* Add -fpmath= option. */
2699 if (fpmath)
2700 {
2701 opts[num][0] = "-mfpmath=";
2702 switch ((int) fpmath)
2703 {
2704 case FPMATH_387:
2705 opts[num++][1] = "387";
2706 break;
2707
2708 case FPMATH_SSE:
2709 opts[num++][1] = "sse";
2710 break;
2711
2712 case FPMATH_387 | FPMATH_SSE:
2713 opts[num++][1] = "sse+387";
2714 break;
2715
2716 default:
2717 gcc_unreachable ();
2718 }
2719 }
2720
2721 /* Any options? */
2722 if (num == 0)
2723 return NULL;
2724
2725 gcc_assert (num < ARRAY_SIZE (opts));
2726
2727 /* Size the string. */
2728 len = 0;
2729 sep_len = (add_nl_p) ? 3 : 1;
2730 for (i = 0; i < num; i++)
2731 {
2732 len += sep_len;
2733 for (j = 0; j < 2; j++)
2734 if (opts[i][j])
2735 len += strlen (opts[i][j]);
2736 }
2737
2738 /* Build the string. */
2739 ret = ptr = (char *) xmalloc (len);
2740 line_len = 0;
2741
2742 for (i = 0; i < num; i++)
2743 {
2744 size_t len2[2];
2745
2746 for (j = 0; j < 2; j++)
2747 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2748
2749 if (i != 0)
2750 {
2751 *ptr++ = ' ';
2752 line_len++;
2753
2754 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2755 {
2756 *ptr++ = '\\';
2757 *ptr++ = '\n';
2758 line_len = 0;
2759 }
2760 }
2761
2762 for (j = 0; j < 2; j++)
2763 if (opts[i][j])
2764 {
2765 memcpy (ptr, opts[i][j], len2[j]);
2766 ptr += len2[j];
2767 line_len += len2[j];
2768 }
2769 }
2770
2771 *ptr = '\0';
2772 gcc_assert (ret + len >= ptr);
2773
2774 return ret;
2775 }
2776
2777 /* Return true, if profiling code should be emitted before
2778 prologue. Otherwise it returns false.
2779 Note: For x86 with "hotfix" it is sorried. */
2780 static bool
2781 ix86_profile_before_prologue (void)
2782 {
2783 return flag_fentry != 0;
2784 }
2785
2786 /* Function that is callable from the debugger to print the current
2787 options. */
2788 void ATTRIBUTE_UNUSED
2789 ix86_debug_options (void)
2790 {
2791 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2792 ix86_arch_string, ix86_tune_string,
2793 ix86_fpmath, true);
2794
2795 if (opts)
2796 {
2797 fprintf (stderr, "%s\n\n", opts);
2798 free (opts);
2799 }
2800 else
2801 fputs ("<no options>\n\n", stderr);
2802
2803 return;
2804 }
2805
2806 static const char *stringop_alg_names[] = {
2807 #define DEF_ENUM
2808 #define DEF_ALG(alg, name) #name,
2809 #include "stringop.def"
2810 #undef DEF_ENUM
2811 #undef DEF_ALG
2812 };
2813
2814 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2815 The string is of the following form (or comma separated list of it):
2816
2817 strategy_alg:max_size:[align|noalign]
2818
2819 where the full size range for the strategy is either [0, max_size] or
2820 [min_size, max_size], in which min_size is the max_size + 1 of the
2821 preceding range. The last size range must have max_size == -1.
2822
2823 Examples:
2824
2825 1.
2826 -mmemcpy-strategy=libcall:-1:noalign
2827
2828 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2829
2830
2831 2.
2832 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2833
2834 This is to tell the compiler to use the following strategy for memset
2835 1) when the expected size is between [1, 16], use rep_8byte strategy;
2836 2) when the size is between [17, 2048], use vector_loop;
2837 3) when the size is > 2048, use libcall. */
2838
2839 struct stringop_size_range
2840 {
2841 int max;
2842 stringop_alg alg;
2843 bool noalign;
2844 };
2845
2846 static void
2847 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2848 {
2849 const struct stringop_algs *default_algs;
2850 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2851 char *curr_range_str, *next_range_str;
2852 int i = 0, n = 0;
2853
2854 if (is_memset)
2855 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2856 else
2857 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2858
2859 curr_range_str = strategy_str;
2860
2861 do
2862 {
2863 int maxs;
2864 stringop_alg alg;
2865 char alg_name[128];
2866 char align[16];
2867 next_range_str = strchr (curr_range_str, ',');
2868 if (next_range_str)
2869 *next_range_str++ = '\0';
2870
2871 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2872 alg_name, &maxs, align))
2873 {
2874 error ("wrong arg %s to option %s", curr_range_str,
2875 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2876 return;
2877 }
2878
2879 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2880 {
2881 error ("size ranges of option %s should be increasing",
2882 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2883 return;
2884 }
2885
2886 for (i = 0; i < last_alg; i++)
2887 {
2888 if (!strcmp (alg_name, stringop_alg_names[i]))
2889 {
2890 alg = (stringop_alg) i;
2891 break;
2892 }
2893 }
2894
2895 if (i == last_alg)
2896 {
2897 error ("wrong stringop strategy name %s specified for option %s",
2898 alg_name,
2899 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2900 return;
2901 }
2902
2903 input_ranges[n].max = maxs;
2904 input_ranges[n].alg = alg;
2905 if (!strcmp (align, "align"))
2906 input_ranges[n].noalign = false;
2907 else if (!strcmp (align, "noalign"))
2908 input_ranges[n].noalign = true;
2909 else
2910 {
2911 error ("unknown alignment %s specified for option %s",
2912 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2913 return;
2914 }
2915 n++;
2916 curr_range_str = next_range_str;
2917 }
2918 while (curr_range_str);
2919
2920 if (input_ranges[n - 1].max != -1)
2921 {
2922 error ("the max value for the last size range should be -1"
2923 " for option %s",
2924 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2925 return;
2926 }
2927
2928 if (n > MAX_STRINGOP_ALGS)
2929 {
2930 error ("too many size ranges specified in option %s",
2931 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2932 return;
2933 }
2934
2935 /* Now override the default algs array. */
2936 for (i = 0; i < n; i++)
2937 {
2938 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2939 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2940 = input_ranges[i].alg;
2941 *const_cast<int *>(&default_algs->size[i].noalign)
2942 = input_ranges[i].noalign;
2943 }
2944 }
2945
2946 \f
2947 /* parse -mtune-ctrl= option. When DUMP is true,
2948 print the features that are explicitly set. */
2949
2950 static void
2951 parse_mtune_ctrl_str (bool dump)
2952 {
2953 if (!ix86_tune_ctrl_string)
2954 return;
2955
2956 char *next_feature_string = NULL;
2957 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2958 char *orig = curr_feature_string;
2959 int i;
2960 do
2961 {
2962 bool clear = false;
2963
2964 next_feature_string = strchr (curr_feature_string, ',');
2965 if (next_feature_string)
2966 *next_feature_string++ = '\0';
2967 if (*curr_feature_string == '^')
2968 {
2969 curr_feature_string++;
2970 clear = true;
2971 }
2972 for (i = 0; i < X86_TUNE_LAST; i++)
2973 {
2974 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2975 {
2976 ix86_tune_features[i] = !clear;
2977 if (dump)
2978 fprintf (stderr, "Explicitly %s feature %s\n",
2979 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2980 break;
2981 }
2982 }
2983 if (i == X86_TUNE_LAST)
2984 error ("Unknown parameter to option -mtune-ctrl: %s",
2985 clear ? curr_feature_string - 1 : curr_feature_string);
2986 curr_feature_string = next_feature_string;
2987 }
2988 while (curr_feature_string);
2989 free (orig);
2990 }
2991
2992 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2993 processor type. */
2994
2995 static void
2996 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2997 {
2998 unsigned int ix86_tune_mask = 1u << ix86_tune;
2999 int i;
3000
3001 for (i = 0; i < X86_TUNE_LAST; ++i)
3002 {
3003 if (ix86_tune_no_default)
3004 ix86_tune_features[i] = 0;
3005 else
3006 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3007 }
3008
3009 if (dump)
3010 {
3011 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3012 for (i = 0; i < X86_TUNE_LAST; i++)
3013 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3014 ix86_tune_features[i] ? "on" : "off");
3015 }
3016
3017 parse_mtune_ctrl_str (dump);
3018 }
3019
3020
3021 /* Override various settings based on options. If MAIN_ARGS_P, the
3022 options are from the command line, otherwise they are from
3023 attributes. */
3024
3025 static void
3026 ix86_option_override_internal (bool main_args_p,
3027 struct gcc_options *opts,
3028 struct gcc_options *opts_set)
3029 {
3030 int i;
3031 unsigned int ix86_arch_mask;
3032 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3033 const char *prefix;
3034 const char *suffix;
3035 const char *sw;
3036
3037 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3038 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3039 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3040 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3041 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3042 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3043 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3044 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3045 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3046 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3047 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3048 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3049 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3050 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3051 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3052 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3053 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3054 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3055 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3056 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3057 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3058 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3059 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3060 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3061 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3062 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3063 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3064 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3065 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3066 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3067 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3068 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3069 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3070 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3071 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3072 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3073 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3074 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3075 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3076 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3077 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3078 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3079 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3080 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3081 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3082
3083 /* if this reaches 64, need to widen struct pta flags below */
3084
3085 static struct pta
3086 {
3087 const char *const name; /* processor name or nickname. */
3088 const enum processor_type processor;
3089 const enum attr_cpu schedule;
3090 const unsigned HOST_WIDE_INT flags;
3091 }
3092 const processor_alias_table[] =
3093 {
3094 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3095 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3096 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3097 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3098 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3099 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3100 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3101 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3102 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3103 PTA_MMX | PTA_SSE | PTA_FXSR},
3104 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3105 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3106 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3107 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3108 PTA_MMX | PTA_SSE | PTA_FXSR},
3109 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3110 PTA_MMX | PTA_SSE | PTA_FXSR},
3111 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3112 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3113 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3114 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3115 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3116 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3117 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3118 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3119 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3120 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3121 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3122 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3123 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3124 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3125 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3126 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3127 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3128 {"corei7-avx", PROCESSOR_COREI7_AVX, CPU_COREI7,
3129 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3130 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3131 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3132 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3133 {"core-avx-i", PROCESSOR_COREI7_AVX, CPU_COREI7,
3134 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3135 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3136 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3137 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3138 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3139 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3140 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3141 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3142 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3143 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3144 | PTA_XSAVEOPT},
3145 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3146 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3147 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3148 {"slm", PROCESSOR_SLM, CPU_SLM,
3149 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3150 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_MOVBE
3151 | PTA_FXSR},
3152 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3153 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3154 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3155 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3156 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3157 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3158 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3159 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3160 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3161 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3162 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3163 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3164 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3165 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3166 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3167 {"x86-64", PROCESSOR_K8, CPU_K8,
3168 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3169 {"k8", PROCESSOR_K8, CPU_K8,
3170 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3171 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3172 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3173 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3174 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3175 {"opteron", PROCESSOR_K8, CPU_K8,
3176 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3177 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3178 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3179 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3180 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3181 {"athlon64", PROCESSOR_K8, CPU_K8,
3182 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3183 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3184 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3185 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3186 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3187 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3188 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3189 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3190 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3191 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3192 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3193 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3194 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3195 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3196 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3197 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3198 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3199 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3200 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3201 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3202 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3203 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3204 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3205 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3206 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3207 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3208 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3209 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3210 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3211 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3212 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3213 | PTA_XSAVEOPT | PTA_FSGSBASE},
3214 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3215 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3216 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3217 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3218 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3219 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3220 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE},
3221 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3222 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3223 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3224 | PTA_FXSR | PTA_XSAVE},
3225 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3226 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3227 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3228 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3229 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3230 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3231
3232 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3233 PTA_64BIT
3234 | PTA_HLE /* flags are only used for -march switch. */ },
3235 };
3236
3237 /* -mrecip options. */
3238 static struct
3239 {
3240 const char *string; /* option name */
3241 unsigned int mask; /* mask bits to set */
3242 }
3243 const recip_options[] =
3244 {
3245 { "all", RECIP_MASK_ALL },
3246 { "none", RECIP_MASK_NONE },
3247 { "div", RECIP_MASK_DIV },
3248 { "sqrt", RECIP_MASK_SQRT },
3249 { "vec-div", RECIP_MASK_VEC_DIV },
3250 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3251 };
3252
3253 int const pta_size = ARRAY_SIZE (processor_alias_table);
3254
3255 /* Set up prefix/suffix so the error messages refer to either the command
3256 line argument, or the attribute(target). */
3257 if (main_args_p)
3258 {
3259 prefix = "-m";
3260 suffix = "";
3261 sw = "switch";
3262 }
3263 else
3264 {
3265 prefix = "option(\"";
3266 suffix = "\")";
3267 sw = "attribute";
3268 }
3269
3270 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3271 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3272 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3273 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3274 #ifdef TARGET_BI_ARCH
3275 else
3276 {
3277 #if TARGET_BI_ARCH == 1
3278 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3279 is on and OPTION_MASK_ABI_X32 is off. We turn off
3280 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3281 -mx32. */
3282 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3283 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3284 #else
3285 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3286 on and OPTION_MASK_ABI_64 is off. We turn off
3287 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3288 -m64. */
3289 if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3290 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3291 #endif
3292 }
3293 #endif
3294
3295 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3296 {
3297 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3298 OPTION_MASK_ABI_64 for TARGET_X32. */
3299 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3300 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3301 }
3302 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3303 {
3304 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3305 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3306 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3307 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3308 }
3309
3310 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3311 SUBTARGET_OVERRIDE_OPTIONS;
3312 #endif
3313
3314 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3315 SUBSUBTARGET_OVERRIDE_OPTIONS;
3316 #endif
3317
3318 /* -fPIC is the default for x86_64. */
3319 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3320 opts->x_flag_pic = 2;
3321
3322 /* Need to check -mtune=generic first. */
3323 if (opts->x_ix86_tune_string)
3324 {
3325 if (!strcmp (opts->x_ix86_tune_string, "generic")
3326 || !strcmp (opts->x_ix86_tune_string, "i686")
3327 /* As special support for cross compilers we read -mtune=native
3328 as -mtune=generic. With native compilers we won't see the
3329 -mtune=native, as it was changed by the driver. */
3330 || !strcmp (opts->x_ix86_tune_string, "native"))
3331 {
3332 opts->x_ix86_tune_string = "generic";
3333 }
3334 /* If this call is for setting the option attribute, allow the
3335 generic that was previously set. */
3336 else if (!main_args_p
3337 && !strcmp (opts->x_ix86_tune_string, "generic"))
3338 ;
3339 else if (!strncmp (opts->x_ix86_tune_string, "generic", 7))
3340 error ("bad value (%s) for %stune=%s %s",
3341 opts->x_ix86_tune_string, prefix, suffix, sw);
3342 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3343 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3344 "%stune=k8%s or %stune=generic%s instead as appropriate",
3345 prefix, suffix, prefix, suffix, prefix, suffix);
3346 }
3347 else
3348 {
3349 if (opts->x_ix86_arch_string)
3350 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3351 if (!opts->x_ix86_tune_string)
3352 {
3353 opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3354 ix86_tune_defaulted = 1;
3355 }
3356
3357 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3358 or defaulted. We need to use a sensible tune option. */
3359 if (!strcmp (opts->x_ix86_tune_string, "generic")
3360 || !strcmp (opts->x_ix86_tune_string, "x86-64")
3361 || !strcmp (opts->x_ix86_tune_string, "i686"))
3362 {
3363 opts->x_ix86_tune_string = "generic";
3364 }
3365 }
3366
3367 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3368 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3369 {
3370 /* rep; movq isn't available in 32-bit code. */
3371 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3372 opts->x_ix86_stringop_alg = no_stringop;
3373 }
3374
3375 if (!opts->x_ix86_arch_string)
3376 opts->x_ix86_arch_string
3377 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3378 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3379 else
3380 ix86_arch_specified = 1;
3381
3382 if (opts_set->x_ix86_pmode)
3383 {
3384 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3385 && opts->x_ix86_pmode == PMODE_SI)
3386 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3387 && opts->x_ix86_pmode == PMODE_DI))
3388 error ("address mode %qs not supported in the %s bit mode",
3389 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3390 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3391 }
3392 else
3393 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3394 ? PMODE_DI : PMODE_SI;
3395
3396 if (!opts_set->x_ix86_abi)
3397 opts->x_ix86_abi = DEFAULT_ABI;
3398
3399 /* For targets using ms ABI enable ms-extensions, if not
3400 explicit turned off. For non-ms ABI we turn off this
3401 option. */
3402 if (!opts_set->x_flag_ms_extensions)
3403 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3404
3405 if (opts_set->x_ix86_cmodel)
3406 {
3407 switch (opts->x_ix86_cmodel)
3408 {
3409 case CM_SMALL:
3410 case CM_SMALL_PIC:
3411 if (opts->x_flag_pic)
3412 opts->x_ix86_cmodel = CM_SMALL_PIC;
3413 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3414 error ("code model %qs not supported in the %s bit mode",
3415 "small", "32");
3416 break;
3417
3418 case CM_MEDIUM:
3419 case CM_MEDIUM_PIC:
3420 if (opts->x_flag_pic)
3421 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3422 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3423 error ("code model %qs not supported in the %s bit mode",
3424 "medium", "32");
3425 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3426 error ("code model %qs not supported in x32 mode",
3427 "medium");
3428 break;
3429
3430 case CM_LARGE:
3431 case CM_LARGE_PIC:
3432 if (opts->x_flag_pic)
3433 opts->x_ix86_cmodel = CM_LARGE_PIC;
3434 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3435 error ("code model %qs not supported in the %s bit mode",
3436 "large", "32");
3437 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3438 error ("code model %qs not supported in x32 mode",
3439 "large");
3440 break;
3441
3442 case CM_32:
3443 if (opts->x_flag_pic)
3444 error ("code model %s does not support PIC mode", "32");
3445 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3446 error ("code model %qs not supported in the %s bit mode",
3447 "32", "64");
3448 break;
3449
3450 case CM_KERNEL:
3451 if (opts->x_flag_pic)
3452 {
3453 error ("code model %s does not support PIC mode", "kernel");
3454 opts->x_ix86_cmodel = CM_32;
3455 }
3456 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3457 error ("code model %qs not supported in the %s bit mode",
3458 "kernel", "32");
3459 break;
3460
3461 default:
3462 gcc_unreachable ();
3463 }
3464 }
3465 else
3466 {
3467 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3468 use of rip-relative addressing. This eliminates fixups that
3469 would otherwise be needed if this object is to be placed in a
3470 DLL, and is essentially just as efficient as direct addressing. */
3471 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3472 && (TARGET_RDOS || TARGET_PECOFF))
3473 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3474 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3475 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3476 else
3477 opts->x_ix86_cmodel = CM_32;
3478 }
3479 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3480 {
3481 error ("-masm=intel not supported in this configuration");
3482 opts->x_ix86_asm_dialect = ASM_ATT;
3483 }
3484 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3485 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3486 sorry ("%i-bit mode not compiled in",
3487 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3488
3489 for (i = 0; i < pta_size; i++)
3490 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3491 {
3492 ix86_schedule = processor_alias_table[i].schedule;
3493 ix86_arch = processor_alias_table[i].processor;
3494 /* Default cpu tuning to the architecture. */
3495 ix86_tune = ix86_arch;
3496
3497 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3498 && !(processor_alias_table[i].flags & PTA_64BIT))
3499 error ("CPU you selected does not support x86-64 "
3500 "instruction set");
3501
3502 if (processor_alias_table[i].flags & PTA_MMX
3503 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3504 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3505 if (processor_alias_table[i].flags & PTA_3DNOW
3506 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3507 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3508 if (processor_alias_table[i].flags & PTA_3DNOW_A
3509 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3510 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3511 if (processor_alias_table[i].flags & PTA_SSE
3512 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3513 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3514 if (processor_alias_table[i].flags & PTA_SSE2
3515 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3516 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3517 if (processor_alias_table[i].flags & PTA_SSE3
3518 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3519 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3520 if (processor_alias_table[i].flags & PTA_SSSE3
3521 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3522 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3523 if (processor_alias_table[i].flags & PTA_SSE4_1
3524 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3525 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3526 if (processor_alias_table[i].flags & PTA_SSE4_2
3527 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3528 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3529 if (processor_alias_table[i].flags & PTA_AVX
3530 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3531 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3532 if (processor_alias_table[i].flags & PTA_AVX2
3533 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3534 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3535 if (processor_alias_table[i].flags & PTA_FMA
3536 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3537 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3538 if (processor_alias_table[i].flags & PTA_SSE4A
3539 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3540 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3541 if (processor_alias_table[i].flags & PTA_FMA4
3542 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3543 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3544 if (processor_alias_table[i].flags & PTA_XOP
3545 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3546 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3547 if (processor_alias_table[i].flags & PTA_LWP
3548 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3549 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3550 if (processor_alias_table[i].flags & PTA_ABM
3551 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3552 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3553 if (processor_alias_table[i].flags & PTA_BMI
3554 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3555 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3556 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3557 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3558 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3559 if (processor_alias_table[i].flags & PTA_TBM
3560 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3561 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3562 if (processor_alias_table[i].flags & PTA_BMI2
3563 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3564 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3565 if (processor_alias_table[i].flags & PTA_CX16
3566 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3567 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3568 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3569 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3570 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3571 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3572 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3573 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3574 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3575 if (processor_alias_table[i].flags & PTA_MOVBE
3576 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3577 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3578 if (processor_alias_table[i].flags & PTA_AES
3579 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3580 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES;
3581 if (processor_alias_table[i].flags & PTA_PCLMUL
3582 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3583 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3584 if (processor_alias_table[i].flags & PTA_FSGSBASE
3585 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3586 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3587 if (processor_alias_table[i].flags & PTA_RDRND
3588 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3589 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3590 if (processor_alias_table[i].flags & PTA_F16C
3591 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3592 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3593 if (processor_alias_table[i].flags & PTA_RTM
3594 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3595 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3596 if (processor_alias_table[i].flags & PTA_HLE
3597 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3598 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3599 if (processor_alias_table[i].flags & PTA_PRFCHW
3600 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3601 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3602 if (processor_alias_table[i].flags & PTA_RDSEED
3603 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3604 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3605 if (processor_alias_table[i].flags & PTA_ADX
3606 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3607 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3608 if (processor_alias_table[i].flags & PTA_FXSR
3609 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3610 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3611 if (processor_alias_table[i].flags & PTA_XSAVE
3612 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3613 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3614 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3615 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3616 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3617 if (processor_alias_table[i].flags & PTA_AVX512F
3618 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3619 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3620 if (processor_alias_table[i].flags & PTA_AVX512ER
3621 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3622 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3623 if (processor_alias_table[i].flags & PTA_AVX512PF
3624 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3625 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3626 if (processor_alias_table[i].flags & PTA_AVX512CD
3627 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3628 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3629 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3630 x86_prefetch_sse = true;
3631
3632 break;
3633 }
3634
3635 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3636 error ("generic CPU can be used only for %stune=%s %s",
3637 prefix, suffix, sw);
3638 else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size)
3639 error ("bad value (%s) for %sarch=%s %s",
3640 opts->x_ix86_arch_string, prefix, suffix, sw);
3641
3642 ix86_arch_mask = 1u << ix86_arch;
3643 for (i = 0; i < X86_ARCH_LAST; ++i)
3644 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3645
3646 for (i = 0; i < pta_size; i++)
3647 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
3648 {
3649 ix86_schedule = processor_alias_table[i].schedule;
3650 ix86_tune = processor_alias_table[i].processor;
3651 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3652 {
3653 if (!(processor_alias_table[i].flags & PTA_64BIT))
3654 {
3655 if (ix86_tune_defaulted)
3656 {
3657 opts->x_ix86_tune_string = "x86-64";
3658 for (i = 0; i < pta_size; i++)
3659 if (! strcmp (opts->x_ix86_tune_string,
3660 processor_alias_table[i].name))
3661 break;
3662 ix86_schedule = processor_alias_table[i].schedule;
3663 ix86_tune = processor_alias_table[i].processor;
3664 }
3665 else
3666 error ("CPU you selected does not support x86-64 "
3667 "instruction set");
3668 }
3669 }
3670 /* Intel CPUs have always interpreted SSE prefetch instructions as
3671 NOPs; so, we can enable SSE prefetch instructions even when
3672 -mtune (rather than -march) points us to a processor that has them.
3673 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3674 higher processors. */
3675 if (TARGET_CMOV
3676 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3677 x86_prefetch_sse = true;
3678 break;
3679 }
3680
3681 if (ix86_tune_specified && i == pta_size)
3682 error ("bad value (%s) for %stune=%s %s",
3683 opts->x_ix86_tune_string, prefix, suffix, sw);
3684
3685 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
3686
3687 #ifndef USE_IX86_FRAME_POINTER
3688 #define USE_IX86_FRAME_POINTER 0
3689 #endif
3690
3691 #ifndef USE_X86_64_FRAME_POINTER
3692 #define USE_X86_64_FRAME_POINTER 0
3693 #endif
3694
3695 /* Set the default values for switches whose default depends on TARGET_64BIT
3696 in case they weren't overwritten by command line options. */
3697 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3698 {
3699 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3700 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3701 if (opts->x_flag_asynchronous_unwind_tables == 2)
3702 opts->x_flag_unwind_tables
3703 = opts->x_flag_asynchronous_unwind_tables = 1;
3704 if (opts->x_flag_pcc_struct_return == 2)
3705 opts->x_flag_pcc_struct_return = 0;
3706 }
3707 else
3708 {
3709 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3710 opts->x_flag_omit_frame_pointer
3711 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
3712 if (opts->x_flag_asynchronous_unwind_tables == 2)
3713 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3714 if (opts->x_flag_pcc_struct_return == 2)
3715 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3716 }
3717
3718 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3719 if (opts->x_optimize_size)
3720 ix86_cost = &ix86_size_cost;
3721 else
3722 ix86_cost = ix86_tune_cost;
3723
3724 /* Arrange to set up i386_stack_locals for all functions. */
3725 init_machine_status = ix86_init_machine_status;
3726
3727 /* Validate -mregparm= value. */
3728 if (opts_set->x_ix86_regparm)
3729 {
3730 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3731 warning (0, "-mregparm is ignored in 64-bit mode");
3732 if (opts->x_ix86_regparm > REGPARM_MAX)
3733 {
3734 error ("-mregparm=%d is not between 0 and %d",
3735 opts->x_ix86_regparm, REGPARM_MAX);
3736 opts->x_ix86_regparm = 0;
3737 }
3738 }
3739 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3740 opts->x_ix86_regparm = REGPARM_MAX;
3741
3742 /* Default align_* from the processor table. */
3743 if (opts->x_align_loops == 0)
3744 {
3745 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3746 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3747 }
3748 if (opts->x_align_jumps == 0)
3749 {
3750 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3751 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3752 }
3753 if (opts->x_align_functions == 0)
3754 {
3755 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3756 }
3757
3758 /* Provide default for -mbranch-cost= value. */
3759 if (!opts_set->x_ix86_branch_cost)
3760 opts->x_ix86_branch_cost = ix86_cost->branch_cost;
3761
3762 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3763 {
3764 opts->x_target_flags
3765 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
3766
3767 /* Enable by default the SSE and MMX builtins. Do allow the user to
3768 explicitly disable any of these. In particular, disabling SSE and
3769 MMX for kernel code is extremely useful. */
3770 if (!ix86_arch_specified)
3771 opts->x_ix86_isa_flags
3772 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3773 | TARGET_SUBTARGET64_ISA_DEFAULT)
3774 & ~opts->x_ix86_isa_flags_explicit);
3775
3776 if (TARGET_RTD_P (opts->x_target_flags))
3777 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3778 }
3779 else
3780 {
3781 opts->x_target_flags
3782 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
3783
3784 if (!ix86_arch_specified)
3785 opts->x_ix86_isa_flags
3786 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
3787
3788 /* i386 ABI does not specify red zone. It still makes sense to use it
3789 when programmer takes care to stack from being destroyed. */
3790 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
3791 opts->x_target_flags |= MASK_NO_RED_ZONE;
3792 }
3793
3794 /* Keep nonleaf frame pointers. */
3795 if (opts->x_flag_omit_frame_pointer)
3796 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3797 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
3798 opts->x_flag_omit_frame_pointer = 1;
3799
3800 /* If we're doing fast math, we don't care about comparison order
3801 wrt NaNs. This lets us use a shorter comparison sequence. */
3802 if (opts->x_flag_finite_math_only)
3803 opts->x_target_flags &= ~MASK_IEEE_FP;
3804
3805 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3806 since the insns won't need emulation. */
3807 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
3808 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
3809
3810 /* Likewise, if the target doesn't have a 387, or we've specified
3811 software floating point, don't use 387 inline intrinsics. */
3812 if (!TARGET_80387_P (opts->x_target_flags))
3813 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
3814
3815 /* Turn on MMX builtins for -msse. */
3816 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
3817 opts->x_ix86_isa_flags
3818 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
3819
3820 /* Enable SSE prefetch. */
3821 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
3822 || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
3823 x86_prefetch_sse = true;
3824
3825 /* Enable prefetch{,w} instructions for -m3dnow. */
3826 if (TARGET_3DNOW_P (opts->x_ix86_isa_flags))
3827 opts->x_ix86_isa_flags
3828 |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
3829
3830 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3831 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
3832 || TARGET_ABM_P (opts->x_ix86_isa_flags))
3833 opts->x_ix86_isa_flags
3834 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
3835
3836 /* Enable lzcnt instruction for -mabm. */
3837 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
3838 opts->x_ix86_isa_flags
3839 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
3840
3841 /* Validate -mpreferred-stack-boundary= value or default it to
3842 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3843 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3844 if (opts_set->x_ix86_preferred_stack_boundary_arg)
3845 {
3846 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3847 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
3848 int max = (TARGET_SEH ? 4 : 12);
3849
3850 if (opts->x_ix86_preferred_stack_boundary_arg < min
3851 || opts->x_ix86_preferred_stack_boundary_arg > max)
3852 {
3853 if (min == max)
3854 error ("-mpreferred-stack-boundary is not supported "
3855 "for this target");
3856 else
3857 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3858 opts->x_ix86_preferred_stack_boundary_arg, min, max);
3859 }
3860 else
3861 ix86_preferred_stack_boundary
3862 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3863 }
3864
3865 /* Set the default value for -mstackrealign. */
3866 if (opts->x_ix86_force_align_arg_pointer == -1)
3867 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3868
3869 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3870
3871 /* Validate -mincoming-stack-boundary= value or default it to
3872 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3873 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3874 if (opts_set->x_ix86_incoming_stack_boundary_arg)
3875 {
3876 if (ix86_incoming_stack_boundary_arg
3877 < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
3878 || ix86_incoming_stack_boundary_arg > 12)
3879 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3880 ix86_incoming_stack_boundary_arg,
3881 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
3882 else
3883 {
3884 ix86_user_incoming_stack_boundary
3885 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3886 ix86_incoming_stack_boundary
3887 = ix86_user_incoming_stack_boundary;
3888 }
3889 }
3890
3891 /* Accept -msseregparm only if at least SSE support is enabled. */
3892 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
3893 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
3894 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3895
3896 if (opts_set->x_ix86_fpmath)
3897 {
3898 if (opts->x_ix86_fpmath & FPMATH_SSE)
3899 {
3900 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
3901 {
3902 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3903 opts->x_ix86_fpmath = FPMATH_387;
3904 }
3905 else if ((opts->x_ix86_fpmath & FPMATH_387)
3906 && !TARGET_80387_P (opts->x_target_flags))
3907 {
3908 warning (0, "387 instruction set disabled, using SSE arithmetics");
3909 opts->x_ix86_fpmath = FPMATH_SSE;
3910 }
3911 }
3912 }
3913 /* For all chips supporting SSE2, -mfpmath=sse performs better than
3914 fpmath=387. The second is however default at many targets since the
3915 extra 80bit precision of temporaries is considered to be part of ABI.
3916 Overwrite the default at least for -ffast-math.
3917 TODO: -mfpmath=both seems to produce same performing code with bit
3918 smaller binaries. It is however not clear if register allocation is
3919 ready for this setting.
3920 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
3921 codegen. We may switch to 387 with -ffast-math for size optimized
3922 functions. */
3923 else if (fast_math_flags_set_p (&global_options)
3924 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
3925 opts->x_ix86_fpmath = FPMATH_SSE;
3926 else
3927 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
3928
3929 /* If the i387 is disabled, then do not return values in it. */
3930 if (!TARGET_80387_P (opts->x_target_flags))
3931 opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
3932
3933 /* Use external vectorized library in vectorizing intrinsics. */
3934 if (opts_set->x_ix86_veclibabi_type)
3935 switch (opts->x_ix86_veclibabi_type)
3936 {
3937 case ix86_veclibabi_type_svml:
3938 ix86_veclib_handler = ix86_veclibabi_svml;
3939 break;
3940
3941 case ix86_veclibabi_type_acml:
3942 ix86_veclib_handler = ix86_veclibabi_acml;
3943 break;
3944
3945 default:
3946 gcc_unreachable ();
3947 }
3948
3949 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
3950 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3951 && !opts->x_optimize_size)
3952 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3953
3954 /* If stack probes are required, the space used for large function
3955 arguments on the stack must also be probed, so enable
3956 -maccumulate-outgoing-args so this happens in the prologue. */
3957 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
3958 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3959 {
3960 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3961 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3962 "for correctness", prefix, suffix);
3963 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3964 }
3965
3966 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3967 {
3968 char *p;
3969 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3970 p = strchr (internal_label_prefix, 'X');
3971 internal_label_prefix_len = p - internal_label_prefix;
3972 *p = '\0';
3973 }
3974
3975 /* When scheduling description is not available, disable scheduler pass
3976 so it won't slow down the compilation and make x87 code slower. */
3977 if (!TARGET_SCHEDULE)
3978 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
3979
3980 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3981 ix86_tune_cost->simultaneous_prefetches,
3982 opts->x_param_values,
3983 opts_set->x_param_values);
3984 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3985 ix86_tune_cost->prefetch_block,
3986 opts->x_param_values,
3987 opts_set->x_param_values);
3988 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3989 ix86_tune_cost->l1_cache_size,
3990 opts->x_param_values,
3991 opts_set->x_param_values);
3992 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3993 ix86_tune_cost->l2_cache_size,
3994 opts->x_param_values,
3995 opts_set->x_param_values);
3996
3997 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3998 if (opts->x_flag_prefetch_loop_arrays < 0
3999 && HAVE_prefetch
4000 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4001 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4002 opts->x_flag_prefetch_loop_arrays = 1;
4003
4004 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4005 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4006 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4007 targetm.expand_builtin_va_start = NULL;
4008
4009 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4010 {
4011 ix86_gen_leave = gen_leave_rex64;
4012 if (Pmode == DImode)
4013 {
4014 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4015 ix86_gen_tls_local_dynamic_base_64
4016 = gen_tls_local_dynamic_base_64_di;
4017 }
4018 else
4019 {
4020 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4021 ix86_gen_tls_local_dynamic_base_64
4022 = gen_tls_local_dynamic_base_64_si;
4023 }
4024 }
4025 else
4026 ix86_gen_leave = gen_leave;
4027
4028 if (Pmode == DImode)
4029 {
4030 ix86_gen_add3 = gen_adddi3;
4031 ix86_gen_sub3 = gen_subdi3;
4032 ix86_gen_sub3_carry = gen_subdi3_carry;
4033 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4034 ix86_gen_andsp = gen_anddi3;
4035 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4036 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4037 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4038 ix86_gen_monitor = gen_sse3_monitor_di;
4039 }
4040 else
4041 {
4042 ix86_gen_add3 = gen_addsi3;
4043 ix86_gen_sub3 = gen_subsi3;
4044 ix86_gen_sub3_carry = gen_subsi3_carry;
4045 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4046 ix86_gen_andsp = gen_andsi3;
4047 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4048 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4049 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4050 ix86_gen_monitor = gen_sse3_monitor_si;
4051 }
4052
4053 #ifdef USE_IX86_CLD
4054 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4055 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4056 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4057 #endif
4058
4059 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
4060 {
4061 if (opts->x_flag_fentry > 0)
4062 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4063 "with -fpic");
4064 opts->x_flag_fentry = 0;
4065 }
4066 else if (TARGET_SEH)
4067 {
4068 if (opts->x_flag_fentry == 0)
4069 sorry ("-mno-fentry isn%'t compatible with SEH");
4070 opts->x_flag_fentry = 1;
4071 }
4072 else if (opts->x_flag_fentry < 0)
4073 {
4074 #if defined(PROFILE_BEFORE_PROLOGUE)
4075 opts->x_flag_fentry = 1;
4076 #else
4077 opts->x_flag_fentry = 0;
4078 #endif
4079 }
4080
4081 /* When not opts->x_optimize for size, enable vzeroupper optimization for
4082 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4083 AVX unaligned load/store. */
4084 if (!opts->x_optimize_size)
4085 {
4086 if (flag_expensive_optimizations
4087 && !(opts_set->x_target_flags & MASK_VZEROUPPER))
4088 opts->x_target_flags |= MASK_VZEROUPPER;
4089 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4090 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4091 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4092 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4093 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4094 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4095 /* Enable 128-bit AVX instruction generation
4096 for the auto-vectorizer. */
4097 if (TARGET_AVX128_OPTIMAL
4098 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
4099 opts->x_target_flags |= MASK_PREFER_AVX128;
4100 }
4101
4102 if (opts->x_ix86_recip_name)
4103 {
4104 char *p = ASTRDUP (opts->x_ix86_recip_name);
4105 char *q;
4106 unsigned int mask, i;
4107 bool invert;
4108
4109 while ((q = strtok (p, ",")) != NULL)
4110 {
4111 p = NULL;
4112 if (*q == '!')
4113 {
4114 invert = true;
4115 q++;
4116 }
4117 else
4118 invert = false;
4119
4120 if (!strcmp (q, "default"))
4121 mask = RECIP_MASK_ALL;
4122 else
4123 {
4124 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4125 if (!strcmp (q, recip_options[i].string))
4126 {
4127 mask = recip_options[i].mask;
4128 break;
4129 }
4130
4131 if (i == ARRAY_SIZE (recip_options))
4132 {
4133 error ("unknown option for -mrecip=%s", q);
4134 invert = false;
4135 mask = RECIP_MASK_NONE;
4136 }
4137 }
4138
4139 opts->x_recip_mask_explicit |= mask;
4140 if (invert)
4141 opts->x_recip_mask &= ~mask;
4142 else
4143 opts->x_recip_mask |= mask;
4144 }
4145 }
4146
4147 if (TARGET_RECIP_P (opts->x_target_flags))
4148 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4149 else if (opts_set->x_target_flags & MASK_RECIP)
4150 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4151
4152 /* Default long double to 64-bit for Bionic. */
4153 if (TARGET_HAS_BIONIC
4154 && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64))
4155 opts->x_target_flags |= MASK_LONG_DOUBLE_64;
4156
4157 /* Save the initial options in case the user does function specific
4158 options. */
4159 if (main_args_p)
4160 target_option_default_node = target_option_current_node
4161 = build_target_option_node (opts);
4162
4163 /* Handle stack protector */
4164 if (!opts_set->x_ix86_stack_protector_guard)
4165 opts->x_ix86_stack_protector_guard
4166 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4167
4168 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4169 if (opts->x_ix86_tune_memcpy_strategy)
4170 {
4171 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4172 ix86_parse_stringop_strategy_string (str, false);
4173 free (str);
4174 }
4175
4176 if (opts->x_ix86_tune_memset_strategy)
4177 {
4178 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4179 ix86_parse_stringop_strategy_string (str, true);
4180 free (str);
4181 }
4182 }
4183
4184 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4185
4186 static void
4187 ix86_option_override (void)
4188 {
4189 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4190 static struct register_pass_info insert_vzeroupper_info
4191 = { pass_insert_vzeroupper, "reload",
4192 1, PASS_POS_INSERT_AFTER
4193 };
4194
4195 ix86_option_override_internal (true, &global_options, &global_options_set);
4196
4197
4198 /* This needs to be done at start up. It's convenient to do it here. */
4199 register_pass (&insert_vzeroupper_info);
4200 }
4201
4202 /* Update register usage after having seen the compiler flags. */
4203
4204 static void
4205 ix86_conditional_register_usage (void)
4206 {
4207 int i, c_mask;
4208 unsigned int j;
4209
4210 /* The PIC register, if it exists, is fixed. */
4211 j = PIC_OFFSET_TABLE_REGNUM;
4212 if (j != INVALID_REGNUM)
4213 fixed_regs[j] = call_used_regs[j] = 1;
4214
4215 /* For 32-bit targets, squash the REX registers. */
4216 if (! TARGET_64BIT)
4217 {
4218 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4219 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4220 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4221 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4222 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4223 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4224 }
4225
4226 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4227 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4228 : TARGET_64BIT ? (1 << 2)
4229 : (1 << 1));
4230
4231 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4232
4233 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4234 {
4235 /* Set/reset conditionally defined registers from
4236 CALL_USED_REGISTERS initializer. */
4237 if (call_used_regs[i] > 1)
4238 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4239
4240 /* Calculate registers of CLOBBERED_REGS register set
4241 as call used registers from GENERAL_REGS register set. */
4242 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4243 && call_used_regs[i])
4244 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4245 }
4246
4247 /* If MMX is disabled, squash the registers. */
4248 if (! TARGET_MMX)
4249 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4250 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4251 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4252
4253 /* If SSE is disabled, squash the registers. */
4254 if (! TARGET_SSE)
4255 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4256 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4257 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4258
4259 /* If the FPU is disabled, squash the registers. */
4260 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4261 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4262 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4263 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4264
4265 /* If AVX512F is disabled, squash the registers. */
4266 if (! TARGET_AVX512F)
4267 {
4268 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4269 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4270
4271 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4272 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4273 }
4274
4275 /* If MPX is disabled, squash the registers. */
4276 if (! TARGET_MPX)
4277 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4278 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4279 }
4280
4281 \f
4282 /* Save the current options */
4283
4284 static void
4285 ix86_function_specific_save (struct cl_target_option *ptr,
4286 struct gcc_options *opts)
4287 {
4288 ptr->arch = ix86_arch;
4289 ptr->schedule = ix86_schedule;
4290 ptr->tune = ix86_tune;
4291 ptr->branch_cost = ix86_branch_cost;
4292 ptr->tune_defaulted = ix86_tune_defaulted;
4293 ptr->arch_specified = ix86_arch_specified;
4294 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4295 ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
4296 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
4297
4298 /* The fields are char but the variables are not; make sure the
4299 values fit in the fields. */
4300 gcc_assert (ptr->arch == ix86_arch);
4301 gcc_assert (ptr->schedule == ix86_schedule);
4302 gcc_assert (ptr->tune == ix86_tune);
4303 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4304 }
4305
4306 /* Restore the current options */
4307
4308 static void
4309 ix86_function_specific_restore (struct gcc_options *opts,
4310 struct cl_target_option *ptr)
4311 {
4312 enum processor_type old_tune = ix86_tune;
4313 enum processor_type old_arch = ix86_arch;
4314 unsigned int ix86_arch_mask;
4315 int i;
4316
4317 ix86_arch = (enum processor_type) ptr->arch;
4318 ix86_schedule = (enum attr_cpu) ptr->schedule;
4319 ix86_tune = (enum processor_type) ptr->tune;
4320 opts->x_ix86_branch_cost = ptr->branch_cost;
4321 ix86_tune_defaulted = ptr->tune_defaulted;
4322 ix86_arch_specified = ptr->arch_specified;
4323 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4324 opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
4325 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
4326
4327 /* Recreate the arch feature tests if the arch changed */
4328 if (old_arch != ix86_arch)
4329 {
4330 ix86_arch_mask = 1u << ix86_arch;
4331 for (i = 0; i < X86_ARCH_LAST; ++i)
4332 ix86_arch_features[i]
4333 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4334 }
4335
4336 /* Recreate the tune optimization tests */
4337 if (old_tune != ix86_tune)
4338 set_ix86_tune_features (ix86_tune, false);
4339 }
4340
4341 /* Print the current options */
4342
4343 static void
4344 ix86_function_specific_print (FILE *file, int indent,
4345 struct cl_target_option *ptr)
4346 {
4347 char *target_string
4348 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4349 NULL, NULL, ptr->x_ix86_fpmath, false);
4350
4351 fprintf (file, "%*sarch = %d (%s)\n",
4352 indent, "",
4353 ptr->arch,
4354 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4355 ? cpu_names[ptr->arch]
4356 : "<unknown>"));
4357
4358 fprintf (file, "%*stune = %d (%s)\n",
4359 indent, "",
4360 ptr->tune,
4361 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4362 ? cpu_names[ptr->tune]
4363 : "<unknown>"));
4364
4365 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4366
4367 if (target_string)
4368 {
4369 fprintf (file, "%*s%s\n", indent, "", target_string);
4370 free (target_string);
4371 }
4372 }
4373
4374 \f
4375 /* Inner function to process the attribute((target(...))), take an argument and
4376 set the current options from the argument. If we have a list, recursively go
4377 over the list. */
4378
4379 static bool
4380 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4381 struct gcc_options *opts,
4382 struct gcc_options *opts_set,
4383 struct gcc_options *enum_opts_set)
4384 {
4385 char *next_optstr;
4386 bool ret = true;
4387
4388 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4389 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4390 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4391 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4392 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4393
4394 enum ix86_opt_type
4395 {
4396 ix86_opt_unknown,
4397 ix86_opt_yes,
4398 ix86_opt_no,
4399 ix86_opt_str,
4400 ix86_opt_enum,
4401 ix86_opt_isa
4402 };
4403
4404 static const struct
4405 {
4406 const char *string;
4407 size_t len;
4408 enum ix86_opt_type type;
4409 int opt;
4410 int mask;
4411 } attrs[] = {
4412 /* isa options */
4413 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4414 IX86_ATTR_ISA ("abm", OPT_mabm),
4415 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4416 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4417 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4418 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4419 IX86_ATTR_ISA ("aes", OPT_maes),
4420 IX86_ATTR_ISA ("avx", OPT_mavx),
4421 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4422 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4423 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4424 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4425 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4426 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4427 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4428 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4429 IX86_ATTR_ISA ("sse", OPT_msse),
4430 IX86_ATTR_ISA ("sse2", OPT_msse2),
4431 IX86_ATTR_ISA ("sse3", OPT_msse3),
4432 IX86_ATTR_ISA ("sse4", OPT_msse4),
4433 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4434 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4435 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4436 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4437 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4438 IX86_ATTR_ISA ("fma", OPT_mfma),
4439 IX86_ATTR_ISA ("xop", OPT_mxop),
4440 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4441 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4442 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4443 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4444 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4445 IX86_ATTR_ISA ("hle", OPT_mhle),
4446 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4447 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4448 IX86_ATTR_ISA ("adx", OPT_madx),
4449 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4450 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4451 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4452
4453 /* enum options */
4454 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4455
4456 /* string options */
4457 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4458 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4459
4460 /* flag options */
4461 IX86_ATTR_YES ("cld",
4462 OPT_mcld,
4463 MASK_CLD),
4464
4465 IX86_ATTR_NO ("fancy-math-387",
4466 OPT_mfancy_math_387,
4467 MASK_NO_FANCY_MATH_387),
4468
4469 IX86_ATTR_YES ("ieee-fp",
4470 OPT_mieee_fp,
4471 MASK_IEEE_FP),
4472
4473 IX86_ATTR_YES ("inline-all-stringops",
4474 OPT_minline_all_stringops,
4475 MASK_INLINE_ALL_STRINGOPS),
4476
4477 IX86_ATTR_YES ("inline-stringops-dynamically",
4478 OPT_minline_stringops_dynamically,
4479 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4480
4481 IX86_ATTR_NO ("align-stringops",
4482 OPT_mno_align_stringops,
4483 MASK_NO_ALIGN_STRINGOPS),
4484
4485 IX86_ATTR_YES ("recip",
4486 OPT_mrecip,
4487 MASK_RECIP),
4488
4489 };
4490
4491 /* If this is a list, recurse to get the options. */
4492 if (TREE_CODE (args) == TREE_LIST)
4493 {
4494 bool ret = true;
4495
4496 for (; args; args = TREE_CHAIN (args))
4497 if (TREE_VALUE (args)
4498 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4499 p_strings, opts, opts_set,
4500 enum_opts_set))
4501 ret = false;
4502
4503 return ret;
4504 }
4505
4506 else if (TREE_CODE (args) != STRING_CST)
4507 {
4508 error ("attribute %<target%> argument not a string");
4509 return false;
4510 }
4511
4512 /* Handle multiple arguments separated by commas. */
4513 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4514
4515 while (next_optstr && *next_optstr != '\0')
4516 {
4517 char *p = next_optstr;
4518 char *orig_p = p;
4519 char *comma = strchr (next_optstr, ',');
4520 const char *opt_string;
4521 size_t len, opt_len;
4522 int opt;
4523 bool opt_set_p;
4524 char ch;
4525 unsigned i;
4526 enum ix86_opt_type type = ix86_opt_unknown;
4527 int mask = 0;
4528
4529 if (comma)
4530 {
4531 *comma = '\0';
4532 len = comma - next_optstr;
4533 next_optstr = comma + 1;
4534 }
4535 else
4536 {
4537 len = strlen (p);
4538 next_optstr = NULL;
4539 }
4540
4541 /* Recognize no-xxx. */
4542 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4543 {
4544 opt_set_p = false;
4545 p += 3;
4546 len -= 3;
4547 }
4548 else
4549 opt_set_p = true;
4550
4551 /* Find the option. */
4552 ch = *p;
4553 opt = N_OPTS;
4554 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4555 {
4556 type = attrs[i].type;
4557 opt_len = attrs[i].len;
4558 if (ch == attrs[i].string[0]
4559 && ((type != ix86_opt_str && type != ix86_opt_enum)
4560 ? len == opt_len
4561 : len > opt_len)
4562 && memcmp (p, attrs[i].string, opt_len) == 0)
4563 {
4564 opt = attrs[i].opt;
4565 mask = attrs[i].mask;
4566 opt_string = attrs[i].string;
4567 break;
4568 }
4569 }
4570
4571 /* Process the option. */
4572 if (opt == N_OPTS)
4573 {
4574 error ("attribute(target(\"%s\")) is unknown", orig_p);
4575 ret = false;
4576 }
4577
4578 else if (type == ix86_opt_isa)
4579 {
4580 struct cl_decoded_option decoded;
4581
4582 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4583 ix86_handle_option (opts, opts_set,
4584 &decoded, input_location);
4585 }
4586
4587 else if (type == ix86_opt_yes || type == ix86_opt_no)
4588 {
4589 if (type == ix86_opt_no)
4590 opt_set_p = !opt_set_p;
4591
4592 if (opt_set_p)
4593 opts->x_target_flags |= mask;
4594 else
4595 opts->x_target_flags &= ~mask;
4596 }
4597
4598 else if (type == ix86_opt_str)
4599 {
4600 if (p_strings[opt])
4601 {
4602 error ("option(\"%s\") was already specified", opt_string);
4603 ret = false;
4604 }
4605 else
4606 p_strings[opt] = xstrdup (p + opt_len);
4607 }
4608
4609 else if (type == ix86_opt_enum)
4610 {
4611 bool arg_ok;
4612 int value;
4613
4614 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4615 if (arg_ok)
4616 set_option (opts, enum_opts_set, opt, value,
4617 p + opt_len, DK_UNSPECIFIED, input_location,
4618 global_dc);
4619 else
4620 {
4621 error ("attribute(target(\"%s\")) is unknown", orig_p);
4622 ret = false;
4623 }
4624 }
4625
4626 else
4627 gcc_unreachable ();
4628 }
4629
4630 return ret;
4631 }
4632
4633 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4634
4635 tree
4636 ix86_valid_target_attribute_tree (tree args,
4637 struct gcc_options *opts,
4638 struct gcc_options *opts_set)
4639 {
4640 const char *orig_arch_string = ix86_arch_string;
4641 const char *orig_tune_string = ix86_tune_string;
4642 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
4643 int orig_tune_defaulted = ix86_tune_defaulted;
4644 int orig_arch_specified = ix86_arch_specified;
4645 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4646 tree t = NULL_TREE;
4647 int i;
4648 struct cl_target_option *def
4649 = TREE_TARGET_OPTION (target_option_default_node);
4650 struct gcc_options enum_opts_set;
4651
4652 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4653
4654 /* Process each of the options on the chain. */
4655 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
4656 opts_set, &enum_opts_set))
4657 return error_mark_node;
4658
4659 /* If the changed options are different from the default, rerun
4660 ix86_option_override_internal, and then save the options away.
4661 The string options are are attribute options, and will be undone
4662 when we copy the save structure. */
4663 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
4664 || opts->x_target_flags != def->x_target_flags
4665 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4666 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4667 || enum_opts_set.x_ix86_fpmath)
4668 {
4669 /* If we are using the default tune= or arch=, undo the string assigned,
4670 and use the default. */
4671 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4672 opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4673 else if (!orig_arch_specified)
4674 opts->x_ix86_arch_string = NULL;
4675
4676 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4677 opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4678 else if (orig_tune_defaulted)
4679 opts->x_ix86_tune_string = NULL;
4680
4681 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4682 if (enum_opts_set.x_ix86_fpmath)
4683 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4684 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4685 && TARGET_SSE_P (opts->x_ix86_isa_flags))
4686 {
4687 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4688 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4689 }
4690
4691 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4692 ix86_option_override_internal (false, opts, opts_set);
4693
4694 /* Add any builtin functions with the new isa if any. */
4695 ix86_add_new_builtins (opts->x_ix86_isa_flags);
4696
4697 /* Save the current options unless we are validating options for
4698 #pragma. */
4699 t = build_target_option_node (opts);
4700
4701 opts->x_ix86_arch_string = orig_arch_string;
4702 opts->x_ix86_tune_string = orig_tune_string;
4703 opts_set->x_ix86_fpmath = orig_fpmath_set;
4704
4705 /* Free up memory allocated to hold the strings */
4706 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4707 free (option_strings[i]);
4708 }
4709
4710 return t;
4711 }
4712
4713 /* Hook to validate attribute((target("string"))). */
4714
4715 static bool
4716 ix86_valid_target_attribute_p (tree fndecl,
4717 tree ARG_UNUSED (name),
4718 tree args,
4719 int ARG_UNUSED (flags))
4720 {
4721 struct gcc_options func_options;
4722 tree new_target, new_optimize;
4723 bool ret = true;
4724
4725 /* attribute((target("default"))) does nothing, beyond
4726 affecting multi-versioning. */
4727 if (TREE_VALUE (args)
4728 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4729 && TREE_CHAIN (args) == NULL_TREE
4730 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4731 return true;
4732
4733 tree old_optimize = build_optimization_node (&global_options);
4734
4735 /* Get the optimization options of the current function. */
4736 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4737
4738 if (!func_optimize)
4739 func_optimize = old_optimize;
4740
4741 /* Init func_options. */
4742 memset (&func_options, 0, sizeof (func_options));
4743 init_options_struct (&func_options, NULL);
4744 lang_hooks.init_options_struct (&func_options);
4745
4746 cl_optimization_restore (&func_options,
4747 TREE_OPTIMIZATION (func_optimize));
4748
4749 /* Initialize func_options to the default before its target options can
4750 be set. */
4751 cl_target_option_restore (&func_options,
4752 TREE_TARGET_OPTION (target_option_default_node));
4753
4754 new_target = ix86_valid_target_attribute_tree (args, &func_options,
4755 &global_options_set);
4756
4757 new_optimize = build_optimization_node (&func_options);
4758
4759 if (new_target == error_mark_node)
4760 ret = false;
4761
4762 else if (fndecl && new_target)
4763 {
4764 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4765
4766 if (old_optimize != new_optimize)
4767 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4768 }
4769
4770 return ret;
4771 }
4772
4773 \f
4774 /* Hook to determine if one function can safely inline another. */
4775
4776 static bool
4777 ix86_can_inline_p (tree caller, tree callee)
4778 {
4779 bool ret = false;
4780 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4781 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4782
4783 /* If callee has no option attributes, then it is ok to inline. */
4784 if (!callee_tree)
4785 ret = true;
4786
4787 /* If caller has no option attributes, but callee does then it is not ok to
4788 inline. */
4789 else if (!caller_tree)
4790 ret = false;
4791
4792 else
4793 {
4794 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4795 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4796
4797 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4798 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4799 function. */
4800 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4801 != callee_opts->x_ix86_isa_flags)
4802 ret = false;
4803
4804 /* See if we have the same non-isa options. */
4805 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4806 ret = false;
4807
4808 /* See if arch, tune, etc. are the same. */
4809 else if (caller_opts->arch != callee_opts->arch)
4810 ret = false;
4811
4812 else if (caller_opts->tune != callee_opts->tune)
4813 ret = false;
4814
4815 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4816 ret = false;
4817
4818 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4819 ret = false;
4820
4821 else
4822 ret = true;
4823 }
4824
4825 return ret;
4826 }
4827
4828 \f
4829 /* Remember the last target of ix86_set_current_function. */
4830 static GTY(()) tree ix86_previous_fndecl;
4831
4832 /* Invalidate ix86_previous_fndecl cache. */
4833 void
4834 ix86_reset_previous_fndecl (void)
4835 {
4836 ix86_previous_fndecl = NULL_TREE;
4837 }
4838
4839 /* Establish appropriate back-end context for processing the function
4840 FNDECL. The argument might be NULL to indicate processing at top
4841 level, outside of any function scope. */
4842 static void
4843 ix86_set_current_function (tree fndecl)
4844 {
4845 /* Only change the context if the function changes. This hook is called
4846 several times in the course of compiling a function, and we don't want to
4847 slow things down too much or call target_reinit when it isn't safe. */
4848 if (fndecl && fndecl != ix86_previous_fndecl)
4849 {
4850 tree old_tree = (ix86_previous_fndecl
4851 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4852 : NULL_TREE);
4853
4854 tree new_tree = (fndecl
4855 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4856 : NULL_TREE);
4857
4858 ix86_previous_fndecl = fndecl;
4859 if (old_tree == new_tree)
4860 ;
4861
4862 else if (new_tree)
4863 {
4864 cl_target_option_restore (&global_options,
4865 TREE_TARGET_OPTION (new_tree));
4866 target_reinit ();
4867 }
4868
4869 else if (old_tree)
4870 {
4871 struct cl_target_option *def
4872 = TREE_TARGET_OPTION (target_option_current_node);
4873
4874 cl_target_option_restore (&global_options, def);
4875 target_reinit ();
4876 }
4877 }
4878 }
4879
4880 \f
4881 /* Return true if this goes in large data/bss. */
4882
4883 static bool
4884 ix86_in_large_data_p (tree exp)
4885 {
4886 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4887 return false;
4888
4889 /* Functions are never large data. */
4890 if (TREE_CODE (exp) == FUNCTION_DECL)
4891 return false;
4892
4893 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4894 {
4895 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4896 if (strcmp (section, ".ldata") == 0
4897 || strcmp (section, ".lbss") == 0)
4898 return true;
4899 return false;
4900 }
4901 else
4902 {
4903 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4904
4905 /* If this is an incomplete type with size 0, then we can't put it
4906 in data because it might be too big when completed. */
4907 if (!size || size > ix86_section_threshold)
4908 return true;
4909 }
4910
4911 return false;
4912 }
4913
4914 /* Switch to the appropriate section for output of DECL.
4915 DECL is either a `VAR_DECL' node or a constant of some sort.
4916 RELOC indicates whether forming the initial value of DECL requires
4917 link-time relocations. */
4918
4919 ATTRIBUTE_UNUSED static section *
4920 x86_64_elf_select_section (tree decl, int reloc,
4921 unsigned HOST_WIDE_INT align)
4922 {
4923 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4924 && ix86_in_large_data_p (decl))
4925 {
4926 const char *sname = NULL;
4927 unsigned int flags = SECTION_WRITE;
4928 switch (categorize_decl_for_section (decl, reloc))
4929 {
4930 case SECCAT_DATA:
4931 sname = ".ldata";
4932 break;
4933 case SECCAT_DATA_REL:
4934 sname = ".ldata.rel";
4935 break;
4936 case SECCAT_DATA_REL_LOCAL:
4937 sname = ".ldata.rel.local";
4938 break;
4939 case SECCAT_DATA_REL_RO:
4940 sname = ".ldata.rel.ro";
4941 break;
4942 case SECCAT_DATA_REL_RO_LOCAL:
4943 sname = ".ldata.rel.ro.local";
4944 break;
4945 case SECCAT_BSS:
4946 sname = ".lbss";
4947 flags |= SECTION_BSS;
4948 break;
4949 case SECCAT_RODATA:
4950 case SECCAT_RODATA_MERGE_STR:
4951 case SECCAT_RODATA_MERGE_STR_INIT:
4952 case SECCAT_RODATA_MERGE_CONST:
4953 sname = ".lrodata";
4954 flags = 0;
4955 break;
4956 case SECCAT_SRODATA:
4957 case SECCAT_SDATA:
4958 case SECCAT_SBSS:
4959 gcc_unreachable ();
4960 case SECCAT_TEXT:
4961 case SECCAT_TDATA:
4962 case SECCAT_TBSS:
4963 /* We don't split these for medium model. Place them into
4964 default sections and hope for best. */
4965 break;
4966 }
4967 if (sname)
4968 {
4969 /* We might get called with string constants, but get_named_section
4970 doesn't like them as they are not DECLs. Also, we need to set
4971 flags in that case. */
4972 if (!DECL_P (decl))
4973 return get_section (sname, flags, NULL);
4974 return get_named_section (decl, sname, reloc);
4975 }
4976 }
4977 return default_elf_select_section (decl, reloc, align);
4978 }
4979
4980 /* Select a set of attributes for section NAME based on the properties
4981 of DECL and whether or not RELOC indicates that DECL's initializer
4982 might contain runtime relocations. */
4983
4984 static unsigned int ATTRIBUTE_UNUSED
4985 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4986 {
4987 unsigned int flags = default_section_type_flags (decl, name, reloc);
4988
4989 if (decl == NULL_TREE
4990 && (strcmp (name, ".ldata.rel.ro") == 0
4991 || strcmp (name, ".ldata.rel.ro.local") == 0))
4992 flags |= SECTION_RELRO;
4993
4994 if (strcmp (name, ".lbss") == 0
4995 || strncmp (name, ".lbss.", 5) == 0
4996 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4997 flags |= SECTION_BSS;
4998
4999 return flags;
5000 }
5001
5002 /* Build up a unique section name, expressed as a
5003 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
5004 RELOC indicates whether the initial value of EXP requires
5005 link-time relocations. */
5006
5007 static void ATTRIBUTE_UNUSED
5008 x86_64_elf_unique_section (tree decl, int reloc)
5009 {
5010 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5011 && ix86_in_large_data_p (decl))
5012 {
5013 const char *prefix = NULL;
5014 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
5015 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
5016
5017 switch (categorize_decl_for_section (decl, reloc))
5018 {
5019 case SECCAT_DATA:
5020 case SECCAT_DATA_REL:
5021 case SECCAT_DATA_REL_LOCAL:
5022 case SECCAT_DATA_REL_RO:
5023 case SECCAT_DATA_REL_RO_LOCAL:
5024 prefix = one_only ? ".ld" : ".ldata";
5025 break;
5026 case SECCAT_BSS:
5027 prefix = one_only ? ".lb" : ".lbss";
5028 break;
5029 case SECCAT_RODATA:
5030 case SECCAT_RODATA_MERGE_STR:
5031 case SECCAT_RODATA_MERGE_STR_INIT:
5032 case SECCAT_RODATA_MERGE_CONST:
5033 prefix = one_only ? ".lr" : ".lrodata";
5034 break;
5035 case SECCAT_SRODATA:
5036 case SECCAT_SDATA:
5037 case SECCAT_SBSS:
5038 gcc_unreachable ();
5039 case SECCAT_TEXT:
5040 case SECCAT_TDATA:
5041 case SECCAT_TBSS:
5042 /* We don't split these for medium model. Place them into
5043 default sections and hope for best. */
5044 break;
5045 }
5046 if (prefix)
5047 {
5048 const char *name, *linkonce;
5049 char *string;
5050
5051 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
5052 name = targetm.strip_name_encoding (name);
5053
5054 /* If we're using one_only, then there needs to be a .gnu.linkonce
5055 prefix to the section name. */
5056 linkonce = one_only ? ".gnu.linkonce" : "";
5057
5058 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
5059
5060 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
5061 return;
5062 }
5063 }
5064 default_unique_section (decl, reloc);
5065 }
5066
5067 #ifdef COMMON_ASM_OP
5068 /* This says how to output assembler code to declare an
5069 uninitialized external linkage data object.
5070
5071 For medium model x86-64 we need to use .largecomm opcode for
5072 large objects. */
5073 void
5074 x86_elf_aligned_common (FILE *file,
5075 const char *name, unsigned HOST_WIDE_INT size,
5076 int align)
5077 {
5078 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5079 && size > (unsigned int)ix86_section_threshold)
5080 fputs (".largecomm\t", file);
5081 else
5082 fputs (COMMON_ASM_OP, file);
5083 assemble_name (file, name);
5084 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
5085 size, align / BITS_PER_UNIT);
5086 }
5087 #endif
5088
5089 /* Utility function for targets to use in implementing
5090 ASM_OUTPUT_ALIGNED_BSS. */
5091
5092 void
5093 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
5094 const char *name, unsigned HOST_WIDE_INT size,
5095 int align)
5096 {
5097 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5098 && size > (unsigned int)ix86_section_threshold)
5099 switch_to_section (get_named_section (decl, ".lbss", 0));
5100 else
5101 switch_to_section (bss_section);
5102 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
5103 #ifdef ASM_DECLARE_OBJECT_NAME
5104 last_assemble_variable_decl = decl;
5105 ASM_DECLARE_OBJECT_NAME (file, name, decl);
5106 #else
5107 /* Standard thing is just output label for the object. */
5108 ASM_OUTPUT_LABEL (file, name);
5109 #endif /* ASM_DECLARE_OBJECT_NAME */
5110 ASM_OUTPUT_SKIP (file, size ? size : 1);
5111 }
5112 \f
5113 /* Decide whether we must probe the stack before any space allocation
5114 on this target. It's essentially TARGET_STACK_PROBE except when
5115 -fstack-check causes the stack to be already probed differently. */
5116
5117 bool
5118 ix86_target_stack_probe (void)
5119 {
5120 /* Do not probe the stack twice if static stack checking is enabled. */
5121 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5122 return false;
5123
5124 return TARGET_STACK_PROBE;
5125 }
5126 \f
5127 /* Decide whether we can make a sibling call to a function. DECL is the
5128 declaration of the function being targeted by the call and EXP is the
5129 CALL_EXPR representing the call. */
5130
5131 static bool
5132 ix86_function_ok_for_sibcall (tree decl, tree exp)
5133 {
5134 tree type, decl_or_type;
5135 rtx a, b;
5136
5137 /* If we are generating position-independent code, we cannot sibcall
5138 optimize any indirect call, or a direct call to a global function,
5139 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5140 if (!TARGET_MACHO
5141 && !TARGET_64BIT
5142 && flag_pic
5143 && (!decl || !targetm.binds_local_p (decl)))
5144 return false;
5145
5146 /* If we need to align the outgoing stack, then sibcalling would
5147 unalign the stack, which may break the called function. */
5148 if (ix86_minimum_incoming_stack_boundary (true)
5149 < PREFERRED_STACK_BOUNDARY)
5150 return false;
5151
5152 if (decl)
5153 {
5154 decl_or_type = decl;
5155 type = TREE_TYPE (decl);
5156 }
5157 else
5158 {
5159 /* We're looking at the CALL_EXPR, we need the type of the function. */
5160 type = CALL_EXPR_FN (exp); /* pointer expression */
5161 type = TREE_TYPE (type); /* pointer type */
5162 type = TREE_TYPE (type); /* function type */
5163 decl_or_type = type;
5164 }
5165
5166 /* Check that the return value locations are the same. Like
5167 if we are returning floats on the 80387 register stack, we cannot
5168 make a sibcall from a function that doesn't return a float to a
5169 function that does or, conversely, from a function that does return
5170 a float to a function that doesn't; the necessary stack adjustment
5171 would not be executed. This is also the place we notice
5172 differences in the return value ABI. Note that it is ok for one
5173 of the functions to have void return type as long as the return
5174 value of the other is passed in a register. */
5175 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5176 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5177 cfun->decl, false);
5178 if (STACK_REG_P (a) || STACK_REG_P (b))
5179 {
5180 if (!rtx_equal_p (a, b))
5181 return false;
5182 }
5183 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5184 ;
5185 else if (!rtx_equal_p (a, b))
5186 return false;
5187
5188 if (TARGET_64BIT)
5189 {
5190 /* The SYSV ABI has more call-clobbered registers;
5191 disallow sibcalls from MS to SYSV. */
5192 if (cfun->machine->call_abi == MS_ABI
5193 && ix86_function_type_abi (type) == SYSV_ABI)
5194 return false;
5195 }
5196 else
5197 {
5198 /* If this call is indirect, we'll need to be able to use a
5199 call-clobbered register for the address of the target function.
5200 Make sure that all such registers are not used for passing
5201 parameters. Note that DLLIMPORT functions are indirect. */
5202 if (!decl
5203 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5204 {
5205 if (ix86_function_regparm (type, NULL) >= 3)
5206 {
5207 /* ??? Need to count the actual number of registers to be used,
5208 not the possible number of registers. Fix later. */
5209 return false;
5210 }
5211 }
5212 }
5213
5214 /* Otherwise okay. That also includes certain types of indirect calls. */
5215 return true;
5216 }
5217
5218 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5219 and "sseregparm" calling convention attributes;
5220 arguments as in struct attribute_spec.handler. */
5221
5222 static tree
5223 ix86_handle_cconv_attribute (tree *node, tree name,
5224 tree args,
5225 int flags ATTRIBUTE_UNUSED,
5226 bool *no_add_attrs)
5227 {
5228 if (TREE_CODE (*node) != FUNCTION_TYPE
5229 && TREE_CODE (*node) != METHOD_TYPE
5230 && TREE_CODE (*node) != FIELD_DECL
5231 && TREE_CODE (*node) != TYPE_DECL)
5232 {
5233 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5234 name);
5235 *no_add_attrs = true;
5236 return NULL_TREE;
5237 }
5238
5239 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5240 if (is_attribute_p ("regparm", name))
5241 {
5242 tree cst;
5243
5244 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5245 {
5246 error ("fastcall and regparm attributes are not compatible");
5247 }
5248
5249 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5250 {
5251 error ("regparam and thiscall attributes are not compatible");
5252 }
5253
5254 cst = TREE_VALUE (args);
5255 if (TREE_CODE (cst) != INTEGER_CST)
5256 {
5257 warning (OPT_Wattributes,
5258 "%qE attribute requires an integer constant argument",
5259 name);
5260 *no_add_attrs = true;
5261 }
5262 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5263 {
5264 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5265 name, REGPARM_MAX);
5266 *no_add_attrs = true;
5267 }
5268
5269 return NULL_TREE;
5270 }
5271
5272 if (TARGET_64BIT)
5273 {
5274 /* Do not warn when emulating the MS ABI. */
5275 if ((TREE_CODE (*node) != FUNCTION_TYPE
5276 && TREE_CODE (*node) != METHOD_TYPE)
5277 || ix86_function_type_abi (*node) != MS_ABI)
5278 warning (OPT_Wattributes, "%qE attribute ignored",
5279 name);
5280 *no_add_attrs = true;
5281 return NULL_TREE;
5282 }
5283
5284 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5285 if (is_attribute_p ("fastcall", name))
5286 {
5287 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5288 {
5289 error ("fastcall and cdecl attributes are not compatible");
5290 }
5291 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5292 {
5293 error ("fastcall and stdcall attributes are not compatible");
5294 }
5295 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5296 {
5297 error ("fastcall and regparm attributes are not compatible");
5298 }
5299 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5300 {
5301 error ("fastcall and thiscall attributes are not compatible");
5302 }
5303 }
5304
5305 /* Can combine stdcall with fastcall (redundant), regparm and
5306 sseregparm. */
5307 else if (is_attribute_p ("stdcall", name))
5308 {
5309 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5310 {
5311 error ("stdcall and cdecl attributes are not compatible");
5312 }
5313 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5314 {
5315 error ("stdcall and fastcall attributes are not compatible");
5316 }
5317 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5318 {
5319 error ("stdcall and thiscall attributes are not compatible");
5320 }
5321 }
5322
5323 /* Can combine cdecl with regparm and sseregparm. */
5324 else if (is_attribute_p ("cdecl", name))
5325 {
5326 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5327 {
5328 error ("stdcall and cdecl attributes are not compatible");
5329 }
5330 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5331 {
5332 error ("fastcall and cdecl attributes are not compatible");
5333 }
5334 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5335 {
5336 error ("cdecl and thiscall attributes are not compatible");
5337 }
5338 }
5339 else if (is_attribute_p ("thiscall", name))
5340 {
5341 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5342 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5343 name);
5344 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5345 {
5346 error ("stdcall and thiscall attributes are not compatible");
5347 }
5348 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5349 {
5350 error ("fastcall and thiscall attributes are not compatible");
5351 }
5352 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5353 {
5354 error ("cdecl and thiscall attributes are not compatible");
5355 }
5356 }
5357
5358 /* Can combine sseregparm with all attributes. */
5359
5360 return NULL_TREE;
5361 }
5362
5363 /* The transactional memory builtins are implicitly regparm or fastcall
5364 depending on the ABI. Override the generic do-nothing attribute that
5365 these builtins were declared with, and replace it with one of the two
5366 attributes that we expect elsewhere. */
5367
5368 static tree
5369 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5370 tree args ATTRIBUTE_UNUSED,
5371 int flags, bool *no_add_attrs)
5372 {
5373 tree alt;
5374
5375 /* In no case do we want to add the placeholder attribute. */
5376 *no_add_attrs = true;
5377
5378 /* The 64-bit ABI is unchanged for transactional memory. */
5379 if (TARGET_64BIT)
5380 return NULL_TREE;
5381
5382 /* ??? Is there a better way to validate 32-bit windows? We have
5383 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5384 if (CHECK_STACK_LIMIT > 0)
5385 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5386 else
5387 {
5388 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5389 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5390 }
5391 decl_attributes (node, alt, flags);
5392
5393 return NULL_TREE;
5394 }
5395
5396 /* This function determines from TYPE the calling-convention. */
5397
5398 unsigned int
5399 ix86_get_callcvt (const_tree type)
5400 {
5401 unsigned int ret = 0;
5402 bool is_stdarg;
5403 tree attrs;
5404
5405 if (TARGET_64BIT)
5406 return IX86_CALLCVT_CDECL;
5407
5408 attrs = TYPE_ATTRIBUTES (type);
5409 if (attrs != NULL_TREE)
5410 {
5411 if (lookup_attribute ("cdecl", attrs))
5412 ret |= IX86_CALLCVT_CDECL;
5413 else if (lookup_attribute ("stdcall", attrs))
5414 ret |= IX86_CALLCVT_STDCALL;
5415 else if (lookup_attribute ("fastcall", attrs))
5416 ret |= IX86_CALLCVT_FASTCALL;
5417 else if (lookup_attribute ("thiscall", attrs))
5418 ret |= IX86_CALLCVT_THISCALL;
5419
5420 /* Regparam isn't allowed for thiscall and fastcall. */
5421 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5422 {
5423 if (lookup_attribute ("regparm", attrs))
5424 ret |= IX86_CALLCVT_REGPARM;
5425 if (lookup_attribute ("sseregparm", attrs))
5426 ret |= IX86_CALLCVT_SSEREGPARM;
5427 }
5428
5429 if (IX86_BASE_CALLCVT(ret) != 0)
5430 return ret;
5431 }
5432
5433 is_stdarg = stdarg_p (type);
5434 if (TARGET_RTD && !is_stdarg)
5435 return IX86_CALLCVT_STDCALL | ret;
5436
5437 if (ret != 0
5438 || is_stdarg
5439 || TREE_CODE (type) != METHOD_TYPE
5440 || ix86_function_type_abi (type) != MS_ABI)
5441 return IX86_CALLCVT_CDECL | ret;
5442
5443 return IX86_CALLCVT_THISCALL;
5444 }
5445
5446 /* Return 0 if the attributes for two types are incompatible, 1 if they
5447 are compatible, and 2 if they are nearly compatible (which causes a
5448 warning to be generated). */
5449
5450 static int
5451 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5452 {
5453 unsigned int ccvt1, ccvt2;
5454
5455 if (TREE_CODE (type1) != FUNCTION_TYPE
5456 && TREE_CODE (type1) != METHOD_TYPE)
5457 return 1;
5458
5459 ccvt1 = ix86_get_callcvt (type1);
5460 ccvt2 = ix86_get_callcvt (type2);
5461 if (ccvt1 != ccvt2)
5462 return 0;
5463 if (ix86_function_regparm (type1, NULL)
5464 != ix86_function_regparm (type2, NULL))
5465 return 0;
5466
5467 return 1;
5468 }
5469 \f
5470 /* Return the regparm value for a function with the indicated TYPE and DECL.
5471 DECL may be NULL when calling function indirectly
5472 or considering a libcall. */
5473
5474 static int
5475 ix86_function_regparm (const_tree type, const_tree decl)
5476 {
5477 tree attr;
5478 int regparm;
5479 unsigned int ccvt;
5480
5481 if (TARGET_64BIT)
5482 return (ix86_function_type_abi (type) == SYSV_ABI
5483 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5484 ccvt = ix86_get_callcvt (type);
5485 regparm = ix86_regparm;
5486
5487 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5488 {
5489 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5490 if (attr)
5491 {
5492 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5493 return regparm;
5494 }
5495 }
5496 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5497 return 2;
5498 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5499 return 1;
5500
5501 /* Use register calling convention for local functions when possible. */
5502 if (decl
5503 && TREE_CODE (decl) == FUNCTION_DECL
5504 && optimize
5505 && !(profile_flag && !flag_fentry))
5506 {
5507 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5508 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5509 if (i && i->local && i->can_change_signature)
5510 {
5511 int local_regparm, globals = 0, regno;
5512
5513 /* Make sure no regparm register is taken by a
5514 fixed register variable. */
5515 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5516 if (fixed_regs[local_regparm])
5517 break;
5518
5519 /* We don't want to use regparm(3) for nested functions as
5520 these use a static chain pointer in the third argument. */
5521 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5522 local_regparm = 2;
5523
5524 /* In 32-bit mode save a register for the split stack. */
5525 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5526 local_regparm = 2;
5527
5528 /* Each fixed register usage increases register pressure,
5529 so less registers should be used for argument passing.
5530 This functionality can be overriden by an explicit
5531 regparm value. */
5532 for (regno = AX_REG; regno <= DI_REG; regno++)
5533 if (fixed_regs[regno])
5534 globals++;
5535
5536 local_regparm
5537 = globals < local_regparm ? local_regparm - globals : 0;
5538
5539 if (local_regparm > regparm)
5540 regparm = local_regparm;
5541 }
5542 }
5543
5544 return regparm;
5545 }
5546
5547 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5548 DFmode (2) arguments in SSE registers for a function with the
5549 indicated TYPE and DECL. DECL may be NULL when calling function
5550 indirectly or considering a libcall. Otherwise return 0. */
5551
5552 static int
5553 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5554 {
5555 gcc_assert (!TARGET_64BIT);
5556
5557 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5558 by the sseregparm attribute. */
5559 if (TARGET_SSEREGPARM
5560 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5561 {
5562 if (!TARGET_SSE)
5563 {
5564 if (warn)
5565 {
5566 if (decl)
5567 error ("calling %qD with attribute sseregparm without "
5568 "SSE/SSE2 enabled", decl);
5569 else
5570 error ("calling %qT with attribute sseregparm without "
5571 "SSE/SSE2 enabled", type);
5572 }
5573 return 0;
5574 }
5575
5576 return 2;
5577 }
5578
5579 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5580 (and DFmode for SSE2) arguments in SSE registers. */
5581 if (decl && TARGET_SSE_MATH && optimize
5582 && !(profile_flag && !flag_fentry))
5583 {
5584 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5585 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5586 if (i && i->local && i->can_change_signature)
5587 return TARGET_SSE2 ? 2 : 1;
5588 }
5589
5590 return 0;
5591 }
5592
5593 /* Return true if EAX is live at the start of the function. Used by
5594 ix86_expand_prologue to determine if we need special help before
5595 calling allocate_stack_worker. */
5596
5597 static bool
5598 ix86_eax_live_at_start_p (void)
5599 {
5600 /* Cheat. Don't bother working forward from ix86_function_regparm
5601 to the function type to whether an actual argument is located in
5602 eax. Instead just look at cfg info, which is still close enough
5603 to correct at this point. This gives false positives for broken
5604 functions that might use uninitialized data that happens to be
5605 allocated in eax, but who cares? */
5606 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
5607 }
5608
5609 static bool
5610 ix86_keep_aggregate_return_pointer (tree fntype)
5611 {
5612 tree attr;
5613
5614 if (!TARGET_64BIT)
5615 {
5616 attr = lookup_attribute ("callee_pop_aggregate_return",
5617 TYPE_ATTRIBUTES (fntype));
5618 if (attr)
5619 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5620
5621 /* For 32-bit MS-ABI the default is to keep aggregate
5622 return pointer. */
5623 if (ix86_function_type_abi (fntype) == MS_ABI)
5624 return true;
5625 }
5626 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5627 }
5628
5629 /* Value is the number of bytes of arguments automatically
5630 popped when returning from a subroutine call.
5631 FUNDECL is the declaration node of the function (as a tree),
5632 FUNTYPE is the data type of the function (as a tree),
5633 or for a library call it is an identifier node for the subroutine name.
5634 SIZE is the number of bytes of arguments passed on the stack.
5635
5636 On the 80386, the RTD insn may be used to pop them if the number
5637 of args is fixed, but if the number is variable then the caller
5638 must pop them all. RTD can't be used for library calls now
5639 because the library is compiled with the Unix compiler.
5640 Use of RTD is a selectable option, since it is incompatible with
5641 standard Unix calling sequences. If the option is not selected,
5642 the caller must always pop the args.
5643
5644 The attribute stdcall is equivalent to RTD on a per module basis. */
5645
5646 static int
5647 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5648 {
5649 unsigned int ccvt;
5650
5651 /* None of the 64-bit ABIs pop arguments. */
5652 if (TARGET_64BIT)
5653 return 0;
5654
5655 ccvt = ix86_get_callcvt (funtype);
5656
5657 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5658 | IX86_CALLCVT_THISCALL)) != 0
5659 && ! stdarg_p (funtype))
5660 return size;
5661
5662 /* Lose any fake structure return argument if it is passed on the stack. */
5663 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5664 && !ix86_keep_aggregate_return_pointer (funtype))
5665 {
5666 int nregs = ix86_function_regparm (funtype, fundecl);
5667 if (nregs == 0)
5668 return GET_MODE_SIZE (Pmode);
5669 }
5670
5671 return 0;
5672 }
5673
5674 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5675
5676 static bool
5677 ix86_legitimate_combined_insn (rtx insn)
5678 {
5679 /* Check operand constraints in case hard registers were propagated
5680 into insn pattern. This check prevents combine pass from
5681 generating insn patterns with invalid hard register operands.
5682 These invalid insns can eventually confuse reload to error out
5683 with a spill failure. See also PRs 46829 and 46843. */
5684 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5685 {
5686 int i;
5687
5688 extract_insn (insn);
5689 preprocess_constraints ();
5690
5691 for (i = 0; i < recog_data.n_operands; i++)
5692 {
5693 rtx op = recog_data.operand[i];
5694 enum machine_mode mode = GET_MODE (op);
5695 struct operand_alternative *op_alt;
5696 int offset = 0;
5697 bool win;
5698 int j;
5699
5700 /* A unary operator may be accepted by the predicate, but it
5701 is irrelevant for matching constraints. */
5702 if (UNARY_P (op))
5703 op = XEXP (op, 0);
5704
5705 if (GET_CODE (op) == SUBREG)
5706 {
5707 if (REG_P (SUBREG_REG (op))
5708 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5709 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5710 GET_MODE (SUBREG_REG (op)),
5711 SUBREG_BYTE (op),
5712 GET_MODE (op));
5713 op = SUBREG_REG (op);
5714 }
5715
5716 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5717 continue;
5718
5719 op_alt = recog_op_alt[i];
5720
5721 /* Operand has no constraints, anything is OK. */
5722 win = !recog_data.n_alternatives;
5723
5724 for (j = 0; j < recog_data.n_alternatives; j++)
5725 {
5726 if (op_alt[j].anything_ok
5727 || (op_alt[j].matches != -1
5728 && operands_match_p
5729 (recog_data.operand[i],
5730 recog_data.operand[op_alt[j].matches]))
5731 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5732 {
5733 win = true;
5734 break;
5735 }
5736 }
5737
5738 if (!win)
5739 return false;
5740 }
5741 }
5742
5743 return true;
5744 }
5745 \f
5746 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5747
5748 static unsigned HOST_WIDE_INT
5749 ix86_asan_shadow_offset (void)
5750 {
5751 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5752 : HOST_WIDE_INT_C (0x7fff8000))
5753 : (HOST_WIDE_INT_1 << 29);
5754 }
5755 \f
5756 /* Argument support functions. */
5757
5758 /* Return true when register may be used to pass function parameters. */
5759 bool
5760 ix86_function_arg_regno_p (int regno)
5761 {
5762 int i;
5763 const int *parm_regs;
5764
5765 if (!TARGET_64BIT)
5766 {
5767 if (TARGET_MACHO)
5768 return (regno < REGPARM_MAX
5769 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5770 else
5771 return (regno < REGPARM_MAX
5772 || (TARGET_MMX && MMX_REGNO_P (regno)
5773 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5774 || (TARGET_SSE && SSE_REGNO_P (regno)
5775 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5776 }
5777
5778 if (TARGET_SSE && SSE_REGNO_P (regno)
5779 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5780 return true;
5781
5782 /* TODO: The function should depend on current function ABI but
5783 builtins.c would need updating then. Therefore we use the
5784 default ABI. */
5785
5786 /* RAX is used as hidden argument to va_arg functions. */
5787 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5788 return true;
5789
5790 if (ix86_abi == MS_ABI)
5791 parm_regs = x86_64_ms_abi_int_parameter_registers;
5792 else
5793 parm_regs = x86_64_int_parameter_registers;
5794 for (i = 0; i < (ix86_abi == MS_ABI
5795 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5796 if (regno == parm_regs[i])
5797 return true;
5798 return false;
5799 }
5800
5801 /* Return if we do not know how to pass TYPE solely in registers. */
5802
5803 static bool
5804 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5805 {
5806 if (must_pass_in_stack_var_size_or_pad (mode, type))
5807 return true;
5808
5809 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5810 The layout_type routine is crafty and tries to trick us into passing
5811 currently unsupported vector types on the stack by using TImode. */
5812 return (!TARGET_64BIT && mode == TImode
5813 && type && TREE_CODE (type) != VECTOR_TYPE);
5814 }
5815
5816 /* It returns the size, in bytes, of the area reserved for arguments passed
5817 in registers for the function represented by fndecl dependent to the used
5818 abi format. */
5819 int
5820 ix86_reg_parm_stack_space (const_tree fndecl)
5821 {
5822 enum calling_abi call_abi = SYSV_ABI;
5823 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5824 call_abi = ix86_function_abi (fndecl);
5825 else
5826 call_abi = ix86_function_type_abi (fndecl);
5827 if (TARGET_64BIT && call_abi == MS_ABI)
5828 return 32;
5829 return 0;
5830 }
5831
5832 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5833 call abi used. */
5834 enum calling_abi
5835 ix86_function_type_abi (const_tree fntype)
5836 {
5837 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5838 {
5839 enum calling_abi abi = ix86_abi;
5840 if (abi == SYSV_ABI)
5841 {
5842 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5843 abi = MS_ABI;
5844 }
5845 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5846 abi = SYSV_ABI;
5847 return abi;
5848 }
5849 return ix86_abi;
5850 }
5851
5852 /* We add this as a workaround in order to use libc_has_function
5853 hook in i386.md. */
5854 bool
5855 ix86_libc_has_function (enum function_class fn_class)
5856 {
5857 return targetm.libc_has_function (fn_class);
5858 }
5859
5860 static bool
5861 ix86_function_ms_hook_prologue (const_tree fn)
5862 {
5863 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5864 {
5865 if (decl_function_context (fn) != NULL_TREE)
5866 error_at (DECL_SOURCE_LOCATION (fn),
5867 "ms_hook_prologue is not compatible with nested function");
5868 else
5869 return true;
5870 }
5871 return false;
5872 }
5873
5874 static enum calling_abi
5875 ix86_function_abi (const_tree fndecl)
5876 {
5877 if (! fndecl)
5878 return ix86_abi;
5879 return ix86_function_type_abi (TREE_TYPE (fndecl));
5880 }
5881
5882 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5883 call abi used. */
5884 enum calling_abi
5885 ix86_cfun_abi (void)
5886 {
5887 if (! cfun)
5888 return ix86_abi;
5889 return cfun->machine->call_abi;
5890 }
5891
5892 /* Write the extra assembler code needed to declare a function properly. */
5893
5894 void
5895 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5896 tree decl)
5897 {
5898 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5899
5900 if (is_ms_hook)
5901 {
5902 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5903 unsigned int filler_cc = 0xcccccccc;
5904
5905 for (i = 0; i < filler_count; i += 4)
5906 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5907 }
5908
5909 #ifdef SUBTARGET_ASM_UNWIND_INIT
5910 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5911 #endif
5912
5913 ASM_OUTPUT_LABEL (asm_out_file, fname);
5914
5915 /* Output magic byte marker, if hot-patch attribute is set. */
5916 if (is_ms_hook)
5917 {
5918 if (TARGET_64BIT)
5919 {
5920 /* leaq [%rsp + 0], %rsp */
5921 asm_fprintf (asm_out_file, ASM_BYTE
5922 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5923 }
5924 else
5925 {
5926 /* movl.s %edi, %edi
5927 push %ebp
5928 movl.s %esp, %ebp */
5929 asm_fprintf (asm_out_file, ASM_BYTE
5930 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5931 }
5932 }
5933 }
5934
5935 /* regclass.c */
5936 extern void init_regs (void);
5937
5938 /* Implementation of call abi switching target hook. Specific to FNDECL
5939 the specific call register sets are set. See also
5940 ix86_conditional_register_usage for more details. */
5941 void
5942 ix86_call_abi_override (const_tree fndecl)
5943 {
5944 if (fndecl == NULL_TREE)
5945 cfun->machine->call_abi = ix86_abi;
5946 else
5947 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5948 }
5949
5950 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5951 expensive re-initialization of init_regs each time we switch function context
5952 since this is needed only during RTL expansion. */
5953 static void
5954 ix86_maybe_switch_abi (void)
5955 {
5956 if (TARGET_64BIT &&
5957 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5958 reinit_regs ();
5959 }
5960
5961 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5962 for a call to a function whose data type is FNTYPE.
5963 For a library call, FNTYPE is 0. */
5964
5965 void
5966 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5967 tree fntype, /* tree ptr for function decl */
5968 rtx libname, /* SYMBOL_REF of library name or 0 */
5969 tree fndecl,
5970 int caller)
5971 {
5972 struct cgraph_local_info *i;
5973
5974 memset (cum, 0, sizeof (*cum));
5975
5976 if (fndecl)
5977 {
5978 i = cgraph_local_info (fndecl);
5979 cum->call_abi = ix86_function_abi (fndecl);
5980 }
5981 else
5982 {
5983 i = NULL;
5984 cum->call_abi = ix86_function_type_abi (fntype);
5985 }
5986
5987 cum->caller = caller;
5988
5989 /* Set up the number of registers to use for passing arguments. */
5990
5991 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5992 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5993 "or subtarget optimization implying it");
5994 cum->nregs = ix86_regparm;
5995 if (TARGET_64BIT)
5996 {
5997 cum->nregs = (cum->call_abi == SYSV_ABI
5998 ? X86_64_REGPARM_MAX
5999 : X86_64_MS_REGPARM_MAX);
6000 }
6001 if (TARGET_SSE)
6002 {
6003 cum->sse_nregs = SSE_REGPARM_MAX;
6004 if (TARGET_64BIT)
6005 {
6006 cum->sse_nregs = (cum->call_abi == SYSV_ABI
6007 ? X86_64_SSE_REGPARM_MAX
6008 : X86_64_MS_SSE_REGPARM_MAX);
6009 }
6010 }
6011 if (TARGET_MMX)
6012 cum->mmx_nregs = MMX_REGPARM_MAX;
6013 cum->warn_avx = true;
6014 cum->warn_sse = true;
6015 cum->warn_mmx = true;
6016
6017 /* Because type might mismatch in between caller and callee, we need to
6018 use actual type of function for local calls.
6019 FIXME: cgraph_analyze can be told to actually record if function uses
6020 va_start so for local functions maybe_vaarg can be made aggressive
6021 helping K&R code.
6022 FIXME: once typesytem is fixed, we won't need this code anymore. */
6023 if (i && i->local && i->can_change_signature)
6024 fntype = TREE_TYPE (fndecl);
6025 cum->maybe_vaarg = (fntype
6026 ? (!prototype_p (fntype) || stdarg_p (fntype))
6027 : !libname);
6028
6029 if (!TARGET_64BIT)
6030 {
6031 /* If there are variable arguments, then we won't pass anything
6032 in registers in 32-bit mode. */
6033 if (stdarg_p (fntype))
6034 {
6035 cum->nregs = 0;
6036 cum->sse_nregs = 0;
6037 cum->mmx_nregs = 0;
6038 cum->warn_avx = 0;
6039 cum->warn_sse = 0;
6040 cum->warn_mmx = 0;
6041 return;
6042 }
6043
6044 /* Use ecx and edx registers if function has fastcall attribute,
6045 else look for regparm information. */
6046 if (fntype)
6047 {
6048 unsigned int ccvt = ix86_get_callcvt (fntype);
6049 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6050 {
6051 cum->nregs = 1;
6052 cum->fastcall = 1; /* Same first register as in fastcall. */
6053 }
6054 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6055 {
6056 cum->nregs = 2;
6057 cum->fastcall = 1;
6058 }
6059 else
6060 cum->nregs = ix86_function_regparm (fntype, fndecl);
6061 }
6062
6063 /* Set up the number of SSE registers used for passing SFmode
6064 and DFmode arguments. Warn for mismatching ABI. */
6065 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
6066 }
6067 }
6068
6069 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
6070 But in the case of vector types, it is some vector mode.
6071
6072 When we have only some of our vector isa extensions enabled, then there
6073 are some modes for which vector_mode_supported_p is false. For these
6074 modes, the generic vector support in gcc will choose some non-vector mode
6075 in order to implement the type. By computing the natural mode, we'll
6076 select the proper ABI location for the operand and not depend on whatever
6077 the middle-end decides to do with these vector types.
6078
6079 The midde-end can't deal with the vector types > 16 bytes. In this
6080 case, we return the original mode and warn ABI change if CUM isn't
6081 NULL. */
6082
6083 static enum machine_mode
6084 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
6085 {
6086 enum machine_mode mode = TYPE_MODE (type);
6087
6088 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
6089 {
6090 HOST_WIDE_INT size = int_size_in_bytes (type);
6091 if ((size == 8 || size == 16 || size == 32)
6092 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
6093 && TYPE_VECTOR_SUBPARTS (type) > 1)
6094 {
6095 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
6096
6097 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
6098 mode = MIN_MODE_VECTOR_FLOAT;
6099 else
6100 mode = MIN_MODE_VECTOR_INT;
6101
6102 /* Get the mode which has this inner mode and number of units. */
6103 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
6104 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
6105 && GET_MODE_INNER (mode) == innermode)
6106 {
6107 if (size == 32 && !TARGET_AVX)
6108 {
6109 static bool warnedavx;
6110
6111 if (cum
6112 && !warnedavx
6113 && cum->warn_avx)
6114 {
6115 warnedavx = true;
6116 warning (0, "AVX vector argument without AVX "
6117 "enabled changes the ABI");
6118 }
6119 return TYPE_MODE (type);
6120 }
6121 else if ((size == 8 || size == 16) && !TARGET_SSE)
6122 {
6123 static bool warnedsse;
6124
6125 if (cum
6126 && !warnedsse
6127 && cum->warn_sse)
6128 {
6129 warnedsse = true;
6130 warning (0, "SSE vector argument without SSE "
6131 "enabled changes the ABI");
6132 }
6133 return mode;
6134 }
6135 else
6136 return mode;
6137 }
6138
6139 gcc_unreachable ();
6140 }
6141 }
6142
6143 return mode;
6144 }
6145
6146 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6147 this may not agree with the mode that the type system has chosen for the
6148 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6149 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6150
6151 static rtx
6152 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6153 unsigned int regno)
6154 {
6155 rtx tmp;
6156
6157 if (orig_mode != BLKmode)
6158 tmp = gen_rtx_REG (orig_mode, regno);
6159 else
6160 {
6161 tmp = gen_rtx_REG (mode, regno);
6162 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6163 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6164 }
6165
6166 return tmp;
6167 }
6168
6169 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6170 of this code is to classify each 8bytes of incoming argument by the register
6171 class and assign registers accordingly. */
6172
6173 /* Return the union class of CLASS1 and CLASS2.
6174 See the x86-64 PS ABI for details. */
6175
6176 static enum x86_64_reg_class
6177 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6178 {
6179 /* Rule #1: If both classes are equal, this is the resulting class. */
6180 if (class1 == class2)
6181 return class1;
6182
6183 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6184 the other class. */
6185 if (class1 == X86_64_NO_CLASS)
6186 return class2;
6187 if (class2 == X86_64_NO_CLASS)
6188 return class1;
6189
6190 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6191 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6192 return X86_64_MEMORY_CLASS;
6193
6194 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6195 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6196 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6197 return X86_64_INTEGERSI_CLASS;
6198 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6199 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6200 return X86_64_INTEGER_CLASS;
6201
6202 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6203 MEMORY is used. */
6204 if (class1 == X86_64_X87_CLASS
6205 || class1 == X86_64_X87UP_CLASS
6206 || class1 == X86_64_COMPLEX_X87_CLASS
6207 || class2 == X86_64_X87_CLASS
6208 || class2 == X86_64_X87UP_CLASS
6209 || class2 == X86_64_COMPLEX_X87_CLASS)
6210 return X86_64_MEMORY_CLASS;
6211
6212 /* Rule #6: Otherwise class SSE is used. */
6213 return X86_64_SSE_CLASS;
6214 }
6215
6216 /* Classify the argument of type TYPE and mode MODE.
6217 CLASSES will be filled by the register class used to pass each word
6218 of the operand. The number of words is returned. In case the parameter
6219 should be passed in memory, 0 is returned. As a special case for zero
6220 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6221
6222 BIT_OFFSET is used internally for handling records and specifies offset
6223 of the offset in bits modulo 256 to avoid overflow cases.
6224
6225 See the x86-64 PS ABI for details.
6226 */
6227
6228 static int
6229 classify_argument (enum machine_mode mode, const_tree type,
6230 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6231 {
6232 HOST_WIDE_INT bytes =
6233 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6234 int words
6235 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6236
6237 /* Variable sized entities are always passed/returned in memory. */
6238 if (bytes < 0)
6239 return 0;
6240
6241 if (mode != VOIDmode
6242 && targetm.calls.must_pass_in_stack (mode, type))
6243 return 0;
6244
6245 if (type && AGGREGATE_TYPE_P (type))
6246 {
6247 int i;
6248 tree field;
6249 enum x86_64_reg_class subclasses[MAX_CLASSES];
6250
6251 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6252 if (bytes > 32)
6253 return 0;
6254
6255 for (i = 0; i < words; i++)
6256 classes[i] = X86_64_NO_CLASS;
6257
6258 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6259 signalize memory class, so handle it as special case. */
6260 if (!words)
6261 {
6262 classes[0] = X86_64_NO_CLASS;
6263 return 1;
6264 }
6265
6266 /* Classify each field of record and merge classes. */
6267 switch (TREE_CODE (type))
6268 {
6269 case RECORD_TYPE:
6270 /* And now merge the fields of structure. */
6271 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6272 {
6273 if (TREE_CODE (field) == FIELD_DECL)
6274 {
6275 int num;
6276
6277 if (TREE_TYPE (field) == error_mark_node)
6278 continue;
6279
6280 /* Bitfields are always classified as integer. Handle them
6281 early, since later code would consider them to be
6282 misaligned integers. */
6283 if (DECL_BIT_FIELD (field))
6284 {
6285 for (i = (int_bit_position (field)
6286 + (bit_offset % 64)) / 8 / 8;
6287 i < ((int_bit_position (field) + (bit_offset % 64))
6288 + tree_to_shwi (DECL_SIZE (field))
6289 + 63) / 8 / 8; i++)
6290 classes[i] =
6291 merge_classes (X86_64_INTEGER_CLASS,
6292 classes[i]);
6293 }
6294 else
6295 {
6296 int pos;
6297
6298 type = TREE_TYPE (field);
6299
6300 /* Flexible array member is ignored. */
6301 if (TYPE_MODE (type) == BLKmode
6302 && TREE_CODE (type) == ARRAY_TYPE
6303 && TYPE_SIZE (type) == NULL_TREE
6304 && TYPE_DOMAIN (type) != NULL_TREE
6305 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6306 == NULL_TREE))
6307 {
6308 static bool warned;
6309
6310 if (!warned && warn_psabi)
6311 {
6312 warned = true;
6313 inform (input_location,
6314 "the ABI of passing struct with"
6315 " a flexible array member has"
6316 " changed in GCC 4.4");
6317 }
6318 continue;
6319 }
6320 num = classify_argument (TYPE_MODE (type), type,
6321 subclasses,
6322 (int_bit_position (field)
6323 + bit_offset) % 256);
6324 if (!num)
6325 return 0;
6326 pos = (int_bit_position (field)
6327 + (bit_offset % 64)) / 8 / 8;
6328 for (i = 0; i < num && (i + pos) < words; i++)
6329 classes[i + pos] =
6330 merge_classes (subclasses[i], classes[i + pos]);
6331 }
6332 }
6333 }
6334 break;
6335
6336 case ARRAY_TYPE:
6337 /* Arrays are handled as small records. */
6338 {
6339 int num;
6340 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6341 TREE_TYPE (type), subclasses, bit_offset);
6342 if (!num)
6343 return 0;
6344
6345 /* The partial classes are now full classes. */
6346 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6347 subclasses[0] = X86_64_SSE_CLASS;
6348 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6349 && !((bit_offset % 64) == 0 && bytes == 4))
6350 subclasses[0] = X86_64_INTEGER_CLASS;
6351
6352 for (i = 0; i < words; i++)
6353 classes[i] = subclasses[i % num];
6354
6355 break;
6356 }
6357 case UNION_TYPE:
6358 case QUAL_UNION_TYPE:
6359 /* Unions are similar to RECORD_TYPE but offset is always 0.
6360 */
6361 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6362 {
6363 if (TREE_CODE (field) == FIELD_DECL)
6364 {
6365 int num;
6366
6367 if (TREE_TYPE (field) == error_mark_node)
6368 continue;
6369
6370 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6371 TREE_TYPE (field), subclasses,
6372 bit_offset);
6373 if (!num)
6374 return 0;
6375 for (i = 0; i < num; i++)
6376 classes[i] = merge_classes (subclasses[i], classes[i]);
6377 }
6378 }
6379 break;
6380
6381 default:
6382 gcc_unreachable ();
6383 }
6384
6385 if (words > 2)
6386 {
6387 /* When size > 16 bytes, if the first one isn't
6388 X86_64_SSE_CLASS or any other ones aren't
6389 X86_64_SSEUP_CLASS, everything should be passed in
6390 memory. */
6391 if (classes[0] != X86_64_SSE_CLASS)
6392 return 0;
6393
6394 for (i = 1; i < words; i++)
6395 if (classes[i] != X86_64_SSEUP_CLASS)
6396 return 0;
6397 }
6398
6399 /* Final merger cleanup. */
6400 for (i = 0; i < words; i++)
6401 {
6402 /* If one class is MEMORY, everything should be passed in
6403 memory. */
6404 if (classes[i] == X86_64_MEMORY_CLASS)
6405 return 0;
6406
6407 /* The X86_64_SSEUP_CLASS should be always preceded by
6408 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6409 if (classes[i] == X86_64_SSEUP_CLASS
6410 && classes[i - 1] != X86_64_SSE_CLASS
6411 && classes[i - 1] != X86_64_SSEUP_CLASS)
6412 {
6413 /* The first one should never be X86_64_SSEUP_CLASS. */
6414 gcc_assert (i != 0);
6415 classes[i] = X86_64_SSE_CLASS;
6416 }
6417
6418 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6419 everything should be passed in memory. */
6420 if (classes[i] == X86_64_X87UP_CLASS
6421 && (classes[i - 1] != X86_64_X87_CLASS))
6422 {
6423 static bool warned;
6424
6425 /* The first one should never be X86_64_X87UP_CLASS. */
6426 gcc_assert (i != 0);
6427 if (!warned && warn_psabi)
6428 {
6429 warned = true;
6430 inform (input_location,
6431 "the ABI of passing union with long double"
6432 " has changed in GCC 4.4");
6433 }
6434 return 0;
6435 }
6436 }
6437 return words;
6438 }
6439
6440 /* Compute alignment needed. We align all types to natural boundaries with
6441 exception of XFmode that is aligned to 64bits. */
6442 if (mode != VOIDmode && mode != BLKmode)
6443 {
6444 int mode_alignment = GET_MODE_BITSIZE (mode);
6445
6446 if (mode == XFmode)
6447 mode_alignment = 128;
6448 else if (mode == XCmode)
6449 mode_alignment = 256;
6450 if (COMPLEX_MODE_P (mode))
6451 mode_alignment /= 2;
6452 /* Misaligned fields are always returned in memory. */
6453 if (bit_offset % mode_alignment)
6454 return 0;
6455 }
6456
6457 /* for V1xx modes, just use the base mode */
6458 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6459 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6460 mode = GET_MODE_INNER (mode);
6461
6462 /* Classification of atomic types. */
6463 switch (mode)
6464 {
6465 case SDmode:
6466 case DDmode:
6467 classes[0] = X86_64_SSE_CLASS;
6468 return 1;
6469 case TDmode:
6470 classes[0] = X86_64_SSE_CLASS;
6471 classes[1] = X86_64_SSEUP_CLASS;
6472 return 2;
6473 case DImode:
6474 case SImode:
6475 case HImode:
6476 case QImode:
6477 case CSImode:
6478 case CHImode:
6479 case CQImode:
6480 {
6481 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6482
6483 if (size <= 32)
6484 {
6485 classes[0] = X86_64_INTEGERSI_CLASS;
6486 return 1;
6487 }
6488 else if (size <= 64)
6489 {
6490 classes[0] = X86_64_INTEGER_CLASS;
6491 return 1;
6492 }
6493 else if (size <= 64+32)
6494 {
6495 classes[0] = X86_64_INTEGER_CLASS;
6496 classes[1] = X86_64_INTEGERSI_CLASS;
6497 return 2;
6498 }
6499 else if (size <= 64+64)
6500 {
6501 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6502 return 2;
6503 }
6504 else
6505 gcc_unreachable ();
6506 }
6507 case CDImode:
6508 case TImode:
6509 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6510 return 2;
6511 case COImode:
6512 case OImode:
6513 /* OImode shouldn't be used directly. */
6514 gcc_unreachable ();
6515 case CTImode:
6516 return 0;
6517 case SFmode:
6518 if (!(bit_offset % 64))
6519 classes[0] = X86_64_SSESF_CLASS;
6520 else
6521 classes[0] = X86_64_SSE_CLASS;
6522 return 1;
6523 case DFmode:
6524 classes[0] = X86_64_SSEDF_CLASS;
6525 return 1;
6526 case XFmode:
6527 classes[0] = X86_64_X87_CLASS;
6528 classes[1] = X86_64_X87UP_CLASS;
6529 return 2;
6530 case TFmode:
6531 classes[0] = X86_64_SSE_CLASS;
6532 classes[1] = X86_64_SSEUP_CLASS;
6533 return 2;
6534 case SCmode:
6535 classes[0] = X86_64_SSE_CLASS;
6536 if (!(bit_offset % 64))
6537 return 1;
6538 else
6539 {
6540 static bool warned;
6541
6542 if (!warned && warn_psabi)
6543 {
6544 warned = true;
6545 inform (input_location,
6546 "the ABI of passing structure with complex float"
6547 " member has changed in GCC 4.4");
6548 }
6549 classes[1] = X86_64_SSESF_CLASS;
6550 return 2;
6551 }
6552 case DCmode:
6553 classes[0] = X86_64_SSEDF_CLASS;
6554 classes[1] = X86_64_SSEDF_CLASS;
6555 return 2;
6556 case XCmode:
6557 classes[0] = X86_64_COMPLEX_X87_CLASS;
6558 return 1;
6559 case TCmode:
6560 /* This modes is larger than 16 bytes. */
6561 return 0;
6562 case V8SFmode:
6563 case V8SImode:
6564 case V32QImode:
6565 case V16HImode:
6566 case V4DFmode:
6567 case V4DImode:
6568 classes[0] = X86_64_SSE_CLASS;
6569 classes[1] = X86_64_SSEUP_CLASS;
6570 classes[2] = X86_64_SSEUP_CLASS;
6571 classes[3] = X86_64_SSEUP_CLASS;
6572 return 4;
6573 case V4SFmode:
6574 case V4SImode:
6575 case V16QImode:
6576 case V8HImode:
6577 case V2DFmode:
6578 case V2DImode:
6579 classes[0] = X86_64_SSE_CLASS;
6580 classes[1] = X86_64_SSEUP_CLASS;
6581 return 2;
6582 case V1TImode:
6583 case V1DImode:
6584 case V2SFmode:
6585 case V2SImode:
6586 case V4HImode:
6587 case V8QImode:
6588 classes[0] = X86_64_SSE_CLASS;
6589 return 1;
6590 case BLKmode:
6591 case VOIDmode:
6592 return 0;
6593 default:
6594 gcc_assert (VECTOR_MODE_P (mode));
6595
6596 if (bytes > 16)
6597 return 0;
6598
6599 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6600
6601 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6602 classes[0] = X86_64_INTEGERSI_CLASS;
6603 else
6604 classes[0] = X86_64_INTEGER_CLASS;
6605 classes[1] = X86_64_INTEGER_CLASS;
6606 return 1 + (bytes > 8);
6607 }
6608 }
6609
6610 /* Examine the argument and return set number of register required in each
6611 class. Return 0 iff parameter should be passed in memory. */
6612 static int
6613 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6614 int *int_nregs, int *sse_nregs)
6615 {
6616 enum x86_64_reg_class regclass[MAX_CLASSES];
6617 int n = classify_argument (mode, type, regclass, 0);
6618
6619 *int_nregs = 0;
6620 *sse_nregs = 0;
6621 if (!n)
6622 return 0;
6623 for (n--; n >= 0; n--)
6624 switch (regclass[n])
6625 {
6626 case X86_64_INTEGER_CLASS:
6627 case X86_64_INTEGERSI_CLASS:
6628 (*int_nregs)++;
6629 break;
6630 case X86_64_SSE_CLASS:
6631 case X86_64_SSESF_CLASS:
6632 case X86_64_SSEDF_CLASS:
6633 (*sse_nregs)++;
6634 break;
6635 case X86_64_NO_CLASS:
6636 case X86_64_SSEUP_CLASS:
6637 break;
6638 case X86_64_X87_CLASS:
6639 case X86_64_X87UP_CLASS:
6640 if (!in_return)
6641 return 0;
6642 break;
6643 case X86_64_COMPLEX_X87_CLASS:
6644 return in_return ? 2 : 0;
6645 case X86_64_MEMORY_CLASS:
6646 gcc_unreachable ();
6647 }
6648 return 1;
6649 }
6650
6651 /* Construct container for the argument used by GCC interface. See
6652 FUNCTION_ARG for the detailed description. */
6653
6654 static rtx
6655 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6656 const_tree type, int in_return, int nintregs, int nsseregs,
6657 const int *intreg, int sse_regno)
6658 {
6659 /* The following variables hold the static issued_error state. */
6660 static bool issued_sse_arg_error;
6661 static bool issued_sse_ret_error;
6662 static bool issued_x87_ret_error;
6663
6664 enum machine_mode tmpmode;
6665 int bytes =
6666 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6667 enum x86_64_reg_class regclass[MAX_CLASSES];
6668 int n;
6669 int i;
6670 int nexps = 0;
6671 int needed_sseregs, needed_intregs;
6672 rtx exp[MAX_CLASSES];
6673 rtx ret;
6674
6675 n = classify_argument (mode, type, regclass, 0);
6676 if (!n)
6677 return NULL;
6678 if (!examine_argument (mode, type, in_return, &needed_intregs,
6679 &needed_sseregs))
6680 return NULL;
6681 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6682 return NULL;
6683
6684 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6685 some less clueful developer tries to use floating-point anyway. */
6686 if (needed_sseregs && !TARGET_SSE)
6687 {
6688 if (in_return)
6689 {
6690 if (!issued_sse_ret_error)
6691 {
6692 error ("SSE register return with SSE disabled");
6693 issued_sse_ret_error = true;
6694 }
6695 }
6696 else if (!issued_sse_arg_error)
6697 {
6698 error ("SSE register argument with SSE disabled");
6699 issued_sse_arg_error = true;
6700 }
6701 return NULL;
6702 }
6703
6704 /* Likewise, error if the ABI requires us to return values in the
6705 x87 registers and the user specified -mno-80387. */
6706 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6707 for (i = 0; i < n; i++)
6708 if (regclass[i] == X86_64_X87_CLASS
6709 || regclass[i] == X86_64_X87UP_CLASS
6710 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6711 {
6712 if (!issued_x87_ret_error)
6713 {
6714 error ("x87 register return with x87 disabled");
6715 issued_x87_ret_error = true;
6716 }
6717 return NULL;
6718 }
6719
6720 /* First construct simple cases. Avoid SCmode, since we want to use
6721 single register to pass this type. */
6722 if (n == 1 && mode != SCmode)
6723 switch (regclass[0])
6724 {
6725 case X86_64_INTEGER_CLASS:
6726 case X86_64_INTEGERSI_CLASS:
6727 return gen_rtx_REG (mode, intreg[0]);
6728 case X86_64_SSE_CLASS:
6729 case X86_64_SSESF_CLASS:
6730 case X86_64_SSEDF_CLASS:
6731 if (mode != BLKmode)
6732 return gen_reg_or_parallel (mode, orig_mode,
6733 SSE_REGNO (sse_regno));
6734 break;
6735 case X86_64_X87_CLASS:
6736 case X86_64_COMPLEX_X87_CLASS:
6737 return gen_rtx_REG (mode, FIRST_STACK_REG);
6738 case X86_64_NO_CLASS:
6739 /* Zero sized array, struct or class. */
6740 return NULL;
6741 default:
6742 gcc_unreachable ();
6743 }
6744 if (n == 2
6745 && regclass[0] == X86_64_SSE_CLASS
6746 && regclass[1] == X86_64_SSEUP_CLASS
6747 && mode != BLKmode)
6748 return gen_reg_or_parallel (mode, orig_mode,
6749 SSE_REGNO (sse_regno));
6750 if (n == 4
6751 && regclass[0] == X86_64_SSE_CLASS
6752 && regclass[1] == X86_64_SSEUP_CLASS
6753 && regclass[2] == X86_64_SSEUP_CLASS
6754 && regclass[3] == X86_64_SSEUP_CLASS
6755 && mode != BLKmode)
6756 return gen_reg_or_parallel (mode, orig_mode,
6757 SSE_REGNO (sse_regno));
6758 if (n == 2
6759 && regclass[0] == X86_64_X87_CLASS
6760 && regclass[1] == X86_64_X87UP_CLASS)
6761 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6762
6763 if (n == 2
6764 && regclass[0] == X86_64_INTEGER_CLASS
6765 && regclass[1] == X86_64_INTEGER_CLASS
6766 && (mode == CDImode || mode == TImode || mode == TFmode)
6767 && intreg[0] + 1 == intreg[1])
6768 return gen_rtx_REG (mode, intreg[0]);
6769
6770 /* Otherwise figure out the entries of the PARALLEL. */
6771 for (i = 0; i < n; i++)
6772 {
6773 int pos;
6774
6775 switch (regclass[i])
6776 {
6777 case X86_64_NO_CLASS:
6778 break;
6779 case X86_64_INTEGER_CLASS:
6780 case X86_64_INTEGERSI_CLASS:
6781 /* Merge TImodes on aligned occasions here too. */
6782 if (i * 8 + 8 > bytes)
6783 tmpmode
6784 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6785 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6786 tmpmode = SImode;
6787 else
6788 tmpmode = DImode;
6789 /* We've requested 24 bytes we
6790 don't have mode for. Use DImode. */
6791 if (tmpmode == BLKmode)
6792 tmpmode = DImode;
6793 exp [nexps++]
6794 = gen_rtx_EXPR_LIST (VOIDmode,
6795 gen_rtx_REG (tmpmode, *intreg),
6796 GEN_INT (i*8));
6797 intreg++;
6798 break;
6799 case X86_64_SSESF_CLASS:
6800 exp [nexps++]
6801 = gen_rtx_EXPR_LIST (VOIDmode,
6802 gen_rtx_REG (SFmode,
6803 SSE_REGNO (sse_regno)),
6804 GEN_INT (i*8));
6805 sse_regno++;
6806 break;
6807 case X86_64_SSEDF_CLASS:
6808 exp [nexps++]
6809 = gen_rtx_EXPR_LIST (VOIDmode,
6810 gen_rtx_REG (DFmode,
6811 SSE_REGNO (sse_regno)),
6812 GEN_INT (i*8));
6813 sse_regno++;
6814 break;
6815 case X86_64_SSE_CLASS:
6816 pos = i;
6817 switch (n)
6818 {
6819 case 1:
6820 tmpmode = DImode;
6821 break;
6822 case 2:
6823 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6824 {
6825 tmpmode = TImode;
6826 i++;
6827 }
6828 else
6829 tmpmode = DImode;
6830 break;
6831 case 4:
6832 gcc_assert (i == 0
6833 && regclass[1] == X86_64_SSEUP_CLASS
6834 && regclass[2] == X86_64_SSEUP_CLASS
6835 && regclass[3] == X86_64_SSEUP_CLASS);
6836 tmpmode = OImode;
6837 i += 3;
6838 break;
6839 default:
6840 gcc_unreachable ();
6841 }
6842 exp [nexps++]
6843 = gen_rtx_EXPR_LIST (VOIDmode,
6844 gen_rtx_REG (tmpmode,
6845 SSE_REGNO (sse_regno)),
6846 GEN_INT (pos*8));
6847 sse_regno++;
6848 break;
6849 default:
6850 gcc_unreachable ();
6851 }
6852 }
6853
6854 /* Empty aligned struct, union or class. */
6855 if (nexps == 0)
6856 return NULL;
6857
6858 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6859 for (i = 0; i < nexps; i++)
6860 XVECEXP (ret, 0, i) = exp [i];
6861 return ret;
6862 }
6863
6864 /* Update the data in CUM to advance over an argument of mode MODE
6865 and data type TYPE. (TYPE is null for libcalls where that information
6866 may not be available.) */
6867
6868 static void
6869 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6870 const_tree type, HOST_WIDE_INT bytes,
6871 HOST_WIDE_INT words)
6872 {
6873 switch (mode)
6874 {
6875 default:
6876 break;
6877
6878 case BLKmode:
6879 if (bytes < 0)
6880 break;
6881 /* FALLTHRU */
6882
6883 case DImode:
6884 case SImode:
6885 case HImode:
6886 case QImode:
6887 cum->words += words;
6888 cum->nregs -= words;
6889 cum->regno += words;
6890
6891 if (cum->nregs <= 0)
6892 {
6893 cum->nregs = 0;
6894 cum->regno = 0;
6895 }
6896 break;
6897
6898 case OImode:
6899 /* OImode shouldn't be used directly. */
6900 gcc_unreachable ();
6901
6902 case DFmode:
6903 if (cum->float_in_sse < 2)
6904 break;
6905 case SFmode:
6906 if (cum->float_in_sse < 1)
6907 break;
6908 /* FALLTHRU */
6909
6910 case V8SFmode:
6911 case V8SImode:
6912 case V32QImode:
6913 case V16HImode:
6914 case V4DFmode:
6915 case V4DImode:
6916 case TImode:
6917 case V16QImode:
6918 case V8HImode:
6919 case V4SImode:
6920 case V2DImode:
6921 case V4SFmode:
6922 case V2DFmode:
6923 if (!type || !AGGREGATE_TYPE_P (type))
6924 {
6925 cum->sse_words += words;
6926 cum->sse_nregs -= 1;
6927 cum->sse_regno += 1;
6928 if (cum->sse_nregs <= 0)
6929 {
6930 cum->sse_nregs = 0;
6931 cum->sse_regno = 0;
6932 }
6933 }
6934 break;
6935
6936 case V8QImode:
6937 case V4HImode:
6938 case V2SImode:
6939 case V2SFmode:
6940 case V1TImode:
6941 case V1DImode:
6942 if (!type || !AGGREGATE_TYPE_P (type))
6943 {
6944 cum->mmx_words += words;
6945 cum->mmx_nregs -= 1;
6946 cum->mmx_regno += 1;
6947 if (cum->mmx_nregs <= 0)
6948 {
6949 cum->mmx_nregs = 0;
6950 cum->mmx_regno = 0;
6951 }
6952 }
6953 break;
6954 }
6955 }
6956
6957 static void
6958 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6959 const_tree type, HOST_WIDE_INT words, bool named)
6960 {
6961 int int_nregs, sse_nregs;
6962
6963 /* Unnamed 256bit vector mode parameters are passed on stack. */
6964 if (!named && VALID_AVX256_REG_MODE (mode))
6965 return;
6966
6967 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6968 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6969 {
6970 cum->nregs -= int_nregs;
6971 cum->sse_nregs -= sse_nregs;
6972 cum->regno += int_nregs;
6973 cum->sse_regno += sse_nregs;
6974 }
6975 else
6976 {
6977 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6978 cum->words = (cum->words + align - 1) & ~(align - 1);
6979 cum->words += words;
6980 }
6981 }
6982
6983 static void
6984 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6985 HOST_WIDE_INT words)
6986 {
6987 /* Otherwise, this should be passed indirect. */
6988 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6989
6990 cum->words += words;
6991 if (cum->nregs > 0)
6992 {
6993 cum->nregs -= 1;
6994 cum->regno += 1;
6995 }
6996 }
6997
6998 /* Update the data in CUM to advance over an argument of mode MODE and
6999 data type TYPE. (TYPE is null for libcalls where that information
7000 may not be available.) */
7001
7002 static void
7003 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
7004 const_tree type, bool named)
7005 {
7006 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7007 HOST_WIDE_INT bytes, words;
7008
7009 if (mode == BLKmode)
7010 bytes = int_size_in_bytes (type);
7011 else
7012 bytes = GET_MODE_SIZE (mode);
7013 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7014
7015 if (type)
7016 mode = type_natural_mode (type, NULL);
7017
7018 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7019 function_arg_advance_ms_64 (cum, bytes, words);
7020 else if (TARGET_64BIT)
7021 function_arg_advance_64 (cum, mode, type, words, named);
7022 else
7023 function_arg_advance_32 (cum, mode, type, bytes, words);
7024 }
7025
7026 /* Define where to put the arguments to a function.
7027 Value is zero to push the argument on the stack,
7028 or a hard register in which to store the argument.
7029
7030 MODE is the argument's machine mode.
7031 TYPE is the data type of the argument (as a tree).
7032 This is null for libcalls where that information may
7033 not be available.
7034 CUM is a variable of type CUMULATIVE_ARGS which gives info about
7035 the preceding args and about the function being called.
7036 NAMED is nonzero if this argument is a named parameter
7037 (otherwise it is an extra parameter matching an ellipsis). */
7038
7039 static rtx
7040 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7041 enum machine_mode orig_mode, const_tree type,
7042 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
7043 {
7044 static bool warnedsse, warnedmmx;
7045
7046 /* Avoid the AL settings for the Unix64 ABI. */
7047 if (mode == VOIDmode)
7048 return constm1_rtx;
7049
7050 switch (mode)
7051 {
7052 default:
7053 break;
7054
7055 case BLKmode:
7056 if (bytes < 0)
7057 break;
7058 /* FALLTHRU */
7059 case DImode:
7060 case SImode:
7061 case HImode:
7062 case QImode:
7063 if (words <= cum->nregs)
7064 {
7065 int regno = cum->regno;
7066
7067 /* Fastcall allocates the first two DWORD (SImode) or
7068 smaller arguments to ECX and EDX if it isn't an
7069 aggregate type . */
7070 if (cum->fastcall)
7071 {
7072 if (mode == BLKmode
7073 || mode == DImode
7074 || (type && AGGREGATE_TYPE_P (type)))
7075 break;
7076
7077 /* ECX not EAX is the first allocated register. */
7078 if (regno == AX_REG)
7079 regno = CX_REG;
7080 }
7081 return gen_rtx_REG (mode, regno);
7082 }
7083 break;
7084
7085 case DFmode:
7086 if (cum->float_in_sse < 2)
7087 break;
7088 case SFmode:
7089 if (cum->float_in_sse < 1)
7090 break;
7091 /* FALLTHRU */
7092 case TImode:
7093 /* In 32bit, we pass TImode in xmm registers. */
7094 case V16QImode:
7095 case V8HImode:
7096 case V4SImode:
7097 case V2DImode:
7098 case V4SFmode:
7099 case V2DFmode:
7100 if (!type || !AGGREGATE_TYPE_P (type))
7101 {
7102 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
7103 {
7104 warnedsse = true;
7105 warning (0, "SSE vector argument without SSE enabled "
7106 "changes the ABI");
7107 }
7108 if (cum->sse_nregs)
7109 return gen_reg_or_parallel (mode, orig_mode,
7110 cum->sse_regno + FIRST_SSE_REG);
7111 }
7112 break;
7113
7114 case OImode:
7115 /* OImode shouldn't be used directly. */
7116 gcc_unreachable ();
7117
7118 case V8SFmode:
7119 case V8SImode:
7120 case V32QImode:
7121 case V16HImode:
7122 case V4DFmode:
7123 case V4DImode:
7124 if (!type || !AGGREGATE_TYPE_P (type))
7125 {
7126 if (cum->sse_nregs)
7127 return gen_reg_or_parallel (mode, orig_mode,
7128 cum->sse_regno + FIRST_SSE_REG);
7129 }
7130 break;
7131
7132 case V8QImode:
7133 case V4HImode:
7134 case V2SImode:
7135 case V2SFmode:
7136 case V1TImode:
7137 case V1DImode:
7138 if (!type || !AGGREGATE_TYPE_P (type))
7139 {
7140 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7141 {
7142 warnedmmx = true;
7143 warning (0, "MMX vector argument without MMX enabled "
7144 "changes the ABI");
7145 }
7146 if (cum->mmx_nregs)
7147 return gen_reg_or_parallel (mode, orig_mode,
7148 cum->mmx_regno + FIRST_MMX_REG);
7149 }
7150 break;
7151 }
7152
7153 return NULL_RTX;
7154 }
7155
7156 static rtx
7157 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7158 enum machine_mode orig_mode, const_tree type, bool named)
7159 {
7160 /* Handle a hidden AL argument containing number of registers
7161 for varargs x86-64 functions. */
7162 if (mode == VOIDmode)
7163 return GEN_INT (cum->maybe_vaarg
7164 ? (cum->sse_nregs < 0
7165 ? X86_64_SSE_REGPARM_MAX
7166 : cum->sse_regno)
7167 : -1);
7168
7169 switch (mode)
7170 {
7171 default:
7172 break;
7173
7174 case V8SFmode:
7175 case V8SImode:
7176 case V32QImode:
7177 case V16HImode:
7178 case V4DFmode:
7179 case V4DImode:
7180 /* Unnamed 256bit vector mode parameters are passed on stack. */
7181 if (!named)
7182 return NULL;
7183 break;
7184 }
7185
7186 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7187 cum->sse_nregs,
7188 &x86_64_int_parameter_registers [cum->regno],
7189 cum->sse_regno);
7190 }
7191
7192 static rtx
7193 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7194 enum machine_mode orig_mode, bool named,
7195 HOST_WIDE_INT bytes)
7196 {
7197 unsigned int regno;
7198
7199 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7200 We use value of -2 to specify that current function call is MSABI. */
7201 if (mode == VOIDmode)
7202 return GEN_INT (-2);
7203
7204 /* If we've run out of registers, it goes on the stack. */
7205 if (cum->nregs == 0)
7206 return NULL_RTX;
7207
7208 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7209
7210 /* Only floating point modes are passed in anything but integer regs. */
7211 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7212 {
7213 if (named)
7214 regno = cum->regno + FIRST_SSE_REG;
7215 else
7216 {
7217 rtx t1, t2;
7218
7219 /* Unnamed floating parameters are passed in both the
7220 SSE and integer registers. */
7221 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7222 t2 = gen_rtx_REG (mode, regno);
7223 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7224 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7225 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7226 }
7227 }
7228 /* Handle aggregated types passed in register. */
7229 if (orig_mode == BLKmode)
7230 {
7231 if (bytes > 0 && bytes <= 8)
7232 mode = (bytes > 4 ? DImode : SImode);
7233 if (mode == BLKmode)
7234 mode = DImode;
7235 }
7236
7237 return gen_reg_or_parallel (mode, orig_mode, regno);
7238 }
7239
7240 /* Return where to put the arguments to a function.
7241 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7242
7243 MODE is the argument's machine mode. TYPE is the data type of the
7244 argument. It is null for libcalls where that information may not be
7245 available. CUM gives information about the preceding args and about
7246 the function being called. NAMED is nonzero if this argument is a
7247 named parameter (otherwise it is an extra parameter matching an
7248 ellipsis). */
7249
7250 static rtx
7251 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7252 const_tree type, bool named)
7253 {
7254 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7255 enum machine_mode mode = omode;
7256 HOST_WIDE_INT bytes, words;
7257 rtx arg;
7258
7259 if (mode == BLKmode)
7260 bytes = int_size_in_bytes (type);
7261 else
7262 bytes = GET_MODE_SIZE (mode);
7263 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7264
7265 /* To simplify the code below, represent vector types with a vector mode
7266 even if MMX/SSE are not active. */
7267 if (type && TREE_CODE (type) == VECTOR_TYPE)
7268 mode = type_natural_mode (type, cum);
7269
7270 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7271 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7272 else if (TARGET_64BIT)
7273 arg = function_arg_64 (cum, mode, omode, type, named);
7274 else
7275 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7276
7277 return arg;
7278 }
7279
7280 /* A C expression that indicates when an argument must be passed by
7281 reference. If nonzero for an argument, a copy of that argument is
7282 made in memory and a pointer to the argument is passed instead of
7283 the argument itself. The pointer is passed in whatever way is
7284 appropriate for passing a pointer to that type. */
7285
7286 static bool
7287 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7288 const_tree type, bool named ATTRIBUTE_UNUSED)
7289 {
7290 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7291
7292 /* See Windows x64 Software Convention. */
7293 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7294 {
7295 int msize = (int) GET_MODE_SIZE (mode);
7296 if (type)
7297 {
7298 /* Arrays are passed by reference. */
7299 if (TREE_CODE (type) == ARRAY_TYPE)
7300 return true;
7301
7302 if (AGGREGATE_TYPE_P (type))
7303 {
7304 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7305 are passed by reference. */
7306 msize = int_size_in_bytes (type);
7307 }
7308 }
7309
7310 /* __m128 is passed by reference. */
7311 switch (msize) {
7312 case 1: case 2: case 4: case 8:
7313 break;
7314 default:
7315 return true;
7316 }
7317 }
7318 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7319 return 1;
7320
7321 return 0;
7322 }
7323
7324 /* Return true when TYPE should be 128bit aligned for 32bit argument
7325 passing ABI. XXX: This function is obsolete and is only used for
7326 checking psABI compatibility with previous versions of GCC. */
7327
7328 static bool
7329 ix86_compat_aligned_value_p (const_tree type)
7330 {
7331 enum machine_mode mode = TYPE_MODE (type);
7332 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7333 || mode == TDmode
7334 || mode == TFmode
7335 || mode == TCmode)
7336 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7337 return true;
7338 if (TYPE_ALIGN (type) < 128)
7339 return false;
7340
7341 if (AGGREGATE_TYPE_P (type))
7342 {
7343 /* Walk the aggregates recursively. */
7344 switch (TREE_CODE (type))
7345 {
7346 case RECORD_TYPE:
7347 case UNION_TYPE:
7348 case QUAL_UNION_TYPE:
7349 {
7350 tree field;
7351
7352 /* Walk all the structure fields. */
7353 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7354 {
7355 if (TREE_CODE (field) == FIELD_DECL
7356 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7357 return true;
7358 }
7359 break;
7360 }
7361
7362 case ARRAY_TYPE:
7363 /* Just for use if some languages passes arrays by value. */
7364 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7365 return true;
7366 break;
7367
7368 default:
7369 gcc_unreachable ();
7370 }
7371 }
7372 return false;
7373 }
7374
7375 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7376 XXX: This function is obsolete and is only used for checking psABI
7377 compatibility with previous versions of GCC. */
7378
7379 static unsigned int
7380 ix86_compat_function_arg_boundary (enum machine_mode mode,
7381 const_tree type, unsigned int align)
7382 {
7383 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7384 natural boundaries. */
7385 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7386 {
7387 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7388 make an exception for SSE modes since these require 128bit
7389 alignment.
7390
7391 The handling here differs from field_alignment. ICC aligns MMX
7392 arguments to 4 byte boundaries, while structure fields are aligned
7393 to 8 byte boundaries. */
7394 if (!type)
7395 {
7396 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7397 align = PARM_BOUNDARY;
7398 }
7399 else
7400 {
7401 if (!ix86_compat_aligned_value_p (type))
7402 align = PARM_BOUNDARY;
7403 }
7404 }
7405 if (align > BIGGEST_ALIGNMENT)
7406 align = BIGGEST_ALIGNMENT;
7407 return align;
7408 }
7409
7410 /* Return true when TYPE should be 128bit aligned for 32bit argument
7411 passing ABI. */
7412
7413 static bool
7414 ix86_contains_aligned_value_p (const_tree type)
7415 {
7416 enum machine_mode mode = TYPE_MODE (type);
7417
7418 if (mode == XFmode || mode == XCmode)
7419 return false;
7420
7421 if (TYPE_ALIGN (type) < 128)
7422 return false;
7423
7424 if (AGGREGATE_TYPE_P (type))
7425 {
7426 /* Walk the aggregates recursively. */
7427 switch (TREE_CODE (type))
7428 {
7429 case RECORD_TYPE:
7430 case UNION_TYPE:
7431 case QUAL_UNION_TYPE:
7432 {
7433 tree field;
7434
7435 /* Walk all the structure fields. */
7436 for (field = TYPE_FIELDS (type);
7437 field;
7438 field = DECL_CHAIN (field))
7439 {
7440 if (TREE_CODE (field) == FIELD_DECL
7441 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7442 return true;
7443 }
7444 break;
7445 }
7446
7447 case ARRAY_TYPE:
7448 /* Just for use if some languages passes arrays by value. */
7449 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7450 return true;
7451 break;
7452
7453 default:
7454 gcc_unreachable ();
7455 }
7456 }
7457 else
7458 return TYPE_ALIGN (type) >= 128;
7459
7460 return false;
7461 }
7462
7463 /* Gives the alignment boundary, in bits, of an argument with the
7464 specified mode and type. */
7465
7466 static unsigned int
7467 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7468 {
7469 unsigned int align;
7470 if (type)
7471 {
7472 /* Since the main variant type is used for call, we convert it to
7473 the main variant type. */
7474 type = TYPE_MAIN_VARIANT (type);
7475 align = TYPE_ALIGN (type);
7476 }
7477 else
7478 align = GET_MODE_ALIGNMENT (mode);
7479 if (align < PARM_BOUNDARY)
7480 align = PARM_BOUNDARY;
7481 else
7482 {
7483 static bool warned;
7484 unsigned int saved_align = align;
7485
7486 if (!TARGET_64BIT)
7487 {
7488 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7489 if (!type)
7490 {
7491 if (mode == XFmode || mode == XCmode)
7492 align = PARM_BOUNDARY;
7493 }
7494 else if (!ix86_contains_aligned_value_p (type))
7495 align = PARM_BOUNDARY;
7496
7497 if (align < 128)
7498 align = PARM_BOUNDARY;
7499 }
7500
7501 if (warn_psabi
7502 && !warned
7503 && align != ix86_compat_function_arg_boundary (mode, type,
7504 saved_align))
7505 {
7506 warned = true;
7507 inform (input_location,
7508 "The ABI for passing parameters with %d-byte"
7509 " alignment has changed in GCC 4.6",
7510 align / BITS_PER_UNIT);
7511 }
7512 }
7513
7514 return align;
7515 }
7516
7517 /* Return true if N is a possible register number of function value. */
7518
7519 static bool
7520 ix86_function_value_regno_p (const unsigned int regno)
7521 {
7522 switch (regno)
7523 {
7524 case AX_REG:
7525 case DX_REG:
7526 return true;
7527 case DI_REG:
7528 case SI_REG:
7529 return TARGET_64BIT && ix86_abi != MS_ABI;
7530
7531 /* Complex values are returned in %st(0)/%st(1) pair. */
7532 case ST0_REG:
7533 case ST1_REG:
7534 /* TODO: The function should depend on current function ABI but
7535 builtins.c would need updating then. Therefore we use the
7536 default ABI. */
7537 if (TARGET_64BIT && ix86_abi == MS_ABI)
7538 return false;
7539 return TARGET_FLOAT_RETURNS_IN_80387;
7540
7541 /* Complex values are returned in %xmm0/%xmm1 pair. */
7542 case XMM0_REG:
7543 case XMM1_REG:
7544 return TARGET_SSE;
7545
7546 case MM0_REG:
7547 if (TARGET_MACHO || TARGET_64BIT)
7548 return false;
7549 return TARGET_MMX;
7550 }
7551
7552 return false;
7553 }
7554
7555 /* Define how to find the value returned by a function.
7556 VALTYPE is the data type of the value (as a tree).
7557 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7558 otherwise, FUNC is 0. */
7559
7560 static rtx
7561 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7562 const_tree fntype, const_tree fn)
7563 {
7564 unsigned int regno;
7565
7566 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7567 we normally prevent this case when mmx is not available. However
7568 some ABIs may require the result to be returned like DImode. */
7569 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7570 regno = FIRST_MMX_REG;
7571
7572 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7573 we prevent this case when sse is not available. However some ABIs
7574 may require the result to be returned like integer TImode. */
7575 else if (mode == TImode
7576 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7577 regno = FIRST_SSE_REG;
7578
7579 /* 32-byte vector modes in %ymm0. */
7580 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7581 regno = FIRST_SSE_REG;
7582
7583 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7584 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7585 regno = FIRST_FLOAT_REG;
7586 else
7587 /* Most things go in %eax. */
7588 regno = AX_REG;
7589
7590 /* Override FP return register with %xmm0 for local functions when
7591 SSE math is enabled or for functions with sseregparm attribute. */
7592 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7593 {
7594 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7595 if ((sse_level >= 1 && mode == SFmode)
7596 || (sse_level == 2 && mode == DFmode))
7597 regno = FIRST_SSE_REG;
7598 }
7599
7600 /* OImode shouldn't be used directly. */
7601 gcc_assert (mode != OImode);
7602
7603 return gen_rtx_REG (orig_mode, regno);
7604 }
7605
7606 static rtx
7607 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7608 const_tree valtype)
7609 {
7610 rtx ret;
7611
7612 /* Handle libcalls, which don't provide a type node. */
7613 if (valtype == NULL)
7614 {
7615 unsigned int regno;
7616
7617 switch (mode)
7618 {
7619 case SFmode:
7620 case SCmode:
7621 case DFmode:
7622 case DCmode:
7623 case TFmode:
7624 case SDmode:
7625 case DDmode:
7626 case TDmode:
7627 regno = FIRST_SSE_REG;
7628 break;
7629 case XFmode:
7630 case XCmode:
7631 regno = FIRST_FLOAT_REG;
7632 break;
7633 case TCmode:
7634 return NULL;
7635 default:
7636 regno = AX_REG;
7637 }
7638
7639 return gen_rtx_REG (mode, regno);
7640 }
7641 else if (POINTER_TYPE_P (valtype))
7642 {
7643 /* Pointers are always returned in word_mode. */
7644 mode = word_mode;
7645 }
7646
7647 ret = construct_container (mode, orig_mode, valtype, 1,
7648 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7649 x86_64_int_return_registers, 0);
7650
7651 /* For zero sized structures, construct_container returns NULL, but we
7652 need to keep rest of compiler happy by returning meaningful value. */
7653 if (!ret)
7654 ret = gen_rtx_REG (orig_mode, AX_REG);
7655
7656 return ret;
7657 }
7658
7659 static rtx
7660 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7661 const_tree valtype)
7662 {
7663 unsigned int regno = AX_REG;
7664
7665 if (TARGET_SSE)
7666 {
7667 switch (GET_MODE_SIZE (mode))
7668 {
7669 case 16:
7670 if (valtype != NULL_TREE
7671 && !VECTOR_INTEGER_TYPE_P (valtype)
7672 && !VECTOR_INTEGER_TYPE_P (valtype)
7673 && !INTEGRAL_TYPE_P (valtype)
7674 && !VECTOR_FLOAT_TYPE_P (valtype))
7675 break;
7676 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7677 && !COMPLEX_MODE_P (mode))
7678 regno = FIRST_SSE_REG;
7679 break;
7680 case 8:
7681 case 4:
7682 if (mode == SFmode || mode == DFmode)
7683 regno = FIRST_SSE_REG;
7684 break;
7685 default:
7686 break;
7687 }
7688 }
7689 return gen_rtx_REG (orig_mode, regno);
7690 }
7691
7692 static rtx
7693 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7694 enum machine_mode orig_mode, enum machine_mode mode)
7695 {
7696 const_tree fn, fntype;
7697
7698 fn = NULL_TREE;
7699 if (fntype_or_decl && DECL_P (fntype_or_decl))
7700 fn = fntype_or_decl;
7701 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7702
7703 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7704 return function_value_ms_64 (orig_mode, mode, valtype);
7705 else if (TARGET_64BIT)
7706 return function_value_64 (orig_mode, mode, valtype);
7707 else
7708 return function_value_32 (orig_mode, mode, fntype, fn);
7709 }
7710
7711 static rtx
7712 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7713 bool outgoing ATTRIBUTE_UNUSED)
7714 {
7715 enum machine_mode mode, orig_mode;
7716
7717 orig_mode = TYPE_MODE (valtype);
7718 mode = type_natural_mode (valtype, NULL);
7719 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7720 }
7721
7722 /* Pointer function arguments and return values are promoted to
7723 word_mode. */
7724
7725 static enum machine_mode
7726 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7727 int *punsignedp, const_tree fntype,
7728 int for_return)
7729 {
7730 if (type != NULL_TREE && POINTER_TYPE_P (type))
7731 {
7732 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7733 return word_mode;
7734 }
7735 return default_promote_function_mode (type, mode, punsignedp, fntype,
7736 for_return);
7737 }
7738
7739 /* Return true if a structure, union or array with MODE containing FIELD
7740 should be accessed using BLKmode. */
7741
7742 static bool
7743 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7744 {
7745 /* Union with XFmode must be in BLKmode. */
7746 return (mode == XFmode
7747 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7748 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7749 }
7750
7751 rtx
7752 ix86_libcall_value (enum machine_mode mode)
7753 {
7754 return ix86_function_value_1 (NULL, NULL, mode, mode);
7755 }
7756
7757 /* Return true iff type is returned in memory. */
7758
7759 static bool ATTRIBUTE_UNUSED
7760 return_in_memory_32 (const_tree type, enum machine_mode mode)
7761 {
7762 HOST_WIDE_INT size;
7763
7764 if (mode == BLKmode)
7765 return true;
7766
7767 size = int_size_in_bytes (type);
7768
7769 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7770 return false;
7771
7772 if (VECTOR_MODE_P (mode) || mode == TImode)
7773 {
7774 /* User-created vectors small enough to fit in EAX. */
7775 if (size < 8)
7776 return false;
7777
7778 /* MMX/3dNow values are returned in MM0,
7779 except when it doesn't exits or the ABI prescribes otherwise. */
7780 if (size == 8)
7781 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7782
7783 /* SSE values are returned in XMM0, except when it doesn't exist. */
7784 if (size == 16)
7785 return !TARGET_SSE;
7786
7787 /* AVX values are returned in YMM0, except when it doesn't exist. */
7788 if (size == 32)
7789 return !TARGET_AVX;
7790 }
7791
7792 if (mode == XFmode)
7793 return false;
7794
7795 if (size > 12)
7796 return true;
7797
7798 /* OImode shouldn't be used directly. */
7799 gcc_assert (mode != OImode);
7800
7801 return false;
7802 }
7803
7804 static bool ATTRIBUTE_UNUSED
7805 return_in_memory_64 (const_tree type, enum machine_mode mode)
7806 {
7807 int needed_intregs, needed_sseregs;
7808 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7809 }
7810
7811 static bool ATTRIBUTE_UNUSED
7812 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7813 {
7814 HOST_WIDE_INT size = int_size_in_bytes (type);
7815
7816 /* __m128 is returned in xmm0. */
7817 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7818 || VECTOR_FLOAT_TYPE_P (type))
7819 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7820 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7821 return false;
7822
7823 /* Otherwise, the size must be exactly in [1248]. */
7824 return size != 1 && size != 2 && size != 4 && size != 8;
7825 }
7826
7827 static bool
7828 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7829 {
7830 #ifdef SUBTARGET_RETURN_IN_MEMORY
7831 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7832 #else
7833 const enum machine_mode mode = type_natural_mode (type, NULL);
7834
7835 if (TARGET_64BIT)
7836 {
7837 if (ix86_function_type_abi (fntype) == MS_ABI)
7838 return return_in_memory_ms_64 (type, mode);
7839 else
7840 return return_in_memory_64 (type, mode);
7841 }
7842 else
7843 return return_in_memory_32 (type, mode);
7844 #endif
7845 }
7846
7847 /* When returning SSE vector types, we have a choice of either
7848 (1) being abi incompatible with a -march switch, or
7849 (2) generating an error.
7850 Given no good solution, I think the safest thing is one warning.
7851 The user won't be able to use -Werror, but....
7852
7853 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7854 called in response to actually generating a caller or callee that
7855 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7856 via aggregate_value_p for general type probing from tree-ssa. */
7857
7858 static rtx
7859 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7860 {
7861 static bool warnedsse, warnedmmx;
7862
7863 if (!TARGET_64BIT && type)
7864 {
7865 /* Look at the return type of the function, not the function type. */
7866 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7867
7868 if (!TARGET_SSE && !warnedsse)
7869 {
7870 if (mode == TImode
7871 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7872 {
7873 warnedsse = true;
7874 warning (0, "SSE vector return without SSE enabled "
7875 "changes the ABI");
7876 }
7877 }
7878
7879 if (!TARGET_MMX && !warnedmmx)
7880 {
7881 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7882 {
7883 warnedmmx = true;
7884 warning (0, "MMX vector return without MMX enabled "
7885 "changes the ABI");
7886 }
7887 }
7888 }
7889
7890 return NULL;
7891 }
7892
7893 \f
7894 /* Create the va_list data type. */
7895
7896 /* Returns the calling convention specific va_list date type.
7897 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7898
7899 static tree
7900 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7901 {
7902 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7903
7904 /* For i386 we use plain pointer to argument area. */
7905 if (!TARGET_64BIT || abi == MS_ABI)
7906 return build_pointer_type (char_type_node);
7907
7908 record = lang_hooks.types.make_type (RECORD_TYPE);
7909 type_decl = build_decl (BUILTINS_LOCATION,
7910 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7911
7912 f_gpr = build_decl (BUILTINS_LOCATION,
7913 FIELD_DECL, get_identifier ("gp_offset"),
7914 unsigned_type_node);
7915 f_fpr = build_decl (BUILTINS_LOCATION,
7916 FIELD_DECL, get_identifier ("fp_offset"),
7917 unsigned_type_node);
7918 f_ovf = build_decl (BUILTINS_LOCATION,
7919 FIELD_DECL, get_identifier ("overflow_arg_area"),
7920 ptr_type_node);
7921 f_sav = build_decl (BUILTINS_LOCATION,
7922 FIELD_DECL, get_identifier ("reg_save_area"),
7923 ptr_type_node);
7924
7925 va_list_gpr_counter_field = f_gpr;
7926 va_list_fpr_counter_field = f_fpr;
7927
7928 DECL_FIELD_CONTEXT (f_gpr) = record;
7929 DECL_FIELD_CONTEXT (f_fpr) = record;
7930 DECL_FIELD_CONTEXT (f_ovf) = record;
7931 DECL_FIELD_CONTEXT (f_sav) = record;
7932
7933 TYPE_STUB_DECL (record) = type_decl;
7934 TYPE_NAME (record) = type_decl;
7935 TYPE_FIELDS (record) = f_gpr;
7936 DECL_CHAIN (f_gpr) = f_fpr;
7937 DECL_CHAIN (f_fpr) = f_ovf;
7938 DECL_CHAIN (f_ovf) = f_sav;
7939
7940 layout_type (record);
7941
7942 /* The correct type is an array type of one element. */
7943 return build_array_type (record, build_index_type (size_zero_node));
7944 }
7945
7946 /* Setup the builtin va_list data type and for 64-bit the additional
7947 calling convention specific va_list data types. */
7948
7949 static tree
7950 ix86_build_builtin_va_list (void)
7951 {
7952 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7953
7954 /* Initialize abi specific va_list builtin types. */
7955 if (TARGET_64BIT)
7956 {
7957 tree t;
7958 if (ix86_abi == MS_ABI)
7959 {
7960 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7961 if (TREE_CODE (t) != RECORD_TYPE)
7962 t = build_variant_type_copy (t);
7963 sysv_va_list_type_node = t;
7964 }
7965 else
7966 {
7967 t = ret;
7968 if (TREE_CODE (t) != RECORD_TYPE)
7969 t = build_variant_type_copy (t);
7970 sysv_va_list_type_node = t;
7971 }
7972 if (ix86_abi != MS_ABI)
7973 {
7974 t = ix86_build_builtin_va_list_abi (MS_ABI);
7975 if (TREE_CODE (t) != RECORD_TYPE)
7976 t = build_variant_type_copy (t);
7977 ms_va_list_type_node = t;
7978 }
7979 else
7980 {
7981 t = ret;
7982 if (TREE_CODE (t) != RECORD_TYPE)
7983 t = build_variant_type_copy (t);
7984 ms_va_list_type_node = t;
7985 }
7986 }
7987
7988 return ret;
7989 }
7990
7991 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7992
7993 static void
7994 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7995 {
7996 rtx save_area, mem;
7997 alias_set_type set;
7998 int i, max;
7999
8000 /* GPR size of varargs save area. */
8001 if (cfun->va_list_gpr_size)
8002 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
8003 else
8004 ix86_varargs_gpr_size = 0;
8005
8006 /* FPR size of varargs save area. We don't need it if we don't pass
8007 anything in SSE registers. */
8008 if (TARGET_SSE && cfun->va_list_fpr_size)
8009 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
8010 else
8011 ix86_varargs_fpr_size = 0;
8012
8013 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
8014 return;
8015
8016 save_area = frame_pointer_rtx;
8017 set = get_varargs_alias_set ();
8018
8019 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
8020 if (max > X86_64_REGPARM_MAX)
8021 max = X86_64_REGPARM_MAX;
8022
8023 for (i = cum->regno; i < max; i++)
8024 {
8025 mem = gen_rtx_MEM (word_mode,
8026 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
8027 MEM_NOTRAP_P (mem) = 1;
8028 set_mem_alias_set (mem, set);
8029 emit_move_insn (mem,
8030 gen_rtx_REG (word_mode,
8031 x86_64_int_parameter_registers[i]));
8032 }
8033
8034 if (ix86_varargs_fpr_size)
8035 {
8036 enum machine_mode smode;
8037 rtx label, test;
8038
8039 /* Now emit code to save SSE registers. The AX parameter contains number
8040 of SSE parameter registers used to call this function, though all we
8041 actually check here is the zero/non-zero status. */
8042
8043 label = gen_label_rtx ();
8044 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
8045 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
8046 label));
8047
8048 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
8049 we used movdqa (i.e. TImode) instead? Perhaps even better would
8050 be if we could determine the real mode of the data, via a hook
8051 into pass_stdarg. Ignore all that for now. */
8052 smode = V4SFmode;
8053 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
8054 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
8055
8056 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
8057 if (max > X86_64_SSE_REGPARM_MAX)
8058 max = X86_64_SSE_REGPARM_MAX;
8059
8060 for (i = cum->sse_regno; i < max; ++i)
8061 {
8062 mem = plus_constant (Pmode, save_area,
8063 i * 16 + ix86_varargs_gpr_size);
8064 mem = gen_rtx_MEM (smode, mem);
8065 MEM_NOTRAP_P (mem) = 1;
8066 set_mem_alias_set (mem, set);
8067 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
8068
8069 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
8070 }
8071
8072 emit_label (label);
8073 }
8074 }
8075
8076 static void
8077 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
8078 {
8079 alias_set_type set = get_varargs_alias_set ();
8080 int i;
8081
8082 /* Reset to zero, as there might be a sysv vaarg used
8083 before. */
8084 ix86_varargs_gpr_size = 0;
8085 ix86_varargs_fpr_size = 0;
8086
8087 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
8088 {
8089 rtx reg, mem;
8090
8091 mem = gen_rtx_MEM (Pmode,
8092 plus_constant (Pmode, virtual_incoming_args_rtx,
8093 i * UNITS_PER_WORD));
8094 MEM_NOTRAP_P (mem) = 1;
8095 set_mem_alias_set (mem, set);
8096
8097 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
8098 emit_move_insn (mem, reg);
8099 }
8100 }
8101
8102 static void
8103 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8104 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8105 int no_rtl)
8106 {
8107 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8108 CUMULATIVE_ARGS next_cum;
8109 tree fntype;
8110
8111 /* This argument doesn't appear to be used anymore. Which is good,
8112 because the old code here didn't suppress rtl generation. */
8113 gcc_assert (!no_rtl);
8114
8115 if (!TARGET_64BIT)
8116 return;
8117
8118 fntype = TREE_TYPE (current_function_decl);
8119
8120 /* For varargs, we do not want to skip the dummy va_dcl argument.
8121 For stdargs, we do want to skip the last named argument. */
8122 next_cum = *cum;
8123 if (stdarg_p (fntype))
8124 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8125 true);
8126
8127 if (cum->call_abi == MS_ABI)
8128 setup_incoming_varargs_ms_64 (&next_cum);
8129 else
8130 setup_incoming_varargs_64 (&next_cum);
8131 }
8132
8133 /* Checks if TYPE is of kind va_list char *. */
8134
8135 static bool
8136 is_va_list_char_pointer (tree type)
8137 {
8138 tree canonic;
8139
8140 /* For 32-bit it is always true. */
8141 if (!TARGET_64BIT)
8142 return true;
8143 canonic = ix86_canonical_va_list_type (type);
8144 return (canonic == ms_va_list_type_node
8145 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8146 }
8147
8148 /* Implement va_start. */
8149
8150 static void
8151 ix86_va_start (tree valist, rtx nextarg)
8152 {
8153 HOST_WIDE_INT words, n_gpr, n_fpr;
8154 tree f_gpr, f_fpr, f_ovf, f_sav;
8155 tree gpr, fpr, ovf, sav, t;
8156 tree type;
8157 rtx ovf_rtx;
8158
8159 if (flag_split_stack
8160 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8161 {
8162 unsigned int scratch_regno;
8163
8164 /* When we are splitting the stack, we can't refer to the stack
8165 arguments using internal_arg_pointer, because they may be on
8166 the old stack. The split stack prologue will arrange to
8167 leave a pointer to the old stack arguments in a scratch
8168 register, which we here copy to a pseudo-register. The split
8169 stack prologue can't set the pseudo-register directly because
8170 it (the prologue) runs before any registers have been saved. */
8171
8172 scratch_regno = split_stack_prologue_scratch_regno ();
8173 if (scratch_regno != INVALID_REGNUM)
8174 {
8175 rtx reg, seq;
8176
8177 reg = gen_reg_rtx (Pmode);
8178 cfun->machine->split_stack_varargs_pointer = reg;
8179
8180 start_sequence ();
8181 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8182 seq = get_insns ();
8183 end_sequence ();
8184
8185 push_topmost_sequence ();
8186 emit_insn_after (seq, entry_of_function ());
8187 pop_topmost_sequence ();
8188 }
8189 }
8190
8191 /* Only 64bit target needs something special. */
8192 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8193 {
8194 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8195 std_expand_builtin_va_start (valist, nextarg);
8196 else
8197 {
8198 rtx va_r, next;
8199
8200 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8201 next = expand_binop (ptr_mode, add_optab,
8202 cfun->machine->split_stack_varargs_pointer,
8203 crtl->args.arg_offset_rtx,
8204 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8205 convert_move (va_r, next, 0);
8206 }
8207 return;
8208 }
8209
8210 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8211 f_fpr = DECL_CHAIN (f_gpr);
8212 f_ovf = DECL_CHAIN (f_fpr);
8213 f_sav = DECL_CHAIN (f_ovf);
8214
8215 valist = build_simple_mem_ref (valist);
8216 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8217 /* The following should be folded into the MEM_REF offset. */
8218 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8219 f_gpr, NULL_TREE);
8220 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8221 f_fpr, NULL_TREE);
8222 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8223 f_ovf, NULL_TREE);
8224 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8225 f_sav, NULL_TREE);
8226
8227 /* Count number of gp and fp argument registers used. */
8228 words = crtl->args.info.words;
8229 n_gpr = crtl->args.info.regno;
8230 n_fpr = crtl->args.info.sse_regno;
8231
8232 if (cfun->va_list_gpr_size)
8233 {
8234 type = TREE_TYPE (gpr);
8235 t = build2 (MODIFY_EXPR, type,
8236 gpr, build_int_cst (type, n_gpr * 8));
8237 TREE_SIDE_EFFECTS (t) = 1;
8238 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8239 }
8240
8241 if (TARGET_SSE && cfun->va_list_fpr_size)
8242 {
8243 type = TREE_TYPE (fpr);
8244 t = build2 (MODIFY_EXPR, type, fpr,
8245 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8246 TREE_SIDE_EFFECTS (t) = 1;
8247 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8248 }
8249
8250 /* Find the overflow area. */
8251 type = TREE_TYPE (ovf);
8252 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8253 ovf_rtx = crtl->args.internal_arg_pointer;
8254 else
8255 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8256 t = make_tree (type, ovf_rtx);
8257 if (words != 0)
8258 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8259 t = build2 (MODIFY_EXPR, type, ovf, t);
8260 TREE_SIDE_EFFECTS (t) = 1;
8261 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8262
8263 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8264 {
8265 /* Find the register save area.
8266 Prologue of the function save it right above stack frame. */
8267 type = TREE_TYPE (sav);
8268 t = make_tree (type, frame_pointer_rtx);
8269 if (!ix86_varargs_gpr_size)
8270 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8271 t = build2 (MODIFY_EXPR, type, sav, t);
8272 TREE_SIDE_EFFECTS (t) = 1;
8273 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8274 }
8275 }
8276
8277 /* Implement va_arg. */
8278
8279 static tree
8280 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8281 gimple_seq *post_p)
8282 {
8283 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8284 tree f_gpr, f_fpr, f_ovf, f_sav;
8285 tree gpr, fpr, ovf, sav, t;
8286 int size, rsize;
8287 tree lab_false, lab_over = NULL_TREE;
8288 tree addr, t2;
8289 rtx container;
8290 int indirect_p = 0;
8291 tree ptrtype;
8292 enum machine_mode nat_mode;
8293 unsigned int arg_boundary;
8294
8295 /* Only 64bit target needs something special. */
8296 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8297 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8298
8299 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8300 f_fpr = DECL_CHAIN (f_gpr);
8301 f_ovf = DECL_CHAIN (f_fpr);
8302 f_sav = DECL_CHAIN (f_ovf);
8303
8304 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8305 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8306 valist = build_va_arg_indirect_ref (valist);
8307 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8308 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8309 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8310
8311 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8312 if (indirect_p)
8313 type = build_pointer_type (type);
8314 size = int_size_in_bytes (type);
8315 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8316
8317 nat_mode = type_natural_mode (type, NULL);
8318 switch (nat_mode)
8319 {
8320 case V8SFmode:
8321 case V8SImode:
8322 case V32QImode:
8323 case V16HImode:
8324 case V4DFmode:
8325 case V4DImode:
8326 /* Unnamed 256bit vector mode parameters are passed on stack. */
8327 if (!TARGET_64BIT_MS_ABI)
8328 {
8329 container = NULL;
8330 break;
8331 }
8332
8333 default:
8334 container = construct_container (nat_mode, TYPE_MODE (type),
8335 type, 0, X86_64_REGPARM_MAX,
8336 X86_64_SSE_REGPARM_MAX, intreg,
8337 0);
8338 break;
8339 }
8340
8341 /* Pull the value out of the saved registers. */
8342
8343 addr = create_tmp_var (ptr_type_node, "addr");
8344
8345 if (container)
8346 {
8347 int needed_intregs, needed_sseregs;
8348 bool need_temp;
8349 tree int_addr, sse_addr;
8350
8351 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8352 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8353
8354 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8355
8356 need_temp = (!REG_P (container)
8357 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8358 || TYPE_ALIGN (type) > 128));
8359
8360 /* In case we are passing structure, verify that it is consecutive block
8361 on the register save area. If not we need to do moves. */
8362 if (!need_temp && !REG_P (container))
8363 {
8364 /* Verify that all registers are strictly consecutive */
8365 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8366 {
8367 int i;
8368
8369 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8370 {
8371 rtx slot = XVECEXP (container, 0, i);
8372 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8373 || INTVAL (XEXP (slot, 1)) != i * 16)
8374 need_temp = 1;
8375 }
8376 }
8377 else
8378 {
8379 int i;
8380
8381 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8382 {
8383 rtx slot = XVECEXP (container, 0, i);
8384 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8385 || INTVAL (XEXP (slot, 1)) != i * 8)
8386 need_temp = 1;
8387 }
8388 }
8389 }
8390 if (!need_temp)
8391 {
8392 int_addr = addr;
8393 sse_addr = addr;
8394 }
8395 else
8396 {
8397 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8398 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8399 }
8400
8401 /* First ensure that we fit completely in registers. */
8402 if (needed_intregs)
8403 {
8404 t = build_int_cst (TREE_TYPE (gpr),
8405 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8406 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8407 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8408 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8409 gimplify_and_add (t, pre_p);
8410 }
8411 if (needed_sseregs)
8412 {
8413 t = build_int_cst (TREE_TYPE (fpr),
8414 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8415 + X86_64_REGPARM_MAX * 8);
8416 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8417 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8418 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8419 gimplify_and_add (t, pre_p);
8420 }
8421
8422 /* Compute index to start of area used for integer regs. */
8423 if (needed_intregs)
8424 {
8425 /* int_addr = gpr + sav; */
8426 t = fold_build_pointer_plus (sav, gpr);
8427 gimplify_assign (int_addr, t, pre_p);
8428 }
8429 if (needed_sseregs)
8430 {
8431 /* sse_addr = fpr + sav; */
8432 t = fold_build_pointer_plus (sav, fpr);
8433 gimplify_assign (sse_addr, t, pre_p);
8434 }
8435 if (need_temp)
8436 {
8437 int i, prev_size = 0;
8438 tree temp = create_tmp_var (type, "va_arg_tmp");
8439
8440 /* addr = &temp; */
8441 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8442 gimplify_assign (addr, t, pre_p);
8443
8444 for (i = 0; i < XVECLEN (container, 0); i++)
8445 {
8446 rtx slot = XVECEXP (container, 0, i);
8447 rtx reg = XEXP (slot, 0);
8448 enum machine_mode mode = GET_MODE (reg);
8449 tree piece_type;
8450 tree addr_type;
8451 tree daddr_type;
8452 tree src_addr, src;
8453 int src_offset;
8454 tree dest_addr, dest;
8455 int cur_size = GET_MODE_SIZE (mode);
8456
8457 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8458 prev_size = INTVAL (XEXP (slot, 1));
8459 if (prev_size + cur_size > size)
8460 {
8461 cur_size = size - prev_size;
8462 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8463 if (mode == BLKmode)
8464 mode = QImode;
8465 }
8466 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8467 if (mode == GET_MODE (reg))
8468 addr_type = build_pointer_type (piece_type);
8469 else
8470 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8471 true);
8472 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8473 true);
8474
8475 if (SSE_REGNO_P (REGNO (reg)))
8476 {
8477 src_addr = sse_addr;
8478 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8479 }
8480 else
8481 {
8482 src_addr = int_addr;
8483 src_offset = REGNO (reg) * 8;
8484 }
8485 src_addr = fold_convert (addr_type, src_addr);
8486 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8487
8488 dest_addr = fold_convert (daddr_type, addr);
8489 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8490 if (cur_size == GET_MODE_SIZE (mode))
8491 {
8492 src = build_va_arg_indirect_ref (src_addr);
8493 dest = build_va_arg_indirect_ref (dest_addr);
8494
8495 gimplify_assign (dest, src, pre_p);
8496 }
8497 else
8498 {
8499 tree copy
8500 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8501 3, dest_addr, src_addr,
8502 size_int (cur_size));
8503 gimplify_and_add (copy, pre_p);
8504 }
8505 prev_size += cur_size;
8506 }
8507 }
8508
8509 if (needed_intregs)
8510 {
8511 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8512 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8513 gimplify_assign (gpr, t, pre_p);
8514 }
8515
8516 if (needed_sseregs)
8517 {
8518 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8519 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8520 gimplify_assign (fpr, t, pre_p);
8521 }
8522
8523 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8524
8525 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8526 }
8527
8528 /* ... otherwise out of the overflow area. */
8529
8530 /* When we align parameter on stack for caller, if the parameter
8531 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8532 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8533 here with caller. */
8534 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8535 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8536 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8537
8538 /* Care for on-stack alignment if needed. */
8539 if (arg_boundary <= 64 || size == 0)
8540 t = ovf;
8541 else
8542 {
8543 HOST_WIDE_INT align = arg_boundary / 8;
8544 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8545 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8546 build_int_cst (TREE_TYPE (t), -align));
8547 }
8548
8549 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8550 gimplify_assign (addr, t, pre_p);
8551
8552 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8553 gimplify_assign (unshare_expr (ovf), t, pre_p);
8554
8555 if (container)
8556 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8557
8558 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8559 addr = fold_convert (ptrtype, addr);
8560
8561 if (indirect_p)
8562 addr = build_va_arg_indirect_ref (addr);
8563 return build_va_arg_indirect_ref (addr);
8564 }
8565 \f
8566 /* Return true if OPNUM's MEM should be matched
8567 in movabs* patterns. */
8568
8569 bool
8570 ix86_check_movabs (rtx insn, int opnum)
8571 {
8572 rtx set, mem;
8573
8574 set = PATTERN (insn);
8575 if (GET_CODE (set) == PARALLEL)
8576 set = XVECEXP (set, 0, 0);
8577 gcc_assert (GET_CODE (set) == SET);
8578 mem = XEXP (set, opnum);
8579 while (GET_CODE (mem) == SUBREG)
8580 mem = SUBREG_REG (mem);
8581 gcc_assert (MEM_P (mem));
8582 return volatile_ok || !MEM_VOLATILE_P (mem);
8583 }
8584 \f
8585 /* Initialize the table of extra 80387 mathematical constants. */
8586
8587 static void
8588 init_ext_80387_constants (void)
8589 {
8590 static const char * cst[5] =
8591 {
8592 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8593 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8594 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8595 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8596 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8597 };
8598 int i;
8599
8600 for (i = 0; i < 5; i++)
8601 {
8602 real_from_string (&ext_80387_constants_table[i], cst[i]);
8603 /* Ensure each constant is rounded to XFmode precision. */
8604 real_convert (&ext_80387_constants_table[i],
8605 XFmode, &ext_80387_constants_table[i]);
8606 }
8607
8608 ext_80387_constants_init = 1;
8609 }
8610
8611 /* Return non-zero if the constant is something that
8612 can be loaded with a special instruction. */
8613
8614 int
8615 standard_80387_constant_p (rtx x)
8616 {
8617 enum machine_mode mode = GET_MODE (x);
8618
8619 REAL_VALUE_TYPE r;
8620
8621 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8622 return -1;
8623
8624 if (x == CONST0_RTX (mode))
8625 return 1;
8626 if (x == CONST1_RTX (mode))
8627 return 2;
8628
8629 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8630
8631 /* For XFmode constants, try to find a special 80387 instruction when
8632 optimizing for size or on those CPUs that benefit from them. */
8633 if (mode == XFmode
8634 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8635 {
8636 int i;
8637
8638 if (! ext_80387_constants_init)
8639 init_ext_80387_constants ();
8640
8641 for (i = 0; i < 5; i++)
8642 if (real_identical (&r, &ext_80387_constants_table[i]))
8643 return i + 3;
8644 }
8645
8646 /* Load of the constant -0.0 or -1.0 will be split as
8647 fldz;fchs or fld1;fchs sequence. */
8648 if (real_isnegzero (&r))
8649 return 8;
8650 if (real_identical (&r, &dconstm1))
8651 return 9;
8652
8653 return 0;
8654 }
8655
8656 /* Return the opcode of the special instruction to be used to load
8657 the constant X. */
8658
8659 const char *
8660 standard_80387_constant_opcode (rtx x)
8661 {
8662 switch (standard_80387_constant_p (x))
8663 {
8664 case 1:
8665 return "fldz";
8666 case 2:
8667 return "fld1";
8668 case 3:
8669 return "fldlg2";
8670 case 4:
8671 return "fldln2";
8672 case 5:
8673 return "fldl2e";
8674 case 6:
8675 return "fldl2t";
8676 case 7:
8677 return "fldpi";
8678 case 8:
8679 case 9:
8680 return "#";
8681 default:
8682 gcc_unreachable ();
8683 }
8684 }
8685
8686 /* Return the CONST_DOUBLE representing the 80387 constant that is
8687 loaded by the specified special instruction. The argument IDX
8688 matches the return value from standard_80387_constant_p. */
8689
8690 rtx
8691 standard_80387_constant_rtx (int idx)
8692 {
8693 int i;
8694
8695 if (! ext_80387_constants_init)
8696 init_ext_80387_constants ();
8697
8698 switch (idx)
8699 {
8700 case 3:
8701 case 4:
8702 case 5:
8703 case 6:
8704 case 7:
8705 i = idx - 3;
8706 break;
8707
8708 default:
8709 gcc_unreachable ();
8710 }
8711
8712 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8713 XFmode);
8714 }
8715
8716 /* Return 1 if X is all 0s and 2 if x is all 1s
8717 in supported SSE/AVX vector mode. */
8718
8719 int
8720 standard_sse_constant_p (rtx x)
8721 {
8722 enum machine_mode mode = GET_MODE (x);
8723
8724 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8725 return 1;
8726 if (vector_all_ones_operand (x, mode))
8727 switch (mode)
8728 {
8729 case V16QImode:
8730 case V8HImode:
8731 case V4SImode:
8732 case V2DImode:
8733 if (TARGET_SSE2)
8734 return 2;
8735 case V32QImode:
8736 case V16HImode:
8737 case V8SImode:
8738 case V4DImode:
8739 if (TARGET_AVX2)
8740 return 2;
8741 default:
8742 break;
8743 }
8744
8745 return 0;
8746 }
8747
8748 /* Return the opcode of the special instruction to be used to load
8749 the constant X. */
8750
8751 const char *
8752 standard_sse_constant_opcode (rtx insn, rtx x)
8753 {
8754 switch (standard_sse_constant_p (x))
8755 {
8756 case 1:
8757 switch (get_attr_mode (insn))
8758 {
8759 case MODE_TI:
8760 return "%vpxor\t%0, %d0";
8761 case MODE_V2DF:
8762 return "%vxorpd\t%0, %d0";
8763 case MODE_V4SF:
8764 return "%vxorps\t%0, %d0";
8765
8766 case MODE_OI:
8767 return "vpxor\t%x0, %x0, %x0";
8768 case MODE_V4DF:
8769 return "vxorpd\t%x0, %x0, %x0";
8770 case MODE_V8SF:
8771 return "vxorps\t%x0, %x0, %x0";
8772
8773 default:
8774 break;
8775 }
8776
8777 case 2:
8778 if (get_attr_mode (insn) == MODE_XI
8779 || get_attr_mode (insn) == MODE_V8DF
8780 || get_attr_mode (insn) == MODE_V16SF)
8781 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8782 if (TARGET_AVX)
8783 return "vpcmpeqd\t%0, %0, %0";
8784 else
8785 return "pcmpeqd\t%0, %0";
8786
8787 default:
8788 break;
8789 }
8790 gcc_unreachable ();
8791 }
8792
8793 /* Returns true if OP contains a symbol reference */
8794
8795 bool
8796 symbolic_reference_mentioned_p (rtx op)
8797 {
8798 const char *fmt;
8799 int i;
8800
8801 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8802 return true;
8803
8804 fmt = GET_RTX_FORMAT (GET_CODE (op));
8805 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8806 {
8807 if (fmt[i] == 'E')
8808 {
8809 int j;
8810
8811 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8812 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8813 return true;
8814 }
8815
8816 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8817 return true;
8818 }
8819
8820 return false;
8821 }
8822
8823 /* Return true if it is appropriate to emit `ret' instructions in the
8824 body of a function. Do this only if the epilogue is simple, needing a
8825 couple of insns. Prior to reloading, we can't tell how many registers
8826 must be saved, so return false then. Return false if there is no frame
8827 marker to de-allocate. */
8828
8829 bool
8830 ix86_can_use_return_insn_p (void)
8831 {
8832 struct ix86_frame frame;
8833
8834 if (! reload_completed || frame_pointer_needed)
8835 return 0;
8836
8837 /* Don't allow more than 32k pop, since that's all we can do
8838 with one instruction. */
8839 if (crtl->args.pops_args && crtl->args.size >= 32768)
8840 return 0;
8841
8842 ix86_compute_frame_layout (&frame);
8843 return (frame.stack_pointer_offset == UNITS_PER_WORD
8844 && (frame.nregs + frame.nsseregs) == 0);
8845 }
8846 \f
8847 /* Value should be nonzero if functions must have frame pointers.
8848 Zero means the frame pointer need not be set up (and parms may
8849 be accessed via the stack pointer) in functions that seem suitable. */
8850
8851 static bool
8852 ix86_frame_pointer_required (void)
8853 {
8854 /* If we accessed previous frames, then the generated code expects
8855 to be able to access the saved ebp value in our frame. */
8856 if (cfun->machine->accesses_prev_frame)
8857 return true;
8858
8859 /* Several x86 os'es need a frame pointer for other reasons,
8860 usually pertaining to setjmp. */
8861 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8862 return true;
8863
8864 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8865 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8866 return true;
8867
8868 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8869 allocation is 4GB. */
8870 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8871 return true;
8872
8873 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8874 turns off the frame pointer by default. Turn it back on now if
8875 we've not got a leaf function. */
8876 if (TARGET_OMIT_LEAF_FRAME_POINTER
8877 && (!crtl->is_leaf
8878 || ix86_current_function_calls_tls_descriptor))
8879 return true;
8880
8881 if (crtl->profile && !flag_fentry)
8882 return true;
8883
8884 return false;
8885 }
8886
8887 /* Record that the current function accesses previous call frames. */
8888
8889 void
8890 ix86_setup_frame_addresses (void)
8891 {
8892 cfun->machine->accesses_prev_frame = 1;
8893 }
8894 \f
8895 #ifndef USE_HIDDEN_LINKONCE
8896 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8897 # define USE_HIDDEN_LINKONCE 1
8898 # else
8899 # define USE_HIDDEN_LINKONCE 0
8900 # endif
8901 #endif
8902
8903 static int pic_labels_used;
8904
8905 /* Fills in the label name that should be used for a pc thunk for
8906 the given register. */
8907
8908 static void
8909 get_pc_thunk_name (char name[32], unsigned int regno)
8910 {
8911 gcc_assert (!TARGET_64BIT);
8912
8913 if (USE_HIDDEN_LINKONCE)
8914 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8915 else
8916 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8917 }
8918
8919
8920 /* This function generates code for -fpic that loads %ebx with
8921 the return address of the caller and then returns. */
8922
8923 static void
8924 ix86_code_end (void)
8925 {
8926 rtx xops[2];
8927 int regno;
8928
8929 for (regno = AX_REG; regno <= SP_REG; regno++)
8930 {
8931 char name[32];
8932 tree decl;
8933
8934 if (!(pic_labels_used & (1 << regno)))
8935 continue;
8936
8937 get_pc_thunk_name (name, regno);
8938
8939 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8940 get_identifier (name),
8941 build_function_type_list (void_type_node, NULL_TREE));
8942 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8943 NULL_TREE, void_type_node);
8944 TREE_PUBLIC (decl) = 1;
8945 TREE_STATIC (decl) = 1;
8946 DECL_IGNORED_P (decl) = 1;
8947
8948 #if TARGET_MACHO
8949 if (TARGET_MACHO)
8950 {
8951 switch_to_section (darwin_sections[text_coal_section]);
8952 fputs ("\t.weak_definition\t", asm_out_file);
8953 assemble_name (asm_out_file, name);
8954 fputs ("\n\t.private_extern\t", asm_out_file);
8955 assemble_name (asm_out_file, name);
8956 putc ('\n', asm_out_file);
8957 ASM_OUTPUT_LABEL (asm_out_file, name);
8958 DECL_WEAK (decl) = 1;
8959 }
8960 else
8961 #endif
8962 if (USE_HIDDEN_LINKONCE)
8963 {
8964 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8965
8966 targetm.asm_out.unique_section (decl, 0);
8967 switch_to_section (get_named_section (decl, NULL, 0));
8968
8969 targetm.asm_out.globalize_label (asm_out_file, name);
8970 fputs ("\t.hidden\t", asm_out_file);
8971 assemble_name (asm_out_file, name);
8972 putc ('\n', asm_out_file);
8973 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8974 }
8975 else
8976 {
8977 switch_to_section (text_section);
8978 ASM_OUTPUT_LABEL (asm_out_file, name);
8979 }
8980
8981 DECL_INITIAL (decl) = make_node (BLOCK);
8982 current_function_decl = decl;
8983 init_function_start (decl);
8984 first_function_block_is_cold = false;
8985 /* Make sure unwind info is emitted for the thunk if needed. */
8986 final_start_function (emit_barrier (), asm_out_file, 1);
8987
8988 /* Pad stack IP move with 4 instructions (two NOPs count
8989 as one instruction). */
8990 if (TARGET_PAD_SHORT_FUNCTION)
8991 {
8992 int i = 8;
8993
8994 while (i--)
8995 fputs ("\tnop\n", asm_out_file);
8996 }
8997
8998 xops[0] = gen_rtx_REG (Pmode, regno);
8999 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
9000 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
9001 output_asm_insn ("%!ret", NULL);
9002 final_end_function ();
9003 init_insn_lengths ();
9004 free_after_compilation (cfun);
9005 set_cfun (NULL);
9006 current_function_decl = NULL;
9007 }
9008
9009 if (flag_split_stack)
9010 file_end_indicate_split_stack ();
9011 }
9012
9013 /* Emit code for the SET_GOT patterns. */
9014
9015 const char *
9016 output_set_got (rtx dest, rtx label)
9017 {
9018 rtx xops[3];
9019
9020 xops[0] = dest;
9021
9022 if (TARGET_VXWORKS_RTP && flag_pic)
9023 {
9024 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
9025 xops[2] = gen_rtx_MEM (Pmode,
9026 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
9027 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
9028
9029 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
9030 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
9031 an unadorned address. */
9032 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
9033 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
9034 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
9035 return "";
9036 }
9037
9038 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
9039
9040 if (!flag_pic)
9041 {
9042 if (TARGET_MACHO)
9043 /* We don't need a pic base, we're not producing pic. */
9044 gcc_unreachable ();
9045
9046 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
9047 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
9048 targetm.asm_out.internal_label (asm_out_file, "L",
9049 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
9050 }
9051 else
9052 {
9053 char name[32];
9054 get_pc_thunk_name (name, REGNO (dest));
9055 pic_labels_used |= 1 << REGNO (dest);
9056
9057 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
9058 xops[2] = gen_rtx_MEM (QImode, xops[2]);
9059 output_asm_insn ("%!call\t%X2", xops);
9060
9061 #if TARGET_MACHO
9062 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
9063 This is what will be referenced by the Mach-O PIC subsystem. */
9064 if (machopic_should_output_picbase_label () || !label)
9065 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
9066
9067 /* When we are restoring the pic base at the site of a nonlocal label,
9068 and we decided to emit the pic base above, we will still output a
9069 local label used for calculating the correction offset (even though
9070 the offset will be 0 in that case). */
9071 if (label)
9072 targetm.asm_out.internal_label (asm_out_file, "L",
9073 CODE_LABEL_NUMBER (label));
9074 #endif
9075 }
9076
9077 if (!TARGET_MACHO)
9078 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
9079
9080 return "";
9081 }
9082
9083 /* Generate an "push" pattern for input ARG. */
9084
9085 static rtx
9086 gen_push (rtx arg)
9087 {
9088 struct machine_function *m = cfun->machine;
9089
9090 if (m->fs.cfa_reg == stack_pointer_rtx)
9091 m->fs.cfa_offset += UNITS_PER_WORD;
9092 m->fs.sp_offset += UNITS_PER_WORD;
9093
9094 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9095 arg = gen_rtx_REG (word_mode, REGNO (arg));
9096
9097 return gen_rtx_SET (VOIDmode,
9098 gen_rtx_MEM (word_mode,
9099 gen_rtx_PRE_DEC (Pmode,
9100 stack_pointer_rtx)),
9101 arg);
9102 }
9103
9104 /* Generate an "pop" pattern for input ARG. */
9105
9106 static rtx
9107 gen_pop (rtx arg)
9108 {
9109 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9110 arg = gen_rtx_REG (word_mode, REGNO (arg));
9111
9112 return gen_rtx_SET (VOIDmode,
9113 arg,
9114 gen_rtx_MEM (word_mode,
9115 gen_rtx_POST_INC (Pmode,
9116 stack_pointer_rtx)));
9117 }
9118
9119 /* Return >= 0 if there is an unused call-clobbered register available
9120 for the entire function. */
9121
9122 static unsigned int
9123 ix86_select_alt_pic_regnum (void)
9124 {
9125 if (crtl->is_leaf
9126 && !crtl->profile
9127 && !ix86_current_function_calls_tls_descriptor)
9128 {
9129 int i, drap;
9130 /* Can't use the same register for both PIC and DRAP. */
9131 if (crtl->drap_reg)
9132 drap = REGNO (crtl->drap_reg);
9133 else
9134 drap = -1;
9135 for (i = 2; i >= 0; --i)
9136 if (i != drap && !df_regs_ever_live_p (i))
9137 return i;
9138 }
9139
9140 return INVALID_REGNUM;
9141 }
9142
9143 /* Return TRUE if we need to save REGNO. */
9144
9145 static bool
9146 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9147 {
9148 if (pic_offset_table_rtx
9149 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9150 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9151 || crtl->profile
9152 || crtl->calls_eh_return
9153 || crtl->uses_const_pool
9154 || cfun->has_nonlocal_label))
9155 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9156
9157 if (crtl->calls_eh_return && maybe_eh_return)
9158 {
9159 unsigned i;
9160 for (i = 0; ; i++)
9161 {
9162 unsigned test = EH_RETURN_DATA_REGNO (i);
9163 if (test == INVALID_REGNUM)
9164 break;
9165 if (test == regno)
9166 return true;
9167 }
9168 }
9169
9170 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9171 return true;
9172
9173 return (df_regs_ever_live_p (regno)
9174 && !call_used_regs[regno]
9175 && !fixed_regs[regno]
9176 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9177 }
9178
9179 /* Return number of saved general prupose registers. */
9180
9181 static int
9182 ix86_nsaved_regs (void)
9183 {
9184 int nregs = 0;
9185 int regno;
9186
9187 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9188 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9189 nregs ++;
9190 return nregs;
9191 }
9192
9193 /* Return number of saved SSE registrers. */
9194
9195 static int
9196 ix86_nsaved_sseregs (void)
9197 {
9198 int nregs = 0;
9199 int regno;
9200
9201 if (!TARGET_64BIT_MS_ABI)
9202 return 0;
9203 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9204 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9205 nregs ++;
9206 return nregs;
9207 }
9208
9209 /* Given FROM and TO register numbers, say whether this elimination is
9210 allowed. If stack alignment is needed, we can only replace argument
9211 pointer with hard frame pointer, or replace frame pointer with stack
9212 pointer. Otherwise, frame pointer elimination is automatically
9213 handled and all other eliminations are valid. */
9214
9215 static bool
9216 ix86_can_eliminate (const int from, const int to)
9217 {
9218 if (stack_realign_fp)
9219 return ((from == ARG_POINTER_REGNUM
9220 && to == HARD_FRAME_POINTER_REGNUM)
9221 || (from == FRAME_POINTER_REGNUM
9222 && to == STACK_POINTER_REGNUM));
9223 else
9224 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9225 }
9226
9227 /* Return the offset between two registers, one to be eliminated, and the other
9228 its replacement, at the start of a routine. */
9229
9230 HOST_WIDE_INT
9231 ix86_initial_elimination_offset (int from, int to)
9232 {
9233 struct ix86_frame frame;
9234 ix86_compute_frame_layout (&frame);
9235
9236 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9237 return frame.hard_frame_pointer_offset;
9238 else if (from == FRAME_POINTER_REGNUM
9239 && to == HARD_FRAME_POINTER_REGNUM)
9240 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9241 else
9242 {
9243 gcc_assert (to == STACK_POINTER_REGNUM);
9244
9245 if (from == ARG_POINTER_REGNUM)
9246 return frame.stack_pointer_offset;
9247
9248 gcc_assert (from == FRAME_POINTER_REGNUM);
9249 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9250 }
9251 }
9252
9253 /* In a dynamically-aligned function, we can't know the offset from
9254 stack pointer to frame pointer, so we must ensure that setjmp
9255 eliminates fp against the hard fp (%ebp) rather than trying to
9256 index from %esp up to the top of the frame across a gap that is
9257 of unknown (at compile-time) size. */
9258 static rtx
9259 ix86_builtin_setjmp_frame_value (void)
9260 {
9261 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9262 }
9263
9264 /* When using -fsplit-stack, the allocation routines set a field in
9265 the TCB to the bottom of the stack plus this much space, measured
9266 in bytes. */
9267
9268 #define SPLIT_STACK_AVAILABLE 256
9269
9270 /* Fill structure ix86_frame about frame of currently computed function. */
9271
9272 static void
9273 ix86_compute_frame_layout (struct ix86_frame *frame)
9274 {
9275 unsigned HOST_WIDE_INT stack_alignment_needed;
9276 HOST_WIDE_INT offset;
9277 unsigned HOST_WIDE_INT preferred_alignment;
9278 HOST_WIDE_INT size = get_frame_size ();
9279 HOST_WIDE_INT to_allocate;
9280
9281 frame->nregs = ix86_nsaved_regs ();
9282 frame->nsseregs = ix86_nsaved_sseregs ();
9283
9284 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9285 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9286
9287 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9288 function prologues and leaf. */
9289 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9290 && (!crtl->is_leaf || cfun->calls_alloca != 0
9291 || ix86_current_function_calls_tls_descriptor))
9292 {
9293 preferred_alignment = 16;
9294 stack_alignment_needed = 16;
9295 crtl->preferred_stack_boundary = 128;
9296 crtl->stack_alignment_needed = 128;
9297 }
9298
9299 gcc_assert (!size || stack_alignment_needed);
9300 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9301 gcc_assert (preferred_alignment <= stack_alignment_needed);
9302
9303 /* For SEH we have to limit the amount of code movement into the prologue.
9304 At present we do this via a BLOCKAGE, at which point there's very little
9305 scheduling that can be done, which means that there's very little point
9306 in doing anything except PUSHs. */
9307 if (TARGET_SEH)
9308 cfun->machine->use_fast_prologue_epilogue = false;
9309
9310 /* During reload iteration the amount of registers saved can change.
9311 Recompute the value as needed. Do not recompute when amount of registers
9312 didn't change as reload does multiple calls to the function and does not
9313 expect the decision to change within single iteration. */
9314 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
9315 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9316 {
9317 int count = frame->nregs;
9318 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9319
9320 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9321
9322 /* The fast prologue uses move instead of push to save registers. This
9323 is significantly longer, but also executes faster as modern hardware
9324 can execute the moves in parallel, but can't do that for push/pop.
9325
9326 Be careful about choosing what prologue to emit: When function takes
9327 many instructions to execute we may use slow version as well as in
9328 case function is known to be outside hot spot (this is known with
9329 feedback only). Weight the size of function by number of registers
9330 to save as it is cheap to use one or two push instructions but very
9331 slow to use many of them. */
9332 if (count)
9333 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9334 if (node->frequency < NODE_FREQUENCY_NORMAL
9335 || (flag_branch_probabilities
9336 && node->frequency < NODE_FREQUENCY_HOT))
9337 cfun->machine->use_fast_prologue_epilogue = false;
9338 else
9339 cfun->machine->use_fast_prologue_epilogue
9340 = !expensive_function_p (count);
9341 }
9342
9343 frame->save_regs_using_mov
9344 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9345 /* If static stack checking is enabled and done with probes,
9346 the registers need to be saved before allocating the frame. */
9347 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9348
9349 /* Skip return address. */
9350 offset = UNITS_PER_WORD;
9351
9352 /* Skip pushed static chain. */
9353 if (ix86_static_chain_on_stack)
9354 offset += UNITS_PER_WORD;
9355
9356 /* Skip saved base pointer. */
9357 if (frame_pointer_needed)
9358 offset += UNITS_PER_WORD;
9359 frame->hfp_save_offset = offset;
9360
9361 /* The traditional frame pointer location is at the top of the frame. */
9362 frame->hard_frame_pointer_offset = offset;
9363
9364 /* Register save area */
9365 offset += frame->nregs * UNITS_PER_WORD;
9366 frame->reg_save_offset = offset;
9367
9368 /* On SEH target, registers are pushed just before the frame pointer
9369 location. */
9370 if (TARGET_SEH)
9371 frame->hard_frame_pointer_offset = offset;
9372
9373 /* Align and set SSE register save area. */
9374 if (frame->nsseregs)
9375 {
9376 /* The only ABI that has saved SSE registers (Win64) also has a
9377 16-byte aligned default stack, and thus we don't need to be
9378 within the re-aligned local stack frame to save them. */
9379 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9380 offset = (offset + 16 - 1) & -16;
9381 offset += frame->nsseregs * 16;
9382 }
9383 frame->sse_reg_save_offset = offset;
9384
9385 /* The re-aligned stack starts here. Values before this point are not
9386 directly comparable with values below this point. In order to make
9387 sure that no value happens to be the same before and after, force
9388 the alignment computation below to add a non-zero value. */
9389 if (stack_realign_fp)
9390 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9391
9392 /* Va-arg area */
9393 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9394 offset += frame->va_arg_size;
9395
9396 /* Align start of frame for local function. */
9397 if (stack_realign_fp
9398 || offset != frame->sse_reg_save_offset
9399 || size != 0
9400 || !crtl->is_leaf
9401 || cfun->calls_alloca
9402 || ix86_current_function_calls_tls_descriptor)
9403 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9404
9405 /* Frame pointer points here. */
9406 frame->frame_pointer_offset = offset;
9407
9408 offset += size;
9409
9410 /* Add outgoing arguments area. Can be skipped if we eliminated
9411 all the function calls as dead code.
9412 Skipping is however impossible when function calls alloca. Alloca
9413 expander assumes that last crtl->outgoing_args_size
9414 of stack frame are unused. */
9415 if (ACCUMULATE_OUTGOING_ARGS
9416 && (!crtl->is_leaf || cfun->calls_alloca
9417 || ix86_current_function_calls_tls_descriptor))
9418 {
9419 offset += crtl->outgoing_args_size;
9420 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9421 }
9422 else
9423 frame->outgoing_arguments_size = 0;
9424
9425 /* Align stack boundary. Only needed if we're calling another function
9426 or using alloca. */
9427 if (!crtl->is_leaf || cfun->calls_alloca
9428 || ix86_current_function_calls_tls_descriptor)
9429 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9430
9431 /* We've reached end of stack frame. */
9432 frame->stack_pointer_offset = offset;
9433
9434 /* Size prologue needs to allocate. */
9435 to_allocate = offset - frame->sse_reg_save_offset;
9436
9437 if ((!to_allocate && frame->nregs <= 1)
9438 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9439 frame->save_regs_using_mov = false;
9440
9441 if (ix86_using_red_zone ()
9442 && crtl->sp_is_unchanging
9443 && crtl->is_leaf
9444 && !ix86_current_function_calls_tls_descriptor)
9445 {
9446 frame->red_zone_size = to_allocate;
9447 if (frame->save_regs_using_mov)
9448 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9449 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9450 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9451 }
9452 else
9453 frame->red_zone_size = 0;
9454 frame->stack_pointer_offset -= frame->red_zone_size;
9455
9456 /* The SEH frame pointer location is near the bottom of the frame.
9457 This is enforced by the fact that the difference between the
9458 stack pointer and the frame pointer is limited to 240 bytes in
9459 the unwind data structure. */
9460 if (TARGET_SEH)
9461 {
9462 HOST_WIDE_INT diff;
9463
9464 /* If we can leave the frame pointer where it is, do so. Also, returns
9465 the establisher frame for __builtin_frame_address (0). */
9466 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9467 if (diff <= SEH_MAX_FRAME_SIZE
9468 && (diff > 240 || (diff & 15) != 0)
9469 && !crtl->accesses_prior_frames)
9470 {
9471 /* Ideally we'd determine what portion of the local stack frame
9472 (within the constraint of the lowest 240) is most heavily used.
9473 But without that complication, simply bias the frame pointer
9474 by 128 bytes so as to maximize the amount of the local stack
9475 frame that is addressable with 8-bit offsets. */
9476 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9477 }
9478 }
9479 }
9480
9481 /* This is semi-inlined memory_address_length, but simplified
9482 since we know that we're always dealing with reg+offset, and
9483 to avoid having to create and discard all that rtl. */
9484
9485 static inline int
9486 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9487 {
9488 int len = 4;
9489
9490 if (offset == 0)
9491 {
9492 /* EBP and R13 cannot be encoded without an offset. */
9493 len = (regno == BP_REG || regno == R13_REG);
9494 }
9495 else if (IN_RANGE (offset, -128, 127))
9496 len = 1;
9497
9498 /* ESP and R12 must be encoded with a SIB byte. */
9499 if (regno == SP_REG || regno == R12_REG)
9500 len++;
9501
9502 return len;
9503 }
9504
9505 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9506 The valid base registers are taken from CFUN->MACHINE->FS. */
9507
9508 static rtx
9509 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9510 {
9511 const struct machine_function *m = cfun->machine;
9512 rtx base_reg = NULL;
9513 HOST_WIDE_INT base_offset = 0;
9514
9515 if (m->use_fast_prologue_epilogue)
9516 {
9517 /* Choose the base register most likely to allow the most scheduling
9518 opportunities. Generally FP is valid throughout the function,
9519 while DRAP must be reloaded within the epilogue. But choose either
9520 over the SP due to increased encoding size. */
9521
9522 if (m->fs.fp_valid)
9523 {
9524 base_reg = hard_frame_pointer_rtx;
9525 base_offset = m->fs.fp_offset - cfa_offset;
9526 }
9527 else if (m->fs.drap_valid)
9528 {
9529 base_reg = crtl->drap_reg;
9530 base_offset = 0 - cfa_offset;
9531 }
9532 else if (m->fs.sp_valid)
9533 {
9534 base_reg = stack_pointer_rtx;
9535 base_offset = m->fs.sp_offset - cfa_offset;
9536 }
9537 }
9538 else
9539 {
9540 HOST_WIDE_INT toffset;
9541 int len = 16, tlen;
9542
9543 /* Choose the base register with the smallest address encoding.
9544 With a tie, choose FP > DRAP > SP. */
9545 if (m->fs.sp_valid)
9546 {
9547 base_reg = stack_pointer_rtx;
9548 base_offset = m->fs.sp_offset - cfa_offset;
9549 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9550 }
9551 if (m->fs.drap_valid)
9552 {
9553 toffset = 0 - cfa_offset;
9554 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9555 if (tlen <= len)
9556 {
9557 base_reg = crtl->drap_reg;
9558 base_offset = toffset;
9559 len = tlen;
9560 }
9561 }
9562 if (m->fs.fp_valid)
9563 {
9564 toffset = m->fs.fp_offset - cfa_offset;
9565 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9566 if (tlen <= len)
9567 {
9568 base_reg = hard_frame_pointer_rtx;
9569 base_offset = toffset;
9570 len = tlen;
9571 }
9572 }
9573 }
9574 gcc_assert (base_reg != NULL);
9575
9576 return plus_constant (Pmode, base_reg, base_offset);
9577 }
9578
9579 /* Emit code to save registers in the prologue. */
9580
9581 static void
9582 ix86_emit_save_regs (void)
9583 {
9584 unsigned int regno;
9585 rtx insn;
9586
9587 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9588 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9589 {
9590 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9591 RTX_FRAME_RELATED_P (insn) = 1;
9592 }
9593 }
9594
9595 /* Emit a single register save at CFA - CFA_OFFSET. */
9596
9597 static void
9598 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9599 HOST_WIDE_INT cfa_offset)
9600 {
9601 struct machine_function *m = cfun->machine;
9602 rtx reg = gen_rtx_REG (mode, regno);
9603 rtx mem, addr, base, insn;
9604
9605 addr = choose_baseaddr (cfa_offset);
9606 mem = gen_frame_mem (mode, addr);
9607
9608 /* For SSE saves, we need to indicate the 128-bit alignment. */
9609 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9610
9611 insn = emit_move_insn (mem, reg);
9612 RTX_FRAME_RELATED_P (insn) = 1;
9613
9614 base = addr;
9615 if (GET_CODE (base) == PLUS)
9616 base = XEXP (base, 0);
9617 gcc_checking_assert (REG_P (base));
9618
9619 /* When saving registers into a re-aligned local stack frame, avoid
9620 any tricky guessing by dwarf2out. */
9621 if (m->fs.realigned)
9622 {
9623 gcc_checking_assert (stack_realign_drap);
9624
9625 if (regno == REGNO (crtl->drap_reg))
9626 {
9627 /* A bit of a hack. We force the DRAP register to be saved in
9628 the re-aligned stack frame, which provides us with a copy
9629 of the CFA that will last past the prologue. Install it. */
9630 gcc_checking_assert (cfun->machine->fs.fp_valid);
9631 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9632 cfun->machine->fs.fp_offset - cfa_offset);
9633 mem = gen_rtx_MEM (mode, addr);
9634 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9635 }
9636 else
9637 {
9638 /* The frame pointer is a stable reference within the
9639 aligned frame. Use it. */
9640 gcc_checking_assert (cfun->machine->fs.fp_valid);
9641 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9642 cfun->machine->fs.fp_offset - cfa_offset);
9643 mem = gen_rtx_MEM (mode, addr);
9644 add_reg_note (insn, REG_CFA_EXPRESSION,
9645 gen_rtx_SET (VOIDmode, mem, reg));
9646 }
9647 }
9648
9649 /* The memory may not be relative to the current CFA register,
9650 which means that we may need to generate a new pattern for
9651 use by the unwind info. */
9652 else if (base != m->fs.cfa_reg)
9653 {
9654 addr = plus_constant (Pmode, m->fs.cfa_reg,
9655 m->fs.cfa_offset - cfa_offset);
9656 mem = gen_rtx_MEM (mode, addr);
9657 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9658 }
9659 }
9660
9661 /* Emit code to save registers using MOV insns.
9662 First register is stored at CFA - CFA_OFFSET. */
9663 static void
9664 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9665 {
9666 unsigned int regno;
9667
9668 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9669 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9670 {
9671 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9672 cfa_offset -= UNITS_PER_WORD;
9673 }
9674 }
9675
9676 /* Emit code to save SSE registers using MOV insns.
9677 First register is stored at CFA - CFA_OFFSET. */
9678 static void
9679 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9680 {
9681 unsigned int regno;
9682
9683 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9684 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9685 {
9686 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9687 cfa_offset -= 16;
9688 }
9689 }
9690
9691 static GTY(()) rtx queued_cfa_restores;
9692
9693 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9694 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9695 Don't add the note if the previously saved value will be left untouched
9696 within stack red-zone till return, as unwinders can find the same value
9697 in the register and on the stack. */
9698
9699 static void
9700 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9701 {
9702 if (!crtl->shrink_wrapped
9703 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9704 return;
9705
9706 if (insn)
9707 {
9708 add_reg_note (insn, REG_CFA_RESTORE, reg);
9709 RTX_FRAME_RELATED_P (insn) = 1;
9710 }
9711 else
9712 queued_cfa_restores
9713 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9714 }
9715
9716 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9717
9718 static void
9719 ix86_add_queued_cfa_restore_notes (rtx insn)
9720 {
9721 rtx last;
9722 if (!queued_cfa_restores)
9723 return;
9724 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9725 ;
9726 XEXP (last, 1) = REG_NOTES (insn);
9727 REG_NOTES (insn) = queued_cfa_restores;
9728 queued_cfa_restores = NULL_RTX;
9729 RTX_FRAME_RELATED_P (insn) = 1;
9730 }
9731
9732 /* Expand prologue or epilogue stack adjustment.
9733 The pattern exist to put a dependency on all ebp-based memory accesses.
9734 STYLE should be negative if instructions should be marked as frame related,
9735 zero if %r11 register is live and cannot be freely used and positive
9736 otherwise. */
9737
9738 static void
9739 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9740 int style, bool set_cfa)
9741 {
9742 struct machine_function *m = cfun->machine;
9743 rtx insn;
9744 bool add_frame_related_expr = false;
9745
9746 if (Pmode == SImode)
9747 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9748 else if (x86_64_immediate_operand (offset, DImode))
9749 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9750 else
9751 {
9752 rtx tmp;
9753 /* r11 is used by indirect sibcall return as well, set before the
9754 epilogue and used after the epilogue. */
9755 if (style)
9756 tmp = gen_rtx_REG (DImode, R11_REG);
9757 else
9758 {
9759 gcc_assert (src != hard_frame_pointer_rtx
9760 && dest != hard_frame_pointer_rtx);
9761 tmp = hard_frame_pointer_rtx;
9762 }
9763 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9764 if (style < 0)
9765 add_frame_related_expr = true;
9766
9767 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9768 }
9769
9770 insn = emit_insn (insn);
9771 if (style >= 0)
9772 ix86_add_queued_cfa_restore_notes (insn);
9773
9774 if (set_cfa)
9775 {
9776 rtx r;
9777
9778 gcc_assert (m->fs.cfa_reg == src);
9779 m->fs.cfa_offset += INTVAL (offset);
9780 m->fs.cfa_reg = dest;
9781
9782 r = gen_rtx_PLUS (Pmode, src, offset);
9783 r = gen_rtx_SET (VOIDmode, dest, r);
9784 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9785 RTX_FRAME_RELATED_P (insn) = 1;
9786 }
9787 else if (style < 0)
9788 {
9789 RTX_FRAME_RELATED_P (insn) = 1;
9790 if (add_frame_related_expr)
9791 {
9792 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9793 r = gen_rtx_SET (VOIDmode, dest, r);
9794 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9795 }
9796 }
9797
9798 if (dest == stack_pointer_rtx)
9799 {
9800 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9801 bool valid = m->fs.sp_valid;
9802
9803 if (src == hard_frame_pointer_rtx)
9804 {
9805 valid = m->fs.fp_valid;
9806 ooffset = m->fs.fp_offset;
9807 }
9808 else if (src == crtl->drap_reg)
9809 {
9810 valid = m->fs.drap_valid;
9811 ooffset = 0;
9812 }
9813 else
9814 {
9815 /* Else there are two possibilities: SP itself, which we set
9816 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9817 taken care of this by hand along the eh_return path. */
9818 gcc_checking_assert (src == stack_pointer_rtx
9819 || offset == const0_rtx);
9820 }
9821
9822 m->fs.sp_offset = ooffset - INTVAL (offset);
9823 m->fs.sp_valid = valid;
9824 }
9825 }
9826
9827 /* Find an available register to be used as dynamic realign argument
9828 pointer regsiter. Such a register will be written in prologue and
9829 used in begin of body, so it must not be
9830 1. parameter passing register.
9831 2. GOT pointer.
9832 We reuse static-chain register if it is available. Otherwise, we
9833 use DI for i386 and R13 for x86-64. We chose R13 since it has
9834 shorter encoding.
9835
9836 Return: the regno of chosen register. */
9837
9838 static unsigned int
9839 find_drap_reg (void)
9840 {
9841 tree decl = cfun->decl;
9842
9843 if (TARGET_64BIT)
9844 {
9845 /* Use R13 for nested function or function need static chain.
9846 Since function with tail call may use any caller-saved
9847 registers in epilogue, DRAP must not use caller-saved
9848 register in such case. */
9849 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9850 return R13_REG;
9851
9852 return R10_REG;
9853 }
9854 else
9855 {
9856 /* Use DI for nested function or function need static chain.
9857 Since function with tail call may use any caller-saved
9858 registers in epilogue, DRAP must not use caller-saved
9859 register in such case. */
9860 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9861 return DI_REG;
9862
9863 /* Reuse static chain register if it isn't used for parameter
9864 passing. */
9865 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9866 {
9867 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9868 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9869 return CX_REG;
9870 }
9871 return DI_REG;
9872 }
9873 }
9874
9875 /* Return minimum incoming stack alignment. */
9876
9877 static unsigned int
9878 ix86_minimum_incoming_stack_boundary (bool sibcall)
9879 {
9880 unsigned int incoming_stack_boundary;
9881
9882 /* Prefer the one specified at command line. */
9883 if (ix86_user_incoming_stack_boundary)
9884 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9885 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9886 if -mstackrealign is used, it isn't used for sibcall check and
9887 estimated stack alignment is 128bit. */
9888 else if (!sibcall
9889 && !TARGET_64BIT
9890 && ix86_force_align_arg_pointer
9891 && crtl->stack_alignment_estimated == 128)
9892 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9893 else
9894 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9895
9896 /* Incoming stack alignment can be changed on individual functions
9897 via force_align_arg_pointer attribute. We use the smallest
9898 incoming stack boundary. */
9899 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9900 && lookup_attribute (ix86_force_align_arg_pointer_string,
9901 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9902 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9903
9904 /* The incoming stack frame has to be aligned at least at
9905 parm_stack_boundary. */
9906 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9907 incoming_stack_boundary = crtl->parm_stack_boundary;
9908
9909 /* Stack at entrance of main is aligned by runtime. We use the
9910 smallest incoming stack boundary. */
9911 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9912 && DECL_NAME (current_function_decl)
9913 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9914 && DECL_FILE_SCOPE_P (current_function_decl))
9915 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9916
9917 return incoming_stack_boundary;
9918 }
9919
9920 /* Update incoming stack boundary and estimated stack alignment. */
9921
9922 static void
9923 ix86_update_stack_boundary (void)
9924 {
9925 ix86_incoming_stack_boundary
9926 = ix86_minimum_incoming_stack_boundary (false);
9927
9928 /* x86_64 vararg needs 16byte stack alignment for register save
9929 area. */
9930 if (TARGET_64BIT
9931 && cfun->stdarg
9932 && crtl->stack_alignment_estimated < 128)
9933 crtl->stack_alignment_estimated = 128;
9934 }
9935
9936 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9937 needed or an rtx for DRAP otherwise. */
9938
9939 static rtx
9940 ix86_get_drap_rtx (void)
9941 {
9942 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9943 crtl->need_drap = true;
9944
9945 if (stack_realign_drap)
9946 {
9947 /* Assign DRAP to vDRAP and returns vDRAP */
9948 unsigned int regno = find_drap_reg ();
9949 rtx drap_vreg;
9950 rtx arg_ptr;
9951 rtx seq, insn;
9952
9953 arg_ptr = gen_rtx_REG (Pmode, regno);
9954 crtl->drap_reg = arg_ptr;
9955
9956 start_sequence ();
9957 drap_vreg = copy_to_reg (arg_ptr);
9958 seq = get_insns ();
9959 end_sequence ();
9960
9961 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9962 if (!optimize)
9963 {
9964 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9965 RTX_FRAME_RELATED_P (insn) = 1;
9966 }
9967 return drap_vreg;
9968 }
9969 else
9970 return NULL;
9971 }
9972
9973 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9974
9975 static rtx
9976 ix86_internal_arg_pointer (void)
9977 {
9978 return virtual_incoming_args_rtx;
9979 }
9980
9981 struct scratch_reg {
9982 rtx reg;
9983 bool saved;
9984 };
9985
9986 /* Return a short-lived scratch register for use on function entry.
9987 In 32-bit mode, it is valid only after the registers are saved
9988 in the prologue. This register must be released by means of
9989 release_scratch_register_on_entry once it is dead. */
9990
9991 static void
9992 get_scratch_register_on_entry (struct scratch_reg *sr)
9993 {
9994 int regno;
9995
9996 sr->saved = false;
9997
9998 if (TARGET_64BIT)
9999 {
10000 /* We always use R11 in 64-bit mode. */
10001 regno = R11_REG;
10002 }
10003 else
10004 {
10005 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
10006 bool fastcall_p
10007 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
10008 bool thiscall_p
10009 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
10010 bool static_chain_p = DECL_STATIC_CHAIN (decl);
10011 int regparm = ix86_function_regparm (fntype, decl);
10012 int drap_regno
10013 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
10014
10015 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
10016 for the static chain register. */
10017 if ((regparm < 1 || (fastcall_p && !static_chain_p))
10018 && drap_regno != AX_REG)
10019 regno = AX_REG;
10020 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
10021 for the static chain register. */
10022 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
10023 regno = AX_REG;
10024 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
10025 regno = DX_REG;
10026 /* ecx is the static chain register. */
10027 else if (regparm < 3 && !fastcall_p && !thiscall_p
10028 && !static_chain_p
10029 && drap_regno != CX_REG)
10030 regno = CX_REG;
10031 else if (ix86_save_reg (BX_REG, true))
10032 regno = BX_REG;
10033 /* esi is the static chain register. */
10034 else if (!(regparm == 3 && static_chain_p)
10035 && ix86_save_reg (SI_REG, true))
10036 regno = SI_REG;
10037 else if (ix86_save_reg (DI_REG, true))
10038 regno = DI_REG;
10039 else
10040 {
10041 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
10042 sr->saved = true;
10043 }
10044 }
10045
10046 sr->reg = gen_rtx_REG (Pmode, regno);
10047 if (sr->saved)
10048 {
10049 rtx insn = emit_insn (gen_push (sr->reg));
10050 RTX_FRAME_RELATED_P (insn) = 1;
10051 }
10052 }
10053
10054 /* Release a scratch register obtained from the preceding function. */
10055
10056 static void
10057 release_scratch_register_on_entry (struct scratch_reg *sr)
10058 {
10059 if (sr->saved)
10060 {
10061 struct machine_function *m = cfun->machine;
10062 rtx x, insn = emit_insn (gen_pop (sr->reg));
10063
10064 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
10065 RTX_FRAME_RELATED_P (insn) = 1;
10066 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
10067 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10068 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
10069 m->fs.sp_offset -= UNITS_PER_WORD;
10070 }
10071 }
10072
10073 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
10074
10075 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
10076
10077 static void
10078 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
10079 {
10080 /* We skip the probe for the first interval + a small dope of 4 words and
10081 probe that many bytes past the specified size to maintain a protection
10082 area at the botton of the stack. */
10083 const int dope = 4 * UNITS_PER_WORD;
10084 rtx size_rtx = GEN_INT (size), last;
10085
10086 /* See if we have a constant small number of probes to generate. If so,
10087 that's the easy case. The run-time loop is made up of 11 insns in the
10088 generic case while the compile-time loop is made up of 3+2*(n-1) insns
10089 for n # of intervals. */
10090 if (size <= 5 * PROBE_INTERVAL)
10091 {
10092 HOST_WIDE_INT i, adjust;
10093 bool first_probe = true;
10094
10095 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
10096 values of N from 1 until it exceeds SIZE. If only one probe is
10097 needed, this will not generate any code. Then adjust and probe
10098 to PROBE_INTERVAL + SIZE. */
10099 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10100 {
10101 if (first_probe)
10102 {
10103 adjust = 2 * PROBE_INTERVAL + dope;
10104 first_probe = false;
10105 }
10106 else
10107 adjust = PROBE_INTERVAL;
10108
10109 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10110 plus_constant (Pmode, stack_pointer_rtx,
10111 -adjust)));
10112 emit_stack_probe (stack_pointer_rtx);
10113 }
10114
10115 if (first_probe)
10116 adjust = size + PROBE_INTERVAL + dope;
10117 else
10118 adjust = size + PROBE_INTERVAL - i;
10119
10120 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10121 plus_constant (Pmode, stack_pointer_rtx,
10122 -adjust)));
10123 emit_stack_probe (stack_pointer_rtx);
10124
10125 /* Adjust back to account for the additional first interval. */
10126 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10127 plus_constant (Pmode, stack_pointer_rtx,
10128 PROBE_INTERVAL + dope)));
10129 }
10130
10131 /* Otherwise, do the same as above, but in a loop. Note that we must be
10132 extra careful with variables wrapping around because we might be at
10133 the very top (or the very bottom) of the address space and we have
10134 to be able to handle this case properly; in particular, we use an
10135 equality test for the loop condition. */
10136 else
10137 {
10138 HOST_WIDE_INT rounded_size;
10139 struct scratch_reg sr;
10140
10141 get_scratch_register_on_entry (&sr);
10142
10143
10144 /* Step 1: round SIZE to the previous multiple of the interval. */
10145
10146 rounded_size = size & -PROBE_INTERVAL;
10147
10148
10149 /* Step 2: compute initial and final value of the loop counter. */
10150
10151 /* SP = SP_0 + PROBE_INTERVAL. */
10152 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10153 plus_constant (Pmode, stack_pointer_rtx,
10154 - (PROBE_INTERVAL + dope))));
10155
10156 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10157 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10158 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10159 gen_rtx_PLUS (Pmode, sr.reg,
10160 stack_pointer_rtx)));
10161
10162
10163 /* Step 3: the loop
10164
10165 while (SP != LAST_ADDR)
10166 {
10167 SP = SP + PROBE_INTERVAL
10168 probe at SP
10169 }
10170
10171 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10172 values of N from 1 until it is equal to ROUNDED_SIZE. */
10173
10174 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10175
10176
10177 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10178 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10179
10180 if (size != rounded_size)
10181 {
10182 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10183 plus_constant (Pmode, stack_pointer_rtx,
10184 rounded_size - size)));
10185 emit_stack_probe (stack_pointer_rtx);
10186 }
10187
10188 /* Adjust back to account for the additional first interval. */
10189 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10190 plus_constant (Pmode, stack_pointer_rtx,
10191 PROBE_INTERVAL + dope)));
10192
10193 release_scratch_register_on_entry (&sr);
10194 }
10195
10196 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10197
10198 /* Even if the stack pointer isn't the CFA register, we need to correctly
10199 describe the adjustments made to it, in particular differentiate the
10200 frame-related ones from the frame-unrelated ones. */
10201 if (size > 0)
10202 {
10203 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10204 XVECEXP (expr, 0, 0)
10205 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10206 plus_constant (Pmode, stack_pointer_rtx, -size));
10207 XVECEXP (expr, 0, 1)
10208 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10209 plus_constant (Pmode, stack_pointer_rtx,
10210 PROBE_INTERVAL + dope + size));
10211 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10212 RTX_FRAME_RELATED_P (last) = 1;
10213
10214 cfun->machine->fs.sp_offset += size;
10215 }
10216
10217 /* Make sure nothing is scheduled before we are done. */
10218 emit_insn (gen_blockage ());
10219 }
10220
10221 /* Adjust the stack pointer up to REG while probing it. */
10222
10223 const char *
10224 output_adjust_stack_and_probe (rtx reg)
10225 {
10226 static int labelno = 0;
10227 char loop_lab[32], end_lab[32];
10228 rtx xops[2];
10229
10230 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10231 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10232
10233 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10234
10235 /* Jump to END_LAB if SP == LAST_ADDR. */
10236 xops[0] = stack_pointer_rtx;
10237 xops[1] = reg;
10238 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10239 fputs ("\tje\t", asm_out_file);
10240 assemble_name_raw (asm_out_file, end_lab);
10241 fputc ('\n', asm_out_file);
10242
10243 /* SP = SP + PROBE_INTERVAL. */
10244 xops[1] = GEN_INT (PROBE_INTERVAL);
10245 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10246
10247 /* Probe at SP. */
10248 xops[1] = const0_rtx;
10249 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10250
10251 fprintf (asm_out_file, "\tjmp\t");
10252 assemble_name_raw (asm_out_file, loop_lab);
10253 fputc ('\n', asm_out_file);
10254
10255 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10256
10257 return "";
10258 }
10259
10260 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10261 inclusive. These are offsets from the current stack pointer. */
10262
10263 static void
10264 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10265 {
10266 /* See if we have a constant small number of probes to generate. If so,
10267 that's the easy case. The run-time loop is made up of 7 insns in the
10268 generic case while the compile-time loop is made up of n insns for n #
10269 of intervals. */
10270 if (size <= 7 * PROBE_INTERVAL)
10271 {
10272 HOST_WIDE_INT i;
10273
10274 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10275 it exceeds SIZE. If only one probe is needed, this will not
10276 generate any code. Then probe at FIRST + SIZE. */
10277 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10278 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10279 -(first + i)));
10280
10281 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10282 -(first + size)));
10283 }
10284
10285 /* Otherwise, do the same as above, but in a loop. Note that we must be
10286 extra careful with variables wrapping around because we might be at
10287 the very top (or the very bottom) of the address space and we have
10288 to be able to handle this case properly; in particular, we use an
10289 equality test for the loop condition. */
10290 else
10291 {
10292 HOST_WIDE_INT rounded_size, last;
10293 struct scratch_reg sr;
10294
10295 get_scratch_register_on_entry (&sr);
10296
10297
10298 /* Step 1: round SIZE to the previous multiple of the interval. */
10299
10300 rounded_size = size & -PROBE_INTERVAL;
10301
10302
10303 /* Step 2: compute initial and final value of the loop counter. */
10304
10305 /* TEST_OFFSET = FIRST. */
10306 emit_move_insn (sr.reg, GEN_INT (-first));
10307
10308 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10309 last = first + rounded_size;
10310
10311
10312 /* Step 3: the loop
10313
10314 while (TEST_ADDR != LAST_ADDR)
10315 {
10316 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10317 probe at TEST_ADDR
10318 }
10319
10320 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10321 until it is equal to ROUNDED_SIZE. */
10322
10323 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10324
10325
10326 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10327 that SIZE is equal to ROUNDED_SIZE. */
10328
10329 if (size != rounded_size)
10330 emit_stack_probe (plus_constant (Pmode,
10331 gen_rtx_PLUS (Pmode,
10332 stack_pointer_rtx,
10333 sr.reg),
10334 rounded_size - size));
10335
10336 release_scratch_register_on_entry (&sr);
10337 }
10338
10339 /* Make sure nothing is scheduled before we are done. */
10340 emit_insn (gen_blockage ());
10341 }
10342
10343 /* Probe a range of stack addresses from REG to END, inclusive. These are
10344 offsets from the current stack pointer. */
10345
10346 const char *
10347 output_probe_stack_range (rtx reg, rtx end)
10348 {
10349 static int labelno = 0;
10350 char loop_lab[32], end_lab[32];
10351 rtx xops[3];
10352
10353 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10354 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10355
10356 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10357
10358 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10359 xops[0] = reg;
10360 xops[1] = end;
10361 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10362 fputs ("\tje\t", asm_out_file);
10363 assemble_name_raw (asm_out_file, end_lab);
10364 fputc ('\n', asm_out_file);
10365
10366 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10367 xops[1] = GEN_INT (PROBE_INTERVAL);
10368 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10369
10370 /* Probe at TEST_ADDR. */
10371 xops[0] = stack_pointer_rtx;
10372 xops[1] = reg;
10373 xops[2] = const0_rtx;
10374 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10375
10376 fprintf (asm_out_file, "\tjmp\t");
10377 assemble_name_raw (asm_out_file, loop_lab);
10378 fputc ('\n', asm_out_file);
10379
10380 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10381
10382 return "";
10383 }
10384
10385 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10386 to be generated in correct form. */
10387 static void
10388 ix86_finalize_stack_realign_flags (void)
10389 {
10390 /* Check if stack realign is really needed after reload, and
10391 stores result in cfun */
10392 unsigned int incoming_stack_boundary
10393 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10394 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10395 unsigned int stack_realign = (incoming_stack_boundary
10396 < (crtl->is_leaf
10397 ? crtl->max_used_stack_slot_alignment
10398 : crtl->stack_alignment_needed));
10399
10400 if (crtl->stack_realign_finalized)
10401 {
10402 /* After stack_realign_needed is finalized, we can't no longer
10403 change it. */
10404 gcc_assert (crtl->stack_realign_needed == stack_realign);
10405 return;
10406 }
10407
10408 /* If the only reason for frame_pointer_needed is that we conservatively
10409 assumed stack realignment might be needed, but in the end nothing that
10410 needed the stack alignment had been spilled, clear frame_pointer_needed
10411 and say we don't need stack realignment. */
10412 if (stack_realign
10413 && !crtl->need_drap
10414 && frame_pointer_needed
10415 && crtl->is_leaf
10416 && flag_omit_frame_pointer
10417 && crtl->sp_is_unchanging
10418 && !ix86_current_function_calls_tls_descriptor
10419 && !crtl->accesses_prior_frames
10420 && !cfun->calls_alloca
10421 && !crtl->calls_eh_return
10422 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10423 && !ix86_frame_pointer_required ()
10424 && get_frame_size () == 0
10425 && ix86_nsaved_sseregs () == 0
10426 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10427 {
10428 HARD_REG_SET set_up_by_prologue, prologue_used;
10429 basic_block bb;
10430
10431 CLEAR_HARD_REG_SET (prologue_used);
10432 CLEAR_HARD_REG_SET (set_up_by_prologue);
10433 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10434 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10435 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10436 HARD_FRAME_POINTER_REGNUM);
10437 FOR_EACH_BB (bb)
10438 {
10439 rtx insn;
10440 FOR_BB_INSNS (bb, insn)
10441 if (NONDEBUG_INSN_P (insn)
10442 && requires_stack_frame_p (insn, prologue_used,
10443 set_up_by_prologue))
10444 {
10445 crtl->stack_realign_needed = stack_realign;
10446 crtl->stack_realign_finalized = true;
10447 return;
10448 }
10449 }
10450
10451 frame_pointer_needed = false;
10452 stack_realign = false;
10453 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10454 crtl->stack_alignment_needed = incoming_stack_boundary;
10455 crtl->stack_alignment_estimated = incoming_stack_boundary;
10456 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10457 crtl->preferred_stack_boundary = incoming_stack_boundary;
10458 df_finish_pass (true);
10459 df_scan_alloc (NULL);
10460 df_scan_blocks ();
10461 df_compute_regs_ever_live (true);
10462 df_analyze ();
10463 }
10464
10465 crtl->stack_realign_needed = stack_realign;
10466 crtl->stack_realign_finalized = true;
10467 }
10468
10469 /* Expand the prologue into a bunch of separate insns. */
10470
10471 void
10472 ix86_expand_prologue (void)
10473 {
10474 struct machine_function *m = cfun->machine;
10475 rtx insn, t;
10476 bool pic_reg_used;
10477 struct ix86_frame frame;
10478 HOST_WIDE_INT allocate;
10479 bool int_registers_saved;
10480 bool sse_registers_saved;
10481
10482 ix86_finalize_stack_realign_flags ();
10483
10484 /* DRAP should not coexist with stack_realign_fp */
10485 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10486
10487 memset (&m->fs, 0, sizeof (m->fs));
10488
10489 /* Initialize CFA state for before the prologue. */
10490 m->fs.cfa_reg = stack_pointer_rtx;
10491 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10492
10493 /* Track SP offset to the CFA. We continue tracking this after we've
10494 swapped the CFA register away from SP. In the case of re-alignment
10495 this is fudged; we're interested to offsets within the local frame. */
10496 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10497 m->fs.sp_valid = true;
10498
10499 ix86_compute_frame_layout (&frame);
10500
10501 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10502 {
10503 /* We should have already generated an error for any use of
10504 ms_hook on a nested function. */
10505 gcc_checking_assert (!ix86_static_chain_on_stack);
10506
10507 /* Check if profiling is active and we shall use profiling before
10508 prologue variant. If so sorry. */
10509 if (crtl->profile && flag_fentry != 0)
10510 sorry ("ms_hook_prologue attribute isn%'t compatible "
10511 "with -mfentry for 32-bit");
10512
10513 /* In ix86_asm_output_function_label we emitted:
10514 8b ff movl.s %edi,%edi
10515 55 push %ebp
10516 8b ec movl.s %esp,%ebp
10517
10518 This matches the hookable function prologue in Win32 API
10519 functions in Microsoft Windows XP Service Pack 2 and newer.
10520 Wine uses this to enable Windows apps to hook the Win32 API
10521 functions provided by Wine.
10522
10523 What that means is that we've already set up the frame pointer. */
10524
10525 if (frame_pointer_needed
10526 && !(crtl->drap_reg && crtl->stack_realign_needed))
10527 {
10528 rtx push, mov;
10529
10530 /* We've decided to use the frame pointer already set up.
10531 Describe this to the unwinder by pretending that both
10532 push and mov insns happen right here.
10533
10534 Putting the unwind info here at the end of the ms_hook
10535 is done so that we can make absolutely certain we get
10536 the required byte sequence at the start of the function,
10537 rather than relying on an assembler that can produce
10538 the exact encoding required.
10539
10540 However it does mean (in the unpatched case) that we have
10541 a 1 insn window where the asynchronous unwind info is
10542 incorrect. However, if we placed the unwind info at
10543 its correct location we would have incorrect unwind info
10544 in the patched case. Which is probably all moot since
10545 I don't expect Wine generates dwarf2 unwind info for the
10546 system libraries that use this feature. */
10547
10548 insn = emit_insn (gen_blockage ());
10549
10550 push = gen_push (hard_frame_pointer_rtx);
10551 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10552 stack_pointer_rtx);
10553 RTX_FRAME_RELATED_P (push) = 1;
10554 RTX_FRAME_RELATED_P (mov) = 1;
10555
10556 RTX_FRAME_RELATED_P (insn) = 1;
10557 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10558 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10559
10560 /* Note that gen_push incremented m->fs.cfa_offset, even
10561 though we didn't emit the push insn here. */
10562 m->fs.cfa_reg = hard_frame_pointer_rtx;
10563 m->fs.fp_offset = m->fs.cfa_offset;
10564 m->fs.fp_valid = true;
10565 }
10566 else
10567 {
10568 /* The frame pointer is not needed so pop %ebp again.
10569 This leaves us with a pristine state. */
10570 emit_insn (gen_pop (hard_frame_pointer_rtx));
10571 }
10572 }
10573
10574 /* The first insn of a function that accepts its static chain on the
10575 stack is to push the register that would be filled in by a direct
10576 call. This insn will be skipped by the trampoline. */
10577 else if (ix86_static_chain_on_stack)
10578 {
10579 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10580 emit_insn (gen_blockage ());
10581
10582 /* We don't want to interpret this push insn as a register save,
10583 only as a stack adjustment. The real copy of the register as
10584 a save will be done later, if needed. */
10585 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10586 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10587 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10588 RTX_FRAME_RELATED_P (insn) = 1;
10589 }
10590
10591 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10592 of DRAP is needed and stack realignment is really needed after reload */
10593 if (stack_realign_drap)
10594 {
10595 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10596
10597 /* Only need to push parameter pointer reg if it is caller saved. */
10598 if (!call_used_regs[REGNO (crtl->drap_reg)])
10599 {
10600 /* Push arg pointer reg */
10601 insn = emit_insn (gen_push (crtl->drap_reg));
10602 RTX_FRAME_RELATED_P (insn) = 1;
10603 }
10604
10605 /* Grab the argument pointer. */
10606 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10607 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10608 RTX_FRAME_RELATED_P (insn) = 1;
10609 m->fs.cfa_reg = crtl->drap_reg;
10610 m->fs.cfa_offset = 0;
10611
10612 /* Align the stack. */
10613 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10614 stack_pointer_rtx,
10615 GEN_INT (-align_bytes)));
10616 RTX_FRAME_RELATED_P (insn) = 1;
10617
10618 /* Replicate the return address on the stack so that return
10619 address can be reached via (argp - 1) slot. This is needed
10620 to implement macro RETURN_ADDR_RTX and intrinsic function
10621 expand_builtin_return_addr etc. */
10622 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10623 t = gen_frame_mem (word_mode, t);
10624 insn = emit_insn (gen_push (t));
10625 RTX_FRAME_RELATED_P (insn) = 1;
10626
10627 /* For the purposes of frame and register save area addressing,
10628 we've started over with a new frame. */
10629 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10630 m->fs.realigned = true;
10631 }
10632
10633 int_registers_saved = (frame.nregs == 0);
10634 sse_registers_saved = (frame.nsseregs == 0);
10635
10636 if (frame_pointer_needed && !m->fs.fp_valid)
10637 {
10638 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10639 slower on all targets. Also sdb doesn't like it. */
10640 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10641 RTX_FRAME_RELATED_P (insn) = 1;
10642
10643 /* Push registers now, before setting the frame pointer
10644 on SEH target. */
10645 if (!int_registers_saved
10646 && TARGET_SEH
10647 && !frame.save_regs_using_mov)
10648 {
10649 ix86_emit_save_regs ();
10650 int_registers_saved = true;
10651 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10652 }
10653
10654 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10655 {
10656 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10657 RTX_FRAME_RELATED_P (insn) = 1;
10658
10659 if (m->fs.cfa_reg == stack_pointer_rtx)
10660 m->fs.cfa_reg = hard_frame_pointer_rtx;
10661 m->fs.fp_offset = m->fs.sp_offset;
10662 m->fs.fp_valid = true;
10663 }
10664 }
10665
10666 if (!int_registers_saved)
10667 {
10668 /* If saving registers via PUSH, do so now. */
10669 if (!frame.save_regs_using_mov)
10670 {
10671 ix86_emit_save_regs ();
10672 int_registers_saved = true;
10673 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10674 }
10675
10676 /* When using red zone we may start register saving before allocating
10677 the stack frame saving one cycle of the prologue. However, avoid
10678 doing this if we have to probe the stack; at least on x86_64 the
10679 stack probe can turn into a call that clobbers a red zone location. */
10680 else if (ix86_using_red_zone ()
10681 && (! TARGET_STACK_PROBE
10682 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10683 {
10684 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10685 int_registers_saved = true;
10686 }
10687 }
10688
10689 if (stack_realign_fp)
10690 {
10691 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10692 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10693
10694 /* The computation of the size of the re-aligned stack frame means
10695 that we must allocate the size of the register save area before
10696 performing the actual alignment. Otherwise we cannot guarantee
10697 that there's enough storage above the realignment point. */
10698 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10699 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10700 GEN_INT (m->fs.sp_offset
10701 - frame.sse_reg_save_offset),
10702 -1, false);
10703
10704 /* Align the stack. */
10705 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10706 stack_pointer_rtx,
10707 GEN_INT (-align_bytes)));
10708
10709 /* For the purposes of register save area addressing, the stack
10710 pointer is no longer valid. As for the value of sp_offset,
10711 see ix86_compute_frame_layout, which we need to match in order
10712 to pass verification of stack_pointer_offset at the end. */
10713 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10714 m->fs.sp_valid = false;
10715 }
10716
10717 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10718
10719 if (flag_stack_usage_info)
10720 {
10721 /* We start to count from ARG_POINTER. */
10722 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10723
10724 /* If it was realigned, take into account the fake frame. */
10725 if (stack_realign_drap)
10726 {
10727 if (ix86_static_chain_on_stack)
10728 stack_size += UNITS_PER_WORD;
10729
10730 if (!call_used_regs[REGNO (crtl->drap_reg)])
10731 stack_size += UNITS_PER_WORD;
10732
10733 /* This over-estimates by 1 minimal-stack-alignment-unit but
10734 mitigates that by counting in the new return address slot. */
10735 current_function_dynamic_stack_size
10736 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10737 }
10738
10739 current_function_static_stack_size = stack_size;
10740 }
10741
10742 /* On SEH target with very large frame size, allocate an area to save
10743 SSE registers (as the very large allocation won't be described). */
10744 if (TARGET_SEH
10745 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10746 && !sse_registers_saved)
10747 {
10748 HOST_WIDE_INT sse_size =
10749 frame.sse_reg_save_offset - frame.reg_save_offset;
10750
10751 gcc_assert (int_registers_saved);
10752
10753 /* No need to do stack checking as the area will be immediately
10754 written. */
10755 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10756 GEN_INT (-sse_size), -1,
10757 m->fs.cfa_reg == stack_pointer_rtx);
10758 allocate -= sse_size;
10759 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10760 sse_registers_saved = true;
10761 }
10762
10763 /* The stack has already been decremented by the instruction calling us
10764 so probe if the size is non-negative to preserve the protection area. */
10765 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10766 {
10767 /* We expect the registers to be saved when probes are used. */
10768 gcc_assert (int_registers_saved);
10769
10770 if (STACK_CHECK_MOVING_SP)
10771 {
10772 if (!(crtl->is_leaf && !cfun->calls_alloca
10773 && allocate <= PROBE_INTERVAL))
10774 {
10775 ix86_adjust_stack_and_probe (allocate);
10776 allocate = 0;
10777 }
10778 }
10779 else
10780 {
10781 HOST_WIDE_INT size = allocate;
10782
10783 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10784 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10785
10786 if (TARGET_STACK_PROBE)
10787 {
10788 if (crtl->is_leaf && !cfun->calls_alloca)
10789 {
10790 if (size > PROBE_INTERVAL)
10791 ix86_emit_probe_stack_range (0, size);
10792 }
10793 else
10794 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10795 }
10796 else
10797 {
10798 if (crtl->is_leaf && !cfun->calls_alloca)
10799 {
10800 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
10801 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
10802 size - STACK_CHECK_PROTECT);
10803 }
10804 else
10805 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10806 }
10807 }
10808 }
10809
10810 if (allocate == 0)
10811 ;
10812 else if (!ix86_target_stack_probe ()
10813 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10814 {
10815 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10816 GEN_INT (-allocate), -1,
10817 m->fs.cfa_reg == stack_pointer_rtx);
10818 }
10819 else
10820 {
10821 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10822 rtx r10 = NULL;
10823 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10824 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10825 bool eax_live = false;
10826 bool r10_live = false;
10827
10828 if (TARGET_64BIT)
10829 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10830 if (!TARGET_64BIT_MS_ABI)
10831 eax_live = ix86_eax_live_at_start_p ();
10832
10833 /* Note that SEH directives need to continue tracking the stack
10834 pointer even after the frame pointer has been set up. */
10835 if (eax_live)
10836 {
10837 insn = emit_insn (gen_push (eax));
10838 allocate -= UNITS_PER_WORD;
10839 if (sp_is_cfa_reg || TARGET_SEH)
10840 {
10841 if (sp_is_cfa_reg)
10842 m->fs.cfa_offset += UNITS_PER_WORD;
10843 RTX_FRAME_RELATED_P (insn) = 1;
10844 }
10845 }
10846
10847 if (r10_live)
10848 {
10849 r10 = gen_rtx_REG (Pmode, R10_REG);
10850 insn = emit_insn (gen_push (r10));
10851 allocate -= UNITS_PER_WORD;
10852 if (sp_is_cfa_reg || TARGET_SEH)
10853 {
10854 if (sp_is_cfa_reg)
10855 m->fs.cfa_offset += UNITS_PER_WORD;
10856 RTX_FRAME_RELATED_P (insn) = 1;
10857 }
10858 }
10859
10860 emit_move_insn (eax, GEN_INT (allocate));
10861 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10862
10863 /* Use the fact that AX still contains ALLOCATE. */
10864 adjust_stack_insn = (Pmode == DImode
10865 ? gen_pro_epilogue_adjust_stack_di_sub
10866 : gen_pro_epilogue_adjust_stack_si_sub);
10867
10868 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10869 stack_pointer_rtx, eax));
10870
10871 if (sp_is_cfa_reg || TARGET_SEH)
10872 {
10873 if (sp_is_cfa_reg)
10874 m->fs.cfa_offset += allocate;
10875 RTX_FRAME_RELATED_P (insn) = 1;
10876 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10877 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10878 plus_constant (Pmode, stack_pointer_rtx,
10879 -allocate)));
10880 }
10881 m->fs.sp_offset += allocate;
10882
10883 if (r10_live && eax_live)
10884 {
10885 t = choose_baseaddr (m->fs.sp_offset - allocate);
10886 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10887 gen_frame_mem (word_mode, t));
10888 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10889 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10890 gen_frame_mem (word_mode, t));
10891 }
10892 else if (eax_live || r10_live)
10893 {
10894 t = choose_baseaddr (m->fs.sp_offset - allocate);
10895 emit_move_insn (gen_rtx_REG (word_mode,
10896 (eax_live ? AX_REG : R10_REG)),
10897 gen_frame_mem (word_mode, t));
10898 }
10899 }
10900 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10901
10902 /* If we havn't already set up the frame pointer, do so now. */
10903 if (frame_pointer_needed && !m->fs.fp_valid)
10904 {
10905 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10906 GEN_INT (frame.stack_pointer_offset
10907 - frame.hard_frame_pointer_offset));
10908 insn = emit_insn (insn);
10909 RTX_FRAME_RELATED_P (insn) = 1;
10910 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10911
10912 if (m->fs.cfa_reg == stack_pointer_rtx)
10913 m->fs.cfa_reg = hard_frame_pointer_rtx;
10914 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10915 m->fs.fp_valid = true;
10916 }
10917
10918 if (!int_registers_saved)
10919 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10920 if (!sse_registers_saved)
10921 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10922
10923 pic_reg_used = false;
10924 /* We don't use pic-register for pe-coff target. */
10925 if (pic_offset_table_rtx
10926 && !TARGET_PECOFF
10927 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10928 || crtl->profile))
10929 {
10930 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10931
10932 if (alt_pic_reg_used != INVALID_REGNUM)
10933 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10934
10935 pic_reg_used = true;
10936 }
10937
10938 if (pic_reg_used)
10939 {
10940 if (TARGET_64BIT)
10941 {
10942 if (ix86_cmodel == CM_LARGE_PIC)
10943 {
10944 rtx label, tmp_reg;
10945
10946 gcc_assert (Pmode == DImode);
10947 label = gen_label_rtx ();
10948 emit_label (label);
10949 LABEL_PRESERVE_P (label) = 1;
10950 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10951 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10952 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10953 label));
10954 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10955 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10956 pic_offset_table_rtx, tmp_reg));
10957 }
10958 else
10959 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10960 }
10961 else
10962 {
10963 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10964 RTX_FRAME_RELATED_P (insn) = 1;
10965 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10966 }
10967 }
10968
10969 /* In the pic_reg_used case, make sure that the got load isn't deleted
10970 when mcount needs it. Blockage to avoid call movement across mcount
10971 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10972 note. */
10973 if (crtl->profile && !flag_fentry && pic_reg_used)
10974 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10975
10976 if (crtl->drap_reg && !crtl->stack_realign_needed)
10977 {
10978 /* vDRAP is setup but after reload it turns out stack realign
10979 isn't necessary, here we will emit prologue to setup DRAP
10980 without stack realign adjustment */
10981 t = choose_baseaddr (0);
10982 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10983 }
10984
10985 /* Prevent instructions from being scheduled into register save push
10986 sequence when access to the redzone area is done through frame pointer.
10987 The offset between the frame pointer and the stack pointer is calculated
10988 relative to the value of the stack pointer at the end of the function
10989 prologue, and moving instructions that access redzone area via frame
10990 pointer inside push sequence violates this assumption. */
10991 if (frame_pointer_needed && frame.red_zone_size)
10992 emit_insn (gen_memory_blockage ());
10993
10994 /* Emit cld instruction if stringops are used in the function. */
10995 if (TARGET_CLD && ix86_current_function_needs_cld)
10996 emit_insn (gen_cld ());
10997
10998 /* SEH requires that the prologue end within 256 bytes of the start of
10999 the function. Prevent instruction schedules that would extend that.
11000 Further, prevent alloca modifications to the stack pointer from being
11001 combined with prologue modifications. */
11002 if (TARGET_SEH)
11003 emit_insn (gen_prologue_use (stack_pointer_rtx));
11004 }
11005
11006 /* Emit code to restore REG using a POP insn. */
11007
11008 static void
11009 ix86_emit_restore_reg_using_pop (rtx reg)
11010 {
11011 struct machine_function *m = cfun->machine;
11012 rtx insn = emit_insn (gen_pop (reg));
11013
11014 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
11015 m->fs.sp_offset -= UNITS_PER_WORD;
11016
11017 if (m->fs.cfa_reg == crtl->drap_reg
11018 && REGNO (reg) == REGNO (crtl->drap_reg))
11019 {
11020 /* Previously we'd represented the CFA as an expression
11021 like *(%ebp - 8). We've just popped that value from
11022 the stack, which means we need to reset the CFA to
11023 the drap register. This will remain until we restore
11024 the stack pointer. */
11025 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11026 RTX_FRAME_RELATED_P (insn) = 1;
11027
11028 /* This means that the DRAP register is valid for addressing too. */
11029 m->fs.drap_valid = true;
11030 return;
11031 }
11032
11033 if (m->fs.cfa_reg == stack_pointer_rtx)
11034 {
11035 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11036 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
11037 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11038 RTX_FRAME_RELATED_P (insn) = 1;
11039
11040 m->fs.cfa_offset -= UNITS_PER_WORD;
11041 }
11042
11043 /* When the frame pointer is the CFA, and we pop it, we are
11044 swapping back to the stack pointer as the CFA. This happens
11045 for stack frames that don't allocate other data, so we assume
11046 the stack pointer is now pointing at the return address, i.e.
11047 the function entry state, which makes the offset be 1 word. */
11048 if (reg == hard_frame_pointer_rtx)
11049 {
11050 m->fs.fp_valid = false;
11051 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11052 {
11053 m->fs.cfa_reg = stack_pointer_rtx;
11054 m->fs.cfa_offset -= UNITS_PER_WORD;
11055
11056 add_reg_note (insn, REG_CFA_DEF_CFA,
11057 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11058 GEN_INT (m->fs.cfa_offset)));
11059 RTX_FRAME_RELATED_P (insn) = 1;
11060 }
11061 }
11062 }
11063
11064 /* Emit code to restore saved registers using POP insns. */
11065
11066 static void
11067 ix86_emit_restore_regs_using_pop (void)
11068 {
11069 unsigned int regno;
11070
11071 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11072 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
11073 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
11074 }
11075
11076 /* Emit code and notes for the LEAVE instruction. */
11077
11078 static void
11079 ix86_emit_leave (void)
11080 {
11081 struct machine_function *m = cfun->machine;
11082 rtx insn = emit_insn (ix86_gen_leave ());
11083
11084 ix86_add_queued_cfa_restore_notes (insn);
11085
11086 gcc_assert (m->fs.fp_valid);
11087 m->fs.sp_valid = true;
11088 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
11089 m->fs.fp_valid = false;
11090
11091 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11092 {
11093 m->fs.cfa_reg = stack_pointer_rtx;
11094 m->fs.cfa_offset = m->fs.sp_offset;
11095
11096 add_reg_note (insn, REG_CFA_DEF_CFA,
11097 plus_constant (Pmode, stack_pointer_rtx,
11098 m->fs.sp_offset));
11099 RTX_FRAME_RELATED_P (insn) = 1;
11100 }
11101 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
11102 m->fs.fp_offset);
11103 }
11104
11105 /* Emit code to restore saved registers using MOV insns.
11106 First register is restored from CFA - CFA_OFFSET. */
11107 static void
11108 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
11109 bool maybe_eh_return)
11110 {
11111 struct machine_function *m = cfun->machine;
11112 unsigned int regno;
11113
11114 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11115 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11116 {
11117 rtx reg = gen_rtx_REG (word_mode, regno);
11118 rtx insn, mem;
11119
11120 mem = choose_baseaddr (cfa_offset);
11121 mem = gen_frame_mem (word_mode, mem);
11122 insn = emit_move_insn (reg, mem);
11123
11124 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
11125 {
11126 /* Previously we'd represented the CFA as an expression
11127 like *(%ebp - 8). We've just popped that value from
11128 the stack, which means we need to reset the CFA to
11129 the drap register. This will remain until we restore
11130 the stack pointer. */
11131 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11132 RTX_FRAME_RELATED_P (insn) = 1;
11133
11134 /* This means that the DRAP register is valid for addressing. */
11135 m->fs.drap_valid = true;
11136 }
11137 else
11138 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11139
11140 cfa_offset -= UNITS_PER_WORD;
11141 }
11142 }
11143
11144 /* Emit code to restore saved registers using MOV insns.
11145 First register is restored from CFA - CFA_OFFSET. */
11146 static void
11147 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11148 bool maybe_eh_return)
11149 {
11150 unsigned int regno;
11151
11152 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11153 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11154 {
11155 rtx reg = gen_rtx_REG (V4SFmode, regno);
11156 rtx mem;
11157
11158 mem = choose_baseaddr (cfa_offset);
11159 mem = gen_rtx_MEM (V4SFmode, mem);
11160 set_mem_align (mem, 128);
11161 emit_move_insn (reg, mem);
11162
11163 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11164
11165 cfa_offset -= 16;
11166 }
11167 }
11168
11169 /* Restore function stack, frame, and registers. */
11170
11171 void
11172 ix86_expand_epilogue (int style)
11173 {
11174 struct machine_function *m = cfun->machine;
11175 struct machine_frame_state frame_state_save = m->fs;
11176 struct ix86_frame frame;
11177 bool restore_regs_via_mov;
11178 bool using_drap;
11179
11180 ix86_finalize_stack_realign_flags ();
11181 ix86_compute_frame_layout (&frame);
11182
11183 m->fs.sp_valid = (!frame_pointer_needed
11184 || (crtl->sp_is_unchanging
11185 && !stack_realign_fp));
11186 gcc_assert (!m->fs.sp_valid
11187 || m->fs.sp_offset == frame.stack_pointer_offset);
11188
11189 /* The FP must be valid if the frame pointer is present. */
11190 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11191 gcc_assert (!m->fs.fp_valid
11192 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11193
11194 /* We must have *some* valid pointer to the stack frame. */
11195 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11196
11197 /* The DRAP is never valid at this point. */
11198 gcc_assert (!m->fs.drap_valid);
11199
11200 /* See the comment about red zone and frame
11201 pointer usage in ix86_expand_prologue. */
11202 if (frame_pointer_needed && frame.red_zone_size)
11203 emit_insn (gen_memory_blockage ());
11204
11205 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11206 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11207
11208 /* Determine the CFA offset of the end of the red-zone. */
11209 m->fs.red_zone_offset = 0;
11210 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11211 {
11212 /* The red-zone begins below the return address. */
11213 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11214
11215 /* When the register save area is in the aligned portion of
11216 the stack, determine the maximum runtime displacement that
11217 matches up with the aligned frame. */
11218 if (stack_realign_drap)
11219 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11220 + UNITS_PER_WORD);
11221 }
11222
11223 /* Special care must be taken for the normal return case of a function
11224 using eh_return: the eax and edx registers are marked as saved, but
11225 not restored along this path. Adjust the save location to match. */
11226 if (crtl->calls_eh_return && style != 2)
11227 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11228
11229 /* EH_RETURN requires the use of moves to function properly. */
11230 if (crtl->calls_eh_return)
11231 restore_regs_via_mov = true;
11232 /* SEH requires the use of pops to identify the epilogue. */
11233 else if (TARGET_SEH)
11234 restore_regs_via_mov = false;
11235 /* If we're only restoring one register and sp is not valid then
11236 using a move instruction to restore the register since it's
11237 less work than reloading sp and popping the register. */
11238 else if (!m->fs.sp_valid && frame.nregs <= 1)
11239 restore_regs_via_mov = true;
11240 else if (TARGET_EPILOGUE_USING_MOVE
11241 && cfun->machine->use_fast_prologue_epilogue
11242 && (frame.nregs > 1
11243 || m->fs.sp_offset != frame.reg_save_offset))
11244 restore_regs_via_mov = true;
11245 else if (frame_pointer_needed
11246 && !frame.nregs
11247 && m->fs.sp_offset != frame.reg_save_offset)
11248 restore_regs_via_mov = true;
11249 else if (frame_pointer_needed
11250 && TARGET_USE_LEAVE
11251 && cfun->machine->use_fast_prologue_epilogue
11252 && frame.nregs == 1)
11253 restore_regs_via_mov = true;
11254 else
11255 restore_regs_via_mov = false;
11256
11257 if (restore_regs_via_mov || frame.nsseregs)
11258 {
11259 /* Ensure that the entire register save area is addressable via
11260 the stack pointer, if we will restore via sp. */
11261 if (TARGET_64BIT
11262 && m->fs.sp_offset > 0x7fffffff
11263 && !(m->fs.fp_valid || m->fs.drap_valid)
11264 && (frame.nsseregs + frame.nregs) != 0)
11265 {
11266 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11267 GEN_INT (m->fs.sp_offset
11268 - frame.sse_reg_save_offset),
11269 style,
11270 m->fs.cfa_reg == stack_pointer_rtx);
11271 }
11272 }
11273
11274 /* If there are any SSE registers to restore, then we have to do it
11275 via moves, since there's obviously no pop for SSE regs. */
11276 if (frame.nsseregs)
11277 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11278 style == 2);
11279
11280 if (restore_regs_via_mov)
11281 {
11282 rtx t;
11283
11284 if (frame.nregs)
11285 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11286
11287 /* eh_return epilogues need %ecx added to the stack pointer. */
11288 if (style == 2)
11289 {
11290 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11291
11292 /* Stack align doesn't work with eh_return. */
11293 gcc_assert (!stack_realign_drap);
11294 /* Neither does regparm nested functions. */
11295 gcc_assert (!ix86_static_chain_on_stack);
11296
11297 if (frame_pointer_needed)
11298 {
11299 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11300 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11301 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11302
11303 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11304 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11305
11306 /* Note that we use SA as a temporary CFA, as the return
11307 address is at the proper place relative to it. We
11308 pretend this happens at the FP restore insn because
11309 prior to this insn the FP would be stored at the wrong
11310 offset relative to SA, and after this insn we have no
11311 other reasonable register to use for the CFA. We don't
11312 bother resetting the CFA to the SP for the duration of
11313 the return insn. */
11314 add_reg_note (insn, REG_CFA_DEF_CFA,
11315 plus_constant (Pmode, sa, UNITS_PER_WORD));
11316 ix86_add_queued_cfa_restore_notes (insn);
11317 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11318 RTX_FRAME_RELATED_P (insn) = 1;
11319
11320 m->fs.cfa_reg = sa;
11321 m->fs.cfa_offset = UNITS_PER_WORD;
11322 m->fs.fp_valid = false;
11323
11324 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11325 const0_rtx, style, false);
11326 }
11327 else
11328 {
11329 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11330 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11331 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11332 ix86_add_queued_cfa_restore_notes (insn);
11333
11334 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11335 if (m->fs.cfa_offset != UNITS_PER_WORD)
11336 {
11337 m->fs.cfa_offset = UNITS_PER_WORD;
11338 add_reg_note (insn, REG_CFA_DEF_CFA,
11339 plus_constant (Pmode, stack_pointer_rtx,
11340 UNITS_PER_WORD));
11341 RTX_FRAME_RELATED_P (insn) = 1;
11342 }
11343 }
11344 m->fs.sp_offset = UNITS_PER_WORD;
11345 m->fs.sp_valid = true;
11346 }
11347 }
11348 else
11349 {
11350 /* SEH requires that the function end with (1) a stack adjustment
11351 if necessary, (2) a sequence of pops, and (3) a return or
11352 jump instruction. Prevent insns from the function body from
11353 being scheduled into this sequence. */
11354 if (TARGET_SEH)
11355 {
11356 /* Prevent a catch region from being adjacent to the standard
11357 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11358 several other flags that would be interesting to test are
11359 not yet set up. */
11360 if (flag_non_call_exceptions)
11361 emit_insn (gen_nops (const1_rtx));
11362 else
11363 emit_insn (gen_blockage ());
11364 }
11365
11366 /* First step is to deallocate the stack frame so that we can
11367 pop the registers. Also do it on SEH target for very large
11368 frame as the emitted instructions aren't allowed by the ABI in
11369 epilogues. */
11370 if (!m->fs.sp_valid
11371 || (TARGET_SEH
11372 && (m->fs.sp_offset - frame.reg_save_offset
11373 >= SEH_MAX_FRAME_SIZE)))
11374 {
11375 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11376 GEN_INT (m->fs.fp_offset
11377 - frame.reg_save_offset),
11378 style, false);
11379 }
11380 else if (m->fs.sp_offset != frame.reg_save_offset)
11381 {
11382 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11383 GEN_INT (m->fs.sp_offset
11384 - frame.reg_save_offset),
11385 style,
11386 m->fs.cfa_reg == stack_pointer_rtx);
11387 }
11388
11389 ix86_emit_restore_regs_using_pop ();
11390 }
11391
11392 /* If we used a stack pointer and haven't already got rid of it,
11393 then do so now. */
11394 if (m->fs.fp_valid)
11395 {
11396 /* If the stack pointer is valid and pointing at the frame
11397 pointer store address, then we only need a pop. */
11398 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11399 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11400 /* Leave results in shorter dependency chains on CPUs that are
11401 able to grok it fast. */
11402 else if (TARGET_USE_LEAVE
11403 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
11404 || !cfun->machine->use_fast_prologue_epilogue)
11405 ix86_emit_leave ();
11406 else
11407 {
11408 pro_epilogue_adjust_stack (stack_pointer_rtx,
11409 hard_frame_pointer_rtx,
11410 const0_rtx, style, !using_drap);
11411 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11412 }
11413 }
11414
11415 if (using_drap)
11416 {
11417 int param_ptr_offset = UNITS_PER_WORD;
11418 rtx insn;
11419
11420 gcc_assert (stack_realign_drap);
11421
11422 if (ix86_static_chain_on_stack)
11423 param_ptr_offset += UNITS_PER_WORD;
11424 if (!call_used_regs[REGNO (crtl->drap_reg)])
11425 param_ptr_offset += UNITS_PER_WORD;
11426
11427 insn = emit_insn (gen_rtx_SET
11428 (VOIDmode, stack_pointer_rtx,
11429 gen_rtx_PLUS (Pmode,
11430 crtl->drap_reg,
11431 GEN_INT (-param_ptr_offset))));
11432 m->fs.cfa_reg = stack_pointer_rtx;
11433 m->fs.cfa_offset = param_ptr_offset;
11434 m->fs.sp_offset = param_ptr_offset;
11435 m->fs.realigned = false;
11436
11437 add_reg_note (insn, REG_CFA_DEF_CFA,
11438 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11439 GEN_INT (param_ptr_offset)));
11440 RTX_FRAME_RELATED_P (insn) = 1;
11441
11442 if (!call_used_regs[REGNO (crtl->drap_reg)])
11443 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11444 }
11445
11446 /* At this point the stack pointer must be valid, and we must have
11447 restored all of the registers. We may not have deallocated the
11448 entire stack frame. We've delayed this until now because it may
11449 be possible to merge the local stack deallocation with the
11450 deallocation forced by ix86_static_chain_on_stack. */
11451 gcc_assert (m->fs.sp_valid);
11452 gcc_assert (!m->fs.fp_valid);
11453 gcc_assert (!m->fs.realigned);
11454 if (m->fs.sp_offset != UNITS_PER_WORD)
11455 {
11456 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11457 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11458 style, true);
11459 }
11460 else
11461 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11462
11463 /* Sibcall epilogues don't want a return instruction. */
11464 if (style == 0)
11465 {
11466 m->fs = frame_state_save;
11467 return;
11468 }
11469
11470 if (crtl->args.pops_args && crtl->args.size)
11471 {
11472 rtx popc = GEN_INT (crtl->args.pops_args);
11473
11474 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11475 address, do explicit add, and jump indirectly to the caller. */
11476
11477 if (crtl->args.pops_args >= 65536)
11478 {
11479 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11480 rtx insn;
11481
11482 /* There is no "pascal" calling convention in any 64bit ABI. */
11483 gcc_assert (!TARGET_64BIT);
11484
11485 insn = emit_insn (gen_pop (ecx));
11486 m->fs.cfa_offset -= UNITS_PER_WORD;
11487 m->fs.sp_offset -= UNITS_PER_WORD;
11488
11489 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11490 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11491 add_reg_note (insn, REG_CFA_REGISTER,
11492 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11493 RTX_FRAME_RELATED_P (insn) = 1;
11494
11495 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11496 popc, -1, true);
11497 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11498 }
11499 else
11500 emit_jump_insn (gen_simple_return_pop_internal (popc));
11501 }
11502 else
11503 emit_jump_insn (gen_simple_return_internal ());
11504
11505 /* Restore the state back to the state from the prologue,
11506 so that it's correct for the next epilogue. */
11507 m->fs = frame_state_save;
11508 }
11509
11510 /* Reset from the function's potential modifications. */
11511
11512 static void
11513 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11514 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11515 {
11516 if (pic_offset_table_rtx)
11517 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11518 #if TARGET_MACHO
11519 /* Mach-O doesn't support labels at the end of objects, so if
11520 it looks like we might want one, insert a NOP. */
11521 {
11522 rtx insn = get_last_insn ();
11523 rtx deleted_debug_label = NULL_RTX;
11524 while (insn
11525 && NOTE_P (insn)
11526 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11527 {
11528 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11529 notes only, instead set their CODE_LABEL_NUMBER to -1,
11530 otherwise there would be code generation differences
11531 in between -g and -g0. */
11532 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11533 deleted_debug_label = insn;
11534 insn = PREV_INSN (insn);
11535 }
11536 if (insn
11537 && (LABEL_P (insn)
11538 || (NOTE_P (insn)
11539 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11540 fputs ("\tnop\n", file);
11541 else if (deleted_debug_label)
11542 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11543 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11544 CODE_LABEL_NUMBER (insn) = -1;
11545 }
11546 #endif
11547
11548 }
11549
11550 /* Return a scratch register to use in the split stack prologue. The
11551 split stack prologue is used for -fsplit-stack. It is the first
11552 instructions in the function, even before the regular prologue.
11553 The scratch register can be any caller-saved register which is not
11554 used for parameters or for the static chain. */
11555
11556 static unsigned int
11557 split_stack_prologue_scratch_regno (void)
11558 {
11559 if (TARGET_64BIT)
11560 return R11_REG;
11561 else
11562 {
11563 bool is_fastcall, is_thiscall;
11564 int regparm;
11565
11566 is_fastcall = (lookup_attribute ("fastcall",
11567 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11568 != NULL);
11569 is_thiscall = (lookup_attribute ("thiscall",
11570 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11571 != NULL);
11572 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11573
11574 if (is_fastcall)
11575 {
11576 if (DECL_STATIC_CHAIN (cfun->decl))
11577 {
11578 sorry ("-fsplit-stack does not support fastcall with "
11579 "nested function");
11580 return INVALID_REGNUM;
11581 }
11582 return AX_REG;
11583 }
11584 else if (is_thiscall)
11585 {
11586 if (!DECL_STATIC_CHAIN (cfun->decl))
11587 return DX_REG;
11588 return AX_REG;
11589 }
11590 else if (regparm < 3)
11591 {
11592 if (!DECL_STATIC_CHAIN (cfun->decl))
11593 return CX_REG;
11594 else
11595 {
11596 if (regparm >= 2)
11597 {
11598 sorry ("-fsplit-stack does not support 2 register "
11599 " parameters for a nested function");
11600 return INVALID_REGNUM;
11601 }
11602 return DX_REG;
11603 }
11604 }
11605 else
11606 {
11607 /* FIXME: We could make this work by pushing a register
11608 around the addition and comparison. */
11609 sorry ("-fsplit-stack does not support 3 register parameters");
11610 return INVALID_REGNUM;
11611 }
11612 }
11613 }
11614
11615 /* A SYMBOL_REF for the function which allocates new stackspace for
11616 -fsplit-stack. */
11617
11618 static GTY(()) rtx split_stack_fn;
11619
11620 /* A SYMBOL_REF for the more stack function when using the large
11621 model. */
11622
11623 static GTY(()) rtx split_stack_fn_large;
11624
11625 /* Handle -fsplit-stack. These are the first instructions in the
11626 function, even before the regular prologue. */
11627
11628 void
11629 ix86_expand_split_stack_prologue (void)
11630 {
11631 struct ix86_frame frame;
11632 HOST_WIDE_INT allocate;
11633 unsigned HOST_WIDE_INT args_size;
11634 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11635 rtx scratch_reg = NULL_RTX;
11636 rtx varargs_label = NULL_RTX;
11637 rtx fn;
11638
11639 gcc_assert (flag_split_stack && reload_completed);
11640
11641 ix86_finalize_stack_realign_flags ();
11642 ix86_compute_frame_layout (&frame);
11643 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11644
11645 /* This is the label we will branch to if we have enough stack
11646 space. We expect the basic block reordering pass to reverse this
11647 branch if optimizing, so that we branch in the unlikely case. */
11648 label = gen_label_rtx ();
11649
11650 /* We need to compare the stack pointer minus the frame size with
11651 the stack boundary in the TCB. The stack boundary always gives
11652 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11653 can compare directly. Otherwise we need to do an addition. */
11654
11655 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11656 UNSPEC_STACK_CHECK);
11657 limit = gen_rtx_CONST (Pmode, limit);
11658 limit = gen_rtx_MEM (Pmode, limit);
11659 if (allocate < SPLIT_STACK_AVAILABLE)
11660 current = stack_pointer_rtx;
11661 else
11662 {
11663 unsigned int scratch_regno;
11664 rtx offset;
11665
11666 /* We need a scratch register to hold the stack pointer minus
11667 the required frame size. Since this is the very start of the
11668 function, the scratch register can be any caller-saved
11669 register which is not used for parameters. */
11670 offset = GEN_INT (- allocate);
11671 scratch_regno = split_stack_prologue_scratch_regno ();
11672 if (scratch_regno == INVALID_REGNUM)
11673 return;
11674 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11675 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11676 {
11677 /* We don't use ix86_gen_add3 in this case because it will
11678 want to split to lea, but when not optimizing the insn
11679 will not be split after this point. */
11680 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11681 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11682 offset)));
11683 }
11684 else
11685 {
11686 emit_move_insn (scratch_reg, offset);
11687 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11688 stack_pointer_rtx));
11689 }
11690 current = scratch_reg;
11691 }
11692
11693 ix86_expand_branch (GEU, current, limit, label);
11694 jump_insn = get_last_insn ();
11695 JUMP_LABEL (jump_insn) = label;
11696
11697 /* Mark the jump as very likely to be taken. */
11698 add_int_reg_note (jump_insn, REG_BR_PROB,
11699 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
11700
11701 if (split_stack_fn == NULL_RTX)
11702 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11703 fn = split_stack_fn;
11704
11705 /* Get more stack space. We pass in the desired stack space and the
11706 size of the arguments to copy to the new stack. In 32-bit mode
11707 we push the parameters; __morestack will return on a new stack
11708 anyhow. In 64-bit mode we pass the parameters in r10 and
11709 r11. */
11710 allocate_rtx = GEN_INT (allocate);
11711 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11712 call_fusage = NULL_RTX;
11713 if (TARGET_64BIT)
11714 {
11715 rtx reg10, reg11;
11716
11717 reg10 = gen_rtx_REG (Pmode, R10_REG);
11718 reg11 = gen_rtx_REG (Pmode, R11_REG);
11719
11720 /* If this function uses a static chain, it will be in %r10.
11721 Preserve it across the call to __morestack. */
11722 if (DECL_STATIC_CHAIN (cfun->decl))
11723 {
11724 rtx rax;
11725
11726 rax = gen_rtx_REG (word_mode, AX_REG);
11727 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11728 use_reg (&call_fusage, rax);
11729 }
11730
11731 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11732 && !TARGET_PECOFF)
11733 {
11734 HOST_WIDE_INT argval;
11735
11736 gcc_assert (Pmode == DImode);
11737 /* When using the large model we need to load the address
11738 into a register, and we've run out of registers. So we
11739 switch to a different calling convention, and we call a
11740 different function: __morestack_large. We pass the
11741 argument size in the upper 32 bits of r10 and pass the
11742 frame size in the lower 32 bits. */
11743 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11744 gcc_assert ((args_size & 0xffffffff) == args_size);
11745
11746 if (split_stack_fn_large == NULL_RTX)
11747 split_stack_fn_large =
11748 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11749
11750 if (ix86_cmodel == CM_LARGE_PIC)
11751 {
11752 rtx label, x;
11753
11754 label = gen_label_rtx ();
11755 emit_label (label);
11756 LABEL_PRESERVE_P (label) = 1;
11757 emit_insn (gen_set_rip_rex64 (reg10, label));
11758 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11759 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11760 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11761 UNSPEC_GOT);
11762 x = gen_rtx_CONST (Pmode, x);
11763 emit_move_insn (reg11, x);
11764 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11765 x = gen_const_mem (Pmode, x);
11766 emit_move_insn (reg11, x);
11767 }
11768 else
11769 emit_move_insn (reg11, split_stack_fn_large);
11770
11771 fn = reg11;
11772
11773 argval = ((args_size << 16) << 16) + allocate;
11774 emit_move_insn (reg10, GEN_INT (argval));
11775 }
11776 else
11777 {
11778 emit_move_insn (reg10, allocate_rtx);
11779 emit_move_insn (reg11, GEN_INT (args_size));
11780 use_reg (&call_fusage, reg11);
11781 }
11782
11783 use_reg (&call_fusage, reg10);
11784 }
11785 else
11786 {
11787 emit_insn (gen_push (GEN_INT (args_size)));
11788 emit_insn (gen_push (allocate_rtx));
11789 }
11790 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11791 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11792 NULL_RTX, false);
11793 add_function_usage_to (call_insn, call_fusage);
11794
11795 /* In order to make call/return prediction work right, we now need
11796 to execute a return instruction. See
11797 libgcc/config/i386/morestack.S for the details on how this works.
11798
11799 For flow purposes gcc must not see this as a return
11800 instruction--we need control flow to continue at the subsequent
11801 label. Therefore, we use an unspec. */
11802 gcc_assert (crtl->args.pops_args < 65536);
11803 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11804
11805 /* If we are in 64-bit mode and this function uses a static chain,
11806 we saved %r10 in %rax before calling _morestack. */
11807 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11808 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11809 gen_rtx_REG (word_mode, AX_REG));
11810
11811 /* If this function calls va_start, we need to store a pointer to
11812 the arguments on the old stack, because they may not have been
11813 all copied to the new stack. At this point the old stack can be
11814 found at the frame pointer value used by __morestack, because
11815 __morestack has set that up before calling back to us. Here we
11816 store that pointer in a scratch register, and in
11817 ix86_expand_prologue we store the scratch register in a stack
11818 slot. */
11819 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11820 {
11821 unsigned int scratch_regno;
11822 rtx frame_reg;
11823 int words;
11824
11825 scratch_regno = split_stack_prologue_scratch_regno ();
11826 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11827 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11828
11829 /* 64-bit:
11830 fp -> old fp value
11831 return address within this function
11832 return address of caller of this function
11833 stack arguments
11834 So we add three words to get to the stack arguments.
11835
11836 32-bit:
11837 fp -> old fp value
11838 return address within this function
11839 first argument to __morestack
11840 second argument to __morestack
11841 return address of caller of this function
11842 stack arguments
11843 So we add five words to get to the stack arguments.
11844 */
11845 words = TARGET_64BIT ? 3 : 5;
11846 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11847 gen_rtx_PLUS (Pmode, frame_reg,
11848 GEN_INT (words * UNITS_PER_WORD))));
11849
11850 varargs_label = gen_label_rtx ();
11851 emit_jump_insn (gen_jump (varargs_label));
11852 JUMP_LABEL (get_last_insn ()) = varargs_label;
11853
11854 emit_barrier ();
11855 }
11856
11857 emit_label (label);
11858 LABEL_NUSES (label) = 1;
11859
11860 /* If this function calls va_start, we now have to set the scratch
11861 register for the case where we do not call __morestack. In this
11862 case we need to set it based on the stack pointer. */
11863 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11864 {
11865 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11866 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11867 GEN_INT (UNITS_PER_WORD))));
11868
11869 emit_label (varargs_label);
11870 LABEL_NUSES (varargs_label) = 1;
11871 }
11872 }
11873
11874 /* We may have to tell the dataflow pass that the split stack prologue
11875 is initializing a scratch register. */
11876
11877 static void
11878 ix86_live_on_entry (bitmap regs)
11879 {
11880 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11881 {
11882 gcc_assert (flag_split_stack);
11883 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11884 }
11885 }
11886 \f
11887 /* Extract the parts of an RTL expression that is a valid memory address
11888 for an instruction. Return 0 if the structure of the address is
11889 grossly off. Return -1 if the address contains ASHIFT, so it is not
11890 strictly valid, but still used for computing length of lea instruction. */
11891
11892 int
11893 ix86_decompose_address (rtx addr, struct ix86_address *out)
11894 {
11895 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11896 rtx base_reg, index_reg;
11897 HOST_WIDE_INT scale = 1;
11898 rtx scale_rtx = NULL_RTX;
11899 rtx tmp;
11900 int retval = 1;
11901 enum ix86_address_seg seg = SEG_DEFAULT;
11902
11903 /* Allow zero-extended SImode addresses,
11904 they will be emitted with addr32 prefix. */
11905 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11906 {
11907 if (GET_CODE (addr) == ZERO_EXTEND
11908 && GET_MODE (XEXP (addr, 0)) == SImode)
11909 {
11910 addr = XEXP (addr, 0);
11911 if (CONST_INT_P (addr))
11912 return 0;
11913 }
11914 else if (GET_CODE (addr) == AND
11915 && const_32bit_mask (XEXP (addr, 1), DImode))
11916 {
11917 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11918 if (addr == NULL_RTX)
11919 return 0;
11920
11921 if (CONST_INT_P (addr))
11922 return 0;
11923 }
11924 }
11925
11926 /* Allow SImode subregs of DImode addresses,
11927 they will be emitted with addr32 prefix. */
11928 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11929 {
11930 if (GET_CODE (addr) == SUBREG
11931 && GET_MODE (SUBREG_REG (addr)) == DImode)
11932 {
11933 addr = SUBREG_REG (addr);
11934 if (CONST_INT_P (addr))
11935 return 0;
11936 }
11937 }
11938
11939 if (REG_P (addr))
11940 base = addr;
11941 else if (GET_CODE (addr) == SUBREG)
11942 {
11943 if (REG_P (SUBREG_REG (addr)))
11944 base = addr;
11945 else
11946 return 0;
11947 }
11948 else if (GET_CODE (addr) == PLUS)
11949 {
11950 rtx addends[4], op;
11951 int n = 0, i;
11952
11953 op = addr;
11954 do
11955 {
11956 if (n >= 4)
11957 return 0;
11958 addends[n++] = XEXP (op, 1);
11959 op = XEXP (op, 0);
11960 }
11961 while (GET_CODE (op) == PLUS);
11962 if (n >= 4)
11963 return 0;
11964 addends[n] = op;
11965
11966 for (i = n; i >= 0; --i)
11967 {
11968 op = addends[i];
11969 switch (GET_CODE (op))
11970 {
11971 case MULT:
11972 if (index)
11973 return 0;
11974 index = XEXP (op, 0);
11975 scale_rtx = XEXP (op, 1);
11976 break;
11977
11978 case ASHIFT:
11979 if (index)
11980 return 0;
11981 index = XEXP (op, 0);
11982 tmp = XEXP (op, 1);
11983 if (!CONST_INT_P (tmp))
11984 return 0;
11985 scale = INTVAL (tmp);
11986 if ((unsigned HOST_WIDE_INT) scale > 3)
11987 return 0;
11988 scale = 1 << scale;
11989 break;
11990
11991 case ZERO_EXTEND:
11992 op = XEXP (op, 0);
11993 if (GET_CODE (op) != UNSPEC)
11994 return 0;
11995 /* FALLTHRU */
11996
11997 case UNSPEC:
11998 if (XINT (op, 1) == UNSPEC_TP
11999 && TARGET_TLS_DIRECT_SEG_REFS
12000 && seg == SEG_DEFAULT)
12001 seg = DEFAULT_TLS_SEG_REG;
12002 else
12003 return 0;
12004 break;
12005
12006 case SUBREG:
12007 if (!REG_P (SUBREG_REG (op)))
12008 return 0;
12009 /* FALLTHRU */
12010
12011 case REG:
12012 if (!base)
12013 base = op;
12014 else if (!index)
12015 index = op;
12016 else
12017 return 0;
12018 break;
12019
12020 case CONST:
12021 case CONST_INT:
12022 case SYMBOL_REF:
12023 case LABEL_REF:
12024 if (disp)
12025 return 0;
12026 disp = op;
12027 break;
12028
12029 default:
12030 return 0;
12031 }
12032 }
12033 }
12034 else if (GET_CODE (addr) == MULT)
12035 {
12036 index = XEXP (addr, 0); /* index*scale */
12037 scale_rtx = XEXP (addr, 1);
12038 }
12039 else if (GET_CODE (addr) == ASHIFT)
12040 {
12041 /* We're called for lea too, which implements ashift on occasion. */
12042 index = XEXP (addr, 0);
12043 tmp = XEXP (addr, 1);
12044 if (!CONST_INT_P (tmp))
12045 return 0;
12046 scale = INTVAL (tmp);
12047 if ((unsigned HOST_WIDE_INT) scale > 3)
12048 return 0;
12049 scale = 1 << scale;
12050 retval = -1;
12051 }
12052 else
12053 disp = addr; /* displacement */
12054
12055 if (index)
12056 {
12057 if (REG_P (index))
12058 ;
12059 else if (GET_CODE (index) == SUBREG
12060 && REG_P (SUBREG_REG (index)))
12061 ;
12062 else
12063 return 0;
12064 }
12065
12066 /* Extract the integral value of scale. */
12067 if (scale_rtx)
12068 {
12069 if (!CONST_INT_P (scale_rtx))
12070 return 0;
12071 scale = INTVAL (scale_rtx);
12072 }
12073
12074 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
12075 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
12076
12077 /* Avoid useless 0 displacement. */
12078 if (disp == const0_rtx && (base || index))
12079 disp = NULL_RTX;
12080
12081 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12082 if (base_reg && index_reg && scale == 1
12083 && (index_reg == arg_pointer_rtx
12084 || index_reg == frame_pointer_rtx
12085 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12086 {
12087 rtx tmp;
12088 tmp = base, base = index, index = tmp;
12089 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12090 }
12091
12092 /* Special case: %ebp cannot be encoded as a base without a displacement.
12093 Similarly %r13. */
12094 if (!disp
12095 && base_reg
12096 && (base_reg == hard_frame_pointer_rtx
12097 || base_reg == frame_pointer_rtx
12098 || base_reg == arg_pointer_rtx
12099 || (REG_P (base_reg)
12100 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12101 || REGNO (base_reg) == R13_REG))))
12102 disp = const0_rtx;
12103
12104 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12105 Avoid this by transforming to [%esi+0].
12106 Reload calls address legitimization without cfun defined, so we need
12107 to test cfun for being non-NULL. */
12108 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12109 && base_reg && !index_reg && !disp
12110 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12111 disp = const0_rtx;
12112
12113 /* Special case: encode reg+reg instead of reg*2. */
12114 if (!base && index && scale == 2)
12115 base = index, base_reg = index_reg, scale = 1;
12116
12117 /* Special case: scaling cannot be encoded without base or displacement. */
12118 if (!base && !disp && index && scale != 1)
12119 disp = const0_rtx;
12120
12121 out->base = base;
12122 out->index = index;
12123 out->disp = disp;
12124 out->scale = scale;
12125 out->seg = seg;
12126
12127 return retval;
12128 }
12129 \f
12130 /* Return cost of the memory address x.
12131 For i386, it is better to use a complex address than let gcc copy
12132 the address into a reg and make a new pseudo. But not if the address
12133 requires to two regs - that would mean more pseudos with longer
12134 lifetimes. */
12135 static int
12136 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12137 addr_space_t as ATTRIBUTE_UNUSED,
12138 bool speed ATTRIBUTE_UNUSED)
12139 {
12140 struct ix86_address parts;
12141 int cost = 1;
12142 int ok = ix86_decompose_address (x, &parts);
12143
12144 gcc_assert (ok);
12145
12146 if (parts.base && GET_CODE (parts.base) == SUBREG)
12147 parts.base = SUBREG_REG (parts.base);
12148 if (parts.index && GET_CODE (parts.index) == SUBREG)
12149 parts.index = SUBREG_REG (parts.index);
12150
12151 /* Attempt to minimize number of registers in the address. */
12152 if ((parts.base
12153 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12154 || (parts.index
12155 && (!REG_P (parts.index)
12156 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12157 cost++;
12158
12159 if (parts.base
12160 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12161 && parts.index
12162 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12163 && parts.base != parts.index)
12164 cost++;
12165
12166 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12167 since it's predecode logic can't detect the length of instructions
12168 and it degenerates to vector decoded. Increase cost of such
12169 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12170 to split such addresses or even refuse such addresses at all.
12171
12172 Following addressing modes are affected:
12173 [base+scale*index]
12174 [scale*index+disp]
12175 [base+index]
12176
12177 The first and last case may be avoidable by explicitly coding the zero in
12178 memory address, but I don't have AMD-K6 machine handy to check this
12179 theory. */
12180
12181 if (TARGET_K6
12182 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12183 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12184 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12185 cost += 10;
12186
12187 return cost;
12188 }
12189 \f
12190 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12191 this is used for to form addresses to local data when -fPIC is in
12192 use. */
12193
12194 static bool
12195 darwin_local_data_pic (rtx disp)
12196 {
12197 return (GET_CODE (disp) == UNSPEC
12198 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12199 }
12200
12201 /* Determine if a given RTX is a valid constant. We already know this
12202 satisfies CONSTANT_P. */
12203
12204 static bool
12205 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12206 {
12207 switch (GET_CODE (x))
12208 {
12209 case CONST:
12210 x = XEXP (x, 0);
12211
12212 if (GET_CODE (x) == PLUS)
12213 {
12214 if (!CONST_INT_P (XEXP (x, 1)))
12215 return false;
12216 x = XEXP (x, 0);
12217 }
12218
12219 if (TARGET_MACHO && darwin_local_data_pic (x))
12220 return true;
12221
12222 /* Only some unspecs are valid as "constants". */
12223 if (GET_CODE (x) == UNSPEC)
12224 switch (XINT (x, 1))
12225 {
12226 case UNSPEC_GOT:
12227 case UNSPEC_GOTOFF:
12228 case UNSPEC_PLTOFF:
12229 return TARGET_64BIT;
12230 case UNSPEC_TPOFF:
12231 case UNSPEC_NTPOFF:
12232 x = XVECEXP (x, 0, 0);
12233 return (GET_CODE (x) == SYMBOL_REF
12234 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12235 case UNSPEC_DTPOFF:
12236 x = XVECEXP (x, 0, 0);
12237 return (GET_CODE (x) == SYMBOL_REF
12238 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12239 default:
12240 return false;
12241 }
12242
12243 /* We must have drilled down to a symbol. */
12244 if (GET_CODE (x) == LABEL_REF)
12245 return true;
12246 if (GET_CODE (x) != SYMBOL_REF)
12247 return false;
12248 /* FALLTHRU */
12249
12250 case SYMBOL_REF:
12251 /* TLS symbols are never valid. */
12252 if (SYMBOL_REF_TLS_MODEL (x))
12253 return false;
12254
12255 /* DLLIMPORT symbols are never valid. */
12256 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12257 && SYMBOL_REF_DLLIMPORT_P (x))
12258 return false;
12259
12260 #if TARGET_MACHO
12261 /* mdynamic-no-pic */
12262 if (MACHO_DYNAMIC_NO_PIC_P)
12263 return machopic_symbol_defined_p (x);
12264 #endif
12265 break;
12266
12267 case CONST_DOUBLE:
12268 if (GET_MODE (x) == TImode
12269 && x != CONST0_RTX (TImode)
12270 && !TARGET_64BIT)
12271 return false;
12272 break;
12273
12274 case CONST_VECTOR:
12275 if (!standard_sse_constant_p (x))
12276 return false;
12277
12278 default:
12279 break;
12280 }
12281
12282 /* Otherwise we handle everything else in the move patterns. */
12283 return true;
12284 }
12285
12286 /* Determine if it's legal to put X into the constant pool. This
12287 is not possible for the address of thread-local symbols, which
12288 is checked above. */
12289
12290 static bool
12291 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12292 {
12293 /* We can always put integral constants and vectors in memory. */
12294 switch (GET_CODE (x))
12295 {
12296 case CONST_INT:
12297 case CONST_DOUBLE:
12298 case CONST_VECTOR:
12299 return false;
12300
12301 default:
12302 break;
12303 }
12304 return !ix86_legitimate_constant_p (mode, x);
12305 }
12306
12307 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12308 otherwise zero. */
12309
12310 static bool
12311 is_imported_p (rtx x)
12312 {
12313 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12314 || GET_CODE (x) != SYMBOL_REF)
12315 return false;
12316
12317 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12318 }
12319
12320
12321 /* Nonzero if the constant value X is a legitimate general operand
12322 when generating PIC code. It is given that flag_pic is on and
12323 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12324
12325 bool
12326 legitimate_pic_operand_p (rtx x)
12327 {
12328 rtx inner;
12329
12330 switch (GET_CODE (x))
12331 {
12332 case CONST:
12333 inner = XEXP (x, 0);
12334 if (GET_CODE (inner) == PLUS
12335 && CONST_INT_P (XEXP (inner, 1)))
12336 inner = XEXP (inner, 0);
12337
12338 /* Only some unspecs are valid as "constants". */
12339 if (GET_CODE (inner) == UNSPEC)
12340 switch (XINT (inner, 1))
12341 {
12342 case UNSPEC_GOT:
12343 case UNSPEC_GOTOFF:
12344 case UNSPEC_PLTOFF:
12345 return TARGET_64BIT;
12346 case UNSPEC_TPOFF:
12347 x = XVECEXP (inner, 0, 0);
12348 return (GET_CODE (x) == SYMBOL_REF
12349 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12350 case UNSPEC_MACHOPIC_OFFSET:
12351 return legitimate_pic_address_disp_p (x);
12352 default:
12353 return false;
12354 }
12355 /* FALLTHRU */
12356
12357 case SYMBOL_REF:
12358 case LABEL_REF:
12359 return legitimate_pic_address_disp_p (x);
12360
12361 default:
12362 return true;
12363 }
12364 }
12365
12366 /* Determine if a given CONST RTX is a valid memory displacement
12367 in PIC mode. */
12368
12369 bool
12370 legitimate_pic_address_disp_p (rtx disp)
12371 {
12372 bool saw_plus;
12373
12374 /* In 64bit mode we can allow direct addresses of symbols and labels
12375 when they are not dynamic symbols. */
12376 if (TARGET_64BIT)
12377 {
12378 rtx op0 = disp, op1;
12379
12380 switch (GET_CODE (disp))
12381 {
12382 case LABEL_REF:
12383 return true;
12384
12385 case CONST:
12386 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12387 break;
12388 op0 = XEXP (XEXP (disp, 0), 0);
12389 op1 = XEXP (XEXP (disp, 0), 1);
12390 if (!CONST_INT_P (op1)
12391 || INTVAL (op1) >= 16*1024*1024
12392 || INTVAL (op1) < -16*1024*1024)
12393 break;
12394 if (GET_CODE (op0) == LABEL_REF)
12395 return true;
12396 if (GET_CODE (op0) == CONST
12397 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12398 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12399 return true;
12400 if (GET_CODE (op0) == UNSPEC
12401 && XINT (op0, 1) == UNSPEC_PCREL)
12402 return true;
12403 if (GET_CODE (op0) != SYMBOL_REF)
12404 break;
12405 /* FALLTHRU */
12406
12407 case SYMBOL_REF:
12408 /* TLS references should always be enclosed in UNSPEC.
12409 The dllimported symbol needs always to be resolved. */
12410 if (SYMBOL_REF_TLS_MODEL (op0)
12411 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12412 return false;
12413
12414 if (TARGET_PECOFF)
12415 {
12416 if (is_imported_p (op0))
12417 return true;
12418
12419 if (SYMBOL_REF_FAR_ADDR_P (op0)
12420 || !SYMBOL_REF_LOCAL_P (op0))
12421 break;
12422
12423 /* Function-symbols need to be resolved only for
12424 large-model.
12425 For the small-model we don't need to resolve anything
12426 here. */
12427 if ((ix86_cmodel != CM_LARGE_PIC
12428 && SYMBOL_REF_FUNCTION_P (op0))
12429 || ix86_cmodel == CM_SMALL_PIC)
12430 return true;
12431 /* Non-external symbols don't need to be resolved for
12432 large, and medium-model. */
12433 if ((ix86_cmodel == CM_LARGE_PIC
12434 || ix86_cmodel == CM_MEDIUM_PIC)
12435 && !SYMBOL_REF_EXTERNAL_P (op0))
12436 return true;
12437 }
12438 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12439 && SYMBOL_REF_LOCAL_P (op0)
12440 && ix86_cmodel != CM_LARGE_PIC)
12441 return true;
12442 break;
12443
12444 default:
12445 break;
12446 }
12447 }
12448 if (GET_CODE (disp) != CONST)
12449 return false;
12450 disp = XEXP (disp, 0);
12451
12452 if (TARGET_64BIT)
12453 {
12454 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12455 of GOT tables. We should not need these anyway. */
12456 if (GET_CODE (disp) != UNSPEC
12457 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12458 && XINT (disp, 1) != UNSPEC_GOTOFF
12459 && XINT (disp, 1) != UNSPEC_PCREL
12460 && XINT (disp, 1) != UNSPEC_PLTOFF))
12461 return false;
12462
12463 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12464 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12465 return false;
12466 return true;
12467 }
12468
12469 saw_plus = false;
12470 if (GET_CODE (disp) == PLUS)
12471 {
12472 if (!CONST_INT_P (XEXP (disp, 1)))
12473 return false;
12474 disp = XEXP (disp, 0);
12475 saw_plus = true;
12476 }
12477
12478 if (TARGET_MACHO && darwin_local_data_pic (disp))
12479 return true;
12480
12481 if (GET_CODE (disp) != UNSPEC)
12482 return false;
12483
12484 switch (XINT (disp, 1))
12485 {
12486 case UNSPEC_GOT:
12487 if (saw_plus)
12488 return false;
12489 /* We need to check for both symbols and labels because VxWorks loads
12490 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12491 details. */
12492 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12493 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12494 case UNSPEC_GOTOFF:
12495 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12496 While ABI specify also 32bit relocation but we don't produce it in
12497 small PIC model at all. */
12498 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12499 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12500 && !TARGET_64BIT)
12501 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12502 return false;
12503 case UNSPEC_GOTTPOFF:
12504 case UNSPEC_GOTNTPOFF:
12505 case UNSPEC_INDNTPOFF:
12506 if (saw_plus)
12507 return false;
12508 disp = XVECEXP (disp, 0, 0);
12509 return (GET_CODE (disp) == SYMBOL_REF
12510 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12511 case UNSPEC_NTPOFF:
12512 disp = XVECEXP (disp, 0, 0);
12513 return (GET_CODE (disp) == SYMBOL_REF
12514 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12515 case UNSPEC_DTPOFF:
12516 disp = XVECEXP (disp, 0, 0);
12517 return (GET_CODE (disp) == SYMBOL_REF
12518 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12519 }
12520
12521 return false;
12522 }
12523
12524 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12525 replace the input X, or the original X if no replacement is called for.
12526 The output parameter *WIN is 1 if the calling macro should goto WIN,
12527 0 if it should not. */
12528
12529 bool
12530 ix86_legitimize_reload_address (rtx x,
12531 enum machine_mode mode ATTRIBUTE_UNUSED,
12532 int opnum, int type,
12533 int ind_levels ATTRIBUTE_UNUSED)
12534 {
12535 /* Reload can generate:
12536
12537 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12538 (reg:DI 97))
12539 (reg:DI 2 cx))
12540
12541 This RTX is rejected from ix86_legitimate_address_p due to
12542 non-strictness of base register 97. Following this rejection,
12543 reload pushes all three components into separate registers,
12544 creating invalid memory address RTX.
12545
12546 Following code reloads only the invalid part of the
12547 memory address RTX. */
12548
12549 if (GET_CODE (x) == PLUS
12550 && REG_P (XEXP (x, 1))
12551 && GET_CODE (XEXP (x, 0)) == PLUS
12552 && REG_P (XEXP (XEXP (x, 0), 1)))
12553 {
12554 rtx base, index;
12555 bool something_reloaded = false;
12556
12557 base = XEXP (XEXP (x, 0), 1);
12558 if (!REG_OK_FOR_BASE_STRICT_P (base))
12559 {
12560 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12561 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12562 opnum, (enum reload_type) type);
12563 something_reloaded = true;
12564 }
12565
12566 index = XEXP (x, 1);
12567 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12568 {
12569 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12570 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12571 opnum, (enum reload_type) type);
12572 something_reloaded = true;
12573 }
12574
12575 gcc_assert (something_reloaded);
12576 return true;
12577 }
12578
12579 return false;
12580 }
12581
12582 /* Determine if op is suitable RTX for an address register.
12583 Return naked register if a register or a register subreg is
12584 found, otherwise return NULL_RTX. */
12585
12586 static rtx
12587 ix86_validate_address_register (rtx op)
12588 {
12589 enum machine_mode mode = GET_MODE (op);
12590
12591 /* Only SImode or DImode registers can form the address. */
12592 if (mode != SImode && mode != DImode)
12593 return NULL_RTX;
12594
12595 if (REG_P (op))
12596 return op;
12597 else if (GET_CODE (op) == SUBREG)
12598 {
12599 rtx reg = SUBREG_REG (op);
12600
12601 if (!REG_P (reg))
12602 return NULL_RTX;
12603
12604 mode = GET_MODE (reg);
12605
12606 /* Don't allow SUBREGs that span more than a word. It can
12607 lead to spill failures when the register is one word out
12608 of a two word structure. */
12609 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
12610 return NULL_RTX;
12611
12612 /* Allow only SUBREGs of non-eliminable hard registers. */
12613 if (register_no_elim_operand (reg, mode))
12614 return reg;
12615 }
12616
12617 /* Op is not a register. */
12618 return NULL_RTX;
12619 }
12620
12621 /* Recognizes RTL expressions that are valid memory addresses for an
12622 instruction. The MODE argument is the machine mode for the MEM
12623 expression that wants to use this address.
12624
12625 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12626 convert common non-canonical forms to canonical form so that they will
12627 be recognized. */
12628
12629 static bool
12630 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12631 rtx addr, bool strict)
12632 {
12633 struct ix86_address parts;
12634 rtx base, index, disp;
12635 HOST_WIDE_INT scale;
12636 enum ix86_address_seg seg;
12637
12638 if (ix86_decompose_address (addr, &parts) <= 0)
12639 /* Decomposition failed. */
12640 return false;
12641
12642 base = parts.base;
12643 index = parts.index;
12644 disp = parts.disp;
12645 scale = parts.scale;
12646 seg = parts.seg;
12647
12648 /* Validate base register. */
12649 if (base)
12650 {
12651 rtx reg = ix86_validate_address_register (base);
12652
12653 if (reg == NULL_RTX)
12654 return false;
12655
12656 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12657 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12658 /* Base is not valid. */
12659 return false;
12660 }
12661
12662 /* Validate index register. */
12663 if (index)
12664 {
12665 rtx reg = ix86_validate_address_register (index);
12666
12667 if (reg == NULL_RTX)
12668 return false;
12669
12670 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12671 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12672 /* Index is not valid. */
12673 return false;
12674 }
12675
12676 /* Index and base should have the same mode. */
12677 if (base && index
12678 && GET_MODE (base) != GET_MODE (index))
12679 return false;
12680
12681 /* Address override works only on the (%reg) part of %fs:(%reg). */
12682 if (seg != SEG_DEFAULT
12683 && ((base && GET_MODE (base) != word_mode)
12684 || (index && GET_MODE (index) != word_mode)))
12685 return false;
12686
12687 /* Validate scale factor. */
12688 if (scale != 1)
12689 {
12690 if (!index)
12691 /* Scale without index. */
12692 return false;
12693
12694 if (scale != 2 && scale != 4 && scale != 8)
12695 /* Scale is not a valid multiplier. */
12696 return false;
12697 }
12698
12699 /* Validate displacement. */
12700 if (disp)
12701 {
12702 if (GET_CODE (disp) == CONST
12703 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12704 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12705 switch (XINT (XEXP (disp, 0), 1))
12706 {
12707 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12708 used. While ABI specify also 32bit relocations, we don't produce
12709 them at all and use IP relative instead. */
12710 case UNSPEC_GOT:
12711 case UNSPEC_GOTOFF:
12712 gcc_assert (flag_pic);
12713 if (!TARGET_64BIT)
12714 goto is_legitimate_pic;
12715
12716 /* 64bit address unspec. */
12717 return false;
12718
12719 case UNSPEC_GOTPCREL:
12720 case UNSPEC_PCREL:
12721 gcc_assert (flag_pic);
12722 goto is_legitimate_pic;
12723
12724 case UNSPEC_GOTTPOFF:
12725 case UNSPEC_GOTNTPOFF:
12726 case UNSPEC_INDNTPOFF:
12727 case UNSPEC_NTPOFF:
12728 case UNSPEC_DTPOFF:
12729 break;
12730
12731 case UNSPEC_STACK_CHECK:
12732 gcc_assert (flag_split_stack);
12733 break;
12734
12735 default:
12736 /* Invalid address unspec. */
12737 return false;
12738 }
12739
12740 else if (SYMBOLIC_CONST (disp)
12741 && (flag_pic
12742 || (TARGET_MACHO
12743 #if TARGET_MACHO
12744 && MACHOPIC_INDIRECT
12745 && !machopic_operand_p (disp)
12746 #endif
12747 )))
12748 {
12749
12750 is_legitimate_pic:
12751 if (TARGET_64BIT && (index || base))
12752 {
12753 /* foo@dtpoff(%rX) is ok. */
12754 if (GET_CODE (disp) != CONST
12755 || GET_CODE (XEXP (disp, 0)) != PLUS
12756 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12757 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12758 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12759 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12760 /* Non-constant pic memory reference. */
12761 return false;
12762 }
12763 else if ((!TARGET_MACHO || flag_pic)
12764 && ! legitimate_pic_address_disp_p (disp))
12765 /* Displacement is an invalid pic construct. */
12766 return false;
12767 #if TARGET_MACHO
12768 else if (MACHO_DYNAMIC_NO_PIC_P
12769 && !ix86_legitimate_constant_p (Pmode, disp))
12770 /* displacment must be referenced via non_lazy_pointer */
12771 return false;
12772 #endif
12773
12774 /* This code used to verify that a symbolic pic displacement
12775 includes the pic_offset_table_rtx register.
12776
12777 While this is good idea, unfortunately these constructs may
12778 be created by "adds using lea" optimization for incorrect
12779 code like:
12780
12781 int a;
12782 int foo(int i)
12783 {
12784 return *(&a+i);
12785 }
12786
12787 This code is nonsensical, but results in addressing
12788 GOT table with pic_offset_table_rtx base. We can't
12789 just refuse it easily, since it gets matched by
12790 "addsi3" pattern, that later gets split to lea in the
12791 case output register differs from input. While this
12792 can be handled by separate addsi pattern for this case
12793 that never results in lea, this seems to be easier and
12794 correct fix for crash to disable this test. */
12795 }
12796 else if (GET_CODE (disp) != LABEL_REF
12797 && !CONST_INT_P (disp)
12798 && (GET_CODE (disp) != CONST
12799 || !ix86_legitimate_constant_p (Pmode, disp))
12800 && (GET_CODE (disp) != SYMBOL_REF
12801 || !ix86_legitimate_constant_p (Pmode, disp)))
12802 /* Displacement is not constant. */
12803 return false;
12804 else if (TARGET_64BIT
12805 && !x86_64_immediate_operand (disp, VOIDmode))
12806 /* Displacement is out of range. */
12807 return false;
12808 /* In x32 mode, constant addresses are sign extended to 64bit, so
12809 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12810 else if (TARGET_X32 && !(index || base)
12811 && CONST_INT_P (disp)
12812 && val_signbit_known_set_p (SImode, INTVAL (disp)))
12813 return false;
12814 }
12815
12816 /* Everything looks valid. */
12817 return true;
12818 }
12819
12820 /* Determine if a given RTX is a valid constant address. */
12821
12822 bool
12823 constant_address_p (rtx x)
12824 {
12825 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12826 }
12827 \f
12828 /* Return a unique alias set for the GOT. */
12829
12830 static alias_set_type
12831 ix86_GOT_alias_set (void)
12832 {
12833 static alias_set_type set = -1;
12834 if (set == -1)
12835 set = new_alias_set ();
12836 return set;
12837 }
12838
12839 /* Return a legitimate reference for ORIG (an address) using the
12840 register REG. If REG is 0, a new pseudo is generated.
12841
12842 There are two types of references that must be handled:
12843
12844 1. Global data references must load the address from the GOT, via
12845 the PIC reg. An insn is emitted to do this load, and the reg is
12846 returned.
12847
12848 2. Static data references, constant pool addresses, and code labels
12849 compute the address as an offset from the GOT, whose base is in
12850 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12851 differentiate them from global data objects. The returned
12852 address is the PIC reg + an unspec constant.
12853
12854 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12855 reg also appears in the address. */
12856
12857 static rtx
12858 legitimize_pic_address (rtx orig, rtx reg)
12859 {
12860 rtx addr = orig;
12861 rtx new_rtx = orig;
12862
12863 #if TARGET_MACHO
12864 if (TARGET_MACHO && !TARGET_64BIT)
12865 {
12866 if (reg == 0)
12867 reg = gen_reg_rtx (Pmode);
12868 /* Use the generic Mach-O PIC machinery. */
12869 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12870 }
12871 #endif
12872
12873 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12874 {
12875 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12876 if (tmp)
12877 return tmp;
12878 }
12879
12880 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12881 new_rtx = addr;
12882 else if (TARGET_64BIT && !TARGET_PECOFF
12883 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12884 {
12885 rtx tmpreg;
12886 /* This symbol may be referenced via a displacement from the PIC
12887 base address (@GOTOFF). */
12888
12889 if (reload_in_progress)
12890 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12891 if (GET_CODE (addr) == CONST)
12892 addr = XEXP (addr, 0);
12893 if (GET_CODE (addr) == PLUS)
12894 {
12895 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12896 UNSPEC_GOTOFF);
12897 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12898 }
12899 else
12900 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12901 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12902 if (!reg)
12903 tmpreg = gen_reg_rtx (Pmode);
12904 else
12905 tmpreg = reg;
12906 emit_move_insn (tmpreg, new_rtx);
12907
12908 if (reg != 0)
12909 {
12910 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12911 tmpreg, 1, OPTAB_DIRECT);
12912 new_rtx = reg;
12913 }
12914 else
12915 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12916 }
12917 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12918 {
12919 /* This symbol may be referenced via a displacement from the PIC
12920 base address (@GOTOFF). */
12921
12922 if (reload_in_progress)
12923 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12924 if (GET_CODE (addr) == CONST)
12925 addr = XEXP (addr, 0);
12926 if (GET_CODE (addr) == PLUS)
12927 {
12928 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12929 UNSPEC_GOTOFF);
12930 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12931 }
12932 else
12933 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12934 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12935 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12936
12937 if (reg != 0)
12938 {
12939 emit_move_insn (reg, new_rtx);
12940 new_rtx = reg;
12941 }
12942 }
12943 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12944 /* We can't use @GOTOFF for text labels on VxWorks;
12945 see gotoff_operand. */
12946 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12947 {
12948 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12949 if (tmp)
12950 return tmp;
12951
12952 /* For x64 PE-COFF there is no GOT table. So we use address
12953 directly. */
12954 if (TARGET_64BIT && TARGET_PECOFF)
12955 {
12956 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12957 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12958
12959 if (reg == 0)
12960 reg = gen_reg_rtx (Pmode);
12961 emit_move_insn (reg, new_rtx);
12962 new_rtx = reg;
12963 }
12964 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12965 {
12966 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12967 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12968 new_rtx = gen_const_mem (Pmode, new_rtx);
12969 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12970
12971 if (reg == 0)
12972 reg = gen_reg_rtx (Pmode);
12973 /* Use directly gen_movsi, otherwise the address is loaded
12974 into register for CSE. We don't want to CSE this addresses,
12975 instead we CSE addresses from the GOT table, so skip this. */
12976 emit_insn (gen_movsi (reg, new_rtx));
12977 new_rtx = reg;
12978 }
12979 else
12980 {
12981 /* This symbol must be referenced via a load from the
12982 Global Offset Table (@GOT). */
12983
12984 if (reload_in_progress)
12985 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12986 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12987 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12988 if (TARGET_64BIT)
12989 new_rtx = force_reg (Pmode, new_rtx);
12990 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12991 new_rtx = gen_const_mem (Pmode, new_rtx);
12992 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12993
12994 if (reg == 0)
12995 reg = gen_reg_rtx (Pmode);
12996 emit_move_insn (reg, new_rtx);
12997 new_rtx = reg;
12998 }
12999 }
13000 else
13001 {
13002 if (CONST_INT_P (addr)
13003 && !x86_64_immediate_operand (addr, VOIDmode))
13004 {
13005 if (reg)
13006 {
13007 emit_move_insn (reg, addr);
13008 new_rtx = reg;
13009 }
13010 else
13011 new_rtx = force_reg (Pmode, addr);
13012 }
13013 else if (GET_CODE (addr) == CONST)
13014 {
13015 addr = XEXP (addr, 0);
13016
13017 /* We must match stuff we generate before. Assume the only
13018 unspecs that can get here are ours. Not that we could do
13019 anything with them anyway.... */
13020 if (GET_CODE (addr) == UNSPEC
13021 || (GET_CODE (addr) == PLUS
13022 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
13023 return orig;
13024 gcc_assert (GET_CODE (addr) == PLUS);
13025 }
13026 if (GET_CODE (addr) == PLUS)
13027 {
13028 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
13029
13030 /* Check first to see if this is a constant offset from a @GOTOFF
13031 symbol reference. */
13032 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
13033 && CONST_INT_P (op1))
13034 {
13035 if (!TARGET_64BIT)
13036 {
13037 if (reload_in_progress)
13038 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13039 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
13040 UNSPEC_GOTOFF);
13041 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
13042 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13043 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13044
13045 if (reg != 0)
13046 {
13047 emit_move_insn (reg, new_rtx);
13048 new_rtx = reg;
13049 }
13050 }
13051 else
13052 {
13053 if (INTVAL (op1) < -16*1024*1024
13054 || INTVAL (op1) >= 16*1024*1024)
13055 {
13056 if (!x86_64_immediate_operand (op1, Pmode))
13057 op1 = force_reg (Pmode, op1);
13058 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
13059 }
13060 }
13061 }
13062 else
13063 {
13064 rtx base = legitimize_pic_address (op0, reg);
13065 enum machine_mode mode = GET_MODE (base);
13066 new_rtx
13067 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
13068
13069 if (CONST_INT_P (new_rtx))
13070 {
13071 if (INTVAL (new_rtx) < -16*1024*1024
13072 || INTVAL (new_rtx) >= 16*1024*1024)
13073 {
13074 if (!x86_64_immediate_operand (new_rtx, mode))
13075 new_rtx = force_reg (mode, new_rtx);
13076 new_rtx
13077 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
13078 }
13079 else
13080 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
13081 }
13082 else
13083 {
13084 if (GET_CODE (new_rtx) == PLUS
13085 && CONSTANT_P (XEXP (new_rtx, 1)))
13086 {
13087 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
13088 new_rtx = XEXP (new_rtx, 1);
13089 }
13090 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
13091 }
13092 }
13093 }
13094 }
13095 return new_rtx;
13096 }
13097 \f
13098 /* Load the thread pointer. If TO_REG is true, force it into a register. */
13099
13100 static rtx
13101 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
13102 {
13103 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
13104
13105 if (GET_MODE (tp) != tp_mode)
13106 {
13107 gcc_assert (GET_MODE (tp) == SImode);
13108 gcc_assert (tp_mode == DImode);
13109
13110 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
13111 }
13112
13113 if (to_reg)
13114 tp = copy_to_mode_reg (tp_mode, tp);
13115
13116 return tp;
13117 }
13118
13119 /* Construct the SYMBOL_REF for the tls_get_addr function. */
13120
13121 static GTY(()) rtx ix86_tls_symbol;
13122
13123 static rtx
13124 ix86_tls_get_addr (void)
13125 {
13126 if (!ix86_tls_symbol)
13127 {
13128 const char *sym
13129 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
13130 ? "___tls_get_addr" : "__tls_get_addr");
13131
13132 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
13133 }
13134
13135 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
13136 {
13137 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
13138 UNSPEC_PLTOFF);
13139 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
13140 gen_rtx_CONST (Pmode, unspec));
13141 }
13142
13143 return ix86_tls_symbol;
13144 }
13145
13146 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
13147
13148 static GTY(()) rtx ix86_tls_module_base_symbol;
13149
13150 rtx
13151 ix86_tls_module_base (void)
13152 {
13153 if (!ix86_tls_module_base_symbol)
13154 {
13155 ix86_tls_module_base_symbol
13156 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13157
13158 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13159 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13160 }
13161
13162 return ix86_tls_module_base_symbol;
13163 }
13164
13165 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13166 false if we expect this to be used for a memory address and true if
13167 we expect to load the address into a register. */
13168
13169 static rtx
13170 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13171 {
13172 rtx dest, base, off;
13173 rtx pic = NULL_RTX, tp = NULL_RTX;
13174 enum machine_mode tp_mode = Pmode;
13175 int type;
13176
13177 switch (model)
13178 {
13179 case TLS_MODEL_GLOBAL_DYNAMIC:
13180 dest = gen_reg_rtx (Pmode);
13181
13182 if (!TARGET_64BIT)
13183 {
13184 if (flag_pic && !TARGET_PECOFF)
13185 pic = pic_offset_table_rtx;
13186 else
13187 {
13188 pic = gen_reg_rtx (Pmode);
13189 emit_insn (gen_set_got (pic));
13190 }
13191 }
13192
13193 if (TARGET_GNU2_TLS)
13194 {
13195 if (TARGET_64BIT)
13196 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13197 else
13198 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13199
13200 tp = get_thread_pointer (Pmode, true);
13201 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13202
13203 if (GET_MODE (x) != Pmode)
13204 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13205
13206 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13207 }
13208 else
13209 {
13210 rtx caddr = ix86_tls_get_addr ();
13211
13212 if (TARGET_64BIT)
13213 {
13214 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13215 rtx insns;
13216
13217 start_sequence ();
13218 emit_call_insn
13219 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13220 insns = get_insns ();
13221 end_sequence ();
13222
13223 if (GET_MODE (x) != Pmode)
13224 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13225
13226 RTL_CONST_CALL_P (insns) = 1;
13227 emit_libcall_block (insns, dest, rax, x);
13228 }
13229 else
13230 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13231 }
13232 break;
13233
13234 case TLS_MODEL_LOCAL_DYNAMIC:
13235 base = gen_reg_rtx (Pmode);
13236
13237 if (!TARGET_64BIT)
13238 {
13239 if (flag_pic)
13240 pic = pic_offset_table_rtx;
13241 else
13242 {
13243 pic = gen_reg_rtx (Pmode);
13244 emit_insn (gen_set_got (pic));
13245 }
13246 }
13247
13248 if (TARGET_GNU2_TLS)
13249 {
13250 rtx tmp = ix86_tls_module_base ();
13251
13252 if (TARGET_64BIT)
13253 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13254 else
13255 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13256
13257 tp = get_thread_pointer (Pmode, true);
13258 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13259 gen_rtx_MINUS (Pmode, tmp, tp));
13260 }
13261 else
13262 {
13263 rtx caddr = ix86_tls_get_addr ();
13264
13265 if (TARGET_64BIT)
13266 {
13267 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13268 rtx insns, eqv;
13269
13270 start_sequence ();
13271 emit_call_insn
13272 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13273 insns = get_insns ();
13274 end_sequence ();
13275
13276 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13277 share the LD_BASE result with other LD model accesses. */
13278 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13279 UNSPEC_TLS_LD_BASE);
13280
13281 RTL_CONST_CALL_P (insns) = 1;
13282 emit_libcall_block (insns, base, rax, eqv);
13283 }
13284 else
13285 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13286 }
13287
13288 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13289 off = gen_rtx_CONST (Pmode, off);
13290
13291 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13292
13293 if (TARGET_GNU2_TLS)
13294 {
13295 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13296
13297 if (GET_MODE (x) != Pmode)
13298 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13299
13300 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13301 }
13302 break;
13303
13304 case TLS_MODEL_INITIAL_EXEC:
13305 if (TARGET_64BIT)
13306 {
13307 if (TARGET_SUN_TLS && !TARGET_X32)
13308 {
13309 /* The Sun linker took the AMD64 TLS spec literally
13310 and can only handle %rax as destination of the
13311 initial executable code sequence. */
13312
13313 dest = gen_reg_rtx (DImode);
13314 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13315 return dest;
13316 }
13317
13318 /* Generate DImode references to avoid %fs:(%reg32)
13319 problems and linker IE->LE relaxation bug. */
13320 tp_mode = DImode;
13321 pic = NULL;
13322 type = UNSPEC_GOTNTPOFF;
13323 }
13324 else if (flag_pic)
13325 {
13326 if (reload_in_progress)
13327 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13328 pic = pic_offset_table_rtx;
13329 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13330 }
13331 else if (!TARGET_ANY_GNU_TLS)
13332 {
13333 pic = gen_reg_rtx (Pmode);
13334 emit_insn (gen_set_got (pic));
13335 type = UNSPEC_GOTTPOFF;
13336 }
13337 else
13338 {
13339 pic = NULL;
13340 type = UNSPEC_INDNTPOFF;
13341 }
13342
13343 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13344 off = gen_rtx_CONST (tp_mode, off);
13345 if (pic)
13346 off = gen_rtx_PLUS (tp_mode, pic, off);
13347 off = gen_const_mem (tp_mode, off);
13348 set_mem_alias_set (off, ix86_GOT_alias_set ());
13349
13350 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13351 {
13352 base = get_thread_pointer (tp_mode,
13353 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13354 off = force_reg (tp_mode, off);
13355 return gen_rtx_PLUS (tp_mode, base, off);
13356 }
13357 else
13358 {
13359 base = get_thread_pointer (Pmode, true);
13360 dest = gen_reg_rtx (Pmode);
13361 emit_insn (ix86_gen_sub3 (dest, base, off));
13362 }
13363 break;
13364
13365 case TLS_MODEL_LOCAL_EXEC:
13366 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13367 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13368 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13369 off = gen_rtx_CONST (Pmode, off);
13370
13371 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13372 {
13373 base = get_thread_pointer (Pmode,
13374 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13375 return gen_rtx_PLUS (Pmode, base, off);
13376 }
13377 else
13378 {
13379 base = get_thread_pointer (Pmode, true);
13380 dest = gen_reg_rtx (Pmode);
13381 emit_insn (ix86_gen_sub3 (dest, base, off));
13382 }
13383 break;
13384
13385 default:
13386 gcc_unreachable ();
13387 }
13388
13389 return dest;
13390 }
13391
13392 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13393 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13394 unique refptr-DECL symbol corresponding to symbol DECL. */
13395
13396 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13397 htab_t dllimport_map;
13398
13399 static tree
13400 get_dllimport_decl (tree decl, bool beimport)
13401 {
13402 struct tree_map *h, in;
13403 void **loc;
13404 const char *name;
13405 const char *prefix;
13406 size_t namelen, prefixlen;
13407 char *imp_name;
13408 tree to;
13409 rtx rtl;
13410
13411 if (!dllimport_map)
13412 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13413
13414 in.hash = htab_hash_pointer (decl);
13415 in.base.from = decl;
13416 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13417 h = (struct tree_map *) *loc;
13418 if (h)
13419 return h->to;
13420
13421 *loc = h = ggc_alloc_tree_map ();
13422 h->hash = in.hash;
13423 h->base.from = decl;
13424 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13425 VAR_DECL, NULL, ptr_type_node);
13426 DECL_ARTIFICIAL (to) = 1;
13427 DECL_IGNORED_P (to) = 1;
13428 DECL_EXTERNAL (to) = 1;
13429 TREE_READONLY (to) = 1;
13430
13431 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13432 name = targetm.strip_name_encoding (name);
13433 if (beimport)
13434 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13435 ? "*__imp_" : "*__imp__";
13436 else
13437 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13438 namelen = strlen (name);
13439 prefixlen = strlen (prefix);
13440 imp_name = (char *) alloca (namelen + prefixlen + 1);
13441 memcpy (imp_name, prefix, prefixlen);
13442 memcpy (imp_name + prefixlen, name, namelen + 1);
13443
13444 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13445 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13446 SET_SYMBOL_REF_DECL (rtl, to);
13447 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13448 if (!beimport)
13449 {
13450 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13451 #ifdef SUB_TARGET_RECORD_STUB
13452 SUB_TARGET_RECORD_STUB (name);
13453 #endif
13454 }
13455
13456 rtl = gen_const_mem (Pmode, rtl);
13457 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13458
13459 SET_DECL_RTL (to, rtl);
13460 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13461
13462 return to;
13463 }
13464
13465 /* Expand SYMBOL into its corresponding far-addresse symbol.
13466 WANT_REG is true if we require the result be a register. */
13467
13468 static rtx
13469 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13470 {
13471 tree imp_decl;
13472 rtx x;
13473
13474 gcc_assert (SYMBOL_REF_DECL (symbol));
13475 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13476
13477 x = DECL_RTL (imp_decl);
13478 if (want_reg)
13479 x = force_reg (Pmode, x);
13480 return x;
13481 }
13482
13483 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13484 true if we require the result be a register. */
13485
13486 static rtx
13487 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13488 {
13489 tree imp_decl;
13490 rtx x;
13491
13492 gcc_assert (SYMBOL_REF_DECL (symbol));
13493 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13494
13495 x = DECL_RTL (imp_decl);
13496 if (want_reg)
13497 x = force_reg (Pmode, x);
13498 return x;
13499 }
13500
13501 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13502 is true if we require the result be a register. */
13503
13504 static rtx
13505 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13506 {
13507 if (!TARGET_PECOFF)
13508 return NULL_RTX;
13509
13510 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13511 {
13512 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13513 return legitimize_dllimport_symbol (addr, inreg);
13514 if (GET_CODE (addr) == CONST
13515 && GET_CODE (XEXP (addr, 0)) == PLUS
13516 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13517 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13518 {
13519 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13520 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13521 }
13522 }
13523
13524 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13525 return NULL_RTX;
13526 if (GET_CODE (addr) == SYMBOL_REF
13527 && !is_imported_p (addr)
13528 && SYMBOL_REF_EXTERNAL_P (addr)
13529 && SYMBOL_REF_DECL (addr))
13530 return legitimize_pe_coff_extern_decl (addr, inreg);
13531
13532 if (GET_CODE (addr) == CONST
13533 && GET_CODE (XEXP (addr, 0)) == PLUS
13534 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13535 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13536 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13537 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13538 {
13539 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13540 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13541 }
13542 return NULL_RTX;
13543 }
13544
13545 /* Try machine-dependent ways of modifying an illegitimate address
13546 to be legitimate. If we find one, return the new, valid address.
13547 This macro is used in only one place: `memory_address' in explow.c.
13548
13549 OLDX is the address as it was before break_out_memory_refs was called.
13550 In some cases it is useful to look at this to decide what needs to be done.
13551
13552 It is always safe for this macro to do nothing. It exists to recognize
13553 opportunities to optimize the output.
13554
13555 For the 80386, we handle X+REG by loading X into a register R and
13556 using R+REG. R will go in a general reg and indexing will be used.
13557 However, if REG is a broken-out memory address or multiplication,
13558 nothing needs to be done because REG can certainly go in a general reg.
13559
13560 When -fpic is used, special handling is needed for symbolic references.
13561 See comments by legitimize_pic_address in i386.c for details. */
13562
13563 static rtx
13564 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13565 enum machine_mode mode)
13566 {
13567 int changed = 0;
13568 unsigned log;
13569
13570 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13571 if (log)
13572 return legitimize_tls_address (x, (enum tls_model) log, false);
13573 if (GET_CODE (x) == CONST
13574 && GET_CODE (XEXP (x, 0)) == PLUS
13575 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13576 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13577 {
13578 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13579 (enum tls_model) log, false);
13580 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13581 }
13582
13583 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13584 {
13585 rtx tmp = legitimize_pe_coff_symbol (x, true);
13586 if (tmp)
13587 return tmp;
13588 }
13589
13590 if (flag_pic && SYMBOLIC_CONST (x))
13591 return legitimize_pic_address (x, 0);
13592
13593 #if TARGET_MACHO
13594 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13595 return machopic_indirect_data_reference (x, 0);
13596 #endif
13597
13598 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13599 if (GET_CODE (x) == ASHIFT
13600 && CONST_INT_P (XEXP (x, 1))
13601 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13602 {
13603 changed = 1;
13604 log = INTVAL (XEXP (x, 1));
13605 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13606 GEN_INT (1 << log));
13607 }
13608
13609 if (GET_CODE (x) == PLUS)
13610 {
13611 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13612
13613 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13614 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13615 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13616 {
13617 changed = 1;
13618 log = INTVAL (XEXP (XEXP (x, 0), 1));
13619 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13620 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13621 GEN_INT (1 << log));
13622 }
13623
13624 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13625 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13626 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13627 {
13628 changed = 1;
13629 log = INTVAL (XEXP (XEXP (x, 1), 1));
13630 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13631 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13632 GEN_INT (1 << log));
13633 }
13634
13635 /* Put multiply first if it isn't already. */
13636 if (GET_CODE (XEXP (x, 1)) == MULT)
13637 {
13638 rtx tmp = XEXP (x, 0);
13639 XEXP (x, 0) = XEXP (x, 1);
13640 XEXP (x, 1) = tmp;
13641 changed = 1;
13642 }
13643
13644 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13645 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13646 created by virtual register instantiation, register elimination, and
13647 similar optimizations. */
13648 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13649 {
13650 changed = 1;
13651 x = gen_rtx_PLUS (Pmode,
13652 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13653 XEXP (XEXP (x, 1), 0)),
13654 XEXP (XEXP (x, 1), 1));
13655 }
13656
13657 /* Canonicalize
13658 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13659 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13660 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13661 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13662 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13663 && CONSTANT_P (XEXP (x, 1)))
13664 {
13665 rtx constant;
13666 rtx other = NULL_RTX;
13667
13668 if (CONST_INT_P (XEXP (x, 1)))
13669 {
13670 constant = XEXP (x, 1);
13671 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13672 }
13673 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13674 {
13675 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13676 other = XEXP (x, 1);
13677 }
13678 else
13679 constant = 0;
13680
13681 if (constant)
13682 {
13683 changed = 1;
13684 x = gen_rtx_PLUS (Pmode,
13685 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13686 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13687 plus_constant (Pmode, other,
13688 INTVAL (constant)));
13689 }
13690 }
13691
13692 if (changed && ix86_legitimate_address_p (mode, x, false))
13693 return x;
13694
13695 if (GET_CODE (XEXP (x, 0)) == MULT)
13696 {
13697 changed = 1;
13698 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13699 }
13700
13701 if (GET_CODE (XEXP (x, 1)) == MULT)
13702 {
13703 changed = 1;
13704 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13705 }
13706
13707 if (changed
13708 && REG_P (XEXP (x, 1))
13709 && REG_P (XEXP (x, 0)))
13710 return x;
13711
13712 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13713 {
13714 changed = 1;
13715 x = legitimize_pic_address (x, 0);
13716 }
13717
13718 if (changed && ix86_legitimate_address_p (mode, x, false))
13719 return x;
13720
13721 if (REG_P (XEXP (x, 0)))
13722 {
13723 rtx temp = gen_reg_rtx (Pmode);
13724 rtx val = force_operand (XEXP (x, 1), temp);
13725 if (val != temp)
13726 {
13727 val = convert_to_mode (Pmode, val, 1);
13728 emit_move_insn (temp, val);
13729 }
13730
13731 XEXP (x, 1) = temp;
13732 return x;
13733 }
13734
13735 else if (REG_P (XEXP (x, 1)))
13736 {
13737 rtx temp = gen_reg_rtx (Pmode);
13738 rtx val = force_operand (XEXP (x, 0), temp);
13739 if (val != temp)
13740 {
13741 val = convert_to_mode (Pmode, val, 1);
13742 emit_move_insn (temp, val);
13743 }
13744
13745 XEXP (x, 0) = temp;
13746 return x;
13747 }
13748 }
13749
13750 return x;
13751 }
13752 \f
13753 /* Print an integer constant expression in assembler syntax. Addition
13754 and subtraction are the only arithmetic that may appear in these
13755 expressions. FILE is the stdio stream to write to, X is the rtx, and
13756 CODE is the operand print code from the output string. */
13757
13758 static void
13759 output_pic_addr_const (FILE *file, rtx x, int code)
13760 {
13761 char buf[256];
13762
13763 switch (GET_CODE (x))
13764 {
13765 case PC:
13766 gcc_assert (flag_pic);
13767 putc ('.', file);
13768 break;
13769
13770 case SYMBOL_REF:
13771 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13772 output_addr_const (file, x);
13773 else
13774 {
13775 const char *name = XSTR (x, 0);
13776
13777 /* Mark the decl as referenced so that cgraph will
13778 output the function. */
13779 if (SYMBOL_REF_DECL (x))
13780 mark_decl_referenced (SYMBOL_REF_DECL (x));
13781
13782 #if TARGET_MACHO
13783 if (MACHOPIC_INDIRECT
13784 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13785 name = machopic_indirection_name (x, /*stub_p=*/true);
13786 #endif
13787 assemble_name (file, name);
13788 }
13789 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13790 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13791 fputs ("@PLT", file);
13792 break;
13793
13794 case LABEL_REF:
13795 x = XEXP (x, 0);
13796 /* FALLTHRU */
13797 case CODE_LABEL:
13798 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13799 assemble_name (asm_out_file, buf);
13800 break;
13801
13802 case CONST_INT:
13803 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13804 break;
13805
13806 case CONST:
13807 /* This used to output parentheses around the expression,
13808 but that does not work on the 386 (either ATT or BSD assembler). */
13809 output_pic_addr_const (file, XEXP (x, 0), code);
13810 break;
13811
13812 case CONST_DOUBLE:
13813 if (GET_MODE (x) == VOIDmode)
13814 {
13815 /* We can use %d if the number is <32 bits and positive. */
13816 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13817 fprintf (file, "0x%lx%08lx",
13818 (unsigned long) CONST_DOUBLE_HIGH (x),
13819 (unsigned long) CONST_DOUBLE_LOW (x));
13820 else
13821 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13822 }
13823 else
13824 /* We can't handle floating point constants;
13825 TARGET_PRINT_OPERAND must handle them. */
13826 output_operand_lossage ("floating constant misused");
13827 break;
13828
13829 case PLUS:
13830 /* Some assemblers need integer constants to appear first. */
13831 if (CONST_INT_P (XEXP (x, 0)))
13832 {
13833 output_pic_addr_const (file, XEXP (x, 0), code);
13834 putc ('+', file);
13835 output_pic_addr_const (file, XEXP (x, 1), code);
13836 }
13837 else
13838 {
13839 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13840 output_pic_addr_const (file, XEXP (x, 1), code);
13841 putc ('+', file);
13842 output_pic_addr_const (file, XEXP (x, 0), code);
13843 }
13844 break;
13845
13846 case MINUS:
13847 if (!TARGET_MACHO)
13848 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13849 output_pic_addr_const (file, XEXP (x, 0), code);
13850 putc ('-', file);
13851 output_pic_addr_const (file, XEXP (x, 1), code);
13852 if (!TARGET_MACHO)
13853 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13854 break;
13855
13856 case UNSPEC:
13857 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13858 {
13859 bool f = i386_asm_output_addr_const_extra (file, x);
13860 gcc_assert (f);
13861 break;
13862 }
13863
13864 gcc_assert (XVECLEN (x, 0) == 1);
13865 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13866 switch (XINT (x, 1))
13867 {
13868 case UNSPEC_GOT:
13869 fputs ("@GOT", file);
13870 break;
13871 case UNSPEC_GOTOFF:
13872 fputs ("@GOTOFF", file);
13873 break;
13874 case UNSPEC_PLTOFF:
13875 fputs ("@PLTOFF", file);
13876 break;
13877 case UNSPEC_PCREL:
13878 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13879 "(%rip)" : "[rip]", file);
13880 break;
13881 case UNSPEC_GOTPCREL:
13882 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13883 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13884 break;
13885 case UNSPEC_GOTTPOFF:
13886 /* FIXME: This might be @TPOFF in Sun ld too. */
13887 fputs ("@gottpoff", file);
13888 break;
13889 case UNSPEC_TPOFF:
13890 fputs ("@tpoff", file);
13891 break;
13892 case UNSPEC_NTPOFF:
13893 if (TARGET_64BIT)
13894 fputs ("@tpoff", file);
13895 else
13896 fputs ("@ntpoff", file);
13897 break;
13898 case UNSPEC_DTPOFF:
13899 fputs ("@dtpoff", file);
13900 break;
13901 case UNSPEC_GOTNTPOFF:
13902 if (TARGET_64BIT)
13903 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13904 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13905 else
13906 fputs ("@gotntpoff", file);
13907 break;
13908 case UNSPEC_INDNTPOFF:
13909 fputs ("@indntpoff", file);
13910 break;
13911 #if TARGET_MACHO
13912 case UNSPEC_MACHOPIC_OFFSET:
13913 putc ('-', file);
13914 machopic_output_function_base_name (file);
13915 break;
13916 #endif
13917 default:
13918 output_operand_lossage ("invalid UNSPEC as operand");
13919 break;
13920 }
13921 break;
13922
13923 default:
13924 output_operand_lossage ("invalid expression as operand");
13925 }
13926 }
13927
13928 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13929 We need to emit DTP-relative relocations. */
13930
13931 static void ATTRIBUTE_UNUSED
13932 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13933 {
13934 fputs (ASM_LONG, file);
13935 output_addr_const (file, x);
13936 fputs ("@dtpoff", file);
13937 switch (size)
13938 {
13939 case 4:
13940 break;
13941 case 8:
13942 fputs (", 0", file);
13943 break;
13944 default:
13945 gcc_unreachable ();
13946 }
13947 }
13948
13949 /* Return true if X is a representation of the PIC register. This copes
13950 with calls from ix86_find_base_term, where the register might have
13951 been replaced by a cselib value. */
13952
13953 static bool
13954 ix86_pic_register_p (rtx x)
13955 {
13956 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13957 return (pic_offset_table_rtx
13958 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13959 else
13960 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13961 }
13962
13963 /* Helper function for ix86_delegitimize_address.
13964 Attempt to delegitimize TLS local-exec accesses. */
13965
13966 static rtx
13967 ix86_delegitimize_tls_address (rtx orig_x)
13968 {
13969 rtx x = orig_x, unspec;
13970 struct ix86_address addr;
13971
13972 if (!TARGET_TLS_DIRECT_SEG_REFS)
13973 return orig_x;
13974 if (MEM_P (x))
13975 x = XEXP (x, 0);
13976 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13977 return orig_x;
13978 if (ix86_decompose_address (x, &addr) == 0
13979 || addr.seg != DEFAULT_TLS_SEG_REG
13980 || addr.disp == NULL_RTX
13981 || GET_CODE (addr.disp) != CONST)
13982 return orig_x;
13983 unspec = XEXP (addr.disp, 0);
13984 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13985 unspec = XEXP (unspec, 0);
13986 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13987 return orig_x;
13988 x = XVECEXP (unspec, 0, 0);
13989 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13990 if (unspec != XEXP (addr.disp, 0))
13991 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13992 if (addr.index)
13993 {
13994 rtx idx = addr.index;
13995 if (addr.scale != 1)
13996 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13997 x = gen_rtx_PLUS (Pmode, idx, x);
13998 }
13999 if (addr.base)
14000 x = gen_rtx_PLUS (Pmode, addr.base, x);
14001 if (MEM_P (orig_x))
14002 x = replace_equiv_address_nv (orig_x, x);
14003 return x;
14004 }
14005
14006 /* In the name of slightly smaller debug output, and to cater to
14007 general assembler lossage, recognize PIC+GOTOFF and turn it back
14008 into a direct symbol reference.
14009
14010 On Darwin, this is necessary to avoid a crash, because Darwin
14011 has a different PIC label for each routine but the DWARF debugging
14012 information is not associated with any particular routine, so it's
14013 necessary to remove references to the PIC label from RTL stored by
14014 the DWARF output code. */
14015
14016 static rtx
14017 ix86_delegitimize_address (rtx x)
14018 {
14019 rtx orig_x = delegitimize_mem_from_attrs (x);
14020 /* addend is NULL or some rtx if x is something+GOTOFF where
14021 something doesn't include the PIC register. */
14022 rtx addend = NULL_RTX;
14023 /* reg_addend is NULL or a multiple of some register. */
14024 rtx reg_addend = NULL_RTX;
14025 /* const_addend is NULL or a const_int. */
14026 rtx const_addend = NULL_RTX;
14027 /* This is the result, or NULL. */
14028 rtx result = NULL_RTX;
14029
14030 x = orig_x;
14031
14032 if (MEM_P (x))
14033 x = XEXP (x, 0);
14034
14035 if (TARGET_64BIT)
14036 {
14037 if (GET_CODE (x) == CONST
14038 && GET_CODE (XEXP (x, 0)) == PLUS
14039 && GET_MODE (XEXP (x, 0)) == Pmode
14040 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
14041 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
14042 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
14043 {
14044 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
14045 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
14046 if (MEM_P (orig_x))
14047 x = replace_equiv_address_nv (orig_x, x);
14048 return x;
14049 }
14050
14051 if (GET_CODE (x) == CONST
14052 && GET_CODE (XEXP (x, 0)) == UNSPEC
14053 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
14054 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
14055 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
14056 {
14057 x = XVECEXP (XEXP (x, 0), 0, 0);
14058 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
14059 {
14060 x = simplify_gen_subreg (GET_MODE (orig_x), x,
14061 GET_MODE (x), 0);
14062 if (x == NULL_RTX)
14063 return orig_x;
14064 }
14065 return x;
14066 }
14067
14068 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
14069 return ix86_delegitimize_tls_address (orig_x);
14070
14071 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
14072 and -mcmodel=medium -fpic. */
14073 }
14074
14075 if (GET_CODE (x) != PLUS
14076 || GET_CODE (XEXP (x, 1)) != CONST)
14077 return ix86_delegitimize_tls_address (orig_x);
14078
14079 if (ix86_pic_register_p (XEXP (x, 0)))
14080 /* %ebx + GOT/GOTOFF */
14081 ;
14082 else if (GET_CODE (XEXP (x, 0)) == PLUS)
14083 {
14084 /* %ebx + %reg * scale + GOT/GOTOFF */
14085 reg_addend = XEXP (x, 0);
14086 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
14087 reg_addend = XEXP (reg_addend, 1);
14088 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
14089 reg_addend = XEXP (reg_addend, 0);
14090 else
14091 {
14092 reg_addend = NULL_RTX;
14093 addend = XEXP (x, 0);
14094 }
14095 }
14096 else
14097 addend = XEXP (x, 0);
14098
14099 x = XEXP (XEXP (x, 1), 0);
14100 if (GET_CODE (x) == PLUS
14101 && CONST_INT_P (XEXP (x, 1)))
14102 {
14103 const_addend = XEXP (x, 1);
14104 x = XEXP (x, 0);
14105 }
14106
14107 if (GET_CODE (x) == UNSPEC
14108 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
14109 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
14110 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
14111 && !MEM_P (orig_x) && !addend)))
14112 result = XVECEXP (x, 0, 0);
14113
14114 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
14115 && !MEM_P (orig_x))
14116 result = XVECEXP (x, 0, 0);
14117
14118 if (! result)
14119 return ix86_delegitimize_tls_address (orig_x);
14120
14121 if (const_addend)
14122 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
14123 if (reg_addend)
14124 result = gen_rtx_PLUS (Pmode, reg_addend, result);
14125 if (addend)
14126 {
14127 /* If the rest of original X doesn't involve the PIC register, add
14128 addend and subtract pic_offset_table_rtx. This can happen e.g.
14129 for code like:
14130 leal (%ebx, %ecx, 4), %ecx
14131 ...
14132 movl foo@GOTOFF(%ecx), %edx
14133 in which case we return (%ecx - %ebx) + foo. */
14134 if (pic_offset_table_rtx)
14135 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
14136 pic_offset_table_rtx),
14137 result);
14138 else
14139 return orig_x;
14140 }
14141 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
14142 {
14143 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
14144 if (result == NULL_RTX)
14145 return orig_x;
14146 }
14147 return result;
14148 }
14149
14150 /* If X is a machine specific address (i.e. a symbol or label being
14151 referenced as a displacement from the GOT implemented using an
14152 UNSPEC), then return the base term. Otherwise return X. */
14153
14154 rtx
14155 ix86_find_base_term (rtx x)
14156 {
14157 rtx term;
14158
14159 if (TARGET_64BIT)
14160 {
14161 if (GET_CODE (x) != CONST)
14162 return x;
14163 term = XEXP (x, 0);
14164 if (GET_CODE (term) == PLUS
14165 && (CONST_INT_P (XEXP (term, 1))
14166 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14167 term = XEXP (term, 0);
14168 if (GET_CODE (term) != UNSPEC
14169 || (XINT (term, 1) != UNSPEC_GOTPCREL
14170 && XINT (term, 1) != UNSPEC_PCREL))
14171 return x;
14172
14173 return XVECEXP (term, 0, 0);
14174 }
14175
14176 return ix86_delegitimize_address (x);
14177 }
14178 \f
14179 static void
14180 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14181 bool fp, FILE *file)
14182 {
14183 const char *suffix;
14184
14185 if (mode == CCFPmode || mode == CCFPUmode)
14186 {
14187 code = ix86_fp_compare_code_to_integer (code);
14188 mode = CCmode;
14189 }
14190 if (reverse)
14191 code = reverse_condition (code);
14192
14193 switch (code)
14194 {
14195 case EQ:
14196 switch (mode)
14197 {
14198 case CCAmode:
14199 suffix = "a";
14200 break;
14201
14202 case CCCmode:
14203 suffix = "c";
14204 break;
14205
14206 case CCOmode:
14207 suffix = "o";
14208 break;
14209
14210 case CCSmode:
14211 suffix = "s";
14212 break;
14213
14214 default:
14215 suffix = "e";
14216 }
14217 break;
14218 case NE:
14219 switch (mode)
14220 {
14221 case CCAmode:
14222 suffix = "na";
14223 break;
14224
14225 case CCCmode:
14226 suffix = "nc";
14227 break;
14228
14229 case CCOmode:
14230 suffix = "no";
14231 break;
14232
14233 case CCSmode:
14234 suffix = "ns";
14235 break;
14236
14237 default:
14238 suffix = "ne";
14239 }
14240 break;
14241 case GT:
14242 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14243 suffix = "g";
14244 break;
14245 case GTU:
14246 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14247 Those same assemblers have the same but opposite lossage on cmov. */
14248 if (mode == CCmode)
14249 suffix = fp ? "nbe" : "a";
14250 else
14251 gcc_unreachable ();
14252 break;
14253 case LT:
14254 switch (mode)
14255 {
14256 case CCNOmode:
14257 case CCGOCmode:
14258 suffix = "s";
14259 break;
14260
14261 case CCmode:
14262 case CCGCmode:
14263 suffix = "l";
14264 break;
14265
14266 default:
14267 gcc_unreachable ();
14268 }
14269 break;
14270 case LTU:
14271 if (mode == CCmode)
14272 suffix = "b";
14273 else if (mode == CCCmode)
14274 suffix = "c";
14275 else
14276 gcc_unreachable ();
14277 break;
14278 case GE:
14279 switch (mode)
14280 {
14281 case CCNOmode:
14282 case CCGOCmode:
14283 suffix = "ns";
14284 break;
14285
14286 case CCmode:
14287 case CCGCmode:
14288 suffix = "ge";
14289 break;
14290
14291 default:
14292 gcc_unreachable ();
14293 }
14294 break;
14295 case GEU:
14296 if (mode == CCmode)
14297 suffix = fp ? "nb" : "ae";
14298 else if (mode == CCCmode)
14299 suffix = "nc";
14300 else
14301 gcc_unreachable ();
14302 break;
14303 case LE:
14304 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14305 suffix = "le";
14306 break;
14307 case LEU:
14308 if (mode == CCmode)
14309 suffix = "be";
14310 else
14311 gcc_unreachable ();
14312 break;
14313 case UNORDERED:
14314 suffix = fp ? "u" : "p";
14315 break;
14316 case ORDERED:
14317 suffix = fp ? "nu" : "np";
14318 break;
14319 default:
14320 gcc_unreachable ();
14321 }
14322 fputs (suffix, file);
14323 }
14324
14325 /* Print the name of register X to FILE based on its machine mode and number.
14326 If CODE is 'w', pretend the mode is HImode.
14327 If CODE is 'b', pretend the mode is QImode.
14328 If CODE is 'k', pretend the mode is SImode.
14329 If CODE is 'q', pretend the mode is DImode.
14330 If CODE is 'x', pretend the mode is V4SFmode.
14331 If CODE is 't', pretend the mode is V8SFmode.
14332 If CODE is 'g', pretend the mode is V16SFmode.
14333 If CODE is 'h', pretend the reg is the 'high' byte register.
14334 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14335 If CODE is 'd', duplicate the operand for AVX instruction.
14336 */
14337
14338 void
14339 print_reg (rtx x, int code, FILE *file)
14340 {
14341 const char *reg;
14342 unsigned int regno;
14343 bool duplicated = code == 'd' && TARGET_AVX;
14344
14345 if (ASSEMBLER_DIALECT == ASM_ATT)
14346 putc ('%', file);
14347
14348 if (x == pc_rtx)
14349 {
14350 gcc_assert (TARGET_64BIT);
14351 fputs ("rip", file);
14352 return;
14353 }
14354
14355 regno = true_regnum (x);
14356 gcc_assert (regno != ARG_POINTER_REGNUM
14357 && regno != FRAME_POINTER_REGNUM
14358 && regno != FLAGS_REG
14359 && regno != FPSR_REG
14360 && regno != FPCR_REG);
14361
14362 if (code == 'w' || MMX_REG_P (x))
14363 code = 2;
14364 else if (code == 'b')
14365 code = 1;
14366 else if (code == 'k')
14367 code = 4;
14368 else if (code == 'q')
14369 code = 8;
14370 else if (code == 'y')
14371 code = 3;
14372 else if (code == 'h')
14373 code = 0;
14374 else if (code == 'x')
14375 code = 16;
14376 else if (code == 't')
14377 code = 32;
14378 else if (code == 'g')
14379 code = 64;
14380 else
14381 code = GET_MODE_SIZE (GET_MODE (x));
14382
14383 /* Irritatingly, AMD extended registers use different naming convention
14384 from the normal registers: "r%d[bwd]" */
14385 if (REX_INT_REGNO_P (regno))
14386 {
14387 gcc_assert (TARGET_64BIT);
14388 putc ('r', file);
14389 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14390 switch (code)
14391 {
14392 case 0:
14393 error ("extended registers have no high halves");
14394 break;
14395 case 1:
14396 putc ('b', file);
14397 break;
14398 case 2:
14399 putc ('w', file);
14400 break;
14401 case 4:
14402 putc ('d', file);
14403 break;
14404 case 8:
14405 /* no suffix */
14406 break;
14407 default:
14408 error ("unsupported operand size for extended register");
14409 break;
14410 }
14411 return;
14412 }
14413
14414 reg = NULL;
14415 switch (code)
14416 {
14417 case 3:
14418 if (STACK_TOP_P (x))
14419 {
14420 reg = "st(0)";
14421 break;
14422 }
14423 /* FALLTHRU */
14424 case 8:
14425 case 4:
14426 case 12:
14427 if (! ANY_FP_REG_P (x) && ! ANY_BND_REG_P (x))
14428 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14429 /* FALLTHRU */
14430 case 16:
14431 case 2:
14432 normal:
14433 reg = hi_reg_name[regno];
14434 break;
14435 case 1:
14436 if (regno >= ARRAY_SIZE (qi_reg_name))
14437 goto normal;
14438 reg = qi_reg_name[regno];
14439 break;
14440 case 0:
14441 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14442 goto normal;
14443 reg = qi_high_reg_name[regno];
14444 break;
14445 case 32:
14446 if (SSE_REG_P (x))
14447 {
14448 gcc_assert (!duplicated);
14449 putc ('y', file);
14450 fputs (hi_reg_name[regno] + 1, file);
14451 return;
14452 }
14453 case 64:
14454 if (SSE_REG_P (x))
14455 {
14456 gcc_assert (!duplicated);
14457 putc ('z', file);
14458 fputs (hi_reg_name[REGNO (x)] + 1, file);
14459 return;
14460 }
14461 break;
14462 default:
14463 gcc_unreachable ();
14464 }
14465
14466 fputs (reg, file);
14467 if (duplicated)
14468 {
14469 if (ASSEMBLER_DIALECT == ASM_ATT)
14470 fprintf (file, ", %%%s", reg);
14471 else
14472 fprintf (file, ", %s", reg);
14473 }
14474 }
14475
14476 /* Locate some local-dynamic symbol still in use by this function
14477 so that we can print its name in some tls_local_dynamic_base
14478 pattern. */
14479
14480 static int
14481 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14482 {
14483 rtx x = *px;
14484
14485 if (GET_CODE (x) == SYMBOL_REF
14486 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14487 {
14488 cfun->machine->some_ld_name = XSTR (x, 0);
14489 return 1;
14490 }
14491
14492 return 0;
14493 }
14494
14495 static const char *
14496 get_some_local_dynamic_name (void)
14497 {
14498 rtx insn;
14499
14500 if (cfun->machine->some_ld_name)
14501 return cfun->machine->some_ld_name;
14502
14503 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14504 if (NONDEBUG_INSN_P (insn)
14505 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14506 return cfun->machine->some_ld_name;
14507
14508 return NULL;
14509 }
14510
14511 /* Meaning of CODE:
14512 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14513 C -- print opcode suffix for set/cmov insn.
14514 c -- like C, but print reversed condition
14515 F,f -- likewise, but for floating-point.
14516 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14517 otherwise nothing
14518 R -- print the prefix for register names.
14519 z -- print the opcode suffix for the size of the current operand.
14520 Z -- likewise, with special suffixes for x87 instructions.
14521 * -- print a star (in certain assembler syntax)
14522 A -- print an absolute memory reference.
14523 E -- print address with DImode register names if TARGET_64BIT.
14524 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14525 s -- print a shift double count, followed by the assemblers argument
14526 delimiter.
14527 b -- print the QImode name of the register for the indicated operand.
14528 %b0 would print %al if operands[0] is reg 0.
14529 w -- likewise, print the HImode name of the register.
14530 k -- likewise, print the SImode name of the register.
14531 q -- likewise, print the DImode name of the register.
14532 x -- likewise, print the V4SFmode name of the register.
14533 t -- likewise, print the V8SFmode name of the register.
14534 g -- likewise, print the V16SFmode name of the register.
14535 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14536 y -- print "st(0)" instead of "st" as a register.
14537 d -- print duplicated register operand for AVX instruction.
14538 D -- print condition for SSE cmp instruction.
14539 P -- if PIC, print an @PLT suffix.
14540 p -- print raw symbol name.
14541 X -- don't print any sort of PIC '@' suffix for a symbol.
14542 & -- print some in-use local-dynamic symbol name.
14543 H -- print a memory address offset by 8; used for sse high-parts
14544 Y -- print condition for XOP pcom* instruction.
14545 + -- print a branch hint as 'cs' or 'ds' prefix
14546 ; -- print a semicolon (after prefixes due to bug in older gas).
14547 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14548 @ -- print a segment register of thread base pointer load
14549 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14550 ! -- print MPX prefix for jxx/call/ret instructions if required.
14551 */
14552
14553 void
14554 ix86_print_operand (FILE *file, rtx x, int code)
14555 {
14556 if (code)
14557 {
14558 switch (code)
14559 {
14560 case 'A':
14561 switch (ASSEMBLER_DIALECT)
14562 {
14563 case ASM_ATT:
14564 putc ('*', file);
14565 break;
14566
14567 case ASM_INTEL:
14568 /* Intel syntax. For absolute addresses, registers should not
14569 be surrounded by braces. */
14570 if (!REG_P (x))
14571 {
14572 putc ('[', file);
14573 ix86_print_operand (file, x, 0);
14574 putc (']', file);
14575 return;
14576 }
14577 break;
14578
14579 default:
14580 gcc_unreachable ();
14581 }
14582
14583 ix86_print_operand (file, x, 0);
14584 return;
14585
14586 case 'E':
14587 /* Wrap address in an UNSPEC to declare special handling. */
14588 if (TARGET_64BIT)
14589 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14590
14591 output_address (x);
14592 return;
14593
14594 case 'L':
14595 if (ASSEMBLER_DIALECT == ASM_ATT)
14596 putc ('l', file);
14597 return;
14598
14599 case 'W':
14600 if (ASSEMBLER_DIALECT == ASM_ATT)
14601 putc ('w', file);
14602 return;
14603
14604 case 'B':
14605 if (ASSEMBLER_DIALECT == ASM_ATT)
14606 putc ('b', file);
14607 return;
14608
14609 case 'Q':
14610 if (ASSEMBLER_DIALECT == ASM_ATT)
14611 putc ('l', file);
14612 return;
14613
14614 case 'S':
14615 if (ASSEMBLER_DIALECT == ASM_ATT)
14616 putc ('s', file);
14617 return;
14618
14619 case 'T':
14620 if (ASSEMBLER_DIALECT == ASM_ATT)
14621 putc ('t', file);
14622 return;
14623
14624 case 'O':
14625 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14626 if (ASSEMBLER_DIALECT != ASM_ATT)
14627 return;
14628
14629 switch (GET_MODE_SIZE (GET_MODE (x)))
14630 {
14631 case 2:
14632 putc ('w', file);
14633 break;
14634
14635 case 4:
14636 putc ('l', file);
14637 break;
14638
14639 case 8:
14640 putc ('q', file);
14641 break;
14642
14643 default:
14644 output_operand_lossage
14645 ("invalid operand size for operand code 'O'");
14646 return;
14647 }
14648
14649 putc ('.', file);
14650 #endif
14651 return;
14652
14653 case 'z':
14654 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14655 {
14656 /* Opcodes don't get size suffixes if using Intel opcodes. */
14657 if (ASSEMBLER_DIALECT == ASM_INTEL)
14658 return;
14659
14660 switch (GET_MODE_SIZE (GET_MODE (x)))
14661 {
14662 case 1:
14663 putc ('b', file);
14664 return;
14665
14666 case 2:
14667 putc ('w', file);
14668 return;
14669
14670 case 4:
14671 putc ('l', file);
14672 return;
14673
14674 case 8:
14675 putc ('q', file);
14676 return;
14677
14678 default:
14679 output_operand_lossage
14680 ("invalid operand size for operand code 'z'");
14681 return;
14682 }
14683 }
14684
14685 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14686 warning
14687 (0, "non-integer operand used with operand code 'z'");
14688 /* FALLTHRU */
14689
14690 case 'Z':
14691 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14692 if (ASSEMBLER_DIALECT == ASM_INTEL)
14693 return;
14694
14695 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14696 {
14697 switch (GET_MODE_SIZE (GET_MODE (x)))
14698 {
14699 case 2:
14700 #ifdef HAVE_AS_IX86_FILDS
14701 putc ('s', file);
14702 #endif
14703 return;
14704
14705 case 4:
14706 putc ('l', file);
14707 return;
14708
14709 case 8:
14710 #ifdef HAVE_AS_IX86_FILDQ
14711 putc ('q', file);
14712 #else
14713 fputs ("ll", file);
14714 #endif
14715 return;
14716
14717 default:
14718 break;
14719 }
14720 }
14721 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14722 {
14723 /* 387 opcodes don't get size suffixes
14724 if the operands are registers. */
14725 if (STACK_REG_P (x))
14726 return;
14727
14728 switch (GET_MODE_SIZE (GET_MODE (x)))
14729 {
14730 case 4:
14731 putc ('s', file);
14732 return;
14733
14734 case 8:
14735 putc ('l', file);
14736 return;
14737
14738 case 12:
14739 case 16:
14740 putc ('t', file);
14741 return;
14742
14743 default:
14744 break;
14745 }
14746 }
14747 else
14748 {
14749 output_operand_lossage
14750 ("invalid operand type used with operand code 'Z'");
14751 return;
14752 }
14753
14754 output_operand_lossage
14755 ("invalid operand size for operand code 'Z'");
14756 return;
14757
14758 case 'd':
14759 case 'b':
14760 case 'w':
14761 case 'k':
14762 case 'q':
14763 case 'h':
14764 case 't':
14765 case 'g':
14766 case 'y':
14767 case 'x':
14768 case 'X':
14769 case 'P':
14770 case 'p':
14771 break;
14772
14773 case 's':
14774 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14775 {
14776 ix86_print_operand (file, x, 0);
14777 fputs (", ", file);
14778 }
14779 return;
14780
14781 case 'Y':
14782 switch (GET_CODE (x))
14783 {
14784 case NE:
14785 fputs ("neq", file);
14786 break;
14787 case EQ:
14788 fputs ("eq", file);
14789 break;
14790 case GE:
14791 case GEU:
14792 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14793 break;
14794 case GT:
14795 case GTU:
14796 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14797 break;
14798 case LE:
14799 case LEU:
14800 fputs ("le", file);
14801 break;
14802 case LT:
14803 case LTU:
14804 fputs ("lt", file);
14805 break;
14806 case UNORDERED:
14807 fputs ("unord", file);
14808 break;
14809 case ORDERED:
14810 fputs ("ord", file);
14811 break;
14812 case UNEQ:
14813 fputs ("ueq", file);
14814 break;
14815 case UNGE:
14816 fputs ("nlt", file);
14817 break;
14818 case UNGT:
14819 fputs ("nle", file);
14820 break;
14821 case UNLE:
14822 fputs ("ule", file);
14823 break;
14824 case UNLT:
14825 fputs ("ult", file);
14826 break;
14827 case LTGT:
14828 fputs ("une", file);
14829 break;
14830 default:
14831 output_operand_lossage ("operand is not a condition code, "
14832 "invalid operand code 'Y'");
14833 return;
14834 }
14835 return;
14836
14837 case 'D':
14838 /* Little bit of braindamage here. The SSE compare instructions
14839 does use completely different names for the comparisons that the
14840 fp conditional moves. */
14841 switch (GET_CODE (x))
14842 {
14843 case UNEQ:
14844 if (TARGET_AVX)
14845 {
14846 fputs ("eq_us", file);
14847 break;
14848 }
14849 case EQ:
14850 fputs ("eq", file);
14851 break;
14852 case UNLT:
14853 if (TARGET_AVX)
14854 {
14855 fputs ("nge", file);
14856 break;
14857 }
14858 case LT:
14859 fputs ("lt", file);
14860 break;
14861 case UNLE:
14862 if (TARGET_AVX)
14863 {
14864 fputs ("ngt", file);
14865 break;
14866 }
14867 case LE:
14868 fputs ("le", file);
14869 break;
14870 case UNORDERED:
14871 fputs ("unord", file);
14872 break;
14873 case LTGT:
14874 if (TARGET_AVX)
14875 {
14876 fputs ("neq_oq", file);
14877 break;
14878 }
14879 case NE:
14880 fputs ("neq", file);
14881 break;
14882 case GE:
14883 if (TARGET_AVX)
14884 {
14885 fputs ("ge", file);
14886 break;
14887 }
14888 case UNGE:
14889 fputs ("nlt", file);
14890 break;
14891 case GT:
14892 if (TARGET_AVX)
14893 {
14894 fputs ("gt", file);
14895 break;
14896 }
14897 case UNGT:
14898 fputs ("nle", file);
14899 break;
14900 case ORDERED:
14901 fputs ("ord", file);
14902 break;
14903 default:
14904 output_operand_lossage ("operand is not a condition code, "
14905 "invalid operand code 'D'");
14906 return;
14907 }
14908 return;
14909
14910 case 'F':
14911 case 'f':
14912 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14913 if (ASSEMBLER_DIALECT == ASM_ATT)
14914 putc ('.', file);
14915 #endif
14916
14917 case 'C':
14918 case 'c':
14919 if (!COMPARISON_P (x))
14920 {
14921 output_operand_lossage ("operand is not a condition code, "
14922 "invalid operand code '%c'", code);
14923 return;
14924 }
14925 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14926 code == 'c' || code == 'f',
14927 code == 'F' || code == 'f',
14928 file);
14929 return;
14930
14931 case 'H':
14932 if (!offsettable_memref_p (x))
14933 {
14934 output_operand_lossage ("operand is not an offsettable memory "
14935 "reference, invalid operand code 'H'");
14936 return;
14937 }
14938 /* It doesn't actually matter what mode we use here, as we're
14939 only going to use this for printing. */
14940 x = adjust_address_nv (x, DImode, 8);
14941 /* Output 'qword ptr' for intel assembler dialect. */
14942 if (ASSEMBLER_DIALECT == ASM_INTEL)
14943 code = 'q';
14944 break;
14945
14946 case 'K':
14947 gcc_assert (CONST_INT_P (x));
14948
14949 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14950 #ifdef HAVE_AS_IX86_HLE
14951 fputs ("xacquire ", file);
14952 #else
14953 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14954 #endif
14955 else if (INTVAL (x) & IX86_HLE_RELEASE)
14956 #ifdef HAVE_AS_IX86_HLE
14957 fputs ("xrelease ", file);
14958 #else
14959 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14960 #endif
14961 /* We do not want to print value of the operand. */
14962 return;
14963
14964 case 'N':
14965 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
14966 fputs ("{z}", file);
14967 return;
14968
14969 case '*':
14970 if (ASSEMBLER_DIALECT == ASM_ATT)
14971 putc ('*', file);
14972 return;
14973
14974 case '&':
14975 {
14976 const char *name = get_some_local_dynamic_name ();
14977 if (name == NULL)
14978 output_operand_lossage ("'%%&' used without any "
14979 "local dynamic TLS references");
14980 else
14981 assemble_name (file, name);
14982 return;
14983 }
14984
14985 case '+':
14986 {
14987 rtx x;
14988
14989 if (!optimize
14990 || optimize_function_for_size_p (cfun)
14991 || !TARGET_BRANCH_PREDICTION_HINTS)
14992 return;
14993
14994 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14995 if (x)
14996 {
14997 int pred_val = XINT (x, 0);
14998
14999 if (pred_val < REG_BR_PROB_BASE * 45 / 100
15000 || pred_val > REG_BR_PROB_BASE * 55 / 100)
15001 {
15002 bool taken = pred_val > REG_BR_PROB_BASE / 2;
15003 bool cputaken
15004 = final_forward_branch_p (current_output_insn) == 0;
15005
15006 /* Emit hints only in the case default branch prediction
15007 heuristics would fail. */
15008 if (taken != cputaken)
15009 {
15010 /* We use 3e (DS) prefix for taken branches and
15011 2e (CS) prefix for not taken branches. */
15012 if (taken)
15013 fputs ("ds ; ", file);
15014 else
15015 fputs ("cs ; ", file);
15016 }
15017 }
15018 }
15019 return;
15020 }
15021
15022 case ';':
15023 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
15024 putc (';', file);
15025 #endif
15026 return;
15027
15028 case '@':
15029 if (ASSEMBLER_DIALECT == ASM_ATT)
15030 putc ('%', file);
15031
15032 /* The kernel uses a different segment register for performance
15033 reasons; a system call would not have to trash the userspace
15034 segment register, which would be expensive. */
15035 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
15036 fputs ("fs", file);
15037 else
15038 fputs ("gs", file);
15039 return;
15040
15041 case '~':
15042 putc (TARGET_AVX2 ? 'i' : 'f', file);
15043 return;
15044
15045 case '^':
15046 if (TARGET_64BIT && Pmode != word_mode)
15047 fputs ("addr32 ", file);
15048 return;
15049
15050 case '!':
15051 if (ix86_bnd_prefixed_insn_p (NULL_RTX))
15052 fputs ("bnd ", file);
15053 return;
15054
15055 default:
15056 output_operand_lossage ("invalid operand code '%c'", code);
15057 }
15058 }
15059
15060 if (REG_P (x))
15061 print_reg (x, code, file);
15062
15063 else if (MEM_P (x))
15064 {
15065 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
15066 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
15067 && GET_MODE (x) != BLKmode)
15068 {
15069 const char * size;
15070 switch (GET_MODE_SIZE (GET_MODE (x)))
15071 {
15072 case 1: size = "BYTE"; break;
15073 case 2: size = "WORD"; break;
15074 case 4: size = "DWORD"; break;
15075 case 8: size = "QWORD"; break;
15076 case 12: size = "TBYTE"; break;
15077 case 16:
15078 if (GET_MODE (x) == XFmode)
15079 size = "TBYTE";
15080 else
15081 size = "XMMWORD";
15082 break;
15083 case 32: size = "YMMWORD"; break;
15084 case 64: size = "ZMMWORD"; break;
15085 default:
15086 gcc_unreachable ();
15087 }
15088
15089 /* Check for explicit size override (codes 'b', 'w', 'k',
15090 'q' and 'x') */
15091 if (code == 'b')
15092 size = "BYTE";
15093 else if (code == 'w')
15094 size = "WORD";
15095 else if (code == 'k')
15096 size = "DWORD";
15097 else if (code == 'q')
15098 size = "QWORD";
15099 else if (code == 'x')
15100 size = "XMMWORD";
15101
15102 fputs (size, file);
15103 fputs (" PTR ", file);
15104 }
15105
15106 x = XEXP (x, 0);
15107 /* Avoid (%rip) for call operands. */
15108 if (CONSTANT_ADDRESS_P (x) && code == 'P'
15109 && !CONST_INT_P (x))
15110 output_addr_const (file, x);
15111 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
15112 output_operand_lossage ("invalid constraints for operand");
15113 else
15114 output_address (x);
15115 }
15116
15117 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
15118 {
15119 REAL_VALUE_TYPE r;
15120 long l;
15121
15122 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15123 REAL_VALUE_TO_TARGET_SINGLE (r, l);
15124
15125 if (ASSEMBLER_DIALECT == ASM_ATT)
15126 putc ('$', file);
15127 /* Sign extend 32bit SFmode immediate to 8 bytes. */
15128 if (code == 'q')
15129 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
15130 (unsigned long long) (int) l);
15131 else
15132 fprintf (file, "0x%08x", (unsigned int) l);
15133 }
15134
15135 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
15136 {
15137 REAL_VALUE_TYPE r;
15138 long l[2];
15139
15140 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15141 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
15142
15143 if (ASSEMBLER_DIALECT == ASM_ATT)
15144 putc ('$', file);
15145 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
15146 }
15147
15148 /* These float cases don't actually occur as immediate operands. */
15149 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
15150 {
15151 char dstr[30];
15152
15153 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
15154 fputs (dstr, file);
15155 }
15156
15157 else
15158 {
15159 /* We have patterns that allow zero sets of memory, for instance.
15160 In 64-bit mode, we should probably support all 8-byte vectors,
15161 since we can in fact encode that into an immediate. */
15162 if (GET_CODE (x) == CONST_VECTOR)
15163 {
15164 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15165 x = const0_rtx;
15166 }
15167
15168 if (code != 'P' && code != 'p')
15169 {
15170 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15171 {
15172 if (ASSEMBLER_DIALECT == ASM_ATT)
15173 putc ('$', file);
15174 }
15175 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15176 || GET_CODE (x) == LABEL_REF)
15177 {
15178 if (ASSEMBLER_DIALECT == ASM_ATT)
15179 putc ('$', file);
15180 else
15181 fputs ("OFFSET FLAT:", file);
15182 }
15183 }
15184 if (CONST_INT_P (x))
15185 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15186 else if (flag_pic || MACHOPIC_INDIRECT)
15187 output_pic_addr_const (file, x, code);
15188 else
15189 output_addr_const (file, x);
15190 }
15191 }
15192
15193 static bool
15194 ix86_print_operand_punct_valid_p (unsigned char code)
15195 {
15196 return (code == '@' || code == '*' || code == '+' || code == '&'
15197 || code == ';' || code == '~' || code == '^' || code == '!');
15198 }
15199 \f
15200 /* Print a memory operand whose address is ADDR. */
15201
15202 static void
15203 ix86_print_operand_address (FILE *file, rtx addr)
15204 {
15205 struct ix86_address parts;
15206 rtx base, index, disp;
15207 int scale;
15208 int ok;
15209 bool vsib = false;
15210 int code = 0;
15211
15212 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15213 {
15214 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15215 gcc_assert (parts.index == NULL_RTX);
15216 parts.index = XVECEXP (addr, 0, 1);
15217 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15218 addr = XVECEXP (addr, 0, 0);
15219 vsib = true;
15220 }
15221 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15222 {
15223 gcc_assert (TARGET_64BIT);
15224 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15225 code = 'q';
15226 }
15227 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
15228 {
15229 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
15230 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
15231 if (parts.base != NULL_RTX)
15232 {
15233 parts.index = parts.base;
15234 parts.scale = 1;
15235 }
15236 parts.base = XVECEXP (addr, 0, 0);
15237 addr = XVECEXP (addr, 0, 0);
15238 }
15239 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
15240 {
15241 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15242 gcc_assert (parts.index == NULL_RTX);
15243 parts.index = XVECEXP (addr, 0, 1);
15244 addr = XVECEXP (addr, 0, 0);
15245 }
15246 else
15247 ok = ix86_decompose_address (addr, &parts);
15248
15249 gcc_assert (ok);
15250
15251 base = parts.base;
15252 index = parts.index;
15253 disp = parts.disp;
15254 scale = parts.scale;
15255
15256 switch (parts.seg)
15257 {
15258 case SEG_DEFAULT:
15259 break;
15260 case SEG_FS:
15261 case SEG_GS:
15262 if (ASSEMBLER_DIALECT == ASM_ATT)
15263 putc ('%', file);
15264 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15265 break;
15266 default:
15267 gcc_unreachable ();
15268 }
15269
15270 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15271 if (TARGET_64BIT && !base && !index)
15272 {
15273 rtx symbol = disp;
15274
15275 if (GET_CODE (disp) == CONST
15276 && GET_CODE (XEXP (disp, 0)) == PLUS
15277 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15278 symbol = XEXP (XEXP (disp, 0), 0);
15279
15280 if (GET_CODE (symbol) == LABEL_REF
15281 || (GET_CODE (symbol) == SYMBOL_REF
15282 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15283 base = pc_rtx;
15284 }
15285 if (!base && !index)
15286 {
15287 /* Displacement only requires special attention. */
15288
15289 if (CONST_INT_P (disp))
15290 {
15291 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15292 fputs ("ds:", file);
15293 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15294 }
15295 else if (flag_pic)
15296 output_pic_addr_const (file, disp, 0);
15297 else
15298 output_addr_const (file, disp);
15299 }
15300 else
15301 {
15302 /* Print SImode register names to force addr32 prefix. */
15303 if (SImode_address_operand (addr, VOIDmode))
15304 {
15305 #ifdef ENABLE_CHECKING
15306 gcc_assert (TARGET_64BIT);
15307 switch (GET_CODE (addr))
15308 {
15309 case SUBREG:
15310 gcc_assert (GET_MODE (addr) == SImode);
15311 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15312 break;
15313 case ZERO_EXTEND:
15314 case AND:
15315 gcc_assert (GET_MODE (addr) == DImode);
15316 break;
15317 default:
15318 gcc_unreachable ();
15319 }
15320 #endif
15321 gcc_assert (!code);
15322 code = 'k';
15323 }
15324 else if (code == 0
15325 && TARGET_X32
15326 && disp
15327 && CONST_INT_P (disp)
15328 && INTVAL (disp) < -16*1024*1024)
15329 {
15330 /* X32 runs in 64-bit mode, where displacement, DISP, in
15331 address DISP(%r64), is encoded as 32-bit immediate sign-
15332 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15333 address is %r64 + 0xffffffffbffffd00. When %r64 <
15334 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15335 which is invalid for x32. The correct address is %r64
15336 - 0x40000300 == 0xf7ffdd64. To properly encode
15337 -0x40000300(%r64) for x32, we zero-extend negative
15338 displacement by forcing addr32 prefix which truncates
15339 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15340 zero-extend all negative displacements, including -1(%rsp).
15341 However, for small negative displacements, sign-extension
15342 won't cause overflow. We only zero-extend negative
15343 displacements if they < -16*1024*1024, which is also used
15344 to check legitimate address displacements for PIC. */
15345 code = 'k';
15346 }
15347
15348 if (ASSEMBLER_DIALECT == ASM_ATT)
15349 {
15350 if (disp)
15351 {
15352 if (flag_pic)
15353 output_pic_addr_const (file, disp, 0);
15354 else if (GET_CODE (disp) == LABEL_REF)
15355 output_asm_label (disp);
15356 else
15357 output_addr_const (file, disp);
15358 }
15359
15360 putc ('(', file);
15361 if (base)
15362 print_reg (base, code, file);
15363 if (index)
15364 {
15365 putc (',', file);
15366 print_reg (index, vsib ? 0 : code, file);
15367 if (scale != 1 || vsib)
15368 fprintf (file, ",%d", scale);
15369 }
15370 putc (')', file);
15371 }
15372 else
15373 {
15374 rtx offset = NULL_RTX;
15375
15376 if (disp)
15377 {
15378 /* Pull out the offset of a symbol; print any symbol itself. */
15379 if (GET_CODE (disp) == CONST
15380 && GET_CODE (XEXP (disp, 0)) == PLUS
15381 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15382 {
15383 offset = XEXP (XEXP (disp, 0), 1);
15384 disp = gen_rtx_CONST (VOIDmode,
15385 XEXP (XEXP (disp, 0), 0));
15386 }
15387
15388 if (flag_pic)
15389 output_pic_addr_const (file, disp, 0);
15390 else if (GET_CODE (disp) == LABEL_REF)
15391 output_asm_label (disp);
15392 else if (CONST_INT_P (disp))
15393 offset = disp;
15394 else
15395 output_addr_const (file, disp);
15396 }
15397
15398 putc ('[', file);
15399 if (base)
15400 {
15401 print_reg (base, code, file);
15402 if (offset)
15403 {
15404 if (INTVAL (offset) >= 0)
15405 putc ('+', file);
15406 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15407 }
15408 }
15409 else if (offset)
15410 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15411 else
15412 putc ('0', file);
15413
15414 if (index)
15415 {
15416 putc ('+', file);
15417 print_reg (index, vsib ? 0 : code, file);
15418 if (scale != 1 || vsib)
15419 fprintf (file, "*%d", scale);
15420 }
15421 putc (']', file);
15422 }
15423 }
15424 }
15425
15426 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15427
15428 static bool
15429 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15430 {
15431 rtx op;
15432
15433 if (GET_CODE (x) != UNSPEC)
15434 return false;
15435
15436 op = XVECEXP (x, 0, 0);
15437 switch (XINT (x, 1))
15438 {
15439 case UNSPEC_GOTTPOFF:
15440 output_addr_const (file, op);
15441 /* FIXME: This might be @TPOFF in Sun ld. */
15442 fputs ("@gottpoff", file);
15443 break;
15444 case UNSPEC_TPOFF:
15445 output_addr_const (file, op);
15446 fputs ("@tpoff", file);
15447 break;
15448 case UNSPEC_NTPOFF:
15449 output_addr_const (file, op);
15450 if (TARGET_64BIT)
15451 fputs ("@tpoff", file);
15452 else
15453 fputs ("@ntpoff", file);
15454 break;
15455 case UNSPEC_DTPOFF:
15456 output_addr_const (file, op);
15457 fputs ("@dtpoff", file);
15458 break;
15459 case UNSPEC_GOTNTPOFF:
15460 output_addr_const (file, op);
15461 if (TARGET_64BIT)
15462 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15463 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15464 else
15465 fputs ("@gotntpoff", file);
15466 break;
15467 case UNSPEC_INDNTPOFF:
15468 output_addr_const (file, op);
15469 fputs ("@indntpoff", file);
15470 break;
15471 #if TARGET_MACHO
15472 case UNSPEC_MACHOPIC_OFFSET:
15473 output_addr_const (file, op);
15474 putc ('-', file);
15475 machopic_output_function_base_name (file);
15476 break;
15477 #endif
15478
15479 case UNSPEC_STACK_CHECK:
15480 {
15481 int offset;
15482
15483 gcc_assert (flag_split_stack);
15484
15485 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15486 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15487 #else
15488 gcc_unreachable ();
15489 #endif
15490
15491 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15492 }
15493 break;
15494
15495 default:
15496 return false;
15497 }
15498
15499 return true;
15500 }
15501 \f
15502 /* Split one or more double-mode RTL references into pairs of half-mode
15503 references. The RTL can be REG, offsettable MEM, integer constant, or
15504 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15505 split and "num" is its length. lo_half and hi_half are output arrays
15506 that parallel "operands". */
15507
15508 void
15509 split_double_mode (enum machine_mode mode, rtx operands[],
15510 int num, rtx lo_half[], rtx hi_half[])
15511 {
15512 enum machine_mode half_mode;
15513 unsigned int byte;
15514
15515 switch (mode)
15516 {
15517 case TImode:
15518 half_mode = DImode;
15519 break;
15520 case DImode:
15521 half_mode = SImode;
15522 break;
15523 default:
15524 gcc_unreachable ();
15525 }
15526
15527 byte = GET_MODE_SIZE (half_mode);
15528
15529 while (num--)
15530 {
15531 rtx op = operands[num];
15532
15533 /* simplify_subreg refuse to split volatile memory addresses,
15534 but we still have to handle it. */
15535 if (MEM_P (op))
15536 {
15537 lo_half[num] = adjust_address (op, half_mode, 0);
15538 hi_half[num] = adjust_address (op, half_mode, byte);
15539 }
15540 else
15541 {
15542 lo_half[num] = simplify_gen_subreg (half_mode, op,
15543 GET_MODE (op) == VOIDmode
15544 ? mode : GET_MODE (op), 0);
15545 hi_half[num] = simplify_gen_subreg (half_mode, op,
15546 GET_MODE (op) == VOIDmode
15547 ? mode : GET_MODE (op), byte);
15548 }
15549 }
15550 }
15551 \f
15552 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15553 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15554 is the expression of the binary operation. The output may either be
15555 emitted here, or returned to the caller, like all output_* functions.
15556
15557 There is no guarantee that the operands are the same mode, as they
15558 might be within FLOAT or FLOAT_EXTEND expressions. */
15559
15560 #ifndef SYSV386_COMPAT
15561 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15562 wants to fix the assemblers because that causes incompatibility
15563 with gcc. No-one wants to fix gcc because that causes
15564 incompatibility with assemblers... You can use the option of
15565 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15566 #define SYSV386_COMPAT 1
15567 #endif
15568
15569 const char *
15570 output_387_binary_op (rtx insn, rtx *operands)
15571 {
15572 static char buf[40];
15573 const char *p;
15574 const char *ssep;
15575 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15576
15577 #ifdef ENABLE_CHECKING
15578 /* Even if we do not want to check the inputs, this documents input
15579 constraints. Which helps in understanding the following code. */
15580 if (STACK_REG_P (operands[0])
15581 && ((REG_P (operands[1])
15582 && REGNO (operands[0]) == REGNO (operands[1])
15583 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15584 || (REG_P (operands[2])
15585 && REGNO (operands[0]) == REGNO (operands[2])
15586 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15587 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15588 ; /* ok */
15589 else
15590 gcc_assert (is_sse);
15591 #endif
15592
15593 switch (GET_CODE (operands[3]))
15594 {
15595 case PLUS:
15596 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15597 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15598 p = "fiadd";
15599 else
15600 p = "fadd";
15601 ssep = "vadd";
15602 break;
15603
15604 case MINUS:
15605 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15606 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15607 p = "fisub";
15608 else
15609 p = "fsub";
15610 ssep = "vsub";
15611 break;
15612
15613 case MULT:
15614 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15615 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15616 p = "fimul";
15617 else
15618 p = "fmul";
15619 ssep = "vmul";
15620 break;
15621
15622 case DIV:
15623 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15624 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15625 p = "fidiv";
15626 else
15627 p = "fdiv";
15628 ssep = "vdiv";
15629 break;
15630
15631 default:
15632 gcc_unreachable ();
15633 }
15634
15635 if (is_sse)
15636 {
15637 if (TARGET_AVX)
15638 {
15639 strcpy (buf, ssep);
15640 if (GET_MODE (operands[0]) == SFmode)
15641 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15642 else
15643 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15644 }
15645 else
15646 {
15647 strcpy (buf, ssep + 1);
15648 if (GET_MODE (operands[0]) == SFmode)
15649 strcat (buf, "ss\t{%2, %0|%0, %2}");
15650 else
15651 strcat (buf, "sd\t{%2, %0|%0, %2}");
15652 }
15653 return buf;
15654 }
15655 strcpy (buf, p);
15656
15657 switch (GET_CODE (operands[3]))
15658 {
15659 case MULT:
15660 case PLUS:
15661 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15662 {
15663 rtx temp = operands[2];
15664 operands[2] = operands[1];
15665 operands[1] = temp;
15666 }
15667
15668 /* know operands[0] == operands[1]. */
15669
15670 if (MEM_P (operands[2]))
15671 {
15672 p = "%Z2\t%2";
15673 break;
15674 }
15675
15676 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15677 {
15678 if (STACK_TOP_P (operands[0]))
15679 /* How is it that we are storing to a dead operand[2]?
15680 Well, presumably operands[1] is dead too. We can't
15681 store the result to st(0) as st(0) gets popped on this
15682 instruction. Instead store to operands[2] (which I
15683 think has to be st(1)). st(1) will be popped later.
15684 gcc <= 2.8.1 didn't have this check and generated
15685 assembly code that the Unixware assembler rejected. */
15686 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15687 else
15688 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15689 break;
15690 }
15691
15692 if (STACK_TOP_P (operands[0]))
15693 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15694 else
15695 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15696 break;
15697
15698 case MINUS:
15699 case DIV:
15700 if (MEM_P (operands[1]))
15701 {
15702 p = "r%Z1\t%1";
15703 break;
15704 }
15705
15706 if (MEM_P (operands[2]))
15707 {
15708 p = "%Z2\t%2";
15709 break;
15710 }
15711
15712 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15713 {
15714 #if SYSV386_COMPAT
15715 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15716 derived assemblers, confusingly reverse the direction of
15717 the operation for fsub{r} and fdiv{r} when the
15718 destination register is not st(0). The Intel assembler
15719 doesn't have this brain damage. Read !SYSV386_COMPAT to
15720 figure out what the hardware really does. */
15721 if (STACK_TOP_P (operands[0]))
15722 p = "{p\t%0, %2|rp\t%2, %0}";
15723 else
15724 p = "{rp\t%2, %0|p\t%0, %2}";
15725 #else
15726 if (STACK_TOP_P (operands[0]))
15727 /* As above for fmul/fadd, we can't store to st(0). */
15728 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15729 else
15730 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15731 #endif
15732 break;
15733 }
15734
15735 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15736 {
15737 #if SYSV386_COMPAT
15738 if (STACK_TOP_P (operands[0]))
15739 p = "{rp\t%0, %1|p\t%1, %0}";
15740 else
15741 p = "{p\t%1, %0|rp\t%0, %1}";
15742 #else
15743 if (STACK_TOP_P (operands[0]))
15744 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15745 else
15746 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15747 #endif
15748 break;
15749 }
15750
15751 if (STACK_TOP_P (operands[0]))
15752 {
15753 if (STACK_TOP_P (operands[1]))
15754 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15755 else
15756 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15757 break;
15758 }
15759 else if (STACK_TOP_P (operands[1]))
15760 {
15761 #if SYSV386_COMPAT
15762 p = "{\t%1, %0|r\t%0, %1}";
15763 #else
15764 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15765 #endif
15766 }
15767 else
15768 {
15769 #if SYSV386_COMPAT
15770 p = "{r\t%2, %0|\t%0, %2}";
15771 #else
15772 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15773 #endif
15774 }
15775 break;
15776
15777 default:
15778 gcc_unreachable ();
15779 }
15780
15781 strcat (buf, p);
15782 return buf;
15783 }
15784
15785 /* Check if a 256bit AVX register is referenced inside of EXP. */
15786
15787 static int
15788 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15789 {
15790 rtx exp = *pexp;
15791
15792 if (GET_CODE (exp) == SUBREG)
15793 exp = SUBREG_REG (exp);
15794
15795 if (REG_P (exp)
15796 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15797 return 1;
15798
15799 return 0;
15800 }
15801
15802 /* Return needed mode for entity in optimize_mode_switching pass. */
15803
15804 static int
15805 ix86_avx_u128_mode_needed (rtx insn)
15806 {
15807 if (CALL_P (insn))
15808 {
15809 rtx link;
15810
15811 /* Needed mode is set to AVX_U128_CLEAN if there are
15812 no 256bit modes used in function arguments. */
15813 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15814 link;
15815 link = XEXP (link, 1))
15816 {
15817 if (GET_CODE (XEXP (link, 0)) == USE)
15818 {
15819 rtx arg = XEXP (XEXP (link, 0), 0);
15820
15821 if (ix86_check_avx256_register (&arg, NULL))
15822 return AVX_U128_DIRTY;
15823 }
15824 }
15825
15826 return AVX_U128_CLEAN;
15827 }
15828
15829 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15830 changes state only when a 256bit register is written to, but we need
15831 to prevent the compiler from moving optimal insertion point above
15832 eventual read from 256bit register. */
15833 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15834 return AVX_U128_DIRTY;
15835
15836 return AVX_U128_ANY;
15837 }
15838
15839 /* Return mode that i387 must be switched into
15840 prior to the execution of insn. */
15841
15842 static int
15843 ix86_i387_mode_needed (int entity, rtx insn)
15844 {
15845 enum attr_i387_cw mode;
15846
15847 /* The mode UNINITIALIZED is used to store control word after a
15848 function call or ASM pattern. The mode ANY specify that function
15849 has no requirements on the control word and make no changes in the
15850 bits we are interested in. */
15851
15852 if (CALL_P (insn)
15853 || (NONJUMP_INSN_P (insn)
15854 && (asm_noperands (PATTERN (insn)) >= 0
15855 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15856 return I387_CW_UNINITIALIZED;
15857
15858 if (recog_memoized (insn) < 0)
15859 return I387_CW_ANY;
15860
15861 mode = get_attr_i387_cw (insn);
15862
15863 switch (entity)
15864 {
15865 case I387_TRUNC:
15866 if (mode == I387_CW_TRUNC)
15867 return mode;
15868 break;
15869
15870 case I387_FLOOR:
15871 if (mode == I387_CW_FLOOR)
15872 return mode;
15873 break;
15874
15875 case I387_CEIL:
15876 if (mode == I387_CW_CEIL)
15877 return mode;
15878 break;
15879
15880 case I387_MASK_PM:
15881 if (mode == I387_CW_MASK_PM)
15882 return mode;
15883 break;
15884
15885 default:
15886 gcc_unreachable ();
15887 }
15888
15889 return I387_CW_ANY;
15890 }
15891
15892 /* Return mode that entity must be switched into
15893 prior to the execution of insn. */
15894
15895 int
15896 ix86_mode_needed (int entity, rtx insn)
15897 {
15898 switch (entity)
15899 {
15900 case AVX_U128:
15901 return ix86_avx_u128_mode_needed (insn);
15902 case I387_TRUNC:
15903 case I387_FLOOR:
15904 case I387_CEIL:
15905 case I387_MASK_PM:
15906 return ix86_i387_mode_needed (entity, insn);
15907 default:
15908 gcc_unreachable ();
15909 }
15910 return 0;
15911 }
15912
15913 /* Check if a 256bit AVX register is referenced in stores. */
15914
15915 static void
15916 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15917 {
15918 if (ix86_check_avx256_register (&dest, NULL))
15919 {
15920 bool *used = (bool *) data;
15921 *used = true;
15922 }
15923 }
15924
15925 /* Calculate mode of upper 128bit AVX registers after the insn. */
15926
15927 static int
15928 ix86_avx_u128_mode_after (int mode, rtx insn)
15929 {
15930 rtx pat = PATTERN (insn);
15931
15932 if (vzeroupper_operation (pat, VOIDmode)
15933 || vzeroall_operation (pat, VOIDmode))
15934 return AVX_U128_CLEAN;
15935
15936 /* We know that state is clean after CALL insn if there are no
15937 256bit registers used in the function return register. */
15938 if (CALL_P (insn))
15939 {
15940 bool avx_reg256_found = false;
15941 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15942
15943 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
15944 }
15945
15946 /* Otherwise, return current mode. Remember that if insn
15947 references AVX 256bit registers, the mode was already changed
15948 to DIRTY from MODE_NEEDED. */
15949 return mode;
15950 }
15951
15952 /* Return the mode that an insn results in. */
15953
15954 int
15955 ix86_mode_after (int entity, int mode, rtx insn)
15956 {
15957 switch (entity)
15958 {
15959 case AVX_U128:
15960 return ix86_avx_u128_mode_after (mode, insn);
15961 case I387_TRUNC:
15962 case I387_FLOOR:
15963 case I387_CEIL:
15964 case I387_MASK_PM:
15965 return mode;
15966 default:
15967 gcc_unreachable ();
15968 }
15969 }
15970
15971 static int
15972 ix86_avx_u128_mode_entry (void)
15973 {
15974 tree arg;
15975
15976 /* Entry mode is set to AVX_U128_DIRTY if there are
15977 256bit modes used in function arguments. */
15978 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15979 arg = TREE_CHAIN (arg))
15980 {
15981 rtx incoming = DECL_INCOMING_RTL (arg);
15982
15983 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15984 return AVX_U128_DIRTY;
15985 }
15986
15987 return AVX_U128_CLEAN;
15988 }
15989
15990 /* Return a mode that ENTITY is assumed to be
15991 switched to at function entry. */
15992
15993 int
15994 ix86_mode_entry (int entity)
15995 {
15996 switch (entity)
15997 {
15998 case AVX_U128:
15999 return ix86_avx_u128_mode_entry ();
16000 case I387_TRUNC:
16001 case I387_FLOOR:
16002 case I387_CEIL:
16003 case I387_MASK_PM:
16004 return I387_CW_ANY;
16005 default:
16006 gcc_unreachable ();
16007 }
16008 }
16009
16010 static int
16011 ix86_avx_u128_mode_exit (void)
16012 {
16013 rtx reg = crtl->return_rtx;
16014
16015 /* Exit mode is set to AVX_U128_DIRTY if there are
16016 256bit modes used in the function return register. */
16017 if (reg && ix86_check_avx256_register (&reg, NULL))
16018 return AVX_U128_DIRTY;
16019
16020 return AVX_U128_CLEAN;
16021 }
16022
16023 /* Return a mode that ENTITY is assumed to be
16024 switched to at function exit. */
16025
16026 int
16027 ix86_mode_exit (int entity)
16028 {
16029 switch (entity)
16030 {
16031 case AVX_U128:
16032 return ix86_avx_u128_mode_exit ();
16033 case I387_TRUNC:
16034 case I387_FLOOR:
16035 case I387_CEIL:
16036 case I387_MASK_PM:
16037 return I387_CW_ANY;
16038 default:
16039 gcc_unreachable ();
16040 }
16041 }
16042
16043 /* Output code to initialize control word copies used by trunc?f?i and
16044 rounding patterns. CURRENT_MODE is set to current control word,
16045 while NEW_MODE is set to new control word. */
16046
16047 static void
16048 emit_i387_cw_initialization (int mode)
16049 {
16050 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
16051 rtx new_mode;
16052
16053 enum ix86_stack_slot slot;
16054
16055 rtx reg = gen_reg_rtx (HImode);
16056
16057 emit_insn (gen_x86_fnstcw_1 (stored_mode));
16058 emit_move_insn (reg, copy_rtx (stored_mode));
16059
16060 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
16061 || optimize_insn_for_size_p ())
16062 {
16063 switch (mode)
16064 {
16065 case I387_CW_TRUNC:
16066 /* round toward zero (truncate) */
16067 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
16068 slot = SLOT_CW_TRUNC;
16069 break;
16070
16071 case I387_CW_FLOOR:
16072 /* round down toward -oo */
16073 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16074 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
16075 slot = SLOT_CW_FLOOR;
16076 break;
16077
16078 case I387_CW_CEIL:
16079 /* round up toward +oo */
16080 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16081 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
16082 slot = SLOT_CW_CEIL;
16083 break;
16084
16085 case I387_CW_MASK_PM:
16086 /* mask precision exception for nearbyint() */
16087 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16088 slot = SLOT_CW_MASK_PM;
16089 break;
16090
16091 default:
16092 gcc_unreachable ();
16093 }
16094 }
16095 else
16096 {
16097 switch (mode)
16098 {
16099 case I387_CW_TRUNC:
16100 /* round toward zero (truncate) */
16101 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
16102 slot = SLOT_CW_TRUNC;
16103 break;
16104
16105 case I387_CW_FLOOR:
16106 /* round down toward -oo */
16107 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
16108 slot = SLOT_CW_FLOOR;
16109 break;
16110
16111 case I387_CW_CEIL:
16112 /* round up toward +oo */
16113 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
16114 slot = SLOT_CW_CEIL;
16115 break;
16116
16117 case I387_CW_MASK_PM:
16118 /* mask precision exception for nearbyint() */
16119 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16120 slot = SLOT_CW_MASK_PM;
16121 break;
16122
16123 default:
16124 gcc_unreachable ();
16125 }
16126 }
16127
16128 gcc_assert (slot < MAX_386_STACK_LOCALS);
16129
16130 new_mode = assign_386_stack_local (HImode, slot);
16131 emit_move_insn (new_mode, reg);
16132 }
16133
16134 /* Emit vzeroupper. */
16135
16136 void
16137 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
16138 {
16139 int i;
16140
16141 /* Cancel automatic vzeroupper insertion if there are
16142 live call-saved SSE registers at the insertion point. */
16143
16144 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
16145 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16146 return;
16147
16148 if (TARGET_64BIT)
16149 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
16150 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16151 return;
16152
16153 emit_insn (gen_avx_vzeroupper ());
16154 }
16155
16156 /* Generate one or more insns to set ENTITY to MODE. */
16157
16158 void
16159 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
16160 {
16161 switch (entity)
16162 {
16163 case AVX_U128:
16164 if (mode == AVX_U128_CLEAN)
16165 ix86_avx_emit_vzeroupper (regs_live);
16166 break;
16167 case I387_TRUNC:
16168 case I387_FLOOR:
16169 case I387_CEIL:
16170 case I387_MASK_PM:
16171 if (mode != I387_CW_ANY
16172 && mode != I387_CW_UNINITIALIZED)
16173 emit_i387_cw_initialization (mode);
16174 break;
16175 default:
16176 gcc_unreachable ();
16177 }
16178 }
16179
16180 /* Output code for INSN to convert a float to a signed int. OPERANDS
16181 are the insn operands. The output may be [HSD]Imode and the input
16182 operand may be [SDX]Fmode. */
16183
16184 const char *
16185 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16186 {
16187 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16188 int dimode_p = GET_MODE (operands[0]) == DImode;
16189 int round_mode = get_attr_i387_cw (insn);
16190
16191 /* Jump through a hoop or two for DImode, since the hardware has no
16192 non-popping instruction. We used to do this a different way, but
16193 that was somewhat fragile and broke with post-reload splitters. */
16194 if ((dimode_p || fisttp) && !stack_top_dies)
16195 output_asm_insn ("fld\t%y1", operands);
16196
16197 gcc_assert (STACK_TOP_P (operands[1]));
16198 gcc_assert (MEM_P (operands[0]));
16199 gcc_assert (GET_MODE (operands[1]) != TFmode);
16200
16201 if (fisttp)
16202 output_asm_insn ("fisttp%Z0\t%0", operands);
16203 else
16204 {
16205 if (round_mode != I387_CW_ANY)
16206 output_asm_insn ("fldcw\t%3", operands);
16207 if (stack_top_dies || dimode_p)
16208 output_asm_insn ("fistp%Z0\t%0", operands);
16209 else
16210 output_asm_insn ("fist%Z0\t%0", operands);
16211 if (round_mode != I387_CW_ANY)
16212 output_asm_insn ("fldcw\t%2", operands);
16213 }
16214
16215 return "";
16216 }
16217
16218 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16219 have the values zero or one, indicates the ffreep insn's operand
16220 from the OPERANDS array. */
16221
16222 static const char *
16223 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16224 {
16225 if (TARGET_USE_FFREEP)
16226 #ifdef HAVE_AS_IX86_FFREEP
16227 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16228 #else
16229 {
16230 static char retval[32];
16231 int regno = REGNO (operands[opno]);
16232
16233 gcc_assert (STACK_REGNO_P (regno));
16234
16235 regno -= FIRST_STACK_REG;
16236
16237 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16238 return retval;
16239 }
16240 #endif
16241
16242 return opno ? "fstp\t%y1" : "fstp\t%y0";
16243 }
16244
16245
16246 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16247 should be used. UNORDERED_P is true when fucom should be used. */
16248
16249 const char *
16250 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16251 {
16252 int stack_top_dies;
16253 rtx cmp_op0, cmp_op1;
16254 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16255
16256 if (eflags_p)
16257 {
16258 cmp_op0 = operands[0];
16259 cmp_op1 = operands[1];
16260 }
16261 else
16262 {
16263 cmp_op0 = operands[1];
16264 cmp_op1 = operands[2];
16265 }
16266
16267 if (is_sse)
16268 {
16269 if (GET_MODE (operands[0]) == SFmode)
16270 if (unordered_p)
16271 return "%vucomiss\t{%1, %0|%0, %1}";
16272 else
16273 return "%vcomiss\t{%1, %0|%0, %1}";
16274 else
16275 if (unordered_p)
16276 return "%vucomisd\t{%1, %0|%0, %1}";
16277 else
16278 return "%vcomisd\t{%1, %0|%0, %1}";
16279 }
16280
16281 gcc_assert (STACK_TOP_P (cmp_op0));
16282
16283 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16284
16285 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16286 {
16287 if (stack_top_dies)
16288 {
16289 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16290 return output_387_ffreep (operands, 1);
16291 }
16292 else
16293 return "ftst\n\tfnstsw\t%0";
16294 }
16295
16296 if (STACK_REG_P (cmp_op1)
16297 && stack_top_dies
16298 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16299 && REGNO (cmp_op1) != FIRST_STACK_REG)
16300 {
16301 /* If both the top of the 387 stack dies, and the other operand
16302 is also a stack register that dies, then this must be a
16303 `fcompp' float compare */
16304
16305 if (eflags_p)
16306 {
16307 /* There is no double popping fcomi variant. Fortunately,
16308 eflags is immune from the fstp's cc clobbering. */
16309 if (unordered_p)
16310 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16311 else
16312 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16313 return output_387_ffreep (operands, 0);
16314 }
16315 else
16316 {
16317 if (unordered_p)
16318 return "fucompp\n\tfnstsw\t%0";
16319 else
16320 return "fcompp\n\tfnstsw\t%0";
16321 }
16322 }
16323 else
16324 {
16325 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16326
16327 static const char * const alt[16] =
16328 {
16329 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16330 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16331 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16332 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16333
16334 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16335 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16336 NULL,
16337 NULL,
16338
16339 "fcomi\t{%y1, %0|%0, %y1}",
16340 "fcomip\t{%y1, %0|%0, %y1}",
16341 "fucomi\t{%y1, %0|%0, %y1}",
16342 "fucomip\t{%y1, %0|%0, %y1}",
16343
16344 NULL,
16345 NULL,
16346 NULL,
16347 NULL
16348 };
16349
16350 int mask;
16351 const char *ret;
16352
16353 mask = eflags_p << 3;
16354 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16355 mask |= unordered_p << 1;
16356 mask |= stack_top_dies;
16357
16358 gcc_assert (mask < 16);
16359 ret = alt[mask];
16360 gcc_assert (ret);
16361
16362 return ret;
16363 }
16364 }
16365
16366 void
16367 ix86_output_addr_vec_elt (FILE *file, int value)
16368 {
16369 const char *directive = ASM_LONG;
16370
16371 #ifdef ASM_QUAD
16372 if (TARGET_LP64)
16373 directive = ASM_QUAD;
16374 #else
16375 gcc_assert (!TARGET_64BIT);
16376 #endif
16377
16378 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16379 }
16380
16381 void
16382 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16383 {
16384 const char *directive = ASM_LONG;
16385
16386 #ifdef ASM_QUAD
16387 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16388 directive = ASM_QUAD;
16389 #else
16390 gcc_assert (!TARGET_64BIT);
16391 #endif
16392 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16393 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16394 fprintf (file, "%s%s%d-%s%d\n",
16395 directive, LPREFIX, value, LPREFIX, rel);
16396 else if (HAVE_AS_GOTOFF_IN_DATA)
16397 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16398 #if TARGET_MACHO
16399 else if (TARGET_MACHO)
16400 {
16401 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16402 machopic_output_function_base_name (file);
16403 putc ('\n', file);
16404 }
16405 #endif
16406 else
16407 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16408 GOT_SYMBOL_NAME, LPREFIX, value);
16409 }
16410 \f
16411 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16412 for the target. */
16413
16414 void
16415 ix86_expand_clear (rtx dest)
16416 {
16417 rtx tmp;
16418
16419 /* We play register width games, which are only valid after reload. */
16420 gcc_assert (reload_completed);
16421
16422 /* Avoid HImode and its attendant prefix byte. */
16423 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16424 dest = gen_rtx_REG (SImode, REGNO (dest));
16425 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16426
16427 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16428 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16429 {
16430 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16431 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16432 }
16433
16434 emit_insn (tmp);
16435 }
16436
16437 /* X is an unchanging MEM. If it is a constant pool reference, return
16438 the constant pool rtx, else NULL. */
16439
16440 rtx
16441 maybe_get_pool_constant (rtx x)
16442 {
16443 x = ix86_delegitimize_address (XEXP (x, 0));
16444
16445 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16446 return get_pool_constant (x);
16447
16448 return NULL_RTX;
16449 }
16450
16451 void
16452 ix86_expand_move (enum machine_mode mode, rtx operands[])
16453 {
16454 rtx op0, op1;
16455 enum tls_model model;
16456
16457 op0 = operands[0];
16458 op1 = operands[1];
16459
16460 if (GET_CODE (op1) == SYMBOL_REF)
16461 {
16462 rtx tmp;
16463
16464 model = SYMBOL_REF_TLS_MODEL (op1);
16465 if (model)
16466 {
16467 op1 = legitimize_tls_address (op1, model, true);
16468 op1 = force_operand (op1, op0);
16469 if (op1 == op0)
16470 return;
16471 op1 = convert_to_mode (mode, op1, 1);
16472 }
16473 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16474 op1 = tmp;
16475 }
16476 else if (GET_CODE (op1) == CONST
16477 && GET_CODE (XEXP (op1, 0)) == PLUS
16478 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16479 {
16480 rtx addend = XEXP (XEXP (op1, 0), 1);
16481 rtx symbol = XEXP (XEXP (op1, 0), 0);
16482 rtx tmp;
16483
16484 model = SYMBOL_REF_TLS_MODEL (symbol);
16485 if (model)
16486 tmp = legitimize_tls_address (symbol, model, true);
16487 else
16488 tmp = legitimize_pe_coff_symbol (symbol, true);
16489
16490 if (tmp)
16491 {
16492 tmp = force_operand (tmp, NULL);
16493 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16494 op0, 1, OPTAB_DIRECT);
16495 if (tmp == op0)
16496 return;
16497 op1 = convert_to_mode (mode, tmp, 1);
16498 }
16499 }
16500
16501 if ((flag_pic || MACHOPIC_INDIRECT)
16502 && symbolic_operand (op1, mode))
16503 {
16504 if (TARGET_MACHO && !TARGET_64BIT)
16505 {
16506 #if TARGET_MACHO
16507 /* dynamic-no-pic */
16508 if (MACHOPIC_INDIRECT)
16509 {
16510 rtx temp = ((reload_in_progress
16511 || ((op0 && REG_P (op0))
16512 && mode == Pmode))
16513 ? op0 : gen_reg_rtx (Pmode));
16514 op1 = machopic_indirect_data_reference (op1, temp);
16515 if (MACHOPIC_PURE)
16516 op1 = machopic_legitimize_pic_address (op1, mode,
16517 temp == op1 ? 0 : temp);
16518 }
16519 if (op0 != op1 && GET_CODE (op0) != MEM)
16520 {
16521 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16522 emit_insn (insn);
16523 return;
16524 }
16525 if (GET_CODE (op0) == MEM)
16526 op1 = force_reg (Pmode, op1);
16527 else
16528 {
16529 rtx temp = op0;
16530 if (GET_CODE (temp) != REG)
16531 temp = gen_reg_rtx (Pmode);
16532 temp = legitimize_pic_address (op1, temp);
16533 if (temp == op0)
16534 return;
16535 op1 = temp;
16536 }
16537 /* dynamic-no-pic */
16538 #endif
16539 }
16540 else
16541 {
16542 if (MEM_P (op0))
16543 op1 = force_reg (mode, op1);
16544 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16545 {
16546 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16547 op1 = legitimize_pic_address (op1, reg);
16548 if (op0 == op1)
16549 return;
16550 op1 = convert_to_mode (mode, op1, 1);
16551 }
16552 }
16553 }
16554 else
16555 {
16556 if (MEM_P (op0)
16557 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16558 || !push_operand (op0, mode))
16559 && MEM_P (op1))
16560 op1 = force_reg (mode, op1);
16561
16562 if (push_operand (op0, mode)
16563 && ! general_no_elim_operand (op1, mode))
16564 op1 = copy_to_mode_reg (mode, op1);
16565
16566 /* Force large constants in 64bit compilation into register
16567 to get them CSEed. */
16568 if (can_create_pseudo_p ()
16569 && (mode == DImode) && TARGET_64BIT
16570 && immediate_operand (op1, mode)
16571 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16572 && !register_operand (op0, mode)
16573 && optimize)
16574 op1 = copy_to_mode_reg (mode, op1);
16575
16576 if (can_create_pseudo_p ()
16577 && FLOAT_MODE_P (mode)
16578 && GET_CODE (op1) == CONST_DOUBLE)
16579 {
16580 /* If we are loading a floating point constant to a register,
16581 force the value to memory now, since we'll get better code
16582 out the back end. */
16583
16584 op1 = validize_mem (force_const_mem (mode, op1));
16585 if (!register_operand (op0, mode))
16586 {
16587 rtx temp = gen_reg_rtx (mode);
16588 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16589 emit_move_insn (op0, temp);
16590 return;
16591 }
16592 }
16593 }
16594
16595 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16596 }
16597
16598 void
16599 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16600 {
16601 rtx op0 = operands[0], op1 = operands[1];
16602 unsigned int align = GET_MODE_ALIGNMENT (mode);
16603
16604 /* Force constants other than zero into memory. We do not know how
16605 the instructions used to build constants modify the upper 64 bits
16606 of the register, once we have that information we may be able
16607 to handle some of them more efficiently. */
16608 if (can_create_pseudo_p ()
16609 && register_operand (op0, mode)
16610 && (CONSTANT_P (op1)
16611 || (GET_CODE (op1) == SUBREG
16612 && CONSTANT_P (SUBREG_REG (op1))))
16613 && !standard_sse_constant_p (op1))
16614 op1 = validize_mem (force_const_mem (mode, op1));
16615
16616 /* We need to check memory alignment for SSE mode since attribute
16617 can make operands unaligned. */
16618 if (can_create_pseudo_p ()
16619 && SSE_REG_MODE_P (mode)
16620 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16621 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16622 {
16623 rtx tmp[2];
16624
16625 /* ix86_expand_vector_move_misalign() does not like constants ... */
16626 if (CONSTANT_P (op1)
16627 || (GET_CODE (op1) == SUBREG
16628 && CONSTANT_P (SUBREG_REG (op1))))
16629 op1 = validize_mem (force_const_mem (mode, op1));
16630
16631 /* ... nor both arguments in memory. */
16632 if (!register_operand (op0, mode)
16633 && !register_operand (op1, mode))
16634 op1 = force_reg (mode, op1);
16635
16636 tmp[0] = op0; tmp[1] = op1;
16637 ix86_expand_vector_move_misalign (mode, tmp);
16638 return;
16639 }
16640
16641 /* Make operand1 a register if it isn't already. */
16642 if (can_create_pseudo_p ()
16643 && !register_operand (op0, mode)
16644 && !register_operand (op1, mode))
16645 {
16646 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16647 return;
16648 }
16649
16650 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16651 }
16652
16653 /* Split 32-byte AVX unaligned load and store if needed. */
16654
16655 static void
16656 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16657 {
16658 rtx m;
16659 rtx (*extract) (rtx, rtx, rtx);
16660 rtx (*load_unaligned) (rtx, rtx);
16661 rtx (*store_unaligned) (rtx, rtx);
16662 enum machine_mode mode;
16663
16664 switch (GET_MODE (op0))
16665 {
16666 default:
16667 gcc_unreachable ();
16668 case V32QImode:
16669 extract = gen_avx_vextractf128v32qi;
16670 load_unaligned = gen_avx_loaddquv32qi;
16671 store_unaligned = gen_avx_storedquv32qi;
16672 mode = V16QImode;
16673 break;
16674 case V8SFmode:
16675 extract = gen_avx_vextractf128v8sf;
16676 load_unaligned = gen_avx_loadups256;
16677 store_unaligned = gen_avx_storeups256;
16678 mode = V4SFmode;
16679 break;
16680 case V4DFmode:
16681 extract = gen_avx_vextractf128v4df;
16682 load_unaligned = gen_avx_loadupd256;
16683 store_unaligned = gen_avx_storeupd256;
16684 mode = V2DFmode;
16685 break;
16686 }
16687
16688 if (MEM_P (op1))
16689 {
16690 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16691 {
16692 rtx r = gen_reg_rtx (mode);
16693 m = adjust_address (op1, mode, 0);
16694 emit_move_insn (r, m);
16695 m = adjust_address (op1, mode, 16);
16696 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16697 emit_move_insn (op0, r);
16698 }
16699 /* Normal *mov<mode>_internal pattern will handle
16700 unaligned loads just fine if misaligned_operand
16701 is true, and without the UNSPEC it can be combined
16702 with arithmetic instructions. */
16703 else if (misaligned_operand (op1, GET_MODE (op1)))
16704 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16705 else
16706 emit_insn (load_unaligned (op0, op1));
16707 }
16708 else if (MEM_P (op0))
16709 {
16710 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16711 {
16712 m = adjust_address (op0, mode, 0);
16713 emit_insn (extract (m, op1, const0_rtx));
16714 m = adjust_address (op0, mode, 16);
16715 emit_insn (extract (m, op1, const1_rtx));
16716 }
16717 else
16718 emit_insn (store_unaligned (op0, op1));
16719 }
16720 else
16721 gcc_unreachable ();
16722 }
16723
16724 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16725 straight to ix86_expand_vector_move. */
16726 /* Code generation for scalar reg-reg moves of single and double precision data:
16727 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16728 movaps reg, reg
16729 else
16730 movss reg, reg
16731 if (x86_sse_partial_reg_dependency == true)
16732 movapd reg, reg
16733 else
16734 movsd reg, reg
16735
16736 Code generation for scalar loads of double precision data:
16737 if (x86_sse_split_regs == true)
16738 movlpd mem, reg (gas syntax)
16739 else
16740 movsd mem, reg
16741
16742 Code generation for unaligned packed loads of single precision data
16743 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16744 if (x86_sse_unaligned_move_optimal)
16745 movups mem, reg
16746
16747 if (x86_sse_partial_reg_dependency == true)
16748 {
16749 xorps reg, reg
16750 movlps mem, reg
16751 movhps mem+8, reg
16752 }
16753 else
16754 {
16755 movlps mem, reg
16756 movhps mem+8, reg
16757 }
16758
16759 Code generation for unaligned packed loads of double precision data
16760 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16761 if (x86_sse_unaligned_move_optimal)
16762 movupd mem, reg
16763
16764 if (x86_sse_split_regs == true)
16765 {
16766 movlpd mem, reg
16767 movhpd mem+8, reg
16768 }
16769 else
16770 {
16771 movsd mem, reg
16772 movhpd mem+8, reg
16773 }
16774 */
16775
16776 void
16777 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16778 {
16779 rtx op0, op1, orig_op0 = NULL_RTX, m;
16780 rtx (*load_unaligned) (rtx, rtx);
16781 rtx (*store_unaligned) (rtx, rtx);
16782
16783 op0 = operands[0];
16784 op1 = operands[1];
16785
16786 if (GET_MODE_SIZE (mode) == 64)
16787 {
16788 switch (GET_MODE_CLASS (mode))
16789 {
16790 case MODE_VECTOR_INT:
16791 case MODE_INT:
16792 if (GET_MODE (op0) != V16SImode)
16793 {
16794 if (!MEM_P (op0))
16795 {
16796 orig_op0 = op0;
16797 op0 = gen_reg_rtx (V16SImode);
16798 }
16799 else
16800 op0 = gen_lowpart (V16SImode, op0);
16801 }
16802 op1 = gen_lowpart (V16SImode, op1);
16803 /* FALLTHRU */
16804
16805 case MODE_VECTOR_FLOAT:
16806 switch (GET_MODE (op0))
16807 {
16808 default:
16809 gcc_unreachable ();
16810 case V16SImode:
16811 load_unaligned = gen_avx512f_loaddquv16si;
16812 store_unaligned = gen_avx512f_storedquv16si;
16813 break;
16814 case V16SFmode:
16815 load_unaligned = gen_avx512f_loadups512;
16816 store_unaligned = gen_avx512f_storeups512;
16817 break;
16818 case V8DFmode:
16819 load_unaligned = gen_avx512f_loadupd512;
16820 store_unaligned = gen_avx512f_storeupd512;
16821 break;
16822 }
16823
16824 if (MEM_P (op1))
16825 emit_insn (load_unaligned (op0, op1));
16826 else if (MEM_P (op0))
16827 emit_insn (store_unaligned (op0, op1));
16828 else
16829 gcc_unreachable ();
16830 if (orig_op0)
16831 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16832 break;
16833
16834 default:
16835 gcc_unreachable ();
16836 }
16837
16838 return;
16839 }
16840
16841 if (TARGET_AVX
16842 && GET_MODE_SIZE (mode) == 32)
16843 {
16844 switch (GET_MODE_CLASS (mode))
16845 {
16846 case MODE_VECTOR_INT:
16847 case MODE_INT:
16848 if (GET_MODE (op0) != V32QImode)
16849 {
16850 if (!MEM_P (op0))
16851 {
16852 orig_op0 = op0;
16853 op0 = gen_reg_rtx (V32QImode);
16854 }
16855 else
16856 op0 = gen_lowpart (V32QImode, op0);
16857 }
16858 op1 = gen_lowpart (V32QImode, op1);
16859 /* FALLTHRU */
16860
16861 case MODE_VECTOR_FLOAT:
16862 ix86_avx256_split_vector_move_misalign (op0, op1);
16863 if (orig_op0)
16864 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16865 break;
16866
16867 default:
16868 gcc_unreachable ();
16869 }
16870
16871 return;
16872 }
16873
16874 if (MEM_P (op1))
16875 {
16876 /* Normal *mov<mode>_internal pattern will handle
16877 unaligned loads just fine if misaligned_operand
16878 is true, and without the UNSPEC it can be combined
16879 with arithmetic instructions. */
16880 if (TARGET_AVX
16881 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16882 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16883 && misaligned_operand (op1, GET_MODE (op1)))
16884 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16885 /* ??? If we have typed data, then it would appear that using
16886 movdqu is the only way to get unaligned data loaded with
16887 integer type. */
16888 else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16889 {
16890 if (GET_MODE (op0) != V16QImode)
16891 {
16892 orig_op0 = op0;
16893 op0 = gen_reg_rtx (V16QImode);
16894 }
16895 op1 = gen_lowpart (V16QImode, op1);
16896 /* We will eventually emit movups based on insn attributes. */
16897 emit_insn (gen_sse2_loaddquv16qi (op0, op1));
16898 if (orig_op0)
16899 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16900 }
16901 else if (TARGET_SSE2 && mode == V2DFmode)
16902 {
16903 rtx zero;
16904
16905 if (TARGET_AVX
16906 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16907 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16908 || optimize_insn_for_size_p ())
16909 {
16910 /* We will eventually emit movups based on insn attributes. */
16911 emit_insn (gen_sse2_loadupd (op0, op1));
16912 return;
16913 }
16914
16915 /* When SSE registers are split into halves, we can avoid
16916 writing to the top half twice. */
16917 if (TARGET_SSE_SPLIT_REGS)
16918 {
16919 emit_clobber (op0);
16920 zero = op0;
16921 }
16922 else
16923 {
16924 /* ??? Not sure about the best option for the Intel chips.
16925 The following would seem to satisfy; the register is
16926 entirely cleared, breaking the dependency chain. We
16927 then store to the upper half, with a dependency depth
16928 of one. A rumor has it that Intel recommends two movsd
16929 followed by an unpacklpd, but this is unconfirmed. And
16930 given that the dependency depth of the unpacklpd would
16931 still be one, I'm not sure why this would be better. */
16932 zero = CONST0_RTX (V2DFmode);
16933 }
16934
16935 m = adjust_address (op1, DFmode, 0);
16936 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16937 m = adjust_address (op1, DFmode, 8);
16938 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16939 }
16940 else
16941 {
16942 rtx t;
16943
16944 if (TARGET_AVX
16945 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16946 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16947 || optimize_insn_for_size_p ())
16948 {
16949 if (GET_MODE (op0) != V4SFmode)
16950 {
16951 orig_op0 = op0;
16952 op0 = gen_reg_rtx (V4SFmode);
16953 }
16954 op1 = gen_lowpart (V4SFmode, op1);
16955 emit_insn (gen_sse_loadups (op0, op1));
16956 if (orig_op0)
16957 emit_move_insn (orig_op0,
16958 gen_lowpart (GET_MODE (orig_op0), op0));
16959 return;
16960 }
16961
16962 if (mode != V4SFmode)
16963 t = gen_reg_rtx (V4SFmode);
16964 else
16965 t = op0;
16966
16967 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16968 emit_move_insn (t, CONST0_RTX (V4SFmode));
16969 else
16970 emit_clobber (t);
16971
16972 m = adjust_address (op1, V2SFmode, 0);
16973 emit_insn (gen_sse_loadlps (t, t, m));
16974 m = adjust_address (op1, V2SFmode, 8);
16975 emit_insn (gen_sse_loadhps (t, t, m));
16976 if (mode != V4SFmode)
16977 emit_move_insn (op0, gen_lowpart (mode, t));
16978 }
16979 }
16980 else if (MEM_P (op0))
16981 {
16982 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16983 {
16984 op0 = gen_lowpart (V16QImode, op0);
16985 op1 = gen_lowpart (V16QImode, op1);
16986 /* We will eventually emit movups based on insn attributes. */
16987 emit_insn (gen_sse2_storedquv16qi (op0, op1));
16988 }
16989 else if (TARGET_SSE2 && mode == V2DFmode)
16990 {
16991 if (TARGET_AVX
16992 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16993 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16994 || optimize_insn_for_size_p ())
16995 /* We will eventually emit movups based on insn attributes. */
16996 emit_insn (gen_sse2_storeupd (op0, op1));
16997 else
16998 {
16999 m = adjust_address (op0, DFmode, 0);
17000 emit_insn (gen_sse2_storelpd (m, op1));
17001 m = adjust_address (op0, DFmode, 8);
17002 emit_insn (gen_sse2_storehpd (m, op1));
17003 }
17004 }
17005 else
17006 {
17007 if (mode != V4SFmode)
17008 op1 = gen_lowpart (V4SFmode, op1);
17009
17010 if (TARGET_AVX
17011 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
17012 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17013 || optimize_insn_for_size_p ())
17014 {
17015 op0 = gen_lowpart (V4SFmode, op0);
17016 emit_insn (gen_sse_storeups (op0, op1));
17017 }
17018 else
17019 {
17020 m = adjust_address (op0, V2SFmode, 0);
17021 emit_insn (gen_sse_storelps (m, op1));
17022 m = adjust_address (op0, V2SFmode, 8);
17023 emit_insn (gen_sse_storehps (m, op1));
17024 }
17025 }
17026 }
17027 else
17028 gcc_unreachable ();
17029 }
17030
17031 /* Expand a push in MODE. This is some mode for which we do not support
17032 proper push instructions, at least from the registers that we expect
17033 the value to live in. */
17034
17035 void
17036 ix86_expand_push (enum machine_mode mode, rtx x)
17037 {
17038 rtx tmp;
17039
17040 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
17041 GEN_INT (-GET_MODE_SIZE (mode)),
17042 stack_pointer_rtx, 1, OPTAB_DIRECT);
17043 if (tmp != stack_pointer_rtx)
17044 emit_move_insn (stack_pointer_rtx, tmp);
17045
17046 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
17047
17048 /* When we push an operand onto stack, it has to be aligned at least
17049 at the function argument boundary. However since we don't have
17050 the argument type, we can't determine the actual argument
17051 boundary. */
17052 emit_move_insn (tmp, x);
17053 }
17054
17055 /* Helper function of ix86_fixup_binary_operands to canonicalize
17056 operand order. Returns true if the operands should be swapped. */
17057
17058 static bool
17059 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
17060 rtx operands[])
17061 {
17062 rtx dst = operands[0];
17063 rtx src1 = operands[1];
17064 rtx src2 = operands[2];
17065
17066 /* If the operation is not commutative, we can't do anything. */
17067 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
17068 return false;
17069
17070 /* Highest priority is that src1 should match dst. */
17071 if (rtx_equal_p (dst, src1))
17072 return false;
17073 if (rtx_equal_p (dst, src2))
17074 return true;
17075
17076 /* Next highest priority is that immediate constants come second. */
17077 if (immediate_operand (src2, mode))
17078 return false;
17079 if (immediate_operand (src1, mode))
17080 return true;
17081
17082 /* Lowest priority is that memory references should come second. */
17083 if (MEM_P (src2))
17084 return false;
17085 if (MEM_P (src1))
17086 return true;
17087
17088 return false;
17089 }
17090
17091
17092 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
17093 destination to use for the operation. If different from the true
17094 destination in operands[0], a copy operation will be required. */
17095
17096 rtx
17097 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
17098 rtx operands[])
17099 {
17100 rtx dst = operands[0];
17101 rtx src1 = operands[1];
17102 rtx src2 = operands[2];
17103
17104 /* Canonicalize operand order. */
17105 if (ix86_swap_binary_operands_p (code, mode, operands))
17106 {
17107 rtx temp;
17108
17109 /* It is invalid to swap operands of different modes. */
17110 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
17111
17112 temp = src1;
17113 src1 = src2;
17114 src2 = temp;
17115 }
17116
17117 /* Both source operands cannot be in memory. */
17118 if (MEM_P (src1) && MEM_P (src2))
17119 {
17120 /* Optimization: Only read from memory once. */
17121 if (rtx_equal_p (src1, src2))
17122 {
17123 src2 = force_reg (mode, src2);
17124 src1 = src2;
17125 }
17126 else if (rtx_equal_p (dst, src1))
17127 src2 = force_reg (mode, src2);
17128 else
17129 src1 = force_reg (mode, src1);
17130 }
17131
17132 /* If the destination is memory, and we do not have matching source
17133 operands, do things in registers. */
17134 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17135 dst = gen_reg_rtx (mode);
17136
17137 /* Source 1 cannot be a constant. */
17138 if (CONSTANT_P (src1))
17139 src1 = force_reg (mode, src1);
17140
17141 /* Source 1 cannot be a non-matching memory. */
17142 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17143 src1 = force_reg (mode, src1);
17144
17145 /* Improve address combine. */
17146 if (code == PLUS
17147 && GET_MODE_CLASS (mode) == MODE_INT
17148 && MEM_P (src2))
17149 src2 = force_reg (mode, src2);
17150
17151 operands[1] = src1;
17152 operands[2] = src2;
17153 return dst;
17154 }
17155
17156 /* Similarly, but assume that the destination has already been
17157 set up properly. */
17158
17159 void
17160 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
17161 enum machine_mode mode, rtx operands[])
17162 {
17163 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
17164 gcc_assert (dst == operands[0]);
17165 }
17166
17167 /* Attempt to expand a binary operator. Make the expansion closer to the
17168 actual machine, then just general_operand, which will allow 3 separate
17169 memory references (one output, two input) in a single insn. */
17170
17171 void
17172 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
17173 rtx operands[])
17174 {
17175 rtx src1, src2, dst, op, clob;
17176
17177 dst = ix86_fixup_binary_operands (code, mode, operands);
17178 src1 = operands[1];
17179 src2 = operands[2];
17180
17181 /* Emit the instruction. */
17182
17183 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
17184 if (reload_in_progress)
17185 {
17186 /* Reload doesn't know about the flags register, and doesn't know that
17187 it doesn't want to clobber it. We can only do this with PLUS. */
17188 gcc_assert (code == PLUS);
17189 emit_insn (op);
17190 }
17191 else if (reload_completed
17192 && code == PLUS
17193 && !rtx_equal_p (dst, src1))
17194 {
17195 /* This is going to be an LEA; avoid splitting it later. */
17196 emit_insn (op);
17197 }
17198 else
17199 {
17200 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17201 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17202 }
17203
17204 /* Fix up the destination if needed. */
17205 if (dst != operands[0])
17206 emit_move_insn (operands[0], dst);
17207 }
17208
17209 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
17210 the given OPERANDS. */
17211
17212 void
17213 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
17214 rtx operands[])
17215 {
17216 rtx op1 = NULL_RTX, op2 = NULL_RTX;
17217 if (GET_CODE (operands[1]) == SUBREG)
17218 {
17219 op1 = operands[1];
17220 op2 = operands[2];
17221 }
17222 else if (GET_CODE (operands[2]) == SUBREG)
17223 {
17224 op1 = operands[2];
17225 op2 = operands[1];
17226 }
17227 /* Optimize (__m128i) d | (__m128i) e and similar code
17228 when d and e are float vectors into float vector logical
17229 insn. In C/C++ without using intrinsics there is no other way
17230 to express vector logical operation on float vectors than
17231 to cast them temporarily to integer vectors. */
17232 if (op1
17233 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17234 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
17235 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
17236 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
17237 && SUBREG_BYTE (op1) == 0
17238 && (GET_CODE (op2) == CONST_VECTOR
17239 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17240 && SUBREG_BYTE (op2) == 0))
17241 && can_create_pseudo_p ())
17242 {
17243 rtx dst;
17244 switch (GET_MODE (SUBREG_REG (op1)))
17245 {
17246 case V4SFmode:
17247 case V8SFmode:
17248 case V2DFmode:
17249 case V4DFmode:
17250 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17251 if (GET_CODE (op2) == CONST_VECTOR)
17252 {
17253 op2 = gen_lowpart (GET_MODE (dst), op2);
17254 op2 = force_reg (GET_MODE (dst), op2);
17255 }
17256 else
17257 {
17258 op1 = operands[1];
17259 op2 = SUBREG_REG (operands[2]);
17260 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17261 op2 = force_reg (GET_MODE (dst), op2);
17262 }
17263 op1 = SUBREG_REG (op1);
17264 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17265 op1 = force_reg (GET_MODE (dst), op1);
17266 emit_insn (gen_rtx_SET (VOIDmode, dst,
17267 gen_rtx_fmt_ee (code, GET_MODE (dst),
17268 op1, op2)));
17269 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17270 return;
17271 default:
17272 break;
17273 }
17274 }
17275 if (!nonimmediate_operand (operands[1], mode))
17276 operands[1] = force_reg (mode, operands[1]);
17277 if (!nonimmediate_operand (operands[2], mode))
17278 operands[2] = force_reg (mode, operands[2]);
17279 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17280 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17281 gen_rtx_fmt_ee (code, mode, operands[1],
17282 operands[2])));
17283 }
17284
17285 /* Return TRUE or FALSE depending on whether the binary operator meets the
17286 appropriate constraints. */
17287
17288 bool
17289 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17290 rtx operands[3])
17291 {
17292 rtx dst = operands[0];
17293 rtx src1 = operands[1];
17294 rtx src2 = operands[2];
17295
17296 /* Both source operands cannot be in memory. */
17297 if (MEM_P (src1) && MEM_P (src2))
17298 return false;
17299
17300 /* Canonicalize operand order for commutative operators. */
17301 if (ix86_swap_binary_operands_p (code, mode, operands))
17302 {
17303 rtx temp = src1;
17304 src1 = src2;
17305 src2 = temp;
17306 }
17307
17308 /* If the destination is memory, we must have a matching source operand. */
17309 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17310 return false;
17311
17312 /* Source 1 cannot be a constant. */
17313 if (CONSTANT_P (src1))
17314 return false;
17315
17316 /* Source 1 cannot be a non-matching memory. */
17317 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17318 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17319 return (code == AND
17320 && (mode == HImode
17321 || mode == SImode
17322 || (TARGET_64BIT && mode == DImode))
17323 && satisfies_constraint_L (src2));
17324
17325 return true;
17326 }
17327
17328 /* Attempt to expand a unary operator. Make the expansion closer to the
17329 actual machine, then just general_operand, which will allow 2 separate
17330 memory references (one output, one input) in a single insn. */
17331
17332 void
17333 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17334 rtx operands[])
17335 {
17336 int matching_memory;
17337 rtx src, dst, op, clob;
17338
17339 dst = operands[0];
17340 src = operands[1];
17341
17342 /* If the destination is memory, and we do not have matching source
17343 operands, do things in registers. */
17344 matching_memory = 0;
17345 if (MEM_P (dst))
17346 {
17347 if (rtx_equal_p (dst, src))
17348 matching_memory = 1;
17349 else
17350 dst = gen_reg_rtx (mode);
17351 }
17352
17353 /* When source operand is memory, destination must match. */
17354 if (MEM_P (src) && !matching_memory)
17355 src = force_reg (mode, src);
17356
17357 /* Emit the instruction. */
17358
17359 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17360 if (reload_in_progress || code == NOT)
17361 {
17362 /* Reload doesn't know about the flags register, and doesn't know that
17363 it doesn't want to clobber it. */
17364 gcc_assert (code == NOT);
17365 emit_insn (op);
17366 }
17367 else
17368 {
17369 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17370 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17371 }
17372
17373 /* Fix up the destination if needed. */
17374 if (dst != operands[0])
17375 emit_move_insn (operands[0], dst);
17376 }
17377
17378 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17379 divisor are within the range [0-255]. */
17380
17381 void
17382 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17383 bool signed_p)
17384 {
17385 rtx end_label, qimode_label;
17386 rtx insn, div, mod;
17387 rtx scratch, tmp0, tmp1, tmp2;
17388 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17389 rtx (*gen_zero_extend) (rtx, rtx);
17390 rtx (*gen_test_ccno_1) (rtx, rtx);
17391
17392 switch (mode)
17393 {
17394 case SImode:
17395 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17396 gen_test_ccno_1 = gen_testsi_ccno_1;
17397 gen_zero_extend = gen_zero_extendqisi2;
17398 break;
17399 case DImode:
17400 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17401 gen_test_ccno_1 = gen_testdi_ccno_1;
17402 gen_zero_extend = gen_zero_extendqidi2;
17403 break;
17404 default:
17405 gcc_unreachable ();
17406 }
17407
17408 end_label = gen_label_rtx ();
17409 qimode_label = gen_label_rtx ();
17410
17411 scratch = gen_reg_rtx (mode);
17412
17413 /* Use 8bit unsigned divimod if dividend and divisor are within
17414 the range [0-255]. */
17415 emit_move_insn (scratch, operands[2]);
17416 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17417 scratch, 1, OPTAB_DIRECT);
17418 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17419 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17420 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17421 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17422 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17423 pc_rtx);
17424 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17425 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17426 JUMP_LABEL (insn) = qimode_label;
17427
17428 /* Generate original signed/unsigned divimod. */
17429 div = gen_divmod4_1 (operands[0], operands[1],
17430 operands[2], operands[3]);
17431 emit_insn (div);
17432
17433 /* Branch to the end. */
17434 emit_jump_insn (gen_jump (end_label));
17435 emit_barrier ();
17436
17437 /* Generate 8bit unsigned divide. */
17438 emit_label (qimode_label);
17439 /* Don't use operands[0] for result of 8bit divide since not all
17440 registers support QImode ZERO_EXTRACT. */
17441 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17442 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17443 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17444 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17445
17446 if (signed_p)
17447 {
17448 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17449 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17450 }
17451 else
17452 {
17453 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17454 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17455 }
17456
17457 /* Extract remainder from AH. */
17458 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17459 if (REG_P (operands[1]))
17460 insn = emit_move_insn (operands[1], tmp1);
17461 else
17462 {
17463 /* Need a new scratch register since the old one has result
17464 of 8bit divide. */
17465 scratch = gen_reg_rtx (mode);
17466 emit_move_insn (scratch, tmp1);
17467 insn = emit_move_insn (operands[1], scratch);
17468 }
17469 set_unique_reg_note (insn, REG_EQUAL, mod);
17470
17471 /* Zero extend quotient from AL. */
17472 tmp1 = gen_lowpart (QImode, tmp0);
17473 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17474 set_unique_reg_note (insn, REG_EQUAL, div);
17475
17476 emit_label (end_label);
17477 }
17478
17479 /* Whether it is OK to emit CFI directives when emitting asm code. */
17480
17481 bool
17482 ix86_emit_cfi ()
17483 {
17484 return dwarf2out_do_cfi_asm ();
17485 }
17486
17487 #define LEA_MAX_STALL (3)
17488 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17489
17490 /* Increase given DISTANCE in half-cycles according to
17491 dependencies between PREV and NEXT instructions.
17492 Add 1 half-cycle if there is no dependency and
17493 go to next cycle if there is some dependecy. */
17494
17495 static unsigned int
17496 increase_distance (rtx prev, rtx next, unsigned int distance)
17497 {
17498 df_ref *use_rec;
17499 df_ref *def_rec;
17500
17501 if (!prev || !next)
17502 return distance + (distance & 1) + 2;
17503
17504 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17505 return distance + 1;
17506
17507 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17508 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17509 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17510 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17511 return distance + (distance & 1) + 2;
17512
17513 return distance + 1;
17514 }
17515
17516 /* Function checks if instruction INSN defines register number
17517 REGNO1 or REGNO2. */
17518
17519 static bool
17520 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17521 rtx insn)
17522 {
17523 df_ref *def_rec;
17524
17525 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17526 if (DF_REF_REG_DEF_P (*def_rec)
17527 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17528 && (regno1 == DF_REF_REGNO (*def_rec)
17529 || regno2 == DF_REF_REGNO (*def_rec)))
17530 {
17531 return true;
17532 }
17533
17534 return false;
17535 }
17536
17537 /* Function checks if instruction INSN uses register number
17538 REGNO as a part of address expression. */
17539
17540 static bool
17541 insn_uses_reg_mem (unsigned int regno, rtx insn)
17542 {
17543 df_ref *use_rec;
17544
17545 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17546 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17547 return true;
17548
17549 return false;
17550 }
17551
17552 /* Search backward for non-agu definition of register number REGNO1
17553 or register number REGNO2 in basic block starting from instruction
17554 START up to head of basic block or instruction INSN.
17555
17556 Function puts true value into *FOUND var if definition was found
17557 and false otherwise.
17558
17559 Distance in half-cycles between START and found instruction or head
17560 of BB is added to DISTANCE and returned. */
17561
17562 static int
17563 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17564 rtx insn, int distance,
17565 rtx start, bool *found)
17566 {
17567 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17568 rtx prev = start;
17569 rtx next = NULL;
17570
17571 *found = false;
17572
17573 while (prev
17574 && prev != insn
17575 && distance < LEA_SEARCH_THRESHOLD)
17576 {
17577 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17578 {
17579 distance = increase_distance (prev, next, distance);
17580 if (insn_defines_reg (regno1, regno2, prev))
17581 {
17582 if (recog_memoized (prev) < 0
17583 || get_attr_type (prev) != TYPE_LEA)
17584 {
17585 *found = true;
17586 return distance;
17587 }
17588 }
17589
17590 next = prev;
17591 }
17592 if (prev == BB_HEAD (bb))
17593 break;
17594
17595 prev = PREV_INSN (prev);
17596 }
17597
17598 return distance;
17599 }
17600
17601 /* Search backward for non-agu definition of register number REGNO1
17602 or register number REGNO2 in INSN's basic block until
17603 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17604 2. Reach neighbour BBs boundary, or
17605 3. Reach agu definition.
17606 Returns the distance between the non-agu definition point and INSN.
17607 If no definition point, returns -1. */
17608
17609 static int
17610 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17611 rtx insn)
17612 {
17613 basic_block bb = BLOCK_FOR_INSN (insn);
17614 int distance = 0;
17615 bool found = false;
17616
17617 if (insn != BB_HEAD (bb))
17618 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17619 distance, PREV_INSN (insn),
17620 &found);
17621
17622 if (!found && distance < LEA_SEARCH_THRESHOLD)
17623 {
17624 edge e;
17625 edge_iterator ei;
17626 bool simple_loop = false;
17627
17628 FOR_EACH_EDGE (e, ei, bb->preds)
17629 if (e->src == bb)
17630 {
17631 simple_loop = true;
17632 break;
17633 }
17634
17635 if (simple_loop)
17636 distance = distance_non_agu_define_in_bb (regno1, regno2,
17637 insn, distance,
17638 BB_END (bb), &found);
17639 else
17640 {
17641 int shortest_dist = -1;
17642 bool found_in_bb = false;
17643
17644 FOR_EACH_EDGE (e, ei, bb->preds)
17645 {
17646 int bb_dist
17647 = distance_non_agu_define_in_bb (regno1, regno2,
17648 insn, distance,
17649 BB_END (e->src),
17650 &found_in_bb);
17651 if (found_in_bb)
17652 {
17653 if (shortest_dist < 0)
17654 shortest_dist = bb_dist;
17655 else if (bb_dist > 0)
17656 shortest_dist = MIN (bb_dist, shortest_dist);
17657
17658 found = true;
17659 }
17660 }
17661
17662 distance = shortest_dist;
17663 }
17664 }
17665
17666 /* get_attr_type may modify recog data. We want to make sure
17667 that recog data is valid for instruction INSN, on which
17668 distance_non_agu_define is called. INSN is unchanged here. */
17669 extract_insn_cached (insn);
17670
17671 if (!found)
17672 return -1;
17673
17674 return distance >> 1;
17675 }
17676
17677 /* Return the distance in half-cycles between INSN and the next
17678 insn that uses register number REGNO in memory address added
17679 to DISTANCE. Return -1 if REGNO0 is set.
17680
17681 Put true value into *FOUND if register usage was found and
17682 false otherwise.
17683 Put true value into *REDEFINED if register redefinition was
17684 found and false otherwise. */
17685
17686 static int
17687 distance_agu_use_in_bb (unsigned int regno,
17688 rtx insn, int distance, rtx start,
17689 bool *found, bool *redefined)
17690 {
17691 basic_block bb = NULL;
17692 rtx next = start;
17693 rtx prev = NULL;
17694
17695 *found = false;
17696 *redefined = false;
17697
17698 if (start != NULL_RTX)
17699 {
17700 bb = BLOCK_FOR_INSN (start);
17701 if (start != BB_HEAD (bb))
17702 /* If insn and start belong to the same bb, set prev to insn,
17703 so the call to increase_distance will increase the distance
17704 between insns by 1. */
17705 prev = insn;
17706 }
17707
17708 while (next
17709 && next != insn
17710 && distance < LEA_SEARCH_THRESHOLD)
17711 {
17712 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17713 {
17714 distance = increase_distance(prev, next, distance);
17715 if (insn_uses_reg_mem (regno, next))
17716 {
17717 /* Return DISTANCE if OP0 is used in memory
17718 address in NEXT. */
17719 *found = true;
17720 return distance;
17721 }
17722
17723 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17724 {
17725 /* Return -1 if OP0 is set in NEXT. */
17726 *redefined = true;
17727 return -1;
17728 }
17729
17730 prev = next;
17731 }
17732
17733 if (next == BB_END (bb))
17734 break;
17735
17736 next = NEXT_INSN (next);
17737 }
17738
17739 return distance;
17740 }
17741
17742 /* Return the distance between INSN and the next insn that uses
17743 register number REGNO0 in memory address. Return -1 if no such
17744 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17745
17746 static int
17747 distance_agu_use (unsigned int regno0, rtx insn)
17748 {
17749 basic_block bb = BLOCK_FOR_INSN (insn);
17750 int distance = 0;
17751 bool found = false;
17752 bool redefined = false;
17753
17754 if (insn != BB_END (bb))
17755 distance = distance_agu_use_in_bb (regno0, insn, distance,
17756 NEXT_INSN (insn),
17757 &found, &redefined);
17758
17759 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17760 {
17761 edge e;
17762 edge_iterator ei;
17763 bool simple_loop = false;
17764
17765 FOR_EACH_EDGE (e, ei, bb->succs)
17766 if (e->dest == bb)
17767 {
17768 simple_loop = true;
17769 break;
17770 }
17771
17772 if (simple_loop)
17773 distance = distance_agu_use_in_bb (regno0, insn,
17774 distance, BB_HEAD (bb),
17775 &found, &redefined);
17776 else
17777 {
17778 int shortest_dist = -1;
17779 bool found_in_bb = false;
17780 bool redefined_in_bb = false;
17781
17782 FOR_EACH_EDGE (e, ei, bb->succs)
17783 {
17784 int bb_dist
17785 = distance_agu_use_in_bb (regno0, insn,
17786 distance, BB_HEAD (e->dest),
17787 &found_in_bb, &redefined_in_bb);
17788 if (found_in_bb)
17789 {
17790 if (shortest_dist < 0)
17791 shortest_dist = bb_dist;
17792 else if (bb_dist > 0)
17793 shortest_dist = MIN (bb_dist, shortest_dist);
17794
17795 found = true;
17796 }
17797 }
17798
17799 distance = shortest_dist;
17800 }
17801 }
17802
17803 if (!found || redefined)
17804 return -1;
17805
17806 return distance >> 1;
17807 }
17808
17809 /* Define this macro to tune LEA priority vs ADD, it take effect when
17810 there is a dilemma of choicing LEA or ADD
17811 Negative value: ADD is more preferred than LEA
17812 Zero: Netrual
17813 Positive value: LEA is more preferred than ADD*/
17814 #define IX86_LEA_PRIORITY 0
17815
17816 /* Return true if usage of lea INSN has performance advantage
17817 over a sequence of instructions. Instructions sequence has
17818 SPLIT_COST cycles higher latency than lea latency. */
17819
17820 static bool
17821 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17822 unsigned int regno2, int split_cost, bool has_scale)
17823 {
17824 int dist_define, dist_use;
17825
17826 /* For Silvermont if using a 2-source or 3-source LEA for
17827 non-destructive destination purposes, or due to wanting
17828 ability to use SCALE, the use of LEA is justified. */
17829 if (ix86_tune == PROCESSOR_SLM)
17830 {
17831 if (has_scale)
17832 return true;
17833 if (split_cost < 1)
17834 return false;
17835 if (regno0 == regno1 || regno0 == regno2)
17836 return false;
17837 return true;
17838 }
17839
17840 dist_define = distance_non_agu_define (regno1, regno2, insn);
17841 dist_use = distance_agu_use (regno0, insn);
17842
17843 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17844 {
17845 /* If there is no non AGU operand definition, no AGU
17846 operand usage and split cost is 0 then both lea
17847 and non lea variants have same priority. Currently
17848 we prefer lea for 64 bit code and non lea on 32 bit
17849 code. */
17850 if (dist_use < 0 && split_cost == 0)
17851 return TARGET_64BIT || IX86_LEA_PRIORITY;
17852 else
17853 return true;
17854 }
17855
17856 /* With longer definitions distance lea is more preferable.
17857 Here we change it to take into account splitting cost and
17858 lea priority. */
17859 dist_define += split_cost + IX86_LEA_PRIORITY;
17860
17861 /* If there is no use in memory addess then we just check
17862 that split cost exceeds AGU stall. */
17863 if (dist_use < 0)
17864 return dist_define > LEA_MAX_STALL;
17865
17866 /* If this insn has both backward non-agu dependence and forward
17867 agu dependence, the one with short distance takes effect. */
17868 return dist_define >= dist_use;
17869 }
17870
17871 /* Return true if it is legal to clobber flags by INSN and
17872 false otherwise. */
17873
17874 static bool
17875 ix86_ok_to_clobber_flags (rtx insn)
17876 {
17877 basic_block bb = BLOCK_FOR_INSN (insn);
17878 df_ref *use;
17879 bitmap live;
17880
17881 while (insn)
17882 {
17883 if (NONDEBUG_INSN_P (insn))
17884 {
17885 for (use = DF_INSN_USES (insn); *use; use++)
17886 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17887 return false;
17888
17889 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17890 return true;
17891 }
17892
17893 if (insn == BB_END (bb))
17894 break;
17895
17896 insn = NEXT_INSN (insn);
17897 }
17898
17899 live = df_get_live_out(bb);
17900 return !REGNO_REG_SET_P (live, FLAGS_REG);
17901 }
17902
17903 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17904 move and add to avoid AGU stalls. */
17905
17906 bool
17907 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17908 {
17909 unsigned int regno0, regno1, regno2;
17910
17911 /* Check if we need to optimize. */
17912 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17913 return false;
17914
17915 /* Check it is correct to split here. */
17916 if (!ix86_ok_to_clobber_flags(insn))
17917 return false;
17918
17919 regno0 = true_regnum (operands[0]);
17920 regno1 = true_regnum (operands[1]);
17921 regno2 = true_regnum (operands[2]);
17922
17923 /* We need to split only adds with non destructive
17924 destination operand. */
17925 if (regno0 == regno1 || regno0 == regno2)
17926 return false;
17927 else
17928 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17929 }
17930
17931 /* Return true if we should emit lea instruction instead of mov
17932 instruction. */
17933
17934 bool
17935 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17936 {
17937 unsigned int regno0, regno1;
17938
17939 /* Check if we need to optimize. */
17940 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17941 return false;
17942
17943 /* Use lea for reg to reg moves only. */
17944 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17945 return false;
17946
17947 regno0 = true_regnum (operands[0]);
17948 regno1 = true_regnum (operands[1]);
17949
17950 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17951 }
17952
17953 /* Return true if we need to split lea into a sequence of
17954 instructions to avoid AGU stalls. */
17955
17956 bool
17957 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17958 {
17959 unsigned int regno0, regno1, regno2;
17960 int split_cost;
17961 struct ix86_address parts;
17962 int ok;
17963
17964 /* Check we need to optimize. */
17965 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17966 return false;
17967
17968 /* Check it is correct to split here. */
17969 if (!ix86_ok_to_clobber_flags(insn))
17970 return false;
17971
17972 ok = ix86_decompose_address (operands[1], &parts);
17973 gcc_assert (ok);
17974
17975 /* There should be at least two components in the address. */
17976 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17977 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17978 return false;
17979
17980 /* We should not split into add if non legitimate pic
17981 operand is used as displacement. */
17982 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17983 return false;
17984
17985 regno0 = true_regnum (operands[0]) ;
17986 regno1 = INVALID_REGNUM;
17987 regno2 = INVALID_REGNUM;
17988
17989 if (parts.base)
17990 regno1 = true_regnum (parts.base);
17991 if (parts.index)
17992 regno2 = true_regnum (parts.index);
17993
17994 split_cost = 0;
17995
17996 /* Compute how many cycles we will add to execution time
17997 if split lea into a sequence of instructions. */
17998 if (parts.base || parts.index)
17999 {
18000 /* Have to use mov instruction if non desctructive
18001 destination form is used. */
18002 if (regno1 != regno0 && regno2 != regno0)
18003 split_cost += 1;
18004
18005 /* Have to add index to base if both exist. */
18006 if (parts.base && parts.index)
18007 split_cost += 1;
18008
18009 /* Have to use shift and adds if scale is 2 or greater. */
18010 if (parts.scale > 1)
18011 {
18012 if (regno0 != regno1)
18013 split_cost += 1;
18014 else if (regno2 == regno0)
18015 split_cost += 4;
18016 else
18017 split_cost += parts.scale;
18018 }
18019
18020 /* Have to use add instruction with immediate if
18021 disp is non zero. */
18022 if (parts.disp && parts.disp != const0_rtx)
18023 split_cost += 1;
18024
18025 /* Subtract the price of lea. */
18026 split_cost -= 1;
18027 }
18028
18029 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
18030 parts.scale > 1);
18031 }
18032
18033 /* Emit x86 binary operand CODE in mode MODE, where the first operand
18034 matches destination. RTX includes clobber of FLAGS_REG. */
18035
18036 static void
18037 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
18038 rtx dst, rtx src)
18039 {
18040 rtx op, clob;
18041
18042 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
18043 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18044
18045 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
18046 }
18047
18048 /* Return true if regno1 def is nearest to the insn. */
18049
18050 static bool
18051 find_nearest_reg_def (rtx insn, int regno1, int regno2)
18052 {
18053 rtx prev = insn;
18054 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
18055
18056 if (insn == start)
18057 return false;
18058 while (prev && prev != start)
18059 {
18060 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
18061 {
18062 prev = PREV_INSN (prev);
18063 continue;
18064 }
18065 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
18066 return true;
18067 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
18068 return false;
18069 prev = PREV_INSN (prev);
18070 }
18071
18072 /* None of the regs is defined in the bb. */
18073 return false;
18074 }
18075
18076 /* Split lea instructions into a sequence of instructions
18077 which are executed on ALU to avoid AGU stalls.
18078 It is assumed that it is allowed to clobber flags register
18079 at lea position. */
18080
18081 void
18082 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
18083 {
18084 unsigned int regno0, regno1, regno2;
18085 struct ix86_address parts;
18086 rtx target, tmp;
18087 int ok, adds;
18088
18089 ok = ix86_decompose_address (operands[1], &parts);
18090 gcc_assert (ok);
18091
18092 target = gen_lowpart (mode, operands[0]);
18093
18094 regno0 = true_regnum (target);
18095 regno1 = INVALID_REGNUM;
18096 regno2 = INVALID_REGNUM;
18097
18098 if (parts.base)
18099 {
18100 parts.base = gen_lowpart (mode, parts.base);
18101 regno1 = true_regnum (parts.base);
18102 }
18103
18104 if (parts.index)
18105 {
18106 parts.index = gen_lowpart (mode, parts.index);
18107 regno2 = true_regnum (parts.index);
18108 }
18109
18110 if (parts.disp)
18111 parts.disp = gen_lowpart (mode, parts.disp);
18112
18113 if (parts.scale > 1)
18114 {
18115 /* Case r1 = r1 + ... */
18116 if (regno1 == regno0)
18117 {
18118 /* If we have a case r1 = r1 + C * r1 then we
18119 should use multiplication which is very
18120 expensive. Assume cost model is wrong if we
18121 have such case here. */
18122 gcc_assert (regno2 != regno0);
18123
18124 for (adds = parts.scale; adds > 0; adds--)
18125 ix86_emit_binop (PLUS, mode, target, parts.index);
18126 }
18127 else
18128 {
18129 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
18130 if (regno0 != regno2)
18131 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18132
18133 /* Use shift for scaling. */
18134 ix86_emit_binop (ASHIFT, mode, target,
18135 GEN_INT (exact_log2 (parts.scale)));
18136
18137 if (parts.base)
18138 ix86_emit_binop (PLUS, mode, target, parts.base);
18139
18140 if (parts.disp && parts.disp != const0_rtx)
18141 ix86_emit_binop (PLUS, mode, target, parts.disp);
18142 }
18143 }
18144 else if (!parts.base && !parts.index)
18145 {
18146 gcc_assert(parts.disp);
18147 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
18148 }
18149 else
18150 {
18151 if (!parts.base)
18152 {
18153 if (regno0 != regno2)
18154 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18155 }
18156 else if (!parts.index)
18157 {
18158 if (regno0 != regno1)
18159 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
18160 }
18161 else
18162 {
18163 if (regno0 == regno1)
18164 tmp = parts.index;
18165 else if (regno0 == regno2)
18166 tmp = parts.base;
18167 else
18168 {
18169 rtx tmp1;
18170
18171 /* Find better operand for SET instruction, depending
18172 on which definition is farther from the insn. */
18173 if (find_nearest_reg_def (insn, regno1, regno2))
18174 tmp = parts.index, tmp1 = parts.base;
18175 else
18176 tmp = parts.base, tmp1 = parts.index;
18177
18178 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
18179
18180 if (parts.disp && parts.disp != const0_rtx)
18181 ix86_emit_binop (PLUS, mode, target, parts.disp);
18182
18183 ix86_emit_binop (PLUS, mode, target, tmp1);
18184 return;
18185 }
18186
18187 ix86_emit_binop (PLUS, mode, target, tmp);
18188 }
18189
18190 if (parts.disp && parts.disp != const0_rtx)
18191 ix86_emit_binop (PLUS, mode, target, parts.disp);
18192 }
18193 }
18194
18195 /* Return true if it is ok to optimize an ADD operation to LEA
18196 operation to avoid flag register consumation. For most processors,
18197 ADD is faster than LEA. For the processors like ATOM, if the
18198 destination register of LEA holds an actual address which will be
18199 used soon, LEA is better and otherwise ADD is better. */
18200
18201 bool
18202 ix86_lea_for_add_ok (rtx insn, rtx operands[])
18203 {
18204 unsigned int regno0 = true_regnum (operands[0]);
18205 unsigned int regno1 = true_regnum (operands[1]);
18206 unsigned int regno2 = true_regnum (operands[2]);
18207
18208 /* If a = b + c, (a!=b && a!=c), must use lea form. */
18209 if (regno0 != regno1 && regno0 != regno2)
18210 return true;
18211
18212 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18213 return false;
18214
18215 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
18216 }
18217
18218 /* Return true if destination reg of SET_BODY is shift count of
18219 USE_BODY. */
18220
18221 static bool
18222 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
18223 {
18224 rtx set_dest;
18225 rtx shift_rtx;
18226 int i;
18227
18228 /* Retrieve destination of SET_BODY. */
18229 switch (GET_CODE (set_body))
18230 {
18231 case SET:
18232 set_dest = SET_DEST (set_body);
18233 if (!set_dest || !REG_P (set_dest))
18234 return false;
18235 break;
18236 case PARALLEL:
18237 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
18238 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
18239 use_body))
18240 return true;
18241 default:
18242 return false;
18243 break;
18244 }
18245
18246 /* Retrieve shift count of USE_BODY. */
18247 switch (GET_CODE (use_body))
18248 {
18249 case SET:
18250 shift_rtx = XEXP (use_body, 1);
18251 break;
18252 case PARALLEL:
18253 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18254 if (ix86_dep_by_shift_count_body (set_body,
18255 XVECEXP (use_body, 0, i)))
18256 return true;
18257 default:
18258 return false;
18259 break;
18260 }
18261
18262 if (shift_rtx
18263 && (GET_CODE (shift_rtx) == ASHIFT
18264 || GET_CODE (shift_rtx) == LSHIFTRT
18265 || GET_CODE (shift_rtx) == ASHIFTRT
18266 || GET_CODE (shift_rtx) == ROTATE
18267 || GET_CODE (shift_rtx) == ROTATERT))
18268 {
18269 rtx shift_count = XEXP (shift_rtx, 1);
18270
18271 /* Return true if shift count is dest of SET_BODY. */
18272 if (REG_P (shift_count))
18273 {
18274 /* Add check since it can be invoked before register
18275 allocation in pre-reload schedule. */
18276 if (reload_completed
18277 && true_regnum (set_dest) == true_regnum (shift_count))
18278 return true;
18279 else if (REGNO(set_dest) == REGNO(shift_count))
18280 return true;
18281 }
18282 }
18283
18284 return false;
18285 }
18286
18287 /* Return true if destination reg of SET_INSN is shift count of
18288 USE_INSN. */
18289
18290 bool
18291 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18292 {
18293 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18294 PATTERN (use_insn));
18295 }
18296
18297 /* Return TRUE or FALSE depending on whether the unary operator meets the
18298 appropriate constraints. */
18299
18300 bool
18301 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18302 enum machine_mode mode ATTRIBUTE_UNUSED,
18303 rtx operands[2])
18304 {
18305 /* If one of operands is memory, source and destination must match. */
18306 if ((MEM_P (operands[0])
18307 || MEM_P (operands[1]))
18308 && ! rtx_equal_p (operands[0], operands[1]))
18309 return false;
18310 return true;
18311 }
18312
18313 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18314 are ok, keeping in mind the possible movddup alternative. */
18315
18316 bool
18317 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18318 {
18319 if (MEM_P (operands[0]))
18320 return rtx_equal_p (operands[0], operands[1 + high]);
18321 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18322 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18323 return true;
18324 }
18325
18326 /* Post-reload splitter for converting an SF or DFmode value in an
18327 SSE register into an unsigned SImode. */
18328
18329 void
18330 ix86_split_convert_uns_si_sse (rtx operands[])
18331 {
18332 enum machine_mode vecmode;
18333 rtx value, large, zero_or_two31, input, two31, x;
18334
18335 large = operands[1];
18336 zero_or_two31 = operands[2];
18337 input = operands[3];
18338 two31 = operands[4];
18339 vecmode = GET_MODE (large);
18340 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18341
18342 /* Load up the value into the low element. We must ensure that the other
18343 elements are valid floats -- zero is the easiest such value. */
18344 if (MEM_P (input))
18345 {
18346 if (vecmode == V4SFmode)
18347 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18348 else
18349 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18350 }
18351 else
18352 {
18353 input = gen_rtx_REG (vecmode, REGNO (input));
18354 emit_move_insn (value, CONST0_RTX (vecmode));
18355 if (vecmode == V4SFmode)
18356 emit_insn (gen_sse_movss (value, value, input));
18357 else
18358 emit_insn (gen_sse2_movsd (value, value, input));
18359 }
18360
18361 emit_move_insn (large, two31);
18362 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18363
18364 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18365 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18366
18367 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18368 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18369
18370 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18371 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18372
18373 large = gen_rtx_REG (V4SImode, REGNO (large));
18374 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18375
18376 x = gen_rtx_REG (V4SImode, REGNO (value));
18377 if (vecmode == V4SFmode)
18378 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18379 else
18380 emit_insn (gen_sse2_cvttpd2dq (x, value));
18381 value = x;
18382
18383 emit_insn (gen_xorv4si3 (value, value, large));
18384 }
18385
18386 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18387 Expects the 64-bit DImode to be supplied in a pair of integral
18388 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18389 -mfpmath=sse, !optimize_size only. */
18390
18391 void
18392 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18393 {
18394 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18395 rtx int_xmm, fp_xmm;
18396 rtx biases, exponents;
18397 rtx x;
18398
18399 int_xmm = gen_reg_rtx (V4SImode);
18400 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18401 emit_insn (gen_movdi_to_sse (int_xmm, input));
18402 else if (TARGET_SSE_SPLIT_REGS)
18403 {
18404 emit_clobber (int_xmm);
18405 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18406 }
18407 else
18408 {
18409 x = gen_reg_rtx (V2DImode);
18410 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18411 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18412 }
18413
18414 x = gen_rtx_CONST_VECTOR (V4SImode,
18415 gen_rtvec (4, GEN_INT (0x43300000UL),
18416 GEN_INT (0x45300000UL),
18417 const0_rtx, const0_rtx));
18418 exponents = validize_mem (force_const_mem (V4SImode, x));
18419
18420 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18421 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18422
18423 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18424 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18425 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18426 (0x1.0p84 + double(fp_value_hi_xmm)).
18427 Note these exponents differ by 32. */
18428
18429 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18430
18431 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18432 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18433 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18434 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18435 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18436 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18437 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18438 biases = validize_mem (force_const_mem (V2DFmode, biases));
18439 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18440
18441 /* Add the upper and lower DFmode values together. */
18442 if (TARGET_SSE3)
18443 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18444 else
18445 {
18446 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18447 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18448 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18449 }
18450
18451 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18452 }
18453
18454 /* Not used, but eases macroization of patterns. */
18455 void
18456 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18457 rtx input ATTRIBUTE_UNUSED)
18458 {
18459 gcc_unreachable ();
18460 }
18461
18462 /* Convert an unsigned SImode value into a DFmode. Only currently used
18463 for SSE, but applicable anywhere. */
18464
18465 void
18466 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18467 {
18468 REAL_VALUE_TYPE TWO31r;
18469 rtx x, fp;
18470
18471 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18472 NULL, 1, OPTAB_DIRECT);
18473
18474 fp = gen_reg_rtx (DFmode);
18475 emit_insn (gen_floatsidf2 (fp, x));
18476
18477 real_ldexp (&TWO31r, &dconst1, 31);
18478 x = const_double_from_real_value (TWO31r, DFmode);
18479
18480 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18481 if (x != target)
18482 emit_move_insn (target, x);
18483 }
18484
18485 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18486 32-bit mode; otherwise we have a direct convert instruction. */
18487
18488 void
18489 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18490 {
18491 REAL_VALUE_TYPE TWO32r;
18492 rtx fp_lo, fp_hi, x;
18493
18494 fp_lo = gen_reg_rtx (DFmode);
18495 fp_hi = gen_reg_rtx (DFmode);
18496
18497 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18498
18499 real_ldexp (&TWO32r, &dconst1, 32);
18500 x = const_double_from_real_value (TWO32r, DFmode);
18501 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18502
18503 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18504
18505 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18506 0, OPTAB_DIRECT);
18507 if (x != target)
18508 emit_move_insn (target, x);
18509 }
18510
18511 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18512 For x86_32, -mfpmath=sse, !optimize_size only. */
18513 void
18514 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18515 {
18516 REAL_VALUE_TYPE ONE16r;
18517 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18518
18519 real_ldexp (&ONE16r, &dconst1, 16);
18520 x = const_double_from_real_value (ONE16r, SFmode);
18521 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18522 NULL, 0, OPTAB_DIRECT);
18523 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18524 NULL, 0, OPTAB_DIRECT);
18525 fp_hi = gen_reg_rtx (SFmode);
18526 fp_lo = gen_reg_rtx (SFmode);
18527 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18528 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18529 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18530 0, OPTAB_DIRECT);
18531 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18532 0, OPTAB_DIRECT);
18533 if (!rtx_equal_p (target, fp_hi))
18534 emit_move_insn (target, fp_hi);
18535 }
18536
18537 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18538 a vector of unsigned ints VAL to vector of floats TARGET. */
18539
18540 void
18541 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18542 {
18543 rtx tmp[8];
18544 REAL_VALUE_TYPE TWO16r;
18545 enum machine_mode intmode = GET_MODE (val);
18546 enum machine_mode fltmode = GET_MODE (target);
18547 rtx (*cvt) (rtx, rtx);
18548
18549 if (intmode == V4SImode)
18550 cvt = gen_floatv4siv4sf2;
18551 else
18552 cvt = gen_floatv8siv8sf2;
18553 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18554 tmp[0] = force_reg (intmode, tmp[0]);
18555 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18556 OPTAB_DIRECT);
18557 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18558 NULL_RTX, 1, OPTAB_DIRECT);
18559 tmp[3] = gen_reg_rtx (fltmode);
18560 emit_insn (cvt (tmp[3], tmp[1]));
18561 tmp[4] = gen_reg_rtx (fltmode);
18562 emit_insn (cvt (tmp[4], tmp[2]));
18563 real_ldexp (&TWO16r, &dconst1, 16);
18564 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18565 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18566 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18567 OPTAB_DIRECT);
18568 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18569 OPTAB_DIRECT);
18570 if (tmp[7] != target)
18571 emit_move_insn (target, tmp[7]);
18572 }
18573
18574 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18575 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18576 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18577 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18578
18579 rtx
18580 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18581 {
18582 REAL_VALUE_TYPE TWO31r;
18583 rtx two31r, tmp[4];
18584 enum machine_mode mode = GET_MODE (val);
18585 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18586 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18587 rtx (*cmp) (rtx, rtx, rtx, rtx);
18588 int i;
18589
18590 for (i = 0; i < 3; i++)
18591 tmp[i] = gen_reg_rtx (mode);
18592 real_ldexp (&TWO31r, &dconst1, 31);
18593 two31r = const_double_from_real_value (TWO31r, scalarmode);
18594 two31r = ix86_build_const_vector (mode, 1, two31r);
18595 two31r = force_reg (mode, two31r);
18596 switch (mode)
18597 {
18598 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18599 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18600 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18601 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18602 default: gcc_unreachable ();
18603 }
18604 tmp[3] = gen_rtx_LE (mode, two31r, val);
18605 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18606 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18607 0, OPTAB_DIRECT);
18608 if (intmode == V4SImode || TARGET_AVX2)
18609 *xorp = expand_simple_binop (intmode, ASHIFT,
18610 gen_lowpart (intmode, tmp[0]),
18611 GEN_INT (31), NULL_RTX, 0,
18612 OPTAB_DIRECT);
18613 else
18614 {
18615 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18616 two31 = ix86_build_const_vector (intmode, 1, two31);
18617 *xorp = expand_simple_binop (intmode, AND,
18618 gen_lowpart (intmode, tmp[0]),
18619 two31, NULL_RTX, 0,
18620 OPTAB_DIRECT);
18621 }
18622 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18623 0, OPTAB_DIRECT);
18624 }
18625
18626 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18627 then replicate the value for all elements of the vector
18628 register. */
18629
18630 rtx
18631 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18632 {
18633 int i, n_elt;
18634 rtvec v;
18635 enum machine_mode scalar_mode;
18636
18637 switch (mode)
18638 {
18639 case V32QImode:
18640 case V16QImode:
18641 case V16HImode:
18642 case V8HImode:
18643 case V8SImode:
18644 case V4SImode:
18645 case V4DImode:
18646 case V2DImode:
18647 gcc_assert (vect);
18648 case V8SFmode:
18649 case V4SFmode:
18650 case V4DFmode:
18651 case V2DFmode:
18652 n_elt = GET_MODE_NUNITS (mode);
18653 v = rtvec_alloc (n_elt);
18654 scalar_mode = GET_MODE_INNER (mode);
18655
18656 RTVEC_ELT (v, 0) = value;
18657
18658 for (i = 1; i < n_elt; ++i)
18659 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18660
18661 return gen_rtx_CONST_VECTOR (mode, v);
18662
18663 default:
18664 gcc_unreachable ();
18665 }
18666 }
18667
18668 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18669 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18670 for an SSE register. If VECT is true, then replicate the mask for
18671 all elements of the vector register. If INVERT is true, then create
18672 a mask excluding the sign bit. */
18673
18674 rtx
18675 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18676 {
18677 enum machine_mode vec_mode, imode;
18678 HOST_WIDE_INT hi, lo;
18679 int shift = 63;
18680 rtx v;
18681 rtx mask;
18682
18683 /* Find the sign bit, sign extended to 2*HWI. */
18684 switch (mode)
18685 {
18686 case V8SImode:
18687 case V4SImode:
18688 case V8SFmode:
18689 case V4SFmode:
18690 vec_mode = mode;
18691 mode = GET_MODE_INNER (mode);
18692 imode = SImode;
18693 lo = 0x80000000, hi = lo < 0;
18694 break;
18695
18696 case V4DImode:
18697 case V2DImode:
18698 case V4DFmode:
18699 case V2DFmode:
18700 vec_mode = mode;
18701 mode = GET_MODE_INNER (mode);
18702 imode = DImode;
18703 if (HOST_BITS_PER_WIDE_INT >= 64)
18704 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18705 else
18706 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18707 break;
18708
18709 case TImode:
18710 case TFmode:
18711 vec_mode = VOIDmode;
18712 if (HOST_BITS_PER_WIDE_INT >= 64)
18713 {
18714 imode = TImode;
18715 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18716 }
18717 else
18718 {
18719 rtvec vec;
18720
18721 imode = DImode;
18722 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18723
18724 if (invert)
18725 {
18726 lo = ~lo, hi = ~hi;
18727 v = constm1_rtx;
18728 }
18729 else
18730 v = const0_rtx;
18731
18732 mask = immed_double_const (lo, hi, imode);
18733
18734 vec = gen_rtvec (2, v, mask);
18735 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18736 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18737
18738 return v;
18739 }
18740 break;
18741
18742 default:
18743 gcc_unreachable ();
18744 }
18745
18746 if (invert)
18747 lo = ~lo, hi = ~hi;
18748
18749 /* Force this value into the low part of a fp vector constant. */
18750 mask = immed_double_const (lo, hi, imode);
18751 mask = gen_lowpart (mode, mask);
18752
18753 if (vec_mode == VOIDmode)
18754 return force_reg (mode, mask);
18755
18756 v = ix86_build_const_vector (vec_mode, vect, mask);
18757 return force_reg (vec_mode, v);
18758 }
18759
18760 /* Generate code for floating point ABS or NEG. */
18761
18762 void
18763 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18764 rtx operands[])
18765 {
18766 rtx mask, set, dst, src;
18767 bool use_sse = false;
18768 bool vector_mode = VECTOR_MODE_P (mode);
18769 enum machine_mode vmode = mode;
18770
18771 if (vector_mode)
18772 use_sse = true;
18773 else if (mode == TFmode)
18774 use_sse = true;
18775 else if (TARGET_SSE_MATH)
18776 {
18777 use_sse = SSE_FLOAT_MODE_P (mode);
18778 if (mode == SFmode)
18779 vmode = V4SFmode;
18780 else if (mode == DFmode)
18781 vmode = V2DFmode;
18782 }
18783
18784 /* NEG and ABS performed with SSE use bitwise mask operations.
18785 Create the appropriate mask now. */
18786 if (use_sse)
18787 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18788 else
18789 mask = NULL_RTX;
18790
18791 dst = operands[0];
18792 src = operands[1];
18793
18794 set = gen_rtx_fmt_e (code, mode, src);
18795 set = gen_rtx_SET (VOIDmode, dst, set);
18796
18797 if (mask)
18798 {
18799 rtx use, clob;
18800 rtvec par;
18801
18802 use = gen_rtx_USE (VOIDmode, mask);
18803 if (vector_mode)
18804 par = gen_rtvec (2, set, use);
18805 else
18806 {
18807 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18808 par = gen_rtvec (3, set, use, clob);
18809 }
18810 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18811 }
18812 else
18813 emit_insn (set);
18814 }
18815
18816 /* Expand a copysign operation. Special case operand 0 being a constant. */
18817
18818 void
18819 ix86_expand_copysign (rtx operands[])
18820 {
18821 enum machine_mode mode, vmode;
18822 rtx dest, op0, op1, mask, nmask;
18823
18824 dest = operands[0];
18825 op0 = operands[1];
18826 op1 = operands[2];
18827
18828 mode = GET_MODE (dest);
18829
18830 if (mode == SFmode)
18831 vmode = V4SFmode;
18832 else if (mode == DFmode)
18833 vmode = V2DFmode;
18834 else
18835 vmode = mode;
18836
18837 if (GET_CODE (op0) == CONST_DOUBLE)
18838 {
18839 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18840
18841 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18842 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18843
18844 if (mode == SFmode || mode == DFmode)
18845 {
18846 if (op0 == CONST0_RTX (mode))
18847 op0 = CONST0_RTX (vmode);
18848 else
18849 {
18850 rtx v = ix86_build_const_vector (vmode, false, op0);
18851
18852 op0 = force_reg (vmode, v);
18853 }
18854 }
18855 else if (op0 != CONST0_RTX (mode))
18856 op0 = force_reg (mode, op0);
18857
18858 mask = ix86_build_signbit_mask (vmode, 0, 0);
18859
18860 if (mode == SFmode)
18861 copysign_insn = gen_copysignsf3_const;
18862 else if (mode == DFmode)
18863 copysign_insn = gen_copysigndf3_const;
18864 else
18865 copysign_insn = gen_copysigntf3_const;
18866
18867 emit_insn (copysign_insn (dest, op0, op1, mask));
18868 }
18869 else
18870 {
18871 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18872
18873 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18874 mask = ix86_build_signbit_mask (vmode, 0, 0);
18875
18876 if (mode == SFmode)
18877 copysign_insn = gen_copysignsf3_var;
18878 else if (mode == DFmode)
18879 copysign_insn = gen_copysigndf3_var;
18880 else
18881 copysign_insn = gen_copysigntf3_var;
18882
18883 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18884 }
18885 }
18886
18887 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18888 be a constant, and so has already been expanded into a vector constant. */
18889
18890 void
18891 ix86_split_copysign_const (rtx operands[])
18892 {
18893 enum machine_mode mode, vmode;
18894 rtx dest, op0, mask, x;
18895
18896 dest = operands[0];
18897 op0 = operands[1];
18898 mask = operands[3];
18899
18900 mode = GET_MODE (dest);
18901 vmode = GET_MODE (mask);
18902
18903 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18904 x = gen_rtx_AND (vmode, dest, mask);
18905 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18906
18907 if (op0 != CONST0_RTX (vmode))
18908 {
18909 x = gen_rtx_IOR (vmode, dest, op0);
18910 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18911 }
18912 }
18913
18914 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18915 so we have to do two masks. */
18916
18917 void
18918 ix86_split_copysign_var (rtx operands[])
18919 {
18920 enum machine_mode mode, vmode;
18921 rtx dest, scratch, op0, op1, mask, nmask, x;
18922
18923 dest = operands[0];
18924 scratch = operands[1];
18925 op0 = operands[2];
18926 op1 = operands[3];
18927 nmask = operands[4];
18928 mask = operands[5];
18929
18930 mode = GET_MODE (dest);
18931 vmode = GET_MODE (mask);
18932
18933 if (rtx_equal_p (op0, op1))
18934 {
18935 /* Shouldn't happen often (it's useless, obviously), but when it does
18936 we'd generate incorrect code if we continue below. */
18937 emit_move_insn (dest, op0);
18938 return;
18939 }
18940
18941 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18942 {
18943 gcc_assert (REGNO (op1) == REGNO (scratch));
18944
18945 x = gen_rtx_AND (vmode, scratch, mask);
18946 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18947
18948 dest = mask;
18949 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18950 x = gen_rtx_NOT (vmode, dest);
18951 x = gen_rtx_AND (vmode, x, op0);
18952 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18953 }
18954 else
18955 {
18956 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18957 {
18958 x = gen_rtx_AND (vmode, scratch, mask);
18959 }
18960 else /* alternative 2,4 */
18961 {
18962 gcc_assert (REGNO (mask) == REGNO (scratch));
18963 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18964 x = gen_rtx_AND (vmode, scratch, op1);
18965 }
18966 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18967
18968 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18969 {
18970 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18971 x = gen_rtx_AND (vmode, dest, nmask);
18972 }
18973 else /* alternative 3,4 */
18974 {
18975 gcc_assert (REGNO (nmask) == REGNO (dest));
18976 dest = nmask;
18977 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18978 x = gen_rtx_AND (vmode, dest, op0);
18979 }
18980 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18981 }
18982
18983 x = gen_rtx_IOR (vmode, dest, scratch);
18984 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18985 }
18986
18987 /* Return TRUE or FALSE depending on whether the first SET in INSN
18988 has source and destination with matching CC modes, and that the
18989 CC mode is at least as constrained as REQ_MODE. */
18990
18991 bool
18992 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18993 {
18994 rtx set;
18995 enum machine_mode set_mode;
18996
18997 set = PATTERN (insn);
18998 if (GET_CODE (set) == PARALLEL)
18999 set = XVECEXP (set, 0, 0);
19000 gcc_assert (GET_CODE (set) == SET);
19001 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
19002
19003 set_mode = GET_MODE (SET_DEST (set));
19004 switch (set_mode)
19005 {
19006 case CCNOmode:
19007 if (req_mode != CCNOmode
19008 && (req_mode != CCmode
19009 || XEXP (SET_SRC (set), 1) != const0_rtx))
19010 return false;
19011 break;
19012 case CCmode:
19013 if (req_mode == CCGCmode)
19014 return false;
19015 /* FALLTHRU */
19016 case CCGCmode:
19017 if (req_mode == CCGOCmode || req_mode == CCNOmode)
19018 return false;
19019 /* FALLTHRU */
19020 case CCGOCmode:
19021 if (req_mode == CCZmode)
19022 return false;
19023 /* FALLTHRU */
19024 case CCZmode:
19025 break;
19026
19027 case CCAmode:
19028 case CCCmode:
19029 case CCOmode:
19030 case CCSmode:
19031 if (set_mode != req_mode)
19032 return false;
19033 break;
19034
19035 default:
19036 gcc_unreachable ();
19037 }
19038
19039 return GET_MODE (SET_SRC (set)) == set_mode;
19040 }
19041
19042 /* Generate insn patterns to do an integer compare of OPERANDS. */
19043
19044 static rtx
19045 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
19046 {
19047 enum machine_mode cmpmode;
19048 rtx tmp, flags;
19049
19050 cmpmode = SELECT_CC_MODE (code, op0, op1);
19051 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
19052
19053 /* This is very simple, but making the interface the same as in the
19054 FP case makes the rest of the code easier. */
19055 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
19056 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
19057
19058 /* Return the test that should be put into the flags user, i.e.
19059 the bcc, scc, or cmov instruction. */
19060 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
19061 }
19062
19063 /* Figure out whether to use ordered or unordered fp comparisons.
19064 Return the appropriate mode to use. */
19065
19066 enum machine_mode
19067 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
19068 {
19069 /* ??? In order to make all comparisons reversible, we do all comparisons
19070 non-trapping when compiling for IEEE. Once gcc is able to distinguish
19071 all forms trapping and nontrapping comparisons, we can make inequality
19072 comparisons trapping again, since it results in better code when using
19073 FCOM based compares. */
19074 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
19075 }
19076
19077 enum machine_mode
19078 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
19079 {
19080 enum machine_mode mode = GET_MODE (op0);
19081
19082 if (SCALAR_FLOAT_MODE_P (mode))
19083 {
19084 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19085 return ix86_fp_compare_mode (code);
19086 }
19087
19088 switch (code)
19089 {
19090 /* Only zero flag is needed. */
19091 case EQ: /* ZF=0 */
19092 case NE: /* ZF!=0 */
19093 return CCZmode;
19094 /* Codes needing carry flag. */
19095 case GEU: /* CF=0 */
19096 case LTU: /* CF=1 */
19097 /* Detect overflow checks. They need just the carry flag. */
19098 if (GET_CODE (op0) == PLUS
19099 && rtx_equal_p (op1, XEXP (op0, 0)))
19100 return CCCmode;
19101 else
19102 return CCmode;
19103 case GTU: /* CF=0 & ZF=0 */
19104 case LEU: /* CF=1 | ZF=1 */
19105 return CCmode;
19106 /* Codes possibly doable only with sign flag when
19107 comparing against zero. */
19108 case GE: /* SF=OF or SF=0 */
19109 case LT: /* SF<>OF or SF=1 */
19110 if (op1 == const0_rtx)
19111 return CCGOCmode;
19112 else
19113 /* For other cases Carry flag is not required. */
19114 return CCGCmode;
19115 /* Codes doable only with sign flag when comparing
19116 against zero, but we miss jump instruction for it
19117 so we need to use relational tests against overflow
19118 that thus needs to be zero. */
19119 case GT: /* ZF=0 & SF=OF */
19120 case LE: /* ZF=1 | SF<>OF */
19121 if (op1 == const0_rtx)
19122 return CCNOmode;
19123 else
19124 return CCGCmode;
19125 /* strcmp pattern do (use flags) and combine may ask us for proper
19126 mode. */
19127 case USE:
19128 return CCmode;
19129 default:
19130 gcc_unreachable ();
19131 }
19132 }
19133
19134 /* Return the fixed registers used for condition codes. */
19135
19136 static bool
19137 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
19138 {
19139 *p1 = FLAGS_REG;
19140 *p2 = FPSR_REG;
19141 return true;
19142 }
19143
19144 /* If two condition code modes are compatible, return a condition code
19145 mode which is compatible with both. Otherwise, return
19146 VOIDmode. */
19147
19148 static enum machine_mode
19149 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
19150 {
19151 if (m1 == m2)
19152 return m1;
19153
19154 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
19155 return VOIDmode;
19156
19157 if ((m1 == CCGCmode && m2 == CCGOCmode)
19158 || (m1 == CCGOCmode && m2 == CCGCmode))
19159 return CCGCmode;
19160
19161 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
19162 return m2;
19163 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
19164 return m1;
19165
19166 switch (m1)
19167 {
19168 default:
19169 gcc_unreachable ();
19170
19171 case CCmode:
19172 case CCGCmode:
19173 case CCGOCmode:
19174 case CCNOmode:
19175 case CCAmode:
19176 case CCCmode:
19177 case CCOmode:
19178 case CCSmode:
19179 case CCZmode:
19180 switch (m2)
19181 {
19182 default:
19183 return VOIDmode;
19184
19185 case CCmode:
19186 case CCGCmode:
19187 case CCGOCmode:
19188 case CCNOmode:
19189 case CCAmode:
19190 case CCCmode:
19191 case CCOmode:
19192 case CCSmode:
19193 case CCZmode:
19194 return CCmode;
19195 }
19196
19197 case CCFPmode:
19198 case CCFPUmode:
19199 /* These are only compatible with themselves, which we already
19200 checked above. */
19201 return VOIDmode;
19202 }
19203 }
19204
19205
19206 /* Return a comparison we can do and that it is equivalent to
19207 swap_condition (code) apart possibly from orderedness.
19208 But, never change orderedness if TARGET_IEEE_FP, returning
19209 UNKNOWN in that case if necessary. */
19210
19211 static enum rtx_code
19212 ix86_fp_swap_condition (enum rtx_code code)
19213 {
19214 switch (code)
19215 {
19216 case GT: /* GTU - CF=0 & ZF=0 */
19217 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
19218 case GE: /* GEU - CF=0 */
19219 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
19220 case UNLT: /* LTU - CF=1 */
19221 return TARGET_IEEE_FP ? UNKNOWN : GT;
19222 case UNLE: /* LEU - CF=1 | ZF=1 */
19223 return TARGET_IEEE_FP ? UNKNOWN : GE;
19224 default:
19225 return swap_condition (code);
19226 }
19227 }
19228
19229 /* Return cost of comparison CODE using the best strategy for performance.
19230 All following functions do use number of instructions as a cost metrics.
19231 In future this should be tweaked to compute bytes for optimize_size and
19232 take into account performance of various instructions on various CPUs. */
19233
19234 static int
19235 ix86_fp_comparison_cost (enum rtx_code code)
19236 {
19237 int arith_cost;
19238
19239 /* The cost of code using bit-twiddling on %ah. */
19240 switch (code)
19241 {
19242 case UNLE:
19243 case UNLT:
19244 case LTGT:
19245 case GT:
19246 case GE:
19247 case UNORDERED:
19248 case ORDERED:
19249 case UNEQ:
19250 arith_cost = 4;
19251 break;
19252 case LT:
19253 case NE:
19254 case EQ:
19255 case UNGE:
19256 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19257 break;
19258 case LE:
19259 case UNGT:
19260 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19261 break;
19262 default:
19263 gcc_unreachable ();
19264 }
19265
19266 switch (ix86_fp_comparison_strategy (code))
19267 {
19268 case IX86_FPCMP_COMI:
19269 return arith_cost > 4 ? 3 : 2;
19270 case IX86_FPCMP_SAHF:
19271 return arith_cost > 4 ? 4 : 3;
19272 default:
19273 return arith_cost;
19274 }
19275 }
19276
19277 /* Return strategy to use for floating-point. We assume that fcomi is always
19278 preferrable where available, since that is also true when looking at size
19279 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19280
19281 enum ix86_fpcmp_strategy
19282 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19283 {
19284 /* Do fcomi/sahf based test when profitable. */
19285
19286 if (TARGET_CMOVE)
19287 return IX86_FPCMP_COMI;
19288
19289 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19290 return IX86_FPCMP_SAHF;
19291
19292 return IX86_FPCMP_ARITH;
19293 }
19294
19295 /* Swap, force into registers, or otherwise massage the two operands
19296 to a fp comparison. The operands are updated in place; the new
19297 comparison code is returned. */
19298
19299 static enum rtx_code
19300 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19301 {
19302 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19303 rtx op0 = *pop0, op1 = *pop1;
19304 enum machine_mode op_mode = GET_MODE (op0);
19305 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19306
19307 /* All of the unordered compare instructions only work on registers.
19308 The same is true of the fcomi compare instructions. The XFmode
19309 compare instructions require registers except when comparing
19310 against zero or when converting operand 1 from fixed point to
19311 floating point. */
19312
19313 if (!is_sse
19314 && (fpcmp_mode == CCFPUmode
19315 || (op_mode == XFmode
19316 && ! (standard_80387_constant_p (op0) == 1
19317 || standard_80387_constant_p (op1) == 1)
19318 && GET_CODE (op1) != FLOAT)
19319 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19320 {
19321 op0 = force_reg (op_mode, op0);
19322 op1 = force_reg (op_mode, op1);
19323 }
19324 else
19325 {
19326 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19327 things around if they appear profitable, otherwise force op0
19328 into a register. */
19329
19330 if (standard_80387_constant_p (op0) == 0
19331 || (MEM_P (op0)
19332 && ! (standard_80387_constant_p (op1) == 0
19333 || MEM_P (op1))))
19334 {
19335 enum rtx_code new_code = ix86_fp_swap_condition (code);
19336 if (new_code != UNKNOWN)
19337 {
19338 rtx tmp;
19339 tmp = op0, op0 = op1, op1 = tmp;
19340 code = new_code;
19341 }
19342 }
19343
19344 if (!REG_P (op0))
19345 op0 = force_reg (op_mode, op0);
19346
19347 if (CONSTANT_P (op1))
19348 {
19349 int tmp = standard_80387_constant_p (op1);
19350 if (tmp == 0)
19351 op1 = validize_mem (force_const_mem (op_mode, op1));
19352 else if (tmp == 1)
19353 {
19354 if (TARGET_CMOVE)
19355 op1 = force_reg (op_mode, op1);
19356 }
19357 else
19358 op1 = force_reg (op_mode, op1);
19359 }
19360 }
19361
19362 /* Try to rearrange the comparison to make it cheaper. */
19363 if (ix86_fp_comparison_cost (code)
19364 > ix86_fp_comparison_cost (swap_condition (code))
19365 && (REG_P (op1) || can_create_pseudo_p ()))
19366 {
19367 rtx tmp;
19368 tmp = op0, op0 = op1, op1 = tmp;
19369 code = swap_condition (code);
19370 if (!REG_P (op0))
19371 op0 = force_reg (op_mode, op0);
19372 }
19373
19374 *pop0 = op0;
19375 *pop1 = op1;
19376 return code;
19377 }
19378
19379 /* Convert comparison codes we use to represent FP comparison to integer
19380 code that will result in proper branch. Return UNKNOWN if no such code
19381 is available. */
19382
19383 enum rtx_code
19384 ix86_fp_compare_code_to_integer (enum rtx_code code)
19385 {
19386 switch (code)
19387 {
19388 case GT:
19389 return GTU;
19390 case GE:
19391 return GEU;
19392 case ORDERED:
19393 case UNORDERED:
19394 return code;
19395 break;
19396 case UNEQ:
19397 return EQ;
19398 break;
19399 case UNLT:
19400 return LTU;
19401 break;
19402 case UNLE:
19403 return LEU;
19404 break;
19405 case LTGT:
19406 return NE;
19407 break;
19408 default:
19409 return UNKNOWN;
19410 }
19411 }
19412
19413 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19414
19415 static rtx
19416 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19417 {
19418 enum machine_mode fpcmp_mode, intcmp_mode;
19419 rtx tmp, tmp2;
19420
19421 fpcmp_mode = ix86_fp_compare_mode (code);
19422 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19423
19424 /* Do fcomi/sahf based test when profitable. */
19425 switch (ix86_fp_comparison_strategy (code))
19426 {
19427 case IX86_FPCMP_COMI:
19428 intcmp_mode = fpcmp_mode;
19429 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19430 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19431 tmp);
19432 emit_insn (tmp);
19433 break;
19434
19435 case IX86_FPCMP_SAHF:
19436 intcmp_mode = fpcmp_mode;
19437 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19438 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19439 tmp);
19440
19441 if (!scratch)
19442 scratch = gen_reg_rtx (HImode);
19443 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19444 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19445 break;
19446
19447 case IX86_FPCMP_ARITH:
19448 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19449 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19450 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19451 if (!scratch)
19452 scratch = gen_reg_rtx (HImode);
19453 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19454
19455 /* In the unordered case, we have to check C2 for NaN's, which
19456 doesn't happen to work out to anything nice combination-wise.
19457 So do some bit twiddling on the value we've got in AH to come
19458 up with an appropriate set of condition codes. */
19459
19460 intcmp_mode = CCNOmode;
19461 switch (code)
19462 {
19463 case GT:
19464 case UNGT:
19465 if (code == GT || !TARGET_IEEE_FP)
19466 {
19467 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19468 code = EQ;
19469 }
19470 else
19471 {
19472 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19473 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19474 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19475 intcmp_mode = CCmode;
19476 code = GEU;
19477 }
19478 break;
19479 case LT:
19480 case UNLT:
19481 if (code == LT && TARGET_IEEE_FP)
19482 {
19483 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19484 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19485 intcmp_mode = CCmode;
19486 code = EQ;
19487 }
19488 else
19489 {
19490 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19491 code = NE;
19492 }
19493 break;
19494 case GE:
19495 case UNGE:
19496 if (code == GE || !TARGET_IEEE_FP)
19497 {
19498 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19499 code = EQ;
19500 }
19501 else
19502 {
19503 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19504 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19505 code = NE;
19506 }
19507 break;
19508 case LE:
19509 case UNLE:
19510 if (code == LE && TARGET_IEEE_FP)
19511 {
19512 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19513 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19514 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19515 intcmp_mode = CCmode;
19516 code = LTU;
19517 }
19518 else
19519 {
19520 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19521 code = NE;
19522 }
19523 break;
19524 case EQ:
19525 case UNEQ:
19526 if (code == EQ && TARGET_IEEE_FP)
19527 {
19528 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19529 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19530 intcmp_mode = CCmode;
19531 code = EQ;
19532 }
19533 else
19534 {
19535 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19536 code = NE;
19537 }
19538 break;
19539 case NE:
19540 case LTGT:
19541 if (code == NE && TARGET_IEEE_FP)
19542 {
19543 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19544 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19545 GEN_INT (0x40)));
19546 code = NE;
19547 }
19548 else
19549 {
19550 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19551 code = EQ;
19552 }
19553 break;
19554
19555 case UNORDERED:
19556 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19557 code = NE;
19558 break;
19559 case ORDERED:
19560 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19561 code = EQ;
19562 break;
19563
19564 default:
19565 gcc_unreachable ();
19566 }
19567 break;
19568
19569 default:
19570 gcc_unreachable();
19571 }
19572
19573 /* Return the test that should be put into the flags user, i.e.
19574 the bcc, scc, or cmov instruction. */
19575 return gen_rtx_fmt_ee (code, VOIDmode,
19576 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19577 const0_rtx);
19578 }
19579
19580 static rtx
19581 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19582 {
19583 rtx ret;
19584
19585 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19586 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19587
19588 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19589 {
19590 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19591 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19592 }
19593 else
19594 ret = ix86_expand_int_compare (code, op0, op1);
19595
19596 return ret;
19597 }
19598
19599 void
19600 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19601 {
19602 enum machine_mode mode = GET_MODE (op0);
19603 rtx tmp;
19604
19605 switch (mode)
19606 {
19607 case SFmode:
19608 case DFmode:
19609 case XFmode:
19610 case QImode:
19611 case HImode:
19612 case SImode:
19613 simple:
19614 tmp = ix86_expand_compare (code, op0, op1);
19615 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19616 gen_rtx_LABEL_REF (VOIDmode, label),
19617 pc_rtx);
19618 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19619 return;
19620
19621 case DImode:
19622 if (TARGET_64BIT)
19623 goto simple;
19624 case TImode:
19625 /* Expand DImode branch into multiple compare+branch. */
19626 {
19627 rtx lo[2], hi[2], label2;
19628 enum rtx_code code1, code2, code3;
19629 enum machine_mode submode;
19630
19631 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19632 {
19633 tmp = op0, op0 = op1, op1 = tmp;
19634 code = swap_condition (code);
19635 }
19636
19637 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19638 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19639
19640 submode = mode == DImode ? SImode : DImode;
19641
19642 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19643 avoid two branches. This costs one extra insn, so disable when
19644 optimizing for size. */
19645
19646 if ((code == EQ || code == NE)
19647 && (!optimize_insn_for_size_p ()
19648 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19649 {
19650 rtx xor0, xor1;
19651
19652 xor1 = hi[0];
19653 if (hi[1] != const0_rtx)
19654 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19655 NULL_RTX, 0, OPTAB_WIDEN);
19656
19657 xor0 = lo[0];
19658 if (lo[1] != const0_rtx)
19659 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19660 NULL_RTX, 0, OPTAB_WIDEN);
19661
19662 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19663 NULL_RTX, 0, OPTAB_WIDEN);
19664
19665 ix86_expand_branch (code, tmp, const0_rtx, label);
19666 return;
19667 }
19668
19669 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19670 op1 is a constant and the low word is zero, then we can just
19671 examine the high word. Similarly for low word -1 and
19672 less-or-equal-than or greater-than. */
19673
19674 if (CONST_INT_P (hi[1]))
19675 switch (code)
19676 {
19677 case LT: case LTU: case GE: case GEU:
19678 if (lo[1] == const0_rtx)
19679 {
19680 ix86_expand_branch (code, hi[0], hi[1], label);
19681 return;
19682 }
19683 break;
19684 case LE: case LEU: case GT: case GTU:
19685 if (lo[1] == constm1_rtx)
19686 {
19687 ix86_expand_branch (code, hi[0], hi[1], label);
19688 return;
19689 }
19690 break;
19691 default:
19692 break;
19693 }
19694
19695 /* Otherwise, we need two or three jumps. */
19696
19697 label2 = gen_label_rtx ();
19698
19699 code1 = code;
19700 code2 = swap_condition (code);
19701 code3 = unsigned_condition (code);
19702
19703 switch (code)
19704 {
19705 case LT: case GT: case LTU: case GTU:
19706 break;
19707
19708 case LE: code1 = LT; code2 = GT; break;
19709 case GE: code1 = GT; code2 = LT; break;
19710 case LEU: code1 = LTU; code2 = GTU; break;
19711 case GEU: code1 = GTU; code2 = LTU; break;
19712
19713 case EQ: code1 = UNKNOWN; code2 = NE; break;
19714 case NE: code2 = UNKNOWN; break;
19715
19716 default:
19717 gcc_unreachable ();
19718 }
19719
19720 /*
19721 * a < b =>
19722 * if (hi(a) < hi(b)) goto true;
19723 * if (hi(a) > hi(b)) goto false;
19724 * if (lo(a) < lo(b)) goto true;
19725 * false:
19726 */
19727
19728 if (code1 != UNKNOWN)
19729 ix86_expand_branch (code1, hi[0], hi[1], label);
19730 if (code2 != UNKNOWN)
19731 ix86_expand_branch (code2, hi[0], hi[1], label2);
19732
19733 ix86_expand_branch (code3, lo[0], lo[1], label);
19734
19735 if (code2 != UNKNOWN)
19736 emit_label (label2);
19737 return;
19738 }
19739
19740 default:
19741 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19742 goto simple;
19743 }
19744 }
19745
19746 /* Split branch based on floating point condition. */
19747 void
19748 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19749 rtx target1, rtx target2, rtx tmp, rtx pushed)
19750 {
19751 rtx condition;
19752 rtx i;
19753
19754 if (target2 != pc_rtx)
19755 {
19756 rtx tmp = target2;
19757 code = reverse_condition_maybe_unordered (code);
19758 target2 = target1;
19759 target1 = tmp;
19760 }
19761
19762 condition = ix86_expand_fp_compare (code, op1, op2,
19763 tmp);
19764
19765 /* Remove pushed operand from stack. */
19766 if (pushed)
19767 ix86_free_from_memory (GET_MODE (pushed));
19768
19769 i = emit_jump_insn (gen_rtx_SET
19770 (VOIDmode, pc_rtx,
19771 gen_rtx_IF_THEN_ELSE (VOIDmode,
19772 condition, target1, target2)));
19773 if (split_branch_probability >= 0)
19774 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
19775 }
19776
19777 void
19778 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19779 {
19780 rtx ret;
19781
19782 gcc_assert (GET_MODE (dest) == QImode);
19783
19784 ret = ix86_expand_compare (code, op0, op1);
19785 PUT_MODE (ret, QImode);
19786 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19787 }
19788
19789 /* Expand comparison setting or clearing carry flag. Return true when
19790 successful and set pop for the operation. */
19791 static bool
19792 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19793 {
19794 enum machine_mode mode =
19795 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19796
19797 /* Do not handle double-mode compares that go through special path. */
19798 if (mode == (TARGET_64BIT ? TImode : DImode))
19799 return false;
19800
19801 if (SCALAR_FLOAT_MODE_P (mode))
19802 {
19803 rtx compare_op, compare_seq;
19804
19805 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19806
19807 /* Shortcut: following common codes never translate
19808 into carry flag compares. */
19809 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19810 || code == ORDERED || code == UNORDERED)
19811 return false;
19812
19813 /* These comparisons require zero flag; swap operands so they won't. */
19814 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19815 && !TARGET_IEEE_FP)
19816 {
19817 rtx tmp = op0;
19818 op0 = op1;
19819 op1 = tmp;
19820 code = swap_condition (code);
19821 }
19822
19823 /* Try to expand the comparison and verify that we end up with
19824 carry flag based comparison. This fails to be true only when
19825 we decide to expand comparison using arithmetic that is not
19826 too common scenario. */
19827 start_sequence ();
19828 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19829 compare_seq = get_insns ();
19830 end_sequence ();
19831
19832 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19833 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19834 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19835 else
19836 code = GET_CODE (compare_op);
19837
19838 if (code != LTU && code != GEU)
19839 return false;
19840
19841 emit_insn (compare_seq);
19842 *pop = compare_op;
19843 return true;
19844 }
19845
19846 if (!INTEGRAL_MODE_P (mode))
19847 return false;
19848
19849 switch (code)
19850 {
19851 case LTU:
19852 case GEU:
19853 break;
19854
19855 /* Convert a==0 into (unsigned)a<1. */
19856 case EQ:
19857 case NE:
19858 if (op1 != const0_rtx)
19859 return false;
19860 op1 = const1_rtx;
19861 code = (code == EQ ? LTU : GEU);
19862 break;
19863
19864 /* Convert a>b into b<a or a>=b-1. */
19865 case GTU:
19866 case LEU:
19867 if (CONST_INT_P (op1))
19868 {
19869 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19870 /* Bail out on overflow. We still can swap operands but that
19871 would force loading of the constant into register. */
19872 if (op1 == const0_rtx
19873 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19874 return false;
19875 code = (code == GTU ? GEU : LTU);
19876 }
19877 else
19878 {
19879 rtx tmp = op1;
19880 op1 = op0;
19881 op0 = tmp;
19882 code = (code == GTU ? LTU : GEU);
19883 }
19884 break;
19885
19886 /* Convert a>=0 into (unsigned)a<0x80000000. */
19887 case LT:
19888 case GE:
19889 if (mode == DImode || op1 != const0_rtx)
19890 return false;
19891 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19892 code = (code == LT ? GEU : LTU);
19893 break;
19894 case LE:
19895 case GT:
19896 if (mode == DImode || op1 != constm1_rtx)
19897 return false;
19898 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19899 code = (code == LE ? GEU : LTU);
19900 break;
19901
19902 default:
19903 return false;
19904 }
19905 /* Swapping operands may cause constant to appear as first operand. */
19906 if (!nonimmediate_operand (op0, VOIDmode))
19907 {
19908 if (!can_create_pseudo_p ())
19909 return false;
19910 op0 = force_reg (mode, op0);
19911 }
19912 *pop = ix86_expand_compare (code, op0, op1);
19913 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19914 return true;
19915 }
19916
19917 bool
19918 ix86_expand_int_movcc (rtx operands[])
19919 {
19920 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19921 rtx compare_seq, compare_op;
19922 enum machine_mode mode = GET_MODE (operands[0]);
19923 bool sign_bit_compare_p = false;
19924 rtx op0 = XEXP (operands[1], 0);
19925 rtx op1 = XEXP (operands[1], 1);
19926
19927 if (GET_MODE (op0) == TImode
19928 || (GET_MODE (op0) == DImode
19929 && !TARGET_64BIT))
19930 return false;
19931
19932 start_sequence ();
19933 compare_op = ix86_expand_compare (code, op0, op1);
19934 compare_seq = get_insns ();
19935 end_sequence ();
19936
19937 compare_code = GET_CODE (compare_op);
19938
19939 if ((op1 == const0_rtx && (code == GE || code == LT))
19940 || (op1 == constm1_rtx && (code == GT || code == LE)))
19941 sign_bit_compare_p = true;
19942
19943 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19944 HImode insns, we'd be swallowed in word prefix ops. */
19945
19946 if ((mode != HImode || TARGET_FAST_PREFIX)
19947 && (mode != (TARGET_64BIT ? TImode : DImode))
19948 && CONST_INT_P (operands[2])
19949 && CONST_INT_P (operands[3]))
19950 {
19951 rtx out = operands[0];
19952 HOST_WIDE_INT ct = INTVAL (operands[2]);
19953 HOST_WIDE_INT cf = INTVAL (operands[3]);
19954 HOST_WIDE_INT diff;
19955
19956 diff = ct - cf;
19957 /* Sign bit compares are better done using shifts than we do by using
19958 sbb. */
19959 if (sign_bit_compare_p
19960 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19961 {
19962 /* Detect overlap between destination and compare sources. */
19963 rtx tmp = out;
19964
19965 if (!sign_bit_compare_p)
19966 {
19967 rtx flags;
19968 bool fpcmp = false;
19969
19970 compare_code = GET_CODE (compare_op);
19971
19972 flags = XEXP (compare_op, 0);
19973
19974 if (GET_MODE (flags) == CCFPmode
19975 || GET_MODE (flags) == CCFPUmode)
19976 {
19977 fpcmp = true;
19978 compare_code
19979 = ix86_fp_compare_code_to_integer (compare_code);
19980 }
19981
19982 /* To simplify rest of code, restrict to the GEU case. */
19983 if (compare_code == LTU)
19984 {
19985 HOST_WIDE_INT tmp = ct;
19986 ct = cf;
19987 cf = tmp;
19988 compare_code = reverse_condition (compare_code);
19989 code = reverse_condition (code);
19990 }
19991 else
19992 {
19993 if (fpcmp)
19994 PUT_CODE (compare_op,
19995 reverse_condition_maybe_unordered
19996 (GET_CODE (compare_op)));
19997 else
19998 PUT_CODE (compare_op,
19999 reverse_condition (GET_CODE (compare_op)));
20000 }
20001 diff = ct - cf;
20002
20003 if (reg_overlap_mentioned_p (out, op0)
20004 || reg_overlap_mentioned_p (out, op1))
20005 tmp = gen_reg_rtx (mode);
20006
20007 if (mode == DImode)
20008 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
20009 else
20010 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
20011 flags, compare_op));
20012 }
20013 else
20014 {
20015 if (code == GT || code == GE)
20016 code = reverse_condition (code);
20017 else
20018 {
20019 HOST_WIDE_INT tmp = ct;
20020 ct = cf;
20021 cf = tmp;
20022 diff = ct - cf;
20023 }
20024 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
20025 }
20026
20027 if (diff == 1)
20028 {
20029 /*
20030 * cmpl op0,op1
20031 * sbbl dest,dest
20032 * [addl dest, ct]
20033 *
20034 * Size 5 - 8.
20035 */
20036 if (ct)
20037 tmp = expand_simple_binop (mode, PLUS,
20038 tmp, GEN_INT (ct),
20039 copy_rtx (tmp), 1, OPTAB_DIRECT);
20040 }
20041 else if (cf == -1)
20042 {
20043 /*
20044 * cmpl op0,op1
20045 * sbbl dest,dest
20046 * orl $ct, dest
20047 *
20048 * Size 8.
20049 */
20050 tmp = expand_simple_binop (mode, IOR,
20051 tmp, GEN_INT (ct),
20052 copy_rtx (tmp), 1, OPTAB_DIRECT);
20053 }
20054 else if (diff == -1 && ct)
20055 {
20056 /*
20057 * cmpl op0,op1
20058 * sbbl dest,dest
20059 * notl dest
20060 * [addl dest, cf]
20061 *
20062 * Size 8 - 11.
20063 */
20064 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
20065 if (cf)
20066 tmp = expand_simple_binop (mode, PLUS,
20067 copy_rtx (tmp), GEN_INT (cf),
20068 copy_rtx (tmp), 1, OPTAB_DIRECT);
20069 }
20070 else
20071 {
20072 /*
20073 * cmpl op0,op1
20074 * sbbl dest,dest
20075 * [notl dest]
20076 * andl cf - ct, dest
20077 * [addl dest, ct]
20078 *
20079 * Size 8 - 11.
20080 */
20081
20082 if (cf == 0)
20083 {
20084 cf = ct;
20085 ct = 0;
20086 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
20087 }
20088
20089 tmp = expand_simple_binop (mode, AND,
20090 copy_rtx (tmp),
20091 gen_int_mode (cf - ct, mode),
20092 copy_rtx (tmp), 1, OPTAB_DIRECT);
20093 if (ct)
20094 tmp = expand_simple_binop (mode, PLUS,
20095 copy_rtx (tmp), GEN_INT (ct),
20096 copy_rtx (tmp), 1, OPTAB_DIRECT);
20097 }
20098
20099 if (!rtx_equal_p (tmp, out))
20100 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
20101
20102 return true;
20103 }
20104
20105 if (diff < 0)
20106 {
20107 enum machine_mode cmp_mode = GET_MODE (op0);
20108
20109 HOST_WIDE_INT tmp;
20110 tmp = ct, ct = cf, cf = tmp;
20111 diff = -diff;
20112
20113 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20114 {
20115 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20116
20117 /* We may be reversing unordered compare to normal compare, that
20118 is not valid in general (we may convert non-trapping condition
20119 to trapping one), however on i386 we currently emit all
20120 comparisons unordered. */
20121 compare_code = reverse_condition_maybe_unordered (compare_code);
20122 code = reverse_condition_maybe_unordered (code);
20123 }
20124 else
20125 {
20126 compare_code = reverse_condition (compare_code);
20127 code = reverse_condition (code);
20128 }
20129 }
20130
20131 compare_code = UNKNOWN;
20132 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
20133 && CONST_INT_P (op1))
20134 {
20135 if (op1 == const0_rtx
20136 && (code == LT || code == GE))
20137 compare_code = code;
20138 else if (op1 == constm1_rtx)
20139 {
20140 if (code == LE)
20141 compare_code = LT;
20142 else if (code == GT)
20143 compare_code = GE;
20144 }
20145 }
20146
20147 /* Optimize dest = (op0 < 0) ? -1 : cf. */
20148 if (compare_code != UNKNOWN
20149 && GET_MODE (op0) == GET_MODE (out)
20150 && (cf == -1 || ct == -1))
20151 {
20152 /* If lea code below could be used, only optimize
20153 if it results in a 2 insn sequence. */
20154
20155 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
20156 || diff == 3 || diff == 5 || diff == 9)
20157 || (compare_code == LT && ct == -1)
20158 || (compare_code == GE && cf == -1))
20159 {
20160 /*
20161 * notl op1 (if necessary)
20162 * sarl $31, op1
20163 * orl cf, op1
20164 */
20165 if (ct != -1)
20166 {
20167 cf = ct;
20168 ct = -1;
20169 code = reverse_condition (code);
20170 }
20171
20172 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20173
20174 out = expand_simple_binop (mode, IOR,
20175 out, GEN_INT (cf),
20176 out, 1, OPTAB_DIRECT);
20177 if (out != operands[0])
20178 emit_move_insn (operands[0], out);
20179
20180 return true;
20181 }
20182 }
20183
20184
20185 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
20186 || diff == 3 || diff == 5 || diff == 9)
20187 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
20188 && (mode != DImode
20189 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
20190 {
20191 /*
20192 * xorl dest,dest
20193 * cmpl op1,op2
20194 * setcc dest
20195 * lea cf(dest*(ct-cf)),dest
20196 *
20197 * Size 14.
20198 *
20199 * This also catches the degenerate setcc-only case.
20200 */
20201
20202 rtx tmp;
20203 int nops;
20204
20205 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20206
20207 nops = 0;
20208 /* On x86_64 the lea instruction operates on Pmode, so we need
20209 to get arithmetics done in proper mode to match. */
20210 if (diff == 1)
20211 tmp = copy_rtx (out);
20212 else
20213 {
20214 rtx out1;
20215 out1 = copy_rtx (out);
20216 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
20217 nops++;
20218 if (diff & 1)
20219 {
20220 tmp = gen_rtx_PLUS (mode, tmp, out1);
20221 nops++;
20222 }
20223 }
20224 if (cf != 0)
20225 {
20226 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
20227 nops++;
20228 }
20229 if (!rtx_equal_p (tmp, out))
20230 {
20231 if (nops == 1)
20232 out = force_operand (tmp, copy_rtx (out));
20233 else
20234 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
20235 }
20236 if (!rtx_equal_p (out, operands[0]))
20237 emit_move_insn (operands[0], copy_rtx (out));
20238
20239 return true;
20240 }
20241
20242 /*
20243 * General case: Jumpful:
20244 * xorl dest,dest cmpl op1, op2
20245 * cmpl op1, op2 movl ct, dest
20246 * setcc dest jcc 1f
20247 * decl dest movl cf, dest
20248 * andl (cf-ct),dest 1:
20249 * addl ct,dest
20250 *
20251 * Size 20. Size 14.
20252 *
20253 * This is reasonably steep, but branch mispredict costs are
20254 * high on modern cpus, so consider failing only if optimizing
20255 * for space.
20256 */
20257
20258 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20259 && BRANCH_COST (optimize_insn_for_speed_p (),
20260 false) >= 2)
20261 {
20262 if (cf == 0)
20263 {
20264 enum machine_mode cmp_mode = GET_MODE (op0);
20265
20266 cf = ct;
20267 ct = 0;
20268
20269 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20270 {
20271 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20272
20273 /* We may be reversing unordered compare to normal compare,
20274 that is not valid in general (we may convert non-trapping
20275 condition to trapping one), however on i386 we currently
20276 emit all comparisons unordered. */
20277 code = reverse_condition_maybe_unordered (code);
20278 }
20279 else
20280 {
20281 code = reverse_condition (code);
20282 if (compare_code != UNKNOWN)
20283 compare_code = reverse_condition (compare_code);
20284 }
20285 }
20286
20287 if (compare_code != UNKNOWN)
20288 {
20289 /* notl op1 (if needed)
20290 sarl $31, op1
20291 andl (cf-ct), op1
20292 addl ct, op1
20293
20294 For x < 0 (resp. x <= -1) there will be no notl,
20295 so if possible swap the constants to get rid of the
20296 complement.
20297 True/false will be -1/0 while code below (store flag
20298 followed by decrement) is 0/-1, so the constants need
20299 to be exchanged once more. */
20300
20301 if (compare_code == GE || !cf)
20302 {
20303 code = reverse_condition (code);
20304 compare_code = LT;
20305 }
20306 else
20307 {
20308 HOST_WIDE_INT tmp = cf;
20309 cf = ct;
20310 ct = tmp;
20311 }
20312
20313 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20314 }
20315 else
20316 {
20317 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20318
20319 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20320 constm1_rtx,
20321 copy_rtx (out), 1, OPTAB_DIRECT);
20322 }
20323
20324 out = expand_simple_binop (mode, AND, copy_rtx (out),
20325 gen_int_mode (cf - ct, mode),
20326 copy_rtx (out), 1, OPTAB_DIRECT);
20327 if (ct)
20328 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20329 copy_rtx (out), 1, OPTAB_DIRECT);
20330 if (!rtx_equal_p (out, operands[0]))
20331 emit_move_insn (operands[0], copy_rtx (out));
20332
20333 return true;
20334 }
20335 }
20336
20337 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20338 {
20339 /* Try a few things more with specific constants and a variable. */
20340
20341 optab op;
20342 rtx var, orig_out, out, tmp;
20343
20344 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20345 return false;
20346
20347 /* If one of the two operands is an interesting constant, load a
20348 constant with the above and mask it in with a logical operation. */
20349
20350 if (CONST_INT_P (operands[2]))
20351 {
20352 var = operands[3];
20353 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20354 operands[3] = constm1_rtx, op = and_optab;
20355 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20356 operands[3] = const0_rtx, op = ior_optab;
20357 else
20358 return false;
20359 }
20360 else if (CONST_INT_P (operands[3]))
20361 {
20362 var = operands[2];
20363 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20364 operands[2] = constm1_rtx, op = and_optab;
20365 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20366 operands[2] = const0_rtx, op = ior_optab;
20367 else
20368 return false;
20369 }
20370 else
20371 return false;
20372
20373 orig_out = operands[0];
20374 tmp = gen_reg_rtx (mode);
20375 operands[0] = tmp;
20376
20377 /* Recurse to get the constant loaded. */
20378 if (ix86_expand_int_movcc (operands) == 0)
20379 return false;
20380
20381 /* Mask in the interesting variable. */
20382 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20383 OPTAB_WIDEN);
20384 if (!rtx_equal_p (out, orig_out))
20385 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20386
20387 return true;
20388 }
20389
20390 /*
20391 * For comparison with above,
20392 *
20393 * movl cf,dest
20394 * movl ct,tmp
20395 * cmpl op1,op2
20396 * cmovcc tmp,dest
20397 *
20398 * Size 15.
20399 */
20400
20401 if (! nonimmediate_operand (operands[2], mode))
20402 operands[2] = force_reg (mode, operands[2]);
20403 if (! nonimmediate_operand (operands[3], mode))
20404 operands[3] = force_reg (mode, operands[3]);
20405
20406 if (! register_operand (operands[2], VOIDmode)
20407 && (mode == QImode
20408 || ! register_operand (operands[3], VOIDmode)))
20409 operands[2] = force_reg (mode, operands[2]);
20410
20411 if (mode == QImode
20412 && ! register_operand (operands[3], VOIDmode))
20413 operands[3] = force_reg (mode, operands[3]);
20414
20415 emit_insn (compare_seq);
20416 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20417 gen_rtx_IF_THEN_ELSE (mode,
20418 compare_op, operands[2],
20419 operands[3])));
20420 return true;
20421 }
20422
20423 /* Swap, force into registers, or otherwise massage the two operands
20424 to an sse comparison with a mask result. Thus we differ a bit from
20425 ix86_prepare_fp_compare_args which expects to produce a flags result.
20426
20427 The DEST operand exists to help determine whether to commute commutative
20428 operators. The POP0/POP1 operands are updated in place. The new
20429 comparison code is returned, or UNKNOWN if not implementable. */
20430
20431 static enum rtx_code
20432 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20433 rtx *pop0, rtx *pop1)
20434 {
20435 rtx tmp;
20436
20437 switch (code)
20438 {
20439 case LTGT:
20440 case UNEQ:
20441 /* AVX supports all the needed comparisons. */
20442 if (TARGET_AVX)
20443 break;
20444 /* We have no LTGT as an operator. We could implement it with
20445 NE & ORDERED, but this requires an extra temporary. It's
20446 not clear that it's worth it. */
20447 return UNKNOWN;
20448
20449 case LT:
20450 case LE:
20451 case UNGT:
20452 case UNGE:
20453 /* These are supported directly. */
20454 break;
20455
20456 case EQ:
20457 case NE:
20458 case UNORDERED:
20459 case ORDERED:
20460 /* AVX has 3 operand comparisons, no need to swap anything. */
20461 if (TARGET_AVX)
20462 break;
20463 /* For commutative operators, try to canonicalize the destination
20464 operand to be first in the comparison - this helps reload to
20465 avoid extra moves. */
20466 if (!dest || !rtx_equal_p (dest, *pop1))
20467 break;
20468 /* FALLTHRU */
20469
20470 case GE:
20471 case GT:
20472 case UNLE:
20473 case UNLT:
20474 /* These are not supported directly before AVX, and furthermore
20475 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20476 comparison operands to transform into something that is
20477 supported. */
20478 tmp = *pop0;
20479 *pop0 = *pop1;
20480 *pop1 = tmp;
20481 code = swap_condition (code);
20482 break;
20483
20484 default:
20485 gcc_unreachable ();
20486 }
20487
20488 return code;
20489 }
20490
20491 /* Detect conditional moves that exactly match min/max operational
20492 semantics. Note that this is IEEE safe, as long as we don't
20493 interchange the operands.
20494
20495 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20496 and TRUE if the operation is successful and instructions are emitted. */
20497
20498 static bool
20499 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20500 rtx cmp_op1, rtx if_true, rtx if_false)
20501 {
20502 enum machine_mode mode;
20503 bool is_min;
20504 rtx tmp;
20505
20506 if (code == LT)
20507 ;
20508 else if (code == UNGE)
20509 {
20510 tmp = if_true;
20511 if_true = if_false;
20512 if_false = tmp;
20513 }
20514 else
20515 return false;
20516
20517 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20518 is_min = true;
20519 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20520 is_min = false;
20521 else
20522 return false;
20523
20524 mode = GET_MODE (dest);
20525
20526 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20527 but MODE may be a vector mode and thus not appropriate. */
20528 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20529 {
20530 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20531 rtvec v;
20532
20533 if_true = force_reg (mode, if_true);
20534 v = gen_rtvec (2, if_true, if_false);
20535 tmp = gen_rtx_UNSPEC (mode, v, u);
20536 }
20537 else
20538 {
20539 code = is_min ? SMIN : SMAX;
20540 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20541 }
20542
20543 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20544 return true;
20545 }
20546
20547 /* Expand an sse vector comparison. Return the register with the result. */
20548
20549 static rtx
20550 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20551 rtx op_true, rtx op_false)
20552 {
20553 enum machine_mode mode = GET_MODE (dest);
20554 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20555 rtx x;
20556
20557 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20558 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20559 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20560
20561 if (optimize
20562 || reg_overlap_mentioned_p (dest, op_true)
20563 || reg_overlap_mentioned_p (dest, op_false))
20564 dest = gen_reg_rtx (mode);
20565
20566 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20567 if (cmp_mode != mode)
20568 {
20569 x = force_reg (cmp_mode, x);
20570 convert_move (dest, x, false);
20571 }
20572 else
20573 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20574
20575 return dest;
20576 }
20577
20578 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20579 operations. This is used for both scalar and vector conditional moves. */
20580
20581 static void
20582 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20583 {
20584 enum machine_mode mode = GET_MODE (dest);
20585 rtx t2, t3, x;
20586
20587 if (vector_all_ones_operand (op_true, mode)
20588 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20589 {
20590 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20591 }
20592 else if (op_false == CONST0_RTX (mode))
20593 {
20594 op_true = force_reg (mode, op_true);
20595 x = gen_rtx_AND (mode, cmp, op_true);
20596 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20597 }
20598 else if (op_true == CONST0_RTX (mode))
20599 {
20600 op_false = force_reg (mode, op_false);
20601 x = gen_rtx_NOT (mode, cmp);
20602 x = gen_rtx_AND (mode, x, op_false);
20603 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20604 }
20605 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20606 {
20607 op_false = force_reg (mode, op_false);
20608 x = gen_rtx_IOR (mode, cmp, op_false);
20609 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20610 }
20611 else if (TARGET_XOP)
20612 {
20613 op_true = force_reg (mode, op_true);
20614
20615 if (!nonimmediate_operand (op_false, mode))
20616 op_false = force_reg (mode, op_false);
20617
20618 emit_insn (gen_rtx_SET (mode, dest,
20619 gen_rtx_IF_THEN_ELSE (mode, cmp,
20620 op_true,
20621 op_false)));
20622 }
20623 else
20624 {
20625 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20626 rtx d = dest;
20627
20628 if (!nonimmediate_operand (op_true, mode))
20629 op_true = force_reg (mode, op_true);
20630
20631 op_false = force_reg (mode, op_false);
20632
20633 switch (mode)
20634 {
20635 case V4SFmode:
20636 if (TARGET_SSE4_1)
20637 gen = gen_sse4_1_blendvps;
20638 break;
20639 case V2DFmode:
20640 if (TARGET_SSE4_1)
20641 gen = gen_sse4_1_blendvpd;
20642 break;
20643 case V16QImode:
20644 case V8HImode:
20645 case V4SImode:
20646 case V2DImode:
20647 if (TARGET_SSE4_1)
20648 {
20649 gen = gen_sse4_1_pblendvb;
20650 if (mode != V16QImode)
20651 d = gen_reg_rtx (V16QImode);
20652 op_false = gen_lowpart (V16QImode, op_false);
20653 op_true = gen_lowpart (V16QImode, op_true);
20654 cmp = gen_lowpart (V16QImode, cmp);
20655 }
20656 break;
20657 case V8SFmode:
20658 if (TARGET_AVX)
20659 gen = gen_avx_blendvps256;
20660 break;
20661 case V4DFmode:
20662 if (TARGET_AVX)
20663 gen = gen_avx_blendvpd256;
20664 break;
20665 case V32QImode:
20666 case V16HImode:
20667 case V8SImode:
20668 case V4DImode:
20669 if (TARGET_AVX2)
20670 {
20671 gen = gen_avx2_pblendvb;
20672 if (mode != V32QImode)
20673 d = gen_reg_rtx (V32QImode);
20674 op_false = gen_lowpart (V32QImode, op_false);
20675 op_true = gen_lowpart (V32QImode, op_true);
20676 cmp = gen_lowpart (V32QImode, cmp);
20677 }
20678 break;
20679 default:
20680 break;
20681 }
20682
20683 if (gen != NULL)
20684 {
20685 emit_insn (gen (d, op_false, op_true, cmp));
20686 if (d != dest)
20687 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
20688 }
20689 else
20690 {
20691 op_true = force_reg (mode, op_true);
20692
20693 t2 = gen_reg_rtx (mode);
20694 if (optimize)
20695 t3 = gen_reg_rtx (mode);
20696 else
20697 t3 = dest;
20698
20699 x = gen_rtx_AND (mode, op_true, cmp);
20700 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20701
20702 x = gen_rtx_NOT (mode, cmp);
20703 x = gen_rtx_AND (mode, x, op_false);
20704 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20705
20706 x = gen_rtx_IOR (mode, t3, t2);
20707 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20708 }
20709 }
20710 }
20711
20712 /* Expand a floating-point conditional move. Return true if successful. */
20713
20714 bool
20715 ix86_expand_fp_movcc (rtx operands[])
20716 {
20717 enum machine_mode mode = GET_MODE (operands[0]);
20718 enum rtx_code code = GET_CODE (operands[1]);
20719 rtx tmp, compare_op;
20720 rtx op0 = XEXP (operands[1], 0);
20721 rtx op1 = XEXP (operands[1], 1);
20722
20723 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20724 {
20725 enum machine_mode cmode;
20726
20727 /* Since we've no cmove for sse registers, don't force bad register
20728 allocation just to gain access to it. Deny movcc when the
20729 comparison mode doesn't match the move mode. */
20730 cmode = GET_MODE (op0);
20731 if (cmode == VOIDmode)
20732 cmode = GET_MODE (op1);
20733 if (cmode != mode)
20734 return false;
20735
20736 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20737 if (code == UNKNOWN)
20738 return false;
20739
20740 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20741 operands[2], operands[3]))
20742 return true;
20743
20744 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20745 operands[2], operands[3]);
20746 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20747 return true;
20748 }
20749
20750 if (GET_MODE (op0) == TImode
20751 || (GET_MODE (op0) == DImode
20752 && !TARGET_64BIT))
20753 return false;
20754
20755 /* The floating point conditional move instructions don't directly
20756 support conditions resulting from a signed integer comparison. */
20757
20758 compare_op = ix86_expand_compare (code, op0, op1);
20759 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20760 {
20761 tmp = gen_reg_rtx (QImode);
20762 ix86_expand_setcc (tmp, code, op0, op1);
20763
20764 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20765 }
20766
20767 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20768 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20769 operands[2], operands[3])));
20770
20771 return true;
20772 }
20773
20774 /* Expand a floating-point vector conditional move; a vcond operation
20775 rather than a movcc operation. */
20776
20777 bool
20778 ix86_expand_fp_vcond (rtx operands[])
20779 {
20780 enum rtx_code code = GET_CODE (operands[3]);
20781 rtx cmp;
20782
20783 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20784 &operands[4], &operands[5]);
20785 if (code == UNKNOWN)
20786 {
20787 rtx temp;
20788 switch (GET_CODE (operands[3]))
20789 {
20790 case LTGT:
20791 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20792 operands[5], operands[0], operands[0]);
20793 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20794 operands[5], operands[1], operands[2]);
20795 code = AND;
20796 break;
20797 case UNEQ:
20798 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20799 operands[5], operands[0], operands[0]);
20800 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20801 operands[5], operands[1], operands[2]);
20802 code = IOR;
20803 break;
20804 default:
20805 gcc_unreachable ();
20806 }
20807 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20808 OPTAB_DIRECT);
20809 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20810 return true;
20811 }
20812
20813 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20814 operands[5], operands[1], operands[2]))
20815 return true;
20816
20817 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20818 operands[1], operands[2]);
20819 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20820 return true;
20821 }
20822
20823 /* Expand a signed/unsigned integral vector conditional move. */
20824
20825 bool
20826 ix86_expand_int_vcond (rtx operands[])
20827 {
20828 enum machine_mode data_mode = GET_MODE (operands[0]);
20829 enum machine_mode mode = GET_MODE (operands[4]);
20830 enum rtx_code code = GET_CODE (operands[3]);
20831 bool negate = false;
20832 rtx x, cop0, cop1;
20833
20834 cop0 = operands[4];
20835 cop1 = operands[5];
20836
20837 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20838 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20839 if ((code == LT || code == GE)
20840 && data_mode == mode
20841 && cop1 == CONST0_RTX (mode)
20842 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20843 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20844 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20845 && (GET_MODE_SIZE (data_mode) == 16
20846 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20847 {
20848 rtx negop = operands[2 - (code == LT)];
20849 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20850 if (negop == CONST1_RTX (data_mode))
20851 {
20852 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20853 operands[0], 1, OPTAB_DIRECT);
20854 if (res != operands[0])
20855 emit_move_insn (operands[0], res);
20856 return true;
20857 }
20858 else if (GET_MODE_INNER (data_mode) != DImode
20859 && vector_all_ones_operand (negop, data_mode))
20860 {
20861 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20862 operands[0], 0, OPTAB_DIRECT);
20863 if (res != operands[0])
20864 emit_move_insn (operands[0], res);
20865 return true;
20866 }
20867 }
20868
20869 if (!nonimmediate_operand (cop1, mode))
20870 cop1 = force_reg (mode, cop1);
20871 if (!general_operand (operands[1], data_mode))
20872 operands[1] = force_reg (data_mode, operands[1]);
20873 if (!general_operand (operands[2], data_mode))
20874 operands[2] = force_reg (data_mode, operands[2]);
20875
20876 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20877 if (TARGET_XOP
20878 && (mode == V16QImode || mode == V8HImode
20879 || mode == V4SImode || mode == V2DImode))
20880 ;
20881 else
20882 {
20883 /* Canonicalize the comparison to EQ, GT, GTU. */
20884 switch (code)
20885 {
20886 case EQ:
20887 case GT:
20888 case GTU:
20889 break;
20890
20891 case NE:
20892 case LE:
20893 case LEU:
20894 code = reverse_condition (code);
20895 negate = true;
20896 break;
20897
20898 case GE:
20899 case GEU:
20900 code = reverse_condition (code);
20901 negate = true;
20902 /* FALLTHRU */
20903
20904 case LT:
20905 case LTU:
20906 code = swap_condition (code);
20907 x = cop0, cop0 = cop1, cop1 = x;
20908 break;
20909
20910 default:
20911 gcc_unreachable ();
20912 }
20913
20914 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20915 if (mode == V2DImode)
20916 {
20917 switch (code)
20918 {
20919 case EQ:
20920 /* SSE4.1 supports EQ. */
20921 if (!TARGET_SSE4_1)
20922 return false;
20923 break;
20924
20925 case GT:
20926 case GTU:
20927 /* SSE4.2 supports GT/GTU. */
20928 if (!TARGET_SSE4_2)
20929 return false;
20930 break;
20931
20932 default:
20933 gcc_unreachable ();
20934 }
20935 }
20936
20937 /* Unsigned parallel compare is not supported by the hardware.
20938 Play some tricks to turn this into a signed comparison
20939 against 0. */
20940 if (code == GTU)
20941 {
20942 cop0 = force_reg (mode, cop0);
20943
20944 switch (mode)
20945 {
20946 case V8SImode:
20947 case V4DImode:
20948 case V4SImode:
20949 case V2DImode:
20950 {
20951 rtx t1, t2, mask;
20952 rtx (*gen_sub3) (rtx, rtx, rtx);
20953
20954 switch (mode)
20955 {
20956 case V8SImode: gen_sub3 = gen_subv8si3; break;
20957 case V4DImode: gen_sub3 = gen_subv4di3; break;
20958 case V4SImode: gen_sub3 = gen_subv4si3; break;
20959 case V2DImode: gen_sub3 = gen_subv2di3; break;
20960 default:
20961 gcc_unreachable ();
20962 }
20963 /* Subtract (-(INT MAX) - 1) from both operands to make
20964 them signed. */
20965 mask = ix86_build_signbit_mask (mode, true, false);
20966 t1 = gen_reg_rtx (mode);
20967 emit_insn (gen_sub3 (t1, cop0, mask));
20968
20969 t2 = gen_reg_rtx (mode);
20970 emit_insn (gen_sub3 (t2, cop1, mask));
20971
20972 cop0 = t1;
20973 cop1 = t2;
20974 code = GT;
20975 }
20976 break;
20977
20978 case V32QImode:
20979 case V16HImode:
20980 case V16QImode:
20981 case V8HImode:
20982 /* Perform a parallel unsigned saturating subtraction. */
20983 x = gen_reg_rtx (mode);
20984 emit_insn (gen_rtx_SET (VOIDmode, x,
20985 gen_rtx_US_MINUS (mode, cop0, cop1)));
20986
20987 cop0 = x;
20988 cop1 = CONST0_RTX (mode);
20989 code = EQ;
20990 negate = !negate;
20991 break;
20992
20993 default:
20994 gcc_unreachable ();
20995 }
20996 }
20997 }
20998
20999 /* Allow the comparison to be done in one mode, but the movcc to
21000 happen in another mode. */
21001 if (data_mode == mode)
21002 {
21003 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
21004 operands[1+negate], operands[2-negate]);
21005 }
21006 else
21007 {
21008 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
21009 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
21010 operands[1+negate], operands[2-negate]);
21011 x = gen_lowpart (data_mode, x);
21012 }
21013
21014 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
21015 operands[2-negate]);
21016 return true;
21017 }
21018
21019 /* Expand a variable vector permutation. */
21020
21021 void
21022 ix86_expand_vec_perm (rtx operands[])
21023 {
21024 rtx target = operands[0];
21025 rtx op0 = operands[1];
21026 rtx op1 = operands[2];
21027 rtx mask = operands[3];
21028 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
21029 enum machine_mode mode = GET_MODE (op0);
21030 enum machine_mode maskmode = GET_MODE (mask);
21031 int w, e, i;
21032 bool one_operand_shuffle = rtx_equal_p (op0, op1);
21033
21034 /* Number of elements in the vector. */
21035 w = GET_MODE_NUNITS (mode);
21036 e = GET_MODE_UNIT_SIZE (mode);
21037 gcc_assert (w <= 32);
21038
21039 if (TARGET_AVX2)
21040 {
21041 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
21042 {
21043 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
21044 an constant shuffle operand. With a tiny bit of effort we can
21045 use VPERMD instead. A re-interpretation stall for V4DFmode is
21046 unfortunate but there's no avoiding it.
21047 Similarly for V16HImode we don't have instructions for variable
21048 shuffling, while for V32QImode we can use after preparing suitable
21049 masks vpshufb; vpshufb; vpermq; vpor. */
21050
21051 if (mode == V16HImode)
21052 {
21053 maskmode = mode = V32QImode;
21054 w = 32;
21055 e = 1;
21056 }
21057 else
21058 {
21059 maskmode = mode = V8SImode;
21060 w = 8;
21061 e = 4;
21062 }
21063 t1 = gen_reg_rtx (maskmode);
21064
21065 /* Replicate the low bits of the V4DImode mask into V8SImode:
21066 mask = { A B C D }
21067 t1 = { A A B B C C D D }. */
21068 for (i = 0; i < w / 2; ++i)
21069 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
21070 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21071 vt = force_reg (maskmode, vt);
21072 mask = gen_lowpart (maskmode, mask);
21073 if (maskmode == V8SImode)
21074 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
21075 else
21076 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
21077
21078 /* Multiply the shuffle indicies by two. */
21079 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
21080 OPTAB_DIRECT);
21081
21082 /* Add one to the odd shuffle indicies:
21083 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
21084 for (i = 0; i < w / 2; ++i)
21085 {
21086 vec[i * 2] = const0_rtx;
21087 vec[i * 2 + 1] = const1_rtx;
21088 }
21089 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21090 vt = validize_mem (force_const_mem (maskmode, vt));
21091 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
21092 OPTAB_DIRECT);
21093
21094 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
21095 operands[3] = mask = t1;
21096 target = gen_reg_rtx (mode);
21097 op0 = gen_lowpart (mode, op0);
21098 op1 = gen_lowpart (mode, op1);
21099 }
21100
21101 switch (mode)
21102 {
21103 case V8SImode:
21104 /* The VPERMD and VPERMPS instructions already properly ignore
21105 the high bits of the shuffle elements. No need for us to
21106 perform an AND ourselves. */
21107 if (one_operand_shuffle)
21108 {
21109 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
21110 if (target != operands[0])
21111 emit_move_insn (operands[0],
21112 gen_lowpart (GET_MODE (operands[0]), target));
21113 }
21114 else
21115 {
21116 t1 = gen_reg_rtx (V8SImode);
21117 t2 = gen_reg_rtx (V8SImode);
21118 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
21119 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
21120 goto merge_two;
21121 }
21122 return;
21123
21124 case V8SFmode:
21125 mask = gen_lowpart (V8SFmode, mask);
21126 if (one_operand_shuffle)
21127 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
21128 else
21129 {
21130 t1 = gen_reg_rtx (V8SFmode);
21131 t2 = gen_reg_rtx (V8SFmode);
21132 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
21133 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
21134 goto merge_two;
21135 }
21136 return;
21137
21138 case V4SImode:
21139 /* By combining the two 128-bit input vectors into one 256-bit
21140 input vector, we can use VPERMD and VPERMPS for the full
21141 two-operand shuffle. */
21142 t1 = gen_reg_rtx (V8SImode);
21143 t2 = gen_reg_rtx (V8SImode);
21144 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
21145 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21146 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
21147 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
21148 return;
21149
21150 case V4SFmode:
21151 t1 = gen_reg_rtx (V8SFmode);
21152 t2 = gen_reg_rtx (V8SImode);
21153 mask = gen_lowpart (V4SImode, mask);
21154 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
21155 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21156 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
21157 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
21158 return;
21159
21160 case V32QImode:
21161 t1 = gen_reg_rtx (V32QImode);
21162 t2 = gen_reg_rtx (V32QImode);
21163 t3 = gen_reg_rtx (V32QImode);
21164 vt2 = GEN_INT (128);
21165 for (i = 0; i < 32; i++)
21166 vec[i] = vt2;
21167 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21168 vt = force_reg (V32QImode, vt);
21169 for (i = 0; i < 32; i++)
21170 vec[i] = i < 16 ? vt2 : const0_rtx;
21171 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21172 vt2 = force_reg (V32QImode, vt2);
21173 /* From mask create two adjusted masks, which contain the same
21174 bits as mask in the low 7 bits of each vector element.
21175 The first mask will have the most significant bit clear
21176 if it requests element from the same 128-bit lane
21177 and MSB set if it requests element from the other 128-bit lane.
21178 The second mask will have the opposite values of the MSB,
21179 and additionally will have its 128-bit lanes swapped.
21180 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
21181 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
21182 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
21183 stands for other 12 bytes. */
21184 /* The bit whether element is from the same lane or the other
21185 lane is bit 4, so shift it up by 3 to the MSB position. */
21186 t5 = gen_reg_rtx (V4DImode);
21187 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
21188 GEN_INT (3)));
21189 /* Clear MSB bits from the mask just in case it had them set. */
21190 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
21191 /* After this t1 will have MSB set for elements from other lane. */
21192 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
21193 /* Clear bits other than MSB. */
21194 emit_insn (gen_andv32qi3 (t1, t1, vt));
21195 /* Or in the lower bits from mask into t3. */
21196 emit_insn (gen_iorv32qi3 (t3, t1, t2));
21197 /* And invert MSB bits in t1, so MSB is set for elements from the same
21198 lane. */
21199 emit_insn (gen_xorv32qi3 (t1, t1, vt));
21200 /* Swap 128-bit lanes in t3. */
21201 t6 = gen_reg_rtx (V4DImode);
21202 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
21203 const2_rtx, GEN_INT (3),
21204 const0_rtx, const1_rtx));
21205 /* And or in the lower bits from mask into t1. */
21206 emit_insn (gen_iorv32qi3 (t1, t1, t2));
21207 if (one_operand_shuffle)
21208 {
21209 /* Each of these shuffles will put 0s in places where
21210 element from the other 128-bit lane is needed, otherwise
21211 will shuffle in the requested value. */
21212 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
21213 gen_lowpart (V32QImode, t6)));
21214 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
21215 /* For t3 the 128-bit lanes are swapped again. */
21216 t7 = gen_reg_rtx (V4DImode);
21217 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
21218 const2_rtx, GEN_INT (3),
21219 const0_rtx, const1_rtx));
21220 /* And oring both together leads to the result. */
21221 emit_insn (gen_iorv32qi3 (target, t1,
21222 gen_lowpart (V32QImode, t7)));
21223 if (target != operands[0])
21224 emit_move_insn (operands[0],
21225 gen_lowpart (GET_MODE (operands[0]), target));
21226 return;
21227 }
21228
21229 t4 = gen_reg_rtx (V32QImode);
21230 /* Similarly to the above one_operand_shuffle code,
21231 just for repeated twice for each operand. merge_two:
21232 code will merge the two results together. */
21233 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
21234 gen_lowpart (V32QImode, t6)));
21235 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
21236 gen_lowpart (V32QImode, t6)));
21237 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
21238 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
21239 t7 = gen_reg_rtx (V4DImode);
21240 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
21241 const2_rtx, GEN_INT (3),
21242 const0_rtx, const1_rtx));
21243 t8 = gen_reg_rtx (V4DImode);
21244 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
21245 const2_rtx, GEN_INT (3),
21246 const0_rtx, const1_rtx));
21247 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
21248 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
21249 t1 = t4;
21250 t2 = t3;
21251 goto merge_two;
21252
21253 default:
21254 gcc_assert (GET_MODE_SIZE (mode) <= 16);
21255 break;
21256 }
21257 }
21258
21259 if (TARGET_XOP)
21260 {
21261 /* The XOP VPPERM insn supports three inputs. By ignoring the
21262 one_operand_shuffle special case, we avoid creating another
21263 set of constant vectors in memory. */
21264 one_operand_shuffle = false;
21265
21266 /* mask = mask & {2*w-1, ...} */
21267 vt = GEN_INT (2*w - 1);
21268 }
21269 else
21270 {
21271 /* mask = mask & {w-1, ...} */
21272 vt = GEN_INT (w - 1);
21273 }
21274
21275 for (i = 0; i < w; i++)
21276 vec[i] = vt;
21277 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21278 mask = expand_simple_binop (maskmode, AND, mask, vt,
21279 NULL_RTX, 0, OPTAB_DIRECT);
21280
21281 /* For non-QImode operations, convert the word permutation control
21282 into a byte permutation control. */
21283 if (mode != V16QImode)
21284 {
21285 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21286 GEN_INT (exact_log2 (e)),
21287 NULL_RTX, 0, OPTAB_DIRECT);
21288
21289 /* Convert mask to vector of chars. */
21290 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21291
21292 /* Replicate each of the input bytes into byte positions:
21293 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21294 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21295 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21296 for (i = 0; i < 16; ++i)
21297 vec[i] = GEN_INT (i/e * e);
21298 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21299 vt = validize_mem (force_const_mem (V16QImode, vt));
21300 if (TARGET_XOP)
21301 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21302 else
21303 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21304
21305 /* Convert it into the byte positions by doing
21306 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21307 for (i = 0; i < 16; ++i)
21308 vec[i] = GEN_INT (i % e);
21309 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21310 vt = validize_mem (force_const_mem (V16QImode, vt));
21311 emit_insn (gen_addv16qi3 (mask, mask, vt));
21312 }
21313
21314 /* The actual shuffle operations all operate on V16QImode. */
21315 op0 = gen_lowpart (V16QImode, op0);
21316 op1 = gen_lowpart (V16QImode, op1);
21317
21318 if (TARGET_XOP)
21319 {
21320 if (GET_MODE (target) != V16QImode)
21321 target = gen_reg_rtx (V16QImode);
21322 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21323 if (target != operands[0])
21324 emit_move_insn (operands[0],
21325 gen_lowpart (GET_MODE (operands[0]), target));
21326 }
21327 else if (one_operand_shuffle)
21328 {
21329 if (GET_MODE (target) != V16QImode)
21330 target = gen_reg_rtx (V16QImode);
21331 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21332 if (target != operands[0])
21333 emit_move_insn (operands[0],
21334 gen_lowpart (GET_MODE (operands[0]), target));
21335 }
21336 else
21337 {
21338 rtx xops[6];
21339 bool ok;
21340
21341 /* Shuffle the two input vectors independently. */
21342 t1 = gen_reg_rtx (V16QImode);
21343 t2 = gen_reg_rtx (V16QImode);
21344 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21345 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21346
21347 merge_two:
21348 /* Then merge them together. The key is whether any given control
21349 element contained a bit set that indicates the second word. */
21350 mask = operands[3];
21351 vt = GEN_INT (w);
21352 if (maskmode == V2DImode && !TARGET_SSE4_1)
21353 {
21354 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21355 more shuffle to convert the V2DI input mask into a V4SI
21356 input mask. At which point the masking that expand_int_vcond
21357 will work as desired. */
21358 rtx t3 = gen_reg_rtx (V4SImode);
21359 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21360 const0_rtx, const0_rtx,
21361 const2_rtx, const2_rtx));
21362 mask = t3;
21363 maskmode = V4SImode;
21364 e = w = 4;
21365 }
21366
21367 for (i = 0; i < w; i++)
21368 vec[i] = vt;
21369 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21370 vt = force_reg (maskmode, vt);
21371 mask = expand_simple_binop (maskmode, AND, mask, vt,
21372 NULL_RTX, 0, OPTAB_DIRECT);
21373
21374 if (GET_MODE (target) != mode)
21375 target = gen_reg_rtx (mode);
21376 xops[0] = target;
21377 xops[1] = gen_lowpart (mode, t2);
21378 xops[2] = gen_lowpart (mode, t1);
21379 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21380 xops[4] = mask;
21381 xops[5] = vt;
21382 ok = ix86_expand_int_vcond (xops);
21383 gcc_assert (ok);
21384 if (target != operands[0])
21385 emit_move_insn (operands[0],
21386 gen_lowpart (GET_MODE (operands[0]), target));
21387 }
21388 }
21389
21390 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21391 true if we should do zero extension, else sign extension. HIGH_P is
21392 true if we want the N/2 high elements, else the low elements. */
21393
21394 void
21395 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21396 {
21397 enum machine_mode imode = GET_MODE (src);
21398 rtx tmp;
21399
21400 if (TARGET_SSE4_1)
21401 {
21402 rtx (*unpack)(rtx, rtx);
21403 rtx (*extract)(rtx, rtx) = NULL;
21404 enum machine_mode halfmode = BLKmode;
21405
21406 switch (imode)
21407 {
21408 case V32QImode:
21409 if (unsigned_p)
21410 unpack = gen_avx2_zero_extendv16qiv16hi2;
21411 else
21412 unpack = gen_avx2_sign_extendv16qiv16hi2;
21413 halfmode = V16QImode;
21414 extract
21415 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21416 break;
21417 case V16HImode:
21418 if (unsigned_p)
21419 unpack = gen_avx2_zero_extendv8hiv8si2;
21420 else
21421 unpack = gen_avx2_sign_extendv8hiv8si2;
21422 halfmode = V8HImode;
21423 extract
21424 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21425 break;
21426 case V8SImode:
21427 if (unsigned_p)
21428 unpack = gen_avx2_zero_extendv4siv4di2;
21429 else
21430 unpack = gen_avx2_sign_extendv4siv4di2;
21431 halfmode = V4SImode;
21432 extract
21433 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21434 break;
21435 case V16QImode:
21436 if (unsigned_p)
21437 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21438 else
21439 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21440 break;
21441 case V8HImode:
21442 if (unsigned_p)
21443 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21444 else
21445 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21446 break;
21447 case V4SImode:
21448 if (unsigned_p)
21449 unpack = gen_sse4_1_zero_extendv2siv2di2;
21450 else
21451 unpack = gen_sse4_1_sign_extendv2siv2di2;
21452 break;
21453 default:
21454 gcc_unreachable ();
21455 }
21456
21457 if (GET_MODE_SIZE (imode) == 32)
21458 {
21459 tmp = gen_reg_rtx (halfmode);
21460 emit_insn (extract (tmp, src));
21461 }
21462 else if (high_p)
21463 {
21464 /* Shift higher 8 bytes to lower 8 bytes. */
21465 tmp = gen_reg_rtx (V1TImode);
21466 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
21467 GEN_INT (64)));
21468 tmp = gen_lowpart (imode, tmp);
21469 }
21470 else
21471 tmp = src;
21472
21473 emit_insn (unpack (dest, tmp));
21474 }
21475 else
21476 {
21477 rtx (*unpack)(rtx, rtx, rtx);
21478
21479 switch (imode)
21480 {
21481 case V16QImode:
21482 if (high_p)
21483 unpack = gen_vec_interleave_highv16qi;
21484 else
21485 unpack = gen_vec_interleave_lowv16qi;
21486 break;
21487 case V8HImode:
21488 if (high_p)
21489 unpack = gen_vec_interleave_highv8hi;
21490 else
21491 unpack = gen_vec_interleave_lowv8hi;
21492 break;
21493 case V4SImode:
21494 if (high_p)
21495 unpack = gen_vec_interleave_highv4si;
21496 else
21497 unpack = gen_vec_interleave_lowv4si;
21498 break;
21499 default:
21500 gcc_unreachable ();
21501 }
21502
21503 if (unsigned_p)
21504 tmp = force_reg (imode, CONST0_RTX (imode));
21505 else
21506 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21507 src, pc_rtx, pc_rtx);
21508
21509 rtx tmp2 = gen_reg_rtx (imode);
21510 emit_insn (unpack (tmp2, src, tmp));
21511 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
21512 }
21513 }
21514
21515 /* Expand conditional increment or decrement using adb/sbb instructions.
21516 The default case using setcc followed by the conditional move can be
21517 done by generic code. */
21518 bool
21519 ix86_expand_int_addcc (rtx operands[])
21520 {
21521 enum rtx_code code = GET_CODE (operands[1]);
21522 rtx flags;
21523 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21524 rtx compare_op;
21525 rtx val = const0_rtx;
21526 bool fpcmp = false;
21527 enum machine_mode mode;
21528 rtx op0 = XEXP (operands[1], 0);
21529 rtx op1 = XEXP (operands[1], 1);
21530
21531 if (operands[3] != const1_rtx
21532 && operands[3] != constm1_rtx)
21533 return false;
21534 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21535 return false;
21536 code = GET_CODE (compare_op);
21537
21538 flags = XEXP (compare_op, 0);
21539
21540 if (GET_MODE (flags) == CCFPmode
21541 || GET_MODE (flags) == CCFPUmode)
21542 {
21543 fpcmp = true;
21544 code = ix86_fp_compare_code_to_integer (code);
21545 }
21546
21547 if (code != LTU)
21548 {
21549 val = constm1_rtx;
21550 if (fpcmp)
21551 PUT_CODE (compare_op,
21552 reverse_condition_maybe_unordered
21553 (GET_CODE (compare_op)));
21554 else
21555 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21556 }
21557
21558 mode = GET_MODE (operands[0]);
21559
21560 /* Construct either adc or sbb insn. */
21561 if ((code == LTU) == (operands[3] == constm1_rtx))
21562 {
21563 switch (mode)
21564 {
21565 case QImode:
21566 insn = gen_subqi3_carry;
21567 break;
21568 case HImode:
21569 insn = gen_subhi3_carry;
21570 break;
21571 case SImode:
21572 insn = gen_subsi3_carry;
21573 break;
21574 case DImode:
21575 insn = gen_subdi3_carry;
21576 break;
21577 default:
21578 gcc_unreachable ();
21579 }
21580 }
21581 else
21582 {
21583 switch (mode)
21584 {
21585 case QImode:
21586 insn = gen_addqi3_carry;
21587 break;
21588 case HImode:
21589 insn = gen_addhi3_carry;
21590 break;
21591 case SImode:
21592 insn = gen_addsi3_carry;
21593 break;
21594 case DImode:
21595 insn = gen_adddi3_carry;
21596 break;
21597 default:
21598 gcc_unreachable ();
21599 }
21600 }
21601 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21602
21603 return true;
21604 }
21605
21606
21607 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21608 but works for floating pointer parameters and nonoffsetable memories.
21609 For pushes, it returns just stack offsets; the values will be saved
21610 in the right order. Maximally three parts are generated. */
21611
21612 static int
21613 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21614 {
21615 int size;
21616
21617 if (!TARGET_64BIT)
21618 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21619 else
21620 size = (GET_MODE_SIZE (mode) + 4) / 8;
21621
21622 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21623 gcc_assert (size >= 2 && size <= 4);
21624
21625 /* Optimize constant pool reference to immediates. This is used by fp
21626 moves, that force all constants to memory to allow combining. */
21627 if (MEM_P (operand) && MEM_READONLY_P (operand))
21628 {
21629 rtx tmp = maybe_get_pool_constant (operand);
21630 if (tmp)
21631 operand = tmp;
21632 }
21633
21634 if (MEM_P (operand) && !offsettable_memref_p (operand))
21635 {
21636 /* The only non-offsetable memories we handle are pushes. */
21637 int ok = push_operand (operand, VOIDmode);
21638
21639 gcc_assert (ok);
21640
21641 operand = copy_rtx (operand);
21642 PUT_MODE (operand, word_mode);
21643 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21644 return size;
21645 }
21646
21647 if (GET_CODE (operand) == CONST_VECTOR)
21648 {
21649 enum machine_mode imode = int_mode_for_mode (mode);
21650 /* Caution: if we looked through a constant pool memory above,
21651 the operand may actually have a different mode now. That's
21652 ok, since we want to pun this all the way back to an integer. */
21653 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21654 gcc_assert (operand != NULL);
21655 mode = imode;
21656 }
21657
21658 if (!TARGET_64BIT)
21659 {
21660 if (mode == DImode)
21661 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21662 else
21663 {
21664 int i;
21665
21666 if (REG_P (operand))
21667 {
21668 gcc_assert (reload_completed);
21669 for (i = 0; i < size; i++)
21670 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21671 }
21672 else if (offsettable_memref_p (operand))
21673 {
21674 operand = adjust_address (operand, SImode, 0);
21675 parts[0] = operand;
21676 for (i = 1; i < size; i++)
21677 parts[i] = adjust_address (operand, SImode, 4 * i);
21678 }
21679 else if (GET_CODE (operand) == CONST_DOUBLE)
21680 {
21681 REAL_VALUE_TYPE r;
21682 long l[4];
21683
21684 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21685 switch (mode)
21686 {
21687 case TFmode:
21688 real_to_target (l, &r, mode);
21689 parts[3] = gen_int_mode (l[3], SImode);
21690 parts[2] = gen_int_mode (l[2], SImode);
21691 break;
21692 case XFmode:
21693 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21694 long double may not be 80-bit. */
21695 real_to_target (l, &r, mode);
21696 parts[2] = gen_int_mode (l[2], SImode);
21697 break;
21698 case DFmode:
21699 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21700 break;
21701 default:
21702 gcc_unreachable ();
21703 }
21704 parts[1] = gen_int_mode (l[1], SImode);
21705 parts[0] = gen_int_mode (l[0], SImode);
21706 }
21707 else
21708 gcc_unreachable ();
21709 }
21710 }
21711 else
21712 {
21713 if (mode == TImode)
21714 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21715 if (mode == XFmode || mode == TFmode)
21716 {
21717 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21718 if (REG_P (operand))
21719 {
21720 gcc_assert (reload_completed);
21721 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21722 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21723 }
21724 else if (offsettable_memref_p (operand))
21725 {
21726 operand = adjust_address (operand, DImode, 0);
21727 parts[0] = operand;
21728 parts[1] = adjust_address (operand, upper_mode, 8);
21729 }
21730 else if (GET_CODE (operand) == CONST_DOUBLE)
21731 {
21732 REAL_VALUE_TYPE r;
21733 long l[4];
21734
21735 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21736 real_to_target (l, &r, mode);
21737
21738 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21739 if (HOST_BITS_PER_WIDE_INT >= 64)
21740 parts[0]
21741 = gen_int_mode
21742 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21743 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21744 DImode);
21745 else
21746 parts[0] = immed_double_const (l[0], l[1], DImode);
21747
21748 if (upper_mode == SImode)
21749 parts[1] = gen_int_mode (l[2], SImode);
21750 else if (HOST_BITS_PER_WIDE_INT >= 64)
21751 parts[1]
21752 = gen_int_mode
21753 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21754 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21755 DImode);
21756 else
21757 parts[1] = immed_double_const (l[2], l[3], DImode);
21758 }
21759 else
21760 gcc_unreachable ();
21761 }
21762 }
21763
21764 return size;
21765 }
21766
21767 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21768 Return false when normal moves are needed; true when all required
21769 insns have been emitted. Operands 2-4 contain the input values
21770 int the correct order; operands 5-7 contain the output values. */
21771
21772 void
21773 ix86_split_long_move (rtx operands[])
21774 {
21775 rtx part[2][4];
21776 int nparts, i, j;
21777 int push = 0;
21778 int collisions = 0;
21779 enum machine_mode mode = GET_MODE (operands[0]);
21780 bool collisionparts[4];
21781
21782 /* The DFmode expanders may ask us to move double.
21783 For 64bit target this is single move. By hiding the fact
21784 here we simplify i386.md splitters. */
21785 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21786 {
21787 /* Optimize constant pool reference to immediates. This is used by
21788 fp moves, that force all constants to memory to allow combining. */
21789
21790 if (MEM_P (operands[1])
21791 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21792 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21793 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21794 if (push_operand (operands[0], VOIDmode))
21795 {
21796 operands[0] = copy_rtx (operands[0]);
21797 PUT_MODE (operands[0], word_mode);
21798 }
21799 else
21800 operands[0] = gen_lowpart (DImode, operands[0]);
21801 operands[1] = gen_lowpart (DImode, operands[1]);
21802 emit_move_insn (operands[0], operands[1]);
21803 return;
21804 }
21805
21806 /* The only non-offsettable memory we handle is push. */
21807 if (push_operand (operands[0], VOIDmode))
21808 push = 1;
21809 else
21810 gcc_assert (!MEM_P (operands[0])
21811 || offsettable_memref_p (operands[0]));
21812
21813 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21814 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21815
21816 /* When emitting push, take care for source operands on the stack. */
21817 if (push && MEM_P (operands[1])
21818 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21819 {
21820 rtx src_base = XEXP (part[1][nparts - 1], 0);
21821
21822 /* Compensate for the stack decrement by 4. */
21823 if (!TARGET_64BIT && nparts == 3
21824 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21825 src_base = plus_constant (Pmode, src_base, 4);
21826
21827 /* src_base refers to the stack pointer and is
21828 automatically decreased by emitted push. */
21829 for (i = 0; i < nparts; i++)
21830 part[1][i] = change_address (part[1][i],
21831 GET_MODE (part[1][i]), src_base);
21832 }
21833
21834 /* We need to do copy in the right order in case an address register
21835 of the source overlaps the destination. */
21836 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21837 {
21838 rtx tmp;
21839
21840 for (i = 0; i < nparts; i++)
21841 {
21842 collisionparts[i]
21843 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21844 if (collisionparts[i])
21845 collisions++;
21846 }
21847
21848 /* Collision in the middle part can be handled by reordering. */
21849 if (collisions == 1 && nparts == 3 && collisionparts [1])
21850 {
21851 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21852 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21853 }
21854 else if (collisions == 1
21855 && nparts == 4
21856 && (collisionparts [1] || collisionparts [2]))
21857 {
21858 if (collisionparts [1])
21859 {
21860 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21861 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21862 }
21863 else
21864 {
21865 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21866 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21867 }
21868 }
21869
21870 /* If there are more collisions, we can't handle it by reordering.
21871 Do an lea to the last part and use only one colliding move. */
21872 else if (collisions > 1)
21873 {
21874 rtx base;
21875
21876 collisions = 1;
21877
21878 base = part[0][nparts - 1];
21879
21880 /* Handle the case when the last part isn't valid for lea.
21881 Happens in 64-bit mode storing the 12-byte XFmode. */
21882 if (GET_MODE (base) != Pmode)
21883 base = gen_rtx_REG (Pmode, REGNO (base));
21884
21885 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21886 part[1][0] = replace_equiv_address (part[1][0], base);
21887 for (i = 1; i < nparts; i++)
21888 {
21889 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21890 part[1][i] = replace_equiv_address (part[1][i], tmp);
21891 }
21892 }
21893 }
21894
21895 if (push)
21896 {
21897 if (!TARGET_64BIT)
21898 {
21899 if (nparts == 3)
21900 {
21901 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21902 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21903 stack_pointer_rtx, GEN_INT (-4)));
21904 emit_move_insn (part[0][2], part[1][2]);
21905 }
21906 else if (nparts == 4)
21907 {
21908 emit_move_insn (part[0][3], part[1][3]);
21909 emit_move_insn (part[0][2], part[1][2]);
21910 }
21911 }
21912 else
21913 {
21914 /* In 64bit mode we don't have 32bit push available. In case this is
21915 register, it is OK - we will just use larger counterpart. We also
21916 retype memory - these comes from attempt to avoid REX prefix on
21917 moving of second half of TFmode value. */
21918 if (GET_MODE (part[1][1]) == SImode)
21919 {
21920 switch (GET_CODE (part[1][1]))
21921 {
21922 case MEM:
21923 part[1][1] = adjust_address (part[1][1], DImode, 0);
21924 break;
21925
21926 case REG:
21927 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21928 break;
21929
21930 default:
21931 gcc_unreachable ();
21932 }
21933
21934 if (GET_MODE (part[1][0]) == SImode)
21935 part[1][0] = part[1][1];
21936 }
21937 }
21938 emit_move_insn (part[0][1], part[1][1]);
21939 emit_move_insn (part[0][0], part[1][0]);
21940 return;
21941 }
21942
21943 /* Choose correct order to not overwrite the source before it is copied. */
21944 if ((REG_P (part[0][0])
21945 && REG_P (part[1][1])
21946 && (REGNO (part[0][0]) == REGNO (part[1][1])
21947 || (nparts == 3
21948 && REGNO (part[0][0]) == REGNO (part[1][2]))
21949 || (nparts == 4
21950 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21951 || (collisions > 0
21952 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21953 {
21954 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21955 {
21956 operands[2 + i] = part[0][j];
21957 operands[6 + i] = part[1][j];
21958 }
21959 }
21960 else
21961 {
21962 for (i = 0; i < nparts; i++)
21963 {
21964 operands[2 + i] = part[0][i];
21965 operands[6 + i] = part[1][i];
21966 }
21967 }
21968
21969 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21970 if (optimize_insn_for_size_p ())
21971 {
21972 for (j = 0; j < nparts - 1; j++)
21973 if (CONST_INT_P (operands[6 + j])
21974 && operands[6 + j] != const0_rtx
21975 && REG_P (operands[2 + j]))
21976 for (i = j; i < nparts - 1; i++)
21977 if (CONST_INT_P (operands[7 + i])
21978 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21979 operands[7 + i] = operands[2 + j];
21980 }
21981
21982 for (i = 0; i < nparts; i++)
21983 emit_move_insn (operands[2 + i], operands[6 + i]);
21984
21985 return;
21986 }
21987
21988 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21989 left shift by a constant, either using a single shift or
21990 a sequence of add instructions. */
21991
21992 static void
21993 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21994 {
21995 rtx (*insn)(rtx, rtx, rtx);
21996
21997 if (count == 1
21998 || (count * ix86_cost->add <= ix86_cost->shift_const
21999 && !optimize_insn_for_size_p ()))
22000 {
22001 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
22002 while (count-- > 0)
22003 emit_insn (insn (operand, operand, operand));
22004 }
22005 else
22006 {
22007 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
22008 emit_insn (insn (operand, operand, GEN_INT (count)));
22009 }
22010 }
22011
22012 void
22013 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
22014 {
22015 rtx (*gen_ashl3)(rtx, rtx, rtx);
22016 rtx (*gen_shld)(rtx, rtx, rtx);
22017 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22018
22019 rtx low[2], high[2];
22020 int count;
22021
22022 if (CONST_INT_P (operands[2]))
22023 {
22024 split_double_mode (mode, operands, 2, low, high);
22025 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22026
22027 if (count >= half_width)
22028 {
22029 emit_move_insn (high[0], low[1]);
22030 emit_move_insn (low[0], const0_rtx);
22031
22032 if (count > half_width)
22033 ix86_expand_ashl_const (high[0], count - half_width, mode);
22034 }
22035 else
22036 {
22037 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22038
22039 if (!rtx_equal_p (operands[0], operands[1]))
22040 emit_move_insn (operands[0], operands[1]);
22041
22042 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
22043 ix86_expand_ashl_const (low[0], count, mode);
22044 }
22045 return;
22046 }
22047
22048 split_double_mode (mode, operands, 1, low, high);
22049
22050 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
22051
22052 if (operands[1] == const1_rtx)
22053 {
22054 /* Assuming we've chosen a QImode capable registers, then 1 << N
22055 can be done with two 32/64-bit shifts, no branches, no cmoves. */
22056 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
22057 {
22058 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
22059
22060 ix86_expand_clear (low[0]);
22061 ix86_expand_clear (high[0]);
22062 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
22063
22064 d = gen_lowpart (QImode, low[0]);
22065 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
22066 s = gen_rtx_EQ (QImode, flags, const0_rtx);
22067 emit_insn (gen_rtx_SET (VOIDmode, d, s));
22068
22069 d = gen_lowpart (QImode, high[0]);
22070 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
22071 s = gen_rtx_NE (QImode, flags, const0_rtx);
22072 emit_insn (gen_rtx_SET (VOIDmode, d, s));
22073 }
22074
22075 /* Otherwise, we can get the same results by manually performing
22076 a bit extract operation on bit 5/6, and then performing the two
22077 shifts. The two methods of getting 0/1 into low/high are exactly
22078 the same size. Avoiding the shift in the bit extract case helps
22079 pentium4 a bit; no one else seems to care much either way. */
22080 else
22081 {
22082 enum machine_mode half_mode;
22083 rtx (*gen_lshr3)(rtx, rtx, rtx);
22084 rtx (*gen_and3)(rtx, rtx, rtx);
22085 rtx (*gen_xor3)(rtx, rtx, rtx);
22086 HOST_WIDE_INT bits;
22087 rtx x;
22088
22089 if (mode == DImode)
22090 {
22091 half_mode = SImode;
22092 gen_lshr3 = gen_lshrsi3;
22093 gen_and3 = gen_andsi3;
22094 gen_xor3 = gen_xorsi3;
22095 bits = 5;
22096 }
22097 else
22098 {
22099 half_mode = DImode;
22100 gen_lshr3 = gen_lshrdi3;
22101 gen_and3 = gen_anddi3;
22102 gen_xor3 = gen_xordi3;
22103 bits = 6;
22104 }
22105
22106 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
22107 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
22108 else
22109 x = gen_lowpart (half_mode, operands[2]);
22110 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
22111
22112 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
22113 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
22114 emit_move_insn (low[0], high[0]);
22115 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
22116 }
22117
22118 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22119 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
22120 return;
22121 }
22122
22123 if (operands[1] == constm1_rtx)
22124 {
22125 /* For -1 << N, we can avoid the shld instruction, because we
22126 know that we're shifting 0...31/63 ones into a -1. */
22127 emit_move_insn (low[0], constm1_rtx);
22128 if (optimize_insn_for_size_p ())
22129 emit_move_insn (high[0], low[0]);
22130 else
22131 emit_move_insn (high[0], constm1_rtx);
22132 }
22133 else
22134 {
22135 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22136
22137 if (!rtx_equal_p (operands[0], operands[1]))
22138 emit_move_insn (operands[0], operands[1]);
22139
22140 split_double_mode (mode, operands, 1, low, high);
22141 emit_insn (gen_shld (high[0], low[0], operands[2]));
22142 }
22143
22144 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22145
22146 if (TARGET_CMOVE && scratch)
22147 {
22148 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22149 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22150
22151 ix86_expand_clear (scratch);
22152 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
22153 }
22154 else
22155 {
22156 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22157 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22158
22159 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
22160 }
22161 }
22162
22163 void
22164 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
22165 {
22166 rtx (*gen_ashr3)(rtx, rtx, rtx)
22167 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
22168 rtx (*gen_shrd)(rtx, rtx, rtx);
22169 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22170
22171 rtx low[2], high[2];
22172 int count;
22173
22174 if (CONST_INT_P (operands[2]))
22175 {
22176 split_double_mode (mode, operands, 2, low, high);
22177 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22178
22179 if (count == GET_MODE_BITSIZE (mode) - 1)
22180 {
22181 emit_move_insn (high[0], high[1]);
22182 emit_insn (gen_ashr3 (high[0], high[0],
22183 GEN_INT (half_width - 1)));
22184 emit_move_insn (low[0], high[0]);
22185
22186 }
22187 else if (count >= half_width)
22188 {
22189 emit_move_insn (low[0], high[1]);
22190 emit_move_insn (high[0], low[0]);
22191 emit_insn (gen_ashr3 (high[0], high[0],
22192 GEN_INT (half_width - 1)));
22193
22194 if (count > half_width)
22195 emit_insn (gen_ashr3 (low[0], low[0],
22196 GEN_INT (count - half_width)));
22197 }
22198 else
22199 {
22200 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22201
22202 if (!rtx_equal_p (operands[0], operands[1]))
22203 emit_move_insn (operands[0], operands[1]);
22204
22205 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22206 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
22207 }
22208 }
22209 else
22210 {
22211 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22212
22213 if (!rtx_equal_p (operands[0], operands[1]))
22214 emit_move_insn (operands[0], operands[1]);
22215
22216 split_double_mode (mode, operands, 1, low, high);
22217
22218 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22219 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
22220
22221 if (TARGET_CMOVE && scratch)
22222 {
22223 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22224 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22225
22226 emit_move_insn (scratch, high[0]);
22227 emit_insn (gen_ashr3 (scratch, scratch,
22228 GEN_INT (half_width - 1)));
22229 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22230 scratch));
22231 }
22232 else
22233 {
22234 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
22235 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
22236
22237 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
22238 }
22239 }
22240 }
22241
22242 void
22243 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
22244 {
22245 rtx (*gen_lshr3)(rtx, rtx, rtx)
22246 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
22247 rtx (*gen_shrd)(rtx, rtx, rtx);
22248 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22249
22250 rtx low[2], high[2];
22251 int count;
22252
22253 if (CONST_INT_P (operands[2]))
22254 {
22255 split_double_mode (mode, operands, 2, low, high);
22256 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22257
22258 if (count >= half_width)
22259 {
22260 emit_move_insn (low[0], high[1]);
22261 ix86_expand_clear (high[0]);
22262
22263 if (count > half_width)
22264 emit_insn (gen_lshr3 (low[0], low[0],
22265 GEN_INT (count - half_width)));
22266 }
22267 else
22268 {
22269 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22270
22271 if (!rtx_equal_p (operands[0], operands[1]))
22272 emit_move_insn (operands[0], operands[1]);
22273
22274 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22275 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22276 }
22277 }
22278 else
22279 {
22280 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22281
22282 if (!rtx_equal_p (operands[0], operands[1]))
22283 emit_move_insn (operands[0], operands[1]);
22284
22285 split_double_mode (mode, operands, 1, low, high);
22286
22287 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22288 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22289
22290 if (TARGET_CMOVE && scratch)
22291 {
22292 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22293 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22294
22295 ix86_expand_clear (scratch);
22296 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22297 scratch));
22298 }
22299 else
22300 {
22301 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22302 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22303
22304 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22305 }
22306 }
22307 }
22308
22309 /* Predict just emitted jump instruction to be taken with probability PROB. */
22310 static void
22311 predict_jump (int prob)
22312 {
22313 rtx insn = get_last_insn ();
22314 gcc_assert (JUMP_P (insn));
22315 add_int_reg_note (insn, REG_BR_PROB, prob);
22316 }
22317
22318 /* Helper function for the string operations below. Dest VARIABLE whether
22319 it is aligned to VALUE bytes. If true, jump to the label. */
22320 static rtx
22321 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22322 {
22323 rtx label = gen_label_rtx ();
22324 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22325 if (GET_MODE (variable) == DImode)
22326 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22327 else
22328 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22329 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22330 1, label);
22331 if (epilogue)
22332 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22333 else
22334 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22335 return label;
22336 }
22337
22338 /* Adjust COUNTER by the VALUE. */
22339 static void
22340 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22341 {
22342 rtx (*gen_add)(rtx, rtx, rtx)
22343 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22344
22345 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22346 }
22347
22348 /* Zero extend possibly SImode EXP to Pmode register. */
22349 rtx
22350 ix86_zero_extend_to_Pmode (rtx exp)
22351 {
22352 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22353 }
22354
22355 /* Divide COUNTREG by SCALE. */
22356 static rtx
22357 scale_counter (rtx countreg, int scale)
22358 {
22359 rtx sc;
22360
22361 if (scale == 1)
22362 return countreg;
22363 if (CONST_INT_P (countreg))
22364 return GEN_INT (INTVAL (countreg) / scale);
22365 gcc_assert (REG_P (countreg));
22366
22367 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22368 GEN_INT (exact_log2 (scale)),
22369 NULL, 1, OPTAB_DIRECT);
22370 return sc;
22371 }
22372
22373 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22374 DImode for constant loop counts. */
22375
22376 static enum machine_mode
22377 counter_mode (rtx count_exp)
22378 {
22379 if (GET_MODE (count_exp) != VOIDmode)
22380 return GET_MODE (count_exp);
22381 if (!CONST_INT_P (count_exp))
22382 return Pmode;
22383 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22384 return DImode;
22385 return SImode;
22386 }
22387
22388 /* Copy the address to a Pmode register. This is used for x32 to
22389 truncate DImode TLS address to a SImode register. */
22390
22391 static rtx
22392 ix86_copy_addr_to_reg (rtx addr)
22393 {
22394 if (GET_MODE (addr) == Pmode)
22395 return copy_addr_to_reg (addr);
22396 else
22397 {
22398 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
22399 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
22400 }
22401 }
22402
22403 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
22404 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
22405 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
22406 memory by VALUE (supposed to be in MODE).
22407
22408 The size is rounded down to whole number of chunk size moved at once.
22409 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22410
22411
22412 static void
22413 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22414 rtx destptr, rtx srcptr, rtx value,
22415 rtx count, enum machine_mode mode, int unroll,
22416 int expected_size, bool issetmem)
22417 {
22418 rtx out_label, top_label, iter, tmp;
22419 enum machine_mode iter_mode = counter_mode (count);
22420 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22421 rtx piece_size = GEN_INT (piece_size_n);
22422 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22423 rtx size;
22424 int i;
22425
22426 top_label = gen_label_rtx ();
22427 out_label = gen_label_rtx ();
22428 iter = gen_reg_rtx (iter_mode);
22429
22430 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22431 NULL, 1, OPTAB_DIRECT);
22432 /* Those two should combine. */
22433 if (piece_size == const1_rtx)
22434 {
22435 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22436 true, out_label);
22437 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22438 }
22439 emit_move_insn (iter, const0_rtx);
22440
22441 emit_label (top_label);
22442
22443 tmp = convert_modes (Pmode, iter_mode, iter, true);
22444
22445 /* This assert could be relaxed - in this case we'll need to compute
22446 smallest power of two, containing in PIECE_SIZE_N and pass it to
22447 offset_address. */
22448 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22449 destmem = offset_address (destmem, tmp, piece_size_n);
22450 destmem = adjust_address (destmem, mode, 0);
22451
22452 if (!issetmem)
22453 {
22454 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22455 srcmem = adjust_address (srcmem, mode, 0);
22456
22457 /* When unrolling for chips that reorder memory reads and writes,
22458 we can save registers by using single temporary.
22459 Also using 4 temporaries is overkill in 32bit mode. */
22460 if (!TARGET_64BIT && 0)
22461 {
22462 for (i = 0; i < unroll; i++)
22463 {
22464 if (i)
22465 {
22466 destmem =
22467 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22468 srcmem =
22469 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22470 }
22471 emit_move_insn (destmem, srcmem);
22472 }
22473 }
22474 else
22475 {
22476 rtx tmpreg[4];
22477 gcc_assert (unroll <= 4);
22478 for (i = 0; i < unroll; i++)
22479 {
22480 tmpreg[i] = gen_reg_rtx (mode);
22481 if (i)
22482 {
22483 srcmem =
22484 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22485 }
22486 emit_move_insn (tmpreg[i], srcmem);
22487 }
22488 for (i = 0; i < unroll; i++)
22489 {
22490 if (i)
22491 {
22492 destmem =
22493 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22494 }
22495 emit_move_insn (destmem, tmpreg[i]);
22496 }
22497 }
22498 }
22499 else
22500 for (i = 0; i < unroll; i++)
22501 {
22502 if (i)
22503 destmem =
22504 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22505 emit_move_insn (destmem, value);
22506 }
22507
22508 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22509 true, OPTAB_LIB_WIDEN);
22510 if (tmp != iter)
22511 emit_move_insn (iter, tmp);
22512
22513 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22514 true, top_label);
22515 if (expected_size != -1)
22516 {
22517 expected_size /= GET_MODE_SIZE (mode) * unroll;
22518 if (expected_size == 0)
22519 predict_jump (0);
22520 else if (expected_size > REG_BR_PROB_BASE)
22521 predict_jump (REG_BR_PROB_BASE - 1);
22522 else
22523 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22524 }
22525 else
22526 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22527 iter = ix86_zero_extend_to_Pmode (iter);
22528 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22529 true, OPTAB_LIB_WIDEN);
22530 if (tmp != destptr)
22531 emit_move_insn (destptr, tmp);
22532 if (!issetmem)
22533 {
22534 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22535 true, OPTAB_LIB_WIDEN);
22536 if (tmp != srcptr)
22537 emit_move_insn (srcptr, tmp);
22538 }
22539 emit_label (out_label);
22540 }
22541
22542 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
22543 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
22544 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
22545 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
22546 ORIG_VALUE is the original value passed to memset to fill the memory with.
22547 Other arguments have same meaning as for previous function. */
22548
22549 static void
22550 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
22551 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
22552 rtx count,
22553 enum machine_mode mode, bool issetmem)
22554 {
22555 rtx destexp;
22556 rtx srcexp;
22557 rtx countreg;
22558 HOST_WIDE_INT rounded_count;
22559
22560 /* If possible, it is shorter to use rep movs.
22561 TODO: Maybe it is better to move this logic to decide_alg. */
22562 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
22563 && (!issetmem || orig_value == const0_rtx))
22564 mode = SImode;
22565
22566 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22567 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22568
22569 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
22570 GET_MODE_SIZE (mode)));
22571 if (mode != QImode)
22572 {
22573 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22574 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22575 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22576 }
22577 else
22578 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22579 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
22580 {
22581 rounded_count = (INTVAL (count)
22582 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22583 destmem = shallow_copy_rtx (destmem);
22584 set_mem_size (destmem, rounded_count);
22585 }
22586 else if (MEM_SIZE_KNOWN_P (destmem))
22587 clear_mem_size (destmem);
22588
22589 if (issetmem)
22590 {
22591 value = force_reg (mode, gen_lowpart (mode, value));
22592 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22593 }
22594 else
22595 {
22596 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22597 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22598 if (mode != QImode)
22599 {
22600 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22601 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22602 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22603 }
22604 else
22605 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22606 if (CONST_INT_P (count))
22607 {
22608 rounded_count = (INTVAL (count)
22609 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22610 srcmem = shallow_copy_rtx (srcmem);
22611 set_mem_size (srcmem, rounded_count);
22612 }
22613 else
22614 {
22615 if (MEM_SIZE_KNOWN_P (srcmem))
22616 clear_mem_size (srcmem);
22617 }
22618 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22619 destexp, srcexp));
22620 }
22621 }
22622
22623 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22624 DESTMEM.
22625 SRC is passed by pointer to be updated on return.
22626 Return value is updated DST. */
22627 static rtx
22628 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22629 HOST_WIDE_INT size_to_move)
22630 {
22631 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22632 enum insn_code code;
22633 enum machine_mode move_mode;
22634 int piece_size, i;
22635
22636 /* Find the widest mode in which we could perform moves.
22637 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22638 it until move of such size is supported. */
22639 piece_size = 1 << floor_log2 (size_to_move);
22640 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22641 code = optab_handler (mov_optab, move_mode);
22642 while (code == CODE_FOR_nothing && piece_size > 1)
22643 {
22644 piece_size >>= 1;
22645 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22646 code = optab_handler (mov_optab, move_mode);
22647 }
22648
22649 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22650 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22651 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22652 {
22653 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22654 move_mode = mode_for_vector (word_mode, nunits);
22655 code = optab_handler (mov_optab, move_mode);
22656 if (code == CODE_FOR_nothing)
22657 {
22658 move_mode = word_mode;
22659 piece_size = GET_MODE_SIZE (move_mode);
22660 code = optab_handler (mov_optab, move_mode);
22661 }
22662 }
22663 gcc_assert (code != CODE_FOR_nothing);
22664
22665 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22666 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22667
22668 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22669 gcc_assert (size_to_move % piece_size == 0);
22670 adjust = GEN_INT (piece_size);
22671 for (i = 0; i < size_to_move; i += piece_size)
22672 {
22673 /* We move from memory to memory, so we'll need to do it via
22674 a temporary register. */
22675 tempreg = gen_reg_rtx (move_mode);
22676 emit_insn (GEN_FCN (code) (tempreg, src));
22677 emit_insn (GEN_FCN (code) (dst, tempreg));
22678
22679 emit_move_insn (destptr,
22680 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22681 emit_move_insn (srcptr,
22682 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22683
22684 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22685 piece_size);
22686 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22687 piece_size);
22688 }
22689
22690 /* Update DST and SRC rtx. */
22691 *srcmem = src;
22692 return dst;
22693 }
22694
22695 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22696 static void
22697 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22698 rtx destptr, rtx srcptr, rtx count, int max_size)
22699 {
22700 rtx src, dest;
22701 if (CONST_INT_P (count))
22702 {
22703 HOST_WIDE_INT countval = INTVAL (count);
22704 HOST_WIDE_INT epilogue_size = countval % max_size;
22705 int i;
22706
22707 /* For now MAX_SIZE should be a power of 2. This assert could be
22708 relaxed, but it'll require a bit more complicated epilogue
22709 expanding. */
22710 gcc_assert ((max_size & (max_size - 1)) == 0);
22711 for (i = max_size; i >= 1; i >>= 1)
22712 {
22713 if (epilogue_size & i)
22714 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22715 }
22716 return;
22717 }
22718 if (max_size > 8)
22719 {
22720 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22721 count, 1, OPTAB_DIRECT);
22722 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22723 count, QImode, 1, 4, false);
22724 return;
22725 }
22726
22727 /* When there are stringops, we can cheaply increase dest and src pointers.
22728 Otherwise we save code size by maintaining offset (zero is readily
22729 available from preceding rep operation) and using x86 addressing modes.
22730 */
22731 if (TARGET_SINGLE_STRINGOP)
22732 {
22733 if (max_size > 4)
22734 {
22735 rtx label = ix86_expand_aligntest (count, 4, true);
22736 src = change_address (srcmem, SImode, srcptr);
22737 dest = change_address (destmem, SImode, destptr);
22738 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22739 emit_label (label);
22740 LABEL_NUSES (label) = 1;
22741 }
22742 if (max_size > 2)
22743 {
22744 rtx label = ix86_expand_aligntest (count, 2, true);
22745 src = change_address (srcmem, HImode, srcptr);
22746 dest = change_address (destmem, HImode, destptr);
22747 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22748 emit_label (label);
22749 LABEL_NUSES (label) = 1;
22750 }
22751 if (max_size > 1)
22752 {
22753 rtx label = ix86_expand_aligntest (count, 1, true);
22754 src = change_address (srcmem, QImode, srcptr);
22755 dest = change_address (destmem, QImode, destptr);
22756 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22757 emit_label (label);
22758 LABEL_NUSES (label) = 1;
22759 }
22760 }
22761 else
22762 {
22763 rtx offset = force_reg (Pmode, const0_rtx);
22764 rtx tmp;
22765
22766 if (max_size > 4)
22767 {
22768 rtx label = ix86_expand_aligntest (count, 4, true);
22769 src = change_address (srcmem, SImode, srcptr);
22770 dest = change_address (destmem, SImode, destptr);
22771 emit_move_insn (dest, src);
22772 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22773 true, OPTAB_LIB_WIDEN);
22774 if (tmp != offset)
22775 emit_move_insn (offset, tmp);
22776 emit_label (label);
22777 LABEL_NUSES (label) = 1;
22778 }
22779 if (max_size > 2)
22780 {
22781 rtx label = ix86_expand_aligntest (count, 2, true);
22782 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22783 src = change_address (srcmem, HImode, tmp);
22784 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22785 dest = change_address (destmem, HImode, tmp);
22786 emit_move_insn (dest, src);
22787 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22788 true, OPTAB_LIB_WIDEN);
22789 if (tmp != offset)
22790 emit_move_insn (offset, tmp);
22791 emit_label (label);
22792 LABEL_NUSES (label) = 1;
22793 }
22794 if (max_size > 1)
22795 {
22796 rtx label = ix86_expand_aligntest (count, 1, true);
22797 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22798 src = change_address (srcmem, QImode, tmp);
22799 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22800 dest = change_address (destmem, QImode, tmp);
22801 emit_move_insn (dest, src);
22802 emit_label (label);
22803 LABEL_NUSES (label) = 1;
22804 }
22805 }
22806 }
22807
22808 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
22809 with value PROMOTED_VAL.
22810 SRC is passed by pointer to be updated on return.
22811 Return value is updated DST. */
22812 static rtx
22813 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
22814 HOST_WIDE_INT size_to_move)
22815 {
22816 rtx dst = destmem, adjust;
22817 enum insn_code code;
22818 enum machine_mode move_mode;
22819 int piece_size, i;
22820
22821 /* Find the widest mode in which we could perform moves.
22822 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22823 it until move of such size is supported. */
22824 move_mode = GET_MODE (promoted_val);
22825 if (move_mode == VOIDmode)
22826 move_mode = QImode;
22827 if (size_to_move < GET_MODE_SIZE (move_mode))
22828 {
22829 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
22830 promoted_val = gen_lowpart (move_mode, promoted_val);
22831 }
22832 piece_size = GET_MODE_SIZE (move_mode);
22833 code = optab_handler (mov_optab, move_mode);
22834 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
22835
22836 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22837
22838 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22839 gcc_assert (size_to_move % piece_size == 0);
22840 adjust = GEN_INT (piece_size);
22841 for (i = 0; i < size_to_move; i += piece_size)
22842 {
22843 if (piece_size <= GET_MODE_SIZE (word_mode))
22844 {
22845 emit_insn (gen_strset (destptr, dst, promoted_val));
22846 continue;
22847 }
22848
22849 emit_insn (GEN_FCN (code) (dst, promoted_val));
22850
22851 emit_move_insn (destptr,
22852 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22853
22854 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22855 piece_size);
22856 }
22857
22858 /* Update DST rtx. */
22859 return dst;
22860 }
22861 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22862 static void
22863 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22864 rtx count, int max_size)
22865 {
22866 count =
22867 expand_simple_binop (counter_mode (count), AND, count,
22868 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22869 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22870 gen_lowpart (QImode, value), count, QImode,
22871 1, max_size / 2, true);
22872 }
22873
22874 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22875 static void
22876 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
22877 rtx count, int max_size)
22878 {
22879 rtx dest;
22880
22881 if (CONST_INT_P (count))
22882 {
22883 HOST_WIDE_INT countval = INTVAL (count);
22884 HOST_WIDE_INT epilogue_size = countval % max_size;
22885 int i;
22886
22887 /* For now MAX_SIZE should be a power of 2. This assert could be
22888 relaxed, but it'll require a bit more complicated epilogue
22889 expanding. */
22890 gcc_assert ((max_size & (max_size - 1)) == 0);
22891 for (i = max_size; i >= 1; i >>= 1)
22892 {
22893 if (epilogue_size & i)
22894 {
22895 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22896 destmem = emit_memset (destmem, destptr, vec_value, i);
22897 else
22898 destmem = emit_memset (destmem, destptr, value, i);
22899 }
22900 }
22901 return;
22902 }
22903 if (max_size > 32)
22904 {
22905 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22906 return;
22907 }
22908 if (max_size > 16)
22909 {
22910 rtx label = ix86_expand_aligntest (count, 16, true);
22911 if (TARGET_64BIT)
22912 {
22913 dest = change_address (destmem, DImode, destptr);
22914 emit_insn (gen_strset (destptr, dest, value));
22915 emit_insn (gen_strset (destptr, dest, value));
22916 }
22917 else
22918 {
22919 dest = change_address (destmem, SImode, destptr);
22920 emit_insn (gen_strset (destptr, dest, value));
22921 emit_insn (gen_strset (destptr, dest, value));
22922 emit_insn (gen_strset (destptr, dest, value));
22923 emit_insn (gen_strset (destptr, dest, value));
22924 }
22925 emit_label (label);
22926 LABEL_NUSES (label) = 1;
22927 }
22928 if (max_size > 8)
22929 {
22930 rtx label = ix86_expand_aligntest (count, 8, true);
22931 if (TARGET_64BIT)
22932 {
22933 dest = change_address (destmem, DImode, destptr);
22934 emit_insn (gen_strset (destptr, dest, value));
22935 }
22936 else
22937 {
22938 dest = change_address (destmem, SImode, destptr);
22939 emit_insn (gen_strset (destptr, dest, value));
22940 emit_insn (gen_strset (destptr, dest, value));
22941 }
22942 emit_label (label);
22943 LABEL_NUSES (label) = 1;
22944 }
22945 if (max_size > 4)
22946 {
22947 rtx label = ix86_expand_aligntest (count, 4, true);
22948 dest = change_address (destmem, SImode, destptr);
22949 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22950 emit_label (label);
22951 LABEL_NUSES (label) = 1;
22952 }
22953 if (max_size > 2)
22954 {
22955 rtx label = ix86_expand_aligntest (count, 2, true);
22956 dest = change_address (destmem, HImode, destptr);
22957 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22958 emit_label (label);
22959 LABEL_NUSES (label) = 1;
22960 }
22961 if (max_size > 1)
22962 {
22963 rtx label = ix86_expand_aligntest (count, 1, true);
22964 dest = change_address (destmem, QImode, destptr);
22965 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22966 emit_label (label);
22967 LABEL_NUSES (label) = 1;
22968 }
22969 }
22970
22971 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
22972 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
22973 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
22974 ignored.
22975 Return value is updated DESTMEM. */
22976 static rtx
22977 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
22978 rtx destptr, rtx srcptr, rtx value,
22979 rtx vec_value, rtx count, int align,
22980 int desired_alignment, bool issetmem)
22981 {
22982 int i;
22983 for (i = 1; i < desired_alignment; i <<= 1)
22984 {
22985 if (align <= i)
22986 {
22987 rtx label = ix86_expand_aligntest (destptr, i, false);
22988 if (issetmem)
22989 {
22990 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22991 destmem = emit_memset (destmem, destptr, vec_value, i);
22992 else
22993 destmem = emit_memset (destmem, destptr, value, i);
22994 }
22995 else
22996 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22997 ix86_adjust_counter (count, i);
22998 emit_label (label);
22999 LABEL_NUSES (label) = 1;
23000 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
23001 }
23002 }
23003 return destmem;
23004 }
23005
23006 /* Test if COUNT&SIZE is nonzero and if so, expand movme
23007 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
23008 and jump to DONE_LABEL. */
23009 static void
23010 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
23011 rtx destptr, rtx srcptr,
23012 rtx value, rtx vec_value,
23013 rtx count, int size,
23014 rtx done_label, bool issetmem)
23015 {
23016 rtx label = ix86_expand_aligntest (count, size, false);
23017 enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
23018 rtx modesize;
23019 int n;
23020
23021 /* If we do not have vector value to copy, we must reduce size. */
23022 if (issetmem)
23023 {
23024 if (!vec_value)
23025 {
23026 if (GET_MODE (value) == VOIDmode && size > 8)
23027 mode = Pmode;
23028 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
23029 mode = GET_MODE (value);
23030 }
23031 else
23032 mode = GET_MODE (vec_value), value = vec_value;
23033 }
23034 else
23035 {
23036 /* Choose appropriate vector mode. */
23037 if (size >= 32)
23038 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
23039 else if (size >= 16)
23040 mode = TARGET_SSE ? V16QImode : DImode;
23041 srcmem = change_address (srcmem, mode, srcptr);
23042 }
23043 destmem = change_address (destmem, mode, destptr);
23044 modesize = GEN_INT (GET_MODE_SIZE (mode));
23045 gcc_assert (GET_MODE_SIZE (mode) <= size);
23046 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
23047 {
23048 if (issetmem)
23049 emit_move_insn (destmem, gen_lowpart (mode, value));
23050 else
23051 {
23052 emit_move_insn (destmem, srcmem);
23053 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23054 }
23055 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23056 }
23057
23058 destmem = offset_address (destmem, count, 1);
23059 destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
23060 GET_MODE_SIZE (mode));
23061 if (issetmem)
23062 emit_move_insn (destmem, gen_lowpart (mode, value));
23063 else
23064 {
23065 srcmem = offset_address (srcmem, count, 1);
23066 srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
23067 GET_MODE_SIZE (mode));
23068 emit_move_insn (destmem, srcmem);
23069 }
23070 emit_jump_insn (gen_jump (done_label));
23071 emit_barrier ();
23072
23073 emit_label (label);
23074 LABEL_NUSES (label) = 1;
23075 }
23076
23077 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
23078 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
23079 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
23080 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
23081 DONE_LABEL is a label after the whole copying sequence. The label is created
23082 on demand if *DONE_LABEL is NULL.
23083 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
23084 bounds after the initial copies.
23085
23086 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
23087 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
23088 we will dispatch to a library call for large blocks.
23089
23090 In pseudocode we do:
23091
23092 if (COUNT < SIZE)
23093 {
23094 Assume that SIZE is 4. Bigger sizes are handled analogously
23095 if (COUNT & 4)
23096 {
23097 copy 4 bytes from SRCPTR to DESTPTR
23098 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
23099 goto done_label
23100 }
23101 if (!COUNT)
23102 goto done_label;
23103 copy 1 byte from SRCPTR to DESTPTR
23104 if (COUNT & 2)
23105 {
23106 copy 2 bytes from SRCPTR to DESTPTR
23107 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
23108 }
23109 }
23110 else
23111 {
23112 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
23113 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
23114
23115 OLD_DESPTR = DESTPTR;
23116 Align DESTPTR up to DESIRED_ALIGN
23117 SRCPTR += DESTPTR - OLD_DESTPTR
23118 COUNT -= DEST_PTR - OLD_DESTPTR
23119 if (DYNAMIC_CHECK)
23120 Round COUNT down to multiple of SIZE
23121 << optional caller supplied zero size guard is here >>
23122 << optional caller suppplied dynamic check is here >>
23123 << caller supplied main copy loop is here >>
23124 }
23125 done_label:
23126 */
23127 static void
23128 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
23129 rtx *destptr, rtx *srcptr,
23130 enum machine_mode mode,
23131 rtx value, rtx vec_value,
23132 rtx *count,
23133 rtx *done_label,
23134 int size,
23135 int desired_align,
23136 int align,
23137 unsigned HOST_WIDE_INT *min_size,
23138 bool dynamic_check,
23139 bool issetmem)
23140 {
23141 rtx loop_label = NULL, label;
23142 int n;
23143 rtx modesize;
23144 int prolog_size = 0;
23145 rtx mode_value;
23146
23147 /* Chose proper value to copy. */
23148 if (issetmem && VECTOR_MODE_P (mode))
23149 mode_value = vec_value;
23150 else
23151 mode_value = value;
23152 gcc_assert (GET_MODE_SIZE (mode) <= size);
23153
23154 /* See if block is big or small, handle small blocks. */
23155 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
23156 {
23157 int size2 = size;
23158 loop_label = gen_label_rtx ();
23159
23160 if (!*done_label)
23161 *done_label = gen_label_rtx ();
23162
23163 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
23164 1, loop_label);
23165 size2 >>= 1;
23166
23167 /* Handle sizes > 3. */
23168 for (;size2 > 2; size2 >>= 1)
23169 expand_small_movmem_or_setmem (destmem, srcmem,
23170 *destptr, *srcptr,
23171 value, vec_value,
23172 *count,
23173 size2, *done_label, issetmem);
23174 /* Nothing to copy? Jump to DONE_LABEL if so */
23175 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
23176 1, *done_label);
23177
23178 /* Do a byte copy. */
23179 destmem = change_address (destmem, QImode, *destptr);
23180 if (issetmem)
23181 emit_move_insn (destmem, gen_lowpart (QImode, value));
23182 else
23183 {
23184 srcmem = change_address (srcmem, QImode, *srcptr);
23185 emit_move_insn (destmem, srcmem);
23186 }
23187
23188 /* Handle sizes 2 and 3. */
23189 label = ix86_expand_aligntest (*count, 2, false);
23190 destmem = change_address (destmem, HImode, *destptr);
23191 destmem = offset_address (destmem, *count, 1);
23192 destmem = offset_address (destmem, GEN_INT (-2), 2);
23193 if (issetmem)
23194 emit_move_insn (destmem, gen_lowpart (HImode, value));
23195 else
23196 {
23197 srcmem = change_address (srcmem, HImode, *srcptr);
23198 srcmem = offset_address (srcmem, *count, 1);
23199 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
23200 emit_move_insn (destmem, srcmem);
23201 }
23202
23203 emit_label (label);
23204 LABEL_NUSES (label) = 1;
23205 emit_jump_insn (gen_jump (*done_label));
23206 emit_barrier ();
23207 }
23208 else
23209 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
23210 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
23211
23212 /* Start memcpy for COUNT >= SIZE. */
23213 if (loop_label)
23214 {
23215 emit_label (loop_label);
23216 LABEL_NUSES (loop_label) = 1;
23217 }
23218
23219 /* Copy first desired_align bytes. */
23220 if (!issetmem)
23221 srcmem = change_address (srcmem, mode, *srcptr);
23222 destmem = change_address (destmem, mode, *destptr);
23223 modesize = GEN_INT (GET_MODE_SIZE (mode));
23224 for (n = 0; prolog_size < desired_align - align; n++)
23225 {
23226 if (issetmem)
23227 emit_move_insn (destmem, mode_value);
23228 else
23229 {
23230 emit_move_insn (destmem, srcmem);
23231 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23232 }
23233 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23234 prolog_size += GET_MODE_SIZE (mode);
23235 }
23236
23237
23238 /* Copy last SIZE bytes. */
23239 destmem = offset_address (destmem, *count, 1);
23240 destmem = offset_address (destmem,
23241 GEN_INT (-size - prolog_size),
23242 1);
23243 if (issetmem)
23244 emit_move_insn (destmem, mode_value);
23245 else
23246 {
23247 srcmem = offset_address (srcmem, *count, 1);
23248 srcmem = offset_address (srcmem,
23249 GEN_INT (-size - prolog_size),
23250 1);
23251 emit_move_insn (destmem, srcmem);
23252 }
23253 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
23254 {
23255 destmem = offset_address (destmem, modesize, 1);
23256 if (issetmem)
23257 emit_move_insn (destmem, mode_value);
23258 else
23259 {
23260 srcmem = offset_address (srcmem, modesize, 1);
23261 emit_move_insn (destmem, srcmem);
23262 }
23263 }
23264
23265 /* Align destination. */
23266 if (desired_align > 1 && desired_align > align)
23267 {
23268 rtx saveddest = *destptr;
23269
23270 gcc_assert (desired_align <= size);
23271 /* Align destptr up, place it to new register. */
23272 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
23273 GEN_INT (prolog_size),
23274 NULL_RTX, 1, OPTAB_DIRECT);
23275 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
23276 GEN_INT (-desired_align),
23277 *destptr, 1, OPTAB_DIRECT);
23278 /* See how many bytes we skipped. */
23279 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
23280 *destptr,
23281 saveddest, 1, OPTAB_DIRECT);
23282 /* Adjust srcptr and count. */
23283 if (!issetmem)
23284 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
23285 *srcptr, 1, OPTAB_DIRECT);
23286 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23287 saveddest, *count, 1, OPTAB_DIRECT);
23288 /* We copied at most size + prolog_size. */
23289 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
23290 *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
23291 else
23292 *min_size = 0;
23293
23294 /* Our loops always round down the bock size, but for dispatch to library
23295 we need precise value. */
23296 if (dynamic_check)
23297 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
23298 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
23299 }
23300 else
23301 {
23302 gcc_assert (prolog_size == 0);
23303 /* Decrease count, so we won't end up copying last word twice. */
23304 if (!CONST_INT_P (*count))
23305 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23306 constm1_rtx, *count, 1, OPTAB_DIRECT);
23307 else
23308 *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
23309 if (*min_size)
23310 *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
23311 }
23312 }
23313
23314
23315 /* This function is like the previous one, except here we know how many bytes
23316 need to be copied. That allows us to update alignment not only of DST, which
23317 is returned, but also of SRC, which is passed as a pointer for that
23318 reason. */
23319 static rtx
23320 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
23321 rtx srcreg, rtx value, rtx vec_value,
23322 int desired_align, int align_bytes,
23323 bool issetmem)
23324 {
23325 rtx src = NULL;
23326 rtx orig_dst = dst;
23327 rtx orig_src = NULL;
23328 int piece_size = 1;
23329 int copied_bytes = 0;
23330
23331 if (!issetmem)
23332 {
23333 gcc_assert (srcp != NULL);
23334 src = *srcp;
23335 orig_src = src;
23336 }
23337
23338 for (piece_size = 1;
23339 piece_size <= desired_align && copied_bytes < align_bytes;
23340 piece_size <<= 1)
23341 {
23342 if (align_bytes & piece_size)
23343 {
23344 if (issetmem)
23345 {
23346 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
23347 dst = emit_memset (dst, destreg, vec_value, piece_size);
23348 else
23349 dst = emit_memset (dst, destreg, value, piece_size);
23350 }
23351 else
23352 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
23353 copied_bytes += piece_size;
23354 }
23355 }
23356 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
23357 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23358 if (MEM_SIZE_KNOWN_P (orig_dst))
23359 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
23360
23361 if (!issetmem)
23362 {
23363 int src_align_bytes = get_mem_align_offset (src, desired_align
23364 * BITS_PER_UNIT);
23365 if (src_align_bytes >= 0)
23366 src_align_bytes = desired_align - src_align_bytes;
23367 if (src_align_bytes >= 0)
23368 {
23369 unsigned int src_align;
23370 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
23371 {
23372 if ((src_align_bytes & (src_align - 1))
23373 == (align_bytes & (src_align - 1)))
23374 break;
23375 }
23376 if (src_align > (unsigned int) desired_align)
23377 src_align = desired_align;
23378 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
23379 set_mem_align (src, src_align * BITS_PER_UNIT);
23380 }
23381 if (MEM_SIZE_KNOWN_P (orig_src))
23382 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
23383 *srcp = src;
23384 }
23385
23386 return dst;
23387 }
23388
23389 /* Return true if ALG can be used in current context.
23390 Assume we expand memset if MEMSET is true. */
23391 static bool
23392 alg_usable_p (enum stringop_alg alg, bool memset)
23393 {
23394 if (alg == no_stringop)
23395 return false;
23396 if (alg == vector_loop)
23397 return TARGET_SSE || TARGET_AVX;
23398 /* Algorithms using the rep prefix want at least edi and ecx;
23399 additionally, memset wants eax and memcpy wants esi. Don't
23400 consider such algorithms if the user has appropriated those
23401 registers for their own purposes. */
23402 if (alg == rep_prefix_1_byte
23403 || alg == rep_prefix_4_byte
23404 || alg == rep_prefix_8_byte)
23405 return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
23406 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
23407 return true;
23408 }
23409
23410 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
23411 static enum stringop_alg
23412 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
23413 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
23414 bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
23415 {
23416 const struct stringop_algs * algs;
23417 bool optimize_for_speed;
23418 int max = -1;
23419 const struct processor_costs *cost;
23420 int i;
23421 bool any_alg_usable_p = false;
23422
23423 *noalign = false;
23424 *dynamic_check = -1;
23425
23426 /* Even if the string operation call is cold, we still might spend a lot
23427 of time processing large blocks. */
23428 if (optimize_function_for_size_p (cfun)
23429 || (optimize_insn_for_size_p ()
23430 && (max_size < 256
23431 || (expected_size != -1 && expected_size < 256))))
23432 optimize_for_speed = false;
23433 else
23434 optimize_for_speed = true;
23435
23436 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
23437 if (memset)
23438 algs = &cost->memset[TARGET_64BIT != 0];
23439 else
23440 algs = &cost->memcpy[TARGET_64BIT != 0];
23441
23442 /* See maximal size for user defined algorithm. */
23443 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23444 {
23445 enum stringop_alg candidate = algs->size[i].alg;
23446 bool usable = alg_usable_p (candidate, memset);
23447 any_alg_usable_p |= usable;
23448
23449 if (candidate != libcall && candidate && usable)
23450 max = algs->size[i].max;
23451 }
23452
23453 /* If expected size is not known but max size is small enough
23454 so inline version is a win, set expected size into
23455 the range. */
23456 if (max > 1 && (unsigned HOST_WIDE_INT)max >= max_size && expected_size == -1)
23457 expected_size = min_size / 2 + max_size / 2;
23458
23459 /* If user specified the algorithm, honnor it if possible. */
23460 if (ix86_stringop_alg != no_stringop
23461 && alg_usable_p (ix86_stringop_alg, memset))
23462 return ix86_stringop_alg;
23463 /* rep; movq or rep; movl is the smallest variant. */
23464 else if (!optimize_for_speed)
23465 {
23466 *noalign = true;
23467 if (!count || (count & 3) || (memset && !zero_memset))
23468 return alg_usable_p (rep_prefix_1_byte, memset)
23469 ? rep_prefix_1_byte : loop_1_byte;
23470 else
23471 return alg_usable_p (rep_prefix_4_byte, memset)
23472 ? rep_prefix_4_byte : loop;
23473 }
23474 /* Very tiny blocks are best handled via the loop, REP is expensive to
23475 setup. */
23476 else if (expected_size != -1 && expected_size < 4)
23477 return loop_1_byte;
23478 else if (expected_size != -1)
23479 {
23480 enum stringop_alg alg = libcall;
23481 bool alg_noalign = false;
23482 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23483 {
23484 /* We get here if the algorithms that were not libcall-based
23485 were rep-prefix based and we are unable to use rep prefixes
23486 based on global register usage. Break out of the loop and
23487 use the heuristic below. */
23488 if (algs->size[i].max == 0)
23489 break;
23490 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
23491 {
23492 enum stringop_alg candidate = algs->size[i].alg;
23493
23494 if (candidate != libcall && alg_usable_p (candidate, memset))
23495 {
23496 alg = candidate;
23497 alg_noalign = algs->size[i].noalign;
23498 }
23499 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
23500 last non-libcall inline algorithm. */
23501 if (TARGET_INLINE_ALL_STRINGOPS)
23502 {
23503 /* When the current size is best to be copied by a libcall,
23504 but we are still forced to inline, run the heuristic below
23505 that will pick code for medium sized blocks. */
23506 if (alg != libcall)
23507 {
23508 *noalign = alg_noalign;
23509 return alg;
23510 }
23511 break;
23512 }
23513 else if (alg_usable_p (candidate, memset))
23514 {
23515 *noalign = algs->size[i].noalign;
23516 return candidate;
23517 }
23518 }
23519 }
23520 }
23521 /* When asked to inline the call anyway, try to pick meaningful choice.
23522 We look for maximal size of block that is faster to copy by hand and
23523 take blocks of at most of that size guessing that average size will
23524 be roughly half of the block.
23525
23526 If this turns out to be bad, we might simply specify the preferred
23527 choice in ix86_costs. */
23528 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23529 && (algs->unknown_size == libcall
23530 || !alg_usable_p (algs->unknown_size, memset)))
23531 {
23532 enum stringop_alg alg;
23533
23534 /* If there aren't any usable algorithms, then recursing on
23535 smaller sizes isn't going to find anything. Just return the
23536 simple byte-at-a-time copy loop. */
23537 if (!any_alg_usable_p)
23538 {
23539 /* Pick something reasonable. */
23540 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23541 *dynamic_check = 128;
23542 return loop_1_byte;
23543 }
23544 if (max == -1)
23545 max = 4096;
23546 alg = decide_alg (count, max / 2, min_size, max_size, memset,
23547 zero_memset, dynamic_check, noalign);
23548 gcc_assert (*dynamic_check == -1);
23549 gcc_assert (alg != libcall);
23550 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23551 *dynamic_check = max;
23552 return alg;
23553 }
23554 return (alg_usable_p (algs->unknown_size, memset)
23555 ? algs->unknown_size : libcall);
23556 }
23557
23558 /* Decide on alignment. We know that the operand is already aligned to ALIGN
23559 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
23560 static int
23561 decide_alignment (int align,
23562 enum stringop_alg alg,
23563 int expected_size,
23564 enum machine_mode move_mode)
23565 {
23566 int desired_align = 0;
23567
23568 gcc_assert (alg != no_stringop);
23569
23570 if (alg == libcall)
23571 return 0;
23572 if (move_mode == VOIDmode)
23573 return 0;
23574
23575 desired_align = GET_MODE_SIZE (move_mode);
23576 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
23577 copying whole cacheline at once. */
23578 if (TARGET_PENTIUMPRO
23579 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
23580 desired_align = 8;
23581
23582 if (optimize_size)
23583 desired_align = 1;
23584 if (desired_align < align)
23585 desired_align = align;
23586 if (expected_size != -1 && expected_size < 4)
23587 desired_align = align;
23588
23589 return desired_align;
23590 }
23591
23592
23593 /* Helper function for memcpy. For QImode value 0xXY produce
23594 0xXYXYXYXY of wide specified by MODE. This is essentially
23595 a * 0x10101010, but we can do slightly better than
23596 synth_mult by unwinding the sequence by hand on CPUs with
23597 slow multiply. */
23598 static rtx
23599 promote_duplicated_reg (enum machine_mode mode, rtx val)
23600 {
23601 enum machine_mode valmode = GET_MODE (val);
23602 rtx tmp;
23603 int nops = mode == DImode ? 3 : 2;
23604
23605 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
23606 if (val == const0_rtx)
23607 return copy_to_mode_reg (mode, CONST0_RTX (mode));
23608 if (CONST_INT_P (val))
23609 {
23610 HOST_WIDE_INT v = INTVAL (val) & 255;
23611
23612 v |= v << 8;
23613 v |= v << 16;
23614 if (mode == DImode)
23615 v |= (v << 16) << 16;
23616 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23617 }
23618
23619 if (valmode == VOIDmode)
23620 valmode = QImode;
23621 if (valmode != QImode)
23622 val = gen_lowpart (QImode, val);
23623 if (mode == QImode)
23624 return val;
23625 if (!TARGET_PARTIAL_REG_STALL)
23626 nops--;
23627 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23628 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23629 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23630 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23631 {
23632 rtx reg = convert_modes (mode, QImode, val, true);
23633 tmp = promote_duplicated_reg (mode, const1_rtx);
23634 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23635 OPTAB_DIRECT);
23636 }
23637 else
23638 {
23639 rtx reg = convert_modes (mode, QImode, val, true);
23640
23641 if (!TARGET_PARTIAL_REG_STALL)
23642 if (mode == SImode)
23643 emit_insn (gen_movsi_insv_1 (reg, reg));
23644 else
23645 emit_insn (gen_movdi_insv_1 (reg, reg));
23646 else
23647 {
23648 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23649 NULL, 1, OPTAB_DIRECT);
23650 reg =
23651 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23652 }
23653 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23654 NULL, 1, OPTAB_DIRECT);
23655 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23656 if (mode == SImode)
23657 return reg;
23658 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23659 NULL, 1, OPTAB_DIRECT);
23660 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23661 return reg;
23662 }
23663 }
23664
23665 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23666 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23667 alignment from ALIGN to DESIRED_ALIGN. */
23668 static rtx
23669 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
23670 int align)
23671 {
23672 rtx promoted_val;
23673
23674 if (TARGET_64BIT
23675 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23676 promoted_val = promote_duplicated_reg (DImode, val);
23677 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23678 promoted_val = promote_duplicated_reg (SImode, val);
23679 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23680 promoted_val = promote_duplicated_reg (HImode, val);
23681 else
23682 promoted_val = val;
23683
23684 return promoted_val;
23685 }
23686
23687 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
23688 operations when profitable. The code depends upon architecture, block size
23689 and alignment, but always has one of the following overall structures:
23690
23691 Aligned move sequence:
23692
23693 1) Prologue guard: Conditional that jumps up to epilogues for small
23694 blocks that can be handled by epilogue alone. This is faster
23695 but also needed for correctness, since prologue assume the block
23696 is larger than the desired alignment.
23697
23698 Optional dynamic check for size and libcall for large
23699 blocks is emitted here too, with -minline-stringops-dynamically.
23700
23701 2) Prologue: copy first few bytes in order to get destination
23702 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23703 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23704 copied. We emit either a jump tree on power of two sized
23705 blocks, or a byte loop.
23706
23707 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23708 with specified algorithm.
23709
23710 4) Epilogue: code copying tail of the block that is too small to be
23711 handled by main body (or up to size guarded by prologue guard).
23712
23713 Misaligned move sequence
23714
23715 1) missaligned move prologue/epilogue containing:
23716 a) Prologue handling small memory blocks and jumping to done_label
23717 (skipped if blocks are known to be large enough)
23718 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
23719 needed by single possibly misaligned move
23720 (skipped if alignment is not needed)
23721 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
23722
23723 2) Zero size guard dispatching to done_label, if needed
23724
23725 3) dispatch to library call, if needed,
23726
23727 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23728 with specified algorithm. */
23729 bool
23730 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
23731 rtx align_exp, rtx expected_align_exp,
23732 rtx expected_size_exp, rtx min_size_exp,
23733 rtx max_size_exp, rtx probable_max_size_exp,
23734 bool issetmem)
23735 {
23736 rtx destreg;
23737 rtx srcreg = NULL;
23738 rtx label = NULL;
23739 rtx tmp;
23740 rtx jump_around_label = NULL;
23741 HOST_WIDE_INT align = 1;
23742 unsigned HOST_WIDE_INT count = 0;
23743 HOST_WIDE_INT expected_size = -1;
23744 int size_needed = 0, epilogue_size_needed;
23745 int desired_align = 0, align_bytes = 0;
23746 enum stringop_alg alg;
23747 rtx promoted_val = NULL;
23748 rtx vec_promoted_val = NULL;
23749 bool force_loopy_epilogue = false;
23750 int dynamic_check;
23751 bool need_zero_guard = false;
23752 bool noalign;
23753 enum machine_mode move_mode = VOIDmode;
23754 int unroll_factor = 1;
23755 /* TODO: Once vlaue ranges are available, fill in proper data. */
23756 unsigned HOST_WIDE_INT min_size = 0;
23757 unsigned HOST_WIDE_INT max_size = -1;
23758 unsigned HOST_WIDE_INT probable_max_size = -1;
23759 bool misaligned_prologue_used = false;
23760
23761 if (CONST_INT_P (align_exp))
23762 align = INTVAL (align_exp);
23763 /* i386 can do misaligned access on reasonably increased cost. */
23764 if (CONST_INT_P (expected_align_exp)
23765 && INTVAL (expected_align_exp) > align)
23766 align = INTVAL (expected_align_exp);
23767 /* ALIGN is the minimum of destination and source alignment, but we care here
23768 just about destination alignment. */
23769 else if (!issetmem
23770 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23771 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23772
23773 if (CONST_INT_P (count_exp))
23774 min_size = max_size = probable_max_size = count = expected_size
23775 = INTVAL (count_exp);
23776 else
23777 {
23778 if (min_size_exp)
23779 min_size = INTVAL (min_size_exp);
23780 if (max_size_exp)
23781 max_size = INTVAL (max_size_exp);
23782 if (probable_max_size_exp)
23783 probable_max_size = INTVAL (probable_max_size_exp);
23784 if (CONST_INT_P (expected_size_exp) && count == 0)
23785 expected_size = INTVAL (expected_size_exp);
23786 }
23787
23788 /* Make sure we don't need to care about overflow later on. */
23789 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23790 return false;
23791
23792 /* Step 0: Decide on preferred algorithm, desired alignment and
23793 size of chunks to be copied by main loop. */
23794 alg = decide_alg (count, expected_size, min_size, probable_max_size,
23795 issetmem,
23796 issetmem && val_exp == const0_rtx,
23797 &dynamic_check, &noalign);
23798 if (alg == libcall)
23799 return false;
23800 gcc_assert (alg != no_stringop);
23801
23802 /* For now vector-version of memset is generated only for memory zeroing, as
23803 creating of promoted vector value is very cheap in this case. */
23804 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
23805 alg = unrolled_loop;
23806
23807 if (!count)
23808 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23809 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23810 if (!issetmem)
23811 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
23812
23813 unroll_factor = 1;
23814 move_mode = word_mode;
23815 switch (alg)
23816 {
23817 case libcall:
23818 case no_stringop:
23819 case last_alg:
23820 gcc_unreachable ();
23821 case loop_1_byte:
23822 need_zero_guard = true;
23823 move_mode = QImode;
23824 break;
23825 case loop:
23826 need_zero_guard = true;
23827 break;
23828 case unrolled_loop:
23829 need_zero_guard = true;
23830 unroll_factor = (TARGET_64BIT ? 4 : 2);
23831 break;
23832 case vector_loop:
23833 need_zero_guard = true;
23834 unroll_factor = 4;
23835 /* Find the widest supported mode. */
23836 move_mode = word_mode;
23837 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23838 != CODE_FOR_nothing)
23839 move_mode = GET_MODE_WIDER_MODE (move_mode);
23840
23841 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23842 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23843 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23844 {
23845 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23846 move_mode = mode_for_vector (word_mode, nunits);
23847 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23848 move_mode = word_mode;
23849 }
23850 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23851 break;
23852 case rep_prefix_8_byte:
23853 move_mode = DImode;
23854 break;
23855 case rep_prefix_4_byte:
23856 move_mode = SImode;
23857 break;
23858 case rep_prefix_1_byte:
23859 move_mode = QImode;
23860 break;
23861 }
23862 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23863 epilogue_size_needed = size_needed;
23864
23865 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23866 if (!TARGET_ALIGN_STRINGOPS || noalign)
23867 align = desired_align;
23868
23869 /* Step 1: Prologue guard. */
23870
23871 /* Alignment code needs count to be in register. */
23872 if (CONST_INT_P (count_exp) && desired_align > align)
23873 {
23874 if (INTVAL (count_exp) > desired_align
23875 && INTVAL (count_exp) > size_needed)
23876 {
23877 align_bytes
23878 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23879 if (align_bytes <= 0)
23880 align_bytes = 0;
23881 else
23882 align_bytes = desired_align - align_bytes;
23883 }
23884 if (align_bytes == 0)
23885 count_exp = force_reg (counter_mode (count_exp), count_exp);
23886 }
23887 gcc_assert (desired_align >= 1 && align >= 1);
23888
23889 /* Misaligned move sequences handle both prologue and epilogue at once.
23890 Default code generation results in a smaller code for large alignments
23891 and also avoids redundant job when sizes are known precisely. */
23892 misaligned_prologue_used
23893 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
23894 && MAX (desired_align, epilogue_size_needed) <= 32
23895 && desired_align <= epilogue_size_needed
23896 && ((desired_align > align && !align_bytes)
23897 || (!count && epilogue_size_needed > 1)));
23898
23899 /* Do the cheap promotion to allow better CSE across the
23900 main loop and epilogue (ie one load of the big constant in the
23901 front of all code.
23902 For now the misaligned move sequences do not have fast path
23903 without broadcasting. */
23904 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
23905 {
23906 if (alg == vector_loop)
23907 {
23908 gcc_assert (val_exp == const0_rtx);
23909 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
23910 promoted_val = promote_duplicated_reg_to_size (val_exp,
23911 GET_MODE_SIZE (word_mode),
23912 desired_align, align);
23913 }
23914 else
23915 {
23916 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23917 desired_align, align);
23918 }
23919 }
23920 /* Misaligned move sequences handles both prologues and epilogues at once.
23921 Default code generation results in smaller code for large alignments and
23922 also avoids redundant job when sizes are known precisely. */
23923 if (misaligned_prologue_used)
23924 {
23925 /* Misaligned move prologue handled small blocks by itself. */
23926 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
23927 (dst, src, &destreg, &srcreg,
23928 move_mode, promoted_val, vec_promoted_val,
23929 &count_exp,
23930 &jump_around_label,
23931 desired_align < align
23932 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
23933 desired_align, align, &min_size, dynamic_check, issetmem);
23934 if (!issetmem)
23935 src = change_address (src, BLKmode, srcreg);
23936 dst = change_address (dst, BLKmode, destreg);
23937 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23938 epilogue_size_needed = 0;
23939 if (need_zero_guard && !min_size)
23940 {
23941 /* It is possible that we copied enough so the main loop will not
23942 execute. */
23943 gcc_assert (size_needed > 1);
23944 if (jump_around_label == NULL_RTX)
23945 jump_around_label = gen_label_rtx ();
23946 emit_cmp_and_jump_insns (count_exp,
23947 GEN_INT (size_needed),
23948 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
23949 if (expected_size == -1
23950 || expected_size < (desired_align - align) / 2 + size_needed)
23951 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23952 else
23953 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23954 }
23955 }
23956 /* Ensure that alignment prologue won't copy past end of block. */
23957 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23958 {
23959 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23960 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23961 Make sure it is power of 2. */
23962 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23963
23964 /* To improve performance of small blocks, we jump around the VAL
23965 promoting mode. This mean that if the promoted VAL is not constant,
23966 we might not use it in the epilogue and have to use byte
23967 loop variant. */
23968 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
23969 force_loopy_epilogue = true;
23970 if (count)
23971 {
23972 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23973 {
23974 /* If main algorithm works on QImode, no epilogue is needed.
23975 For small sizes just don't align anything. */
23976 if (size_needed == 1)
23977 desired_align = align;
23978 else
23979 goto epilogue;
23980 }
23981 }
23982 else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23983 {
23984 gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
23985 label = gen_label_rtx ();
23986 emit_cmp_and_jump_insns (count_exp,
23987 GEN_INT (epilogue_size_needed),
23988 LTU, 0, counter_mode (count_exp), 1, label);
23989 if (expected_size == -1 || expected_size < epilogue_size_needed)
23990 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23991 else
23992 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23993 }
23994 }
23995
23996 /* Emit code to decide on runtime whether library call or inline should be
23997 used. */
23998 if (dynamic_check != -1)
23999 {
24000 if (!issetmem && CONST_INT_P (count_exp))
24001 {
24002 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
24003 {
24004 emit_block_move_via_libcall (dst, src, count_exp, false);
24005 count_exp = const0_rtx;
24006 goto epilogue;
24007 }
24008 }
24009 else
24010 {
24011 rtx hot_label = gen_label_rtx ();
24012 jump_around_label = gen_label_rtx ();
24013 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
24014 LEU, 0, GET_MODE (count_exp), 1, hot_label);
24015 predict_jump (REG_BR_PROB_BASE * 90 / 100);
24016 if (issetmem)
24017 set_storage_via_libcall (dst, count_exp, val_exp, false);
24018 else
24019 emit_block_move_via_libcall (dst, src, count_exp, false);
24020 emit_jump (jump_around_label);
24021 emit_label (hot_label);
24022 }
24023 }
24024
24025 /* Step 2: Alignment prologue. */
24026 /* Do the expensive promotion once we branched off the small blocks. */
24027 if (issetmem && !promoted_val)
24028 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
24029 desired_align, align);
24030
24031 if (desired_align > align && !misaligned_prologue_used)
24032 {
24033 if (align_bytes == 0)
24034 {
24035 /* Except for the first move in prologue, we no longer know
24036 constant offset in aliasing info. It don't seems to worth
24037 the pain to maintain it for the first move, so throw away
24038 the info early. */
24039 dst = change_address (dst, BLKmode, destreg);
24040 if (!issetmem)
24041 src = change_address (src, BLKmode, srcreg);
24042 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
24043 promoted_val, vec_promoted_val,
24044 count_exp, align, desired_align,
24045 issetmem);
24046 /* At most desired_align - align bytes are copied. */
24047 if (min_size < (unsigned)(desired_align - align))
24048 min_size = 0;
24049 else
24050 min_size -= desired_align - align;
24051 }
24052 else
24053 {
24054 /* If we know how many bytes need to be stored before dst is
24055 sufficiently aligned, maintain aliasing info accurately. */
24056 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
24057 srcreg,
24058 promoted_val,
24059 vec_promoted_val,
24060 desired_align,
24061 align_bytes,
24062 issetmem);
24063
24064 count_exp = plus_constant (counter_mode (count_exp),
24065 count_exp, -align_bytes);
24066 count -= align_bytes;
24067 min_size -= align_bytes;
24068 max_size -= align_bytes;
24069 }
24070 if (need_zero_guard
24071 && !min_size
24072 && (count < (unsigned HOST_WIDE_INT) size_needed
24073 || (align_bytes == 0
24074 && count < ((unsigned HOST_WIDE_INT) size_needed
24075 + desired_align - align))))
24076 {
24077 /* It is possible that we copied enough so the main loop will not
24078 execute. */
24079 gcc_assert (size_needed > 1);
24080 if (label == NULL_RTX)
24081 label = gen_label_rtx ();
24082 emit_cmp_and_jump_insns (count_exp,
24083 GEN_INT (size_needed),
24084 LTU, 0, counter_mode (count_exp), 1, label);
24085 if (expected_size == -1
24086 || expected_size < (desired_align - align) / 2 + size_needed)
24087 predict_jump (REG_BR_PROB_BASE * 20 / 100);
24088 else
24089 predict_jump (REG_BR_PROB_BASE * 60 / 100);
24090 }
24091 }
24092 if (label && size_needed == 1)
24093 {
24094 emit_label (label);
24095 LABEL_NUSES (label) = 1;
24096 label = NULL;
24097 epilogue_size_needed = 1;
24098 if (issetmem)
24099 promoted_val = val_exp;
24100 }
24101 else if (label == NULL_RTX && !misaligned_prologue_used)
24102 epilogue_size_needed = size_needed;
24103
24104 /* Step 3: Main loop. */
24105
24106 switch (alg)
24107 {
24108 case libcall:
24109 case no_stringop:
24110 case last_alg:
24111 gcc_unreachable ();
24112 case loop_1_byte:
24113 case loop:
24114 case unrolled_loop:
24115 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
24116 count_exp, move_mode, unroll_factor,
24117 expected_size, issetmem);
24118 break;
24119 case vector_loop:
24120 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
24121 vec_promoted_val, count_exp, move_mode,
24122 unroll_factor, expected_size, issetmem);
24123 break;
24124 case rep_prefix_8_byte:
24125 case rep_prefix_4_byte:
24126 case rep_prefix_1_byte:
24127 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
24128 val_exp, count_exp, move_mode, issetmem);
24129 break;
24130 }
24131 /* Adjust properly the offset of src and dest memory for aliasing. */
24132 if (CONST_INT_P (count_exp))
24133 {
24134 if (!issetmem)
24135 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
24136 (count / size_needed) * size_needed);
24137 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
24138 (count / size_needed) * size_needed);
24139 }
24140 else
24141 {
24142 if (!issetmem)
24143 src = change_address (src, BLKmode, srcreg);
24144 dst = change_address (dst, BLKmode, destreg);
24145 }
24146
24147 /* Step 4: Epilogue to copy the remaining bytes. */
24148 epilogue:
24149 if (label)
24150 {
24151 /* When the main loop is done, COUNT_EXP might hold original count,
24152 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
24153 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
24154 bytes. Compensate if needed. */
24155
24156 if (size_needed < epilogue_size_needed)
24157 {
24158 tmp =
24159 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
24160 GEN_INT (size_needed - 1), count_exp, 1,
24161 OPTAB_DIRECT);
24162 if (tmp != count_exp)
24163 emit_move_insn (count_exp, tmp);
24164 }
24165 emit_label (label);
24166 LABEL_NUSES (label) = 1;
24167 }
24168
24169 if (count_exp != const0_rtx && epilogue_size_needed > 1)
24170 {
24171 if (force_loopy_epilogue)
24172 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
24173 epilogue_size_needed);
24174 else
24175 {
24176 if (issetmem)
24177 expand_setmem_epilogue (dst, destreg, promoted_val,
24178 vec_promoted_val, count_exp,
24179 epilogue_size_needed);
24180 else
24181 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
24182 epilogue_size_needed);
24183 }
24184 }
24185 if (jump_around_label)
24186 emit_label (jump_around_label);
24187 return true;
24188 }
24189
24190
24191 /* Expand the appropriate insns for doing strlen if not just doing
24192 repnz; scasb
24193
24194 out = result, initialized with the start address
24195 align_rtx = alignment of the address.
24196 scratch = scratch register, initialized with the startaddress when
24197 not aligned, otherwise undefined
24198
24199 This is just the body. It needs the initializations mentioned above and
24200 some address computing at the end. These things are done in i386.md. */
24201
24202 static void
24203 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
24204 {
24205 int align;
24206 rtx tmp;
24207 rtx align_2_label = NULL_RTX;
24208 rtx align_3_label = NULL_RTX;
24209 rtx align_4_label = gen_label_rtx ();
24210 rtx end_0_label = gen_label_rtx ();
24211 rtx mem;
24212 rtx tmpreg = gen_reg_rtx (SImode);
24213 rtx scratch = gen_reg_rtx (SImode);
24214 rtx cmp;
24215
24216 align = 0;
24217 if (CONST_INT_P (align_rtx))
24218 align = INTVAL (align_rtx);
24219
24220 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
24221
24222 /* Is there a known alignment and is it less than 4? */
24223 if (align < 4)
24224 {
24225 rtx scratch1 = gen_reg_rtx (Pmode);
24226 emit_move_insn (scratch1, out);
24227 /* Is there a known alignment and is it not 2? */
24228 if (align != 2)
24229 {
24230 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
24231 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
24232
24233 /* Leave just the 3 lower bits. */
24234 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
24235 NULL_RTX, 0, OPTAB_WIDEN);
24236
24237 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24238 Pmode, 1, align_4_label);
24239 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
24240 Pmode, 1, align_2_label);
24241 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
24242 Pmode, 1, align_3_label);
24243 }
24244 else
24245 {
24246 /* Since the alignment is 2, we have to check 2 or 0 bytes;
24247 check if is aligned to 4 - byte. */
24248
24249 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
24250 NULL_RTX, 0, OPTAB_WIDEN);
24251
24252 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24253 Pmode, 1, align_4_label);
24254 }
24255
24256 mem = change_address (src, QImode, out);
24257
24258 /* Now compare the bytes. */
24259
24260 /* Compare the first n unaligned byte on a byte per byte basis. */
24261 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
24262 QImode, 1, end_0_label);
24263
24264 /* Increment the address. */
24265 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24266
24267 /* Not needed with an alignment of 2 */
24268 if (align != 2)
24269 {
24270 emit_label (align_2_label);
24271
24272 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24273 end_0_label);
24274
24275 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24276
24277 emit_label (align_3_label);
24278 }
24279
24280 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24281 end_0_label);
24282
24283 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24284 }
24285
24286 /* Generate loop to check 4 bytes at a time. It is not a good idea to
24287 align this loop. It gives only huge programs, but does not help to
24288 speed up. */
24289 emit_label (align_4_label);
24290
24291 mem = change_address (src, SImode, out);
24292 emit_move_insn (scratch, mem);
24293 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
24294
24295 /* This formula yields a nonzero result iff one of the bytes is zero.
24296 This saves three branches inside loop and many cycles. */
24297
24298 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
24299 emit_insn (gen_one_cmplsi2 (scratch, scratch));
24300 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
24301 emit_insn (gen_andsi3 (tmpreg, tmpreg,
24302 gen_int_mode (0x80808080, SImode)));
24303 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
24304 align_4_label);
24305
24306 if (TARGET_CMOVE)
24307 {
24308 rtx reg = gen_reg_rtx (SImode);
24309 rtx reg2 = gen_reg_rtx (Pmode);
24310 emit_move_insn (reg, tmpreg);
24311 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
24312
24313 /* If zero is not in the first two bytes, move two bytes forward. */
24314 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24315 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24316 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24317 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
24318 gen_rtx_IF_THEN_ELSE (SImode, tmp,
24319 reg,
24320 tmpreg)));
24321 /* Emit lea manually to avoid clobbering of flags. */
24322 emit_insn (gen_rtx_SET (SImode, reg2,
24323 gen_rtx_PLUS (Pmode, out, const2_rtx)));
24324
24325 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24326 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24327 emit_insn (gen_rtx_SET (VOIDmode, out,
24328 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
24329 reg2,
24330 out)));
24331 }
24332 else
24333 {
24334 rtx end_2_label = gen_label_rtx ();
24335 /* Is zero in the first two bytes? */
24336
24337 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24338 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24339 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
24340 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24341 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
24342 pc_rtx);
24343 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
24344 JUMP_LABEL (tmp) = end_2_label;
24345
24346 /* Not in the first two. Move two bytes forward. */
24347 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
24348 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
24349
24350 emit_label (end_2_label);
24351
24352 }
24353
24354 /* Avoid branch in fixing the byte. */
24355 tmpreg = gen_lowpart (QImode, tmpreg);
24356 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
24357 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
24358 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
24359 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
24360
24361 emit_label (end_0_label);
24362 }
24363
24364 /* Expand strlen. */
24365
24366 bool
24367 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
24368 {
24369 rtx addr, scratch1, scratch2, scratch3, scratch4;
24370
24371 /* The generic case of strlen expander is long. Avoid it's
24372 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
24373
24374 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24375 && !TARGET_INLINE_ALL_STRINGOPS
24376 && !optimize_insn_for_size_p ()
24377 && (!CONST_INT_P (align) || INTVAL (align) < 4))
24378 return false;
24379
24380 addr = force_reg (Pmode, XEXP (src, 0));
24381 scratch1 = gen_reg_rtx (Pmode);
24382
24383 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24384 && !optimize_insn_for_size_p ())
24385 {
24386 /* Well it seems that some optimizer does not combine a call like
24387 foo(strlen(bar), strlen(bar));
24388 when the move and the subtraction is done here. It does calculate
24389 the length just once when these instructions are done inside of
24390 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
24391 often used and I use one fewer register for the lifetime of
24392 output_strlen_unroll() this is better. */
24393
24394 emit_move_insn (out, addr);
24395
24396 ix86_expand_strlensi_unroll_1 (out, src, align);
24397
24398 /* strlensi_unroll_1 returns the address of the zero at the end of
24399 the string, like memchr(), so compute the length by subtracting
24400 the start address. */
24401 emit_insn (ix86_gen_sub3 (out, out, addr));
24402 }
24403 else
24404 {
24405 rtx unspec;
24406
24407 /* Can't use this if the user has appropriated eax, ecx, or edi. */
24408 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
24409 return false;
24410
24411 scratch2 = gen_reg_rtx (Pmode);
24412 scratch3 = gen_reg_rtx (Pmode);
24413 scratch4 = force_reg (Pmode, constm1_rtx);
24414
24415 emit_move_insn (scratch3, addr);
24416 eoschar = force_reg (QImode, eoschar);
24417
24418 src = replace_equiv_address_nv (src, scratch3);
24419
24420 /* If .md starts supporting :P, this can be done in .md. */
24421 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
24422 scratch4), UNSPEC_SCAS);
24423 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
24424 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
24425 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
24426 }
24427 return true;
24428 }
24429
24430 /* For given symbol (function) construct code to compute address of it's PLT
24431 entry in large x86-64 PIC model. */
24432 static rtx
24433 construct_plt_address (rtx symbol)
24434 {
24435 rtx tmp, unspec;
24436
24437 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
24438 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
24439 gcc_assert (Pmode == DImode);
24440
24441 tmp = gen_reg_rtx (Pmode);
24442 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
24443
24444 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
24445 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
24446 return tmp;
24447 }
24448
24449 rtx
24450 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
24451 rtx callarg2,
24452 rtx pop, bool sibcall)
24453 {
24454 unsigned int const cregs_size
24455 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24456 rtx vec[3 + cregs_size];
24457 rtx use = NULL, call;
24458 unsigned int vec_len = 0;
24459
24460 if (pop == const0_rtx)
24461 pop = NULL;
24462 gcc_assert (!TARGET_64BIT || !pop);
24463
24464 if (TARGET_MACHO && !TARGET_64BIT)
24465 {
24466 #if TARGET_MACHO
24467 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24468 fnaddr = machopic_indirect_call_target (fnaddr);
24469 #endif
24470 }
24471 else
24472 {
24473 /* Static functions and indirect calls don't need the pic register. */
24474 if (flag_pic
24475 && (!TARGET_64BIT
24476 || (ix86_cmodel == CM_LARGE_PIC
24477 && DEFAULT_ABI != MS_ABI))
24478 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24479 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24480 use_reg (&use, pic_offset_table_rtx);
24481 }
24482
24483 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24484 {
24485 rtx al = gen_rtx_REG (QImode, AX_REG);
24486 emit_move_insn (al, callarg2);
24487 use_reg (&use, al);
24488 }
24489
24490 if (ix86_cmodel == CM_LARGE_PIC
24491 && !TARGET_PECOFF
24492 && MEM_P (fnaddr)
24493 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24494 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24495 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24496 else if (sibcall
24497 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24498 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24499 {
24500 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24501 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24502 }
24503
24504 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24505 if (retval)
24506 call = gen_rtx_SET (VOIDmode, retval, call);
24507 vec[vec_len++] = call;
24508
24509 if (pop)
24510 {
24511 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24512 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24513 vec[vec_len++] = pop;
24514 }
24515
24516 if (TARGET_64BIT_MS_ABI
24517 && (!callarg2 || INTVAL (callarg2) != -2))
24518 {
24519 unsigned i;
24520
24521 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24522 UNSPEC_MS_TO_SYSV_CALL);
24523
24524 for (i = 0; i < cregs_size; i++)
24525 {
24526 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24527 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24528
24529 vec[vec_len++]
24530 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24531 }
24532 }
24533
24534 if (vec_len > 1)
24535 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24536 call = emit_call_insn (call);
24537 if (use)
24538 CALL_INSN_FUNCTION_USAGE (call) = use;
24539
24540 return call;
24541 }
24542
24543 /* Output the assembly for a call instruction. */
24544
24545 const char *
24546 ix86_output_call_insn (rtx insn, rtx call_op)
24547 {
24548 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24549 bool seh_nop_p = false;
24550 const char *xasm;
24551
24552 if (SIBLING_CALL_P (insn))
24553 {
24554 if (direct_p)
24555 xasm = "%!jmp\t%P0";
24556 /* SEH epilogue detection requires the indirect branch case
24557 to include REX.W. */
24558 else if (TARGET_SEH)
24559 xasm = "%!rex.W jmp %A0";
24560 else
24561 xasm = "%!jmp\t%A0";
24562
24563 output_asm_insn (xasm, &call_op);
24564 return "";
24565 }
24566
24567 /* SEH unwinding can require an extra nop to be emitted in several
24568 circumstances. Determine if we have one of those. */
24569 if (TARGET_SEH)
24570 {
24571 rtx i;
24572
24573 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24574 {
24575 /* If we get to another real insn, we don't need the nop. */
24576 if (INSN_P (i))
24577 break;
24578
24579 /* If we get to the epilogue note, prevent a catch region from
24580 being adjacent to the standard epilogue sequence. If non-
24581 call-exceptions, we'll have done this during epilogue emission. */
24582 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24583 && !flag_non_call_exceptions
24584 && !can_throw_internal (insn))
24585 {
24586 seh_nop_p = true;
24587 break;
24588 }
24589 }
24590
24591 /* If we didn't find a real insn following the call, prevent the
24592 unwinder from looking into the next function. */
24593 if (i == NULL)
24594 seh_nop_p = true;
24595 }
24596
24597 if (direct_p)
24598 xasm = "%!call\t%P0";
24599 else
24600 xasm = "%!call\t%A0";
24601
24602 output_asm_insn (xasm, &call_op);
24603
24604 if (seh_nop_p)
24605 return "nop";
24606
24607 return "";
24608 }
24609 \f
24610 /* Clear stack slot assignments remembered from previous functions.
24611 This is called from INIT_EXPANDERS once before RTL is emitted for each
24612 function. */
24613
24614 static struct machine_function *
24615 ix86_init_machine_status (void)
24616 {
24617 struct machine_function *f;
24618
24619 f = ggc_alloc_cleared_machine_function ();
24620 f->use_fast_prologue_epilogue_nregs = -1;
24621 f->call_abi = ix86_abi;
24622
24623 return f;
24624 }
24625
24626 /* Return a MEM corresponding to a stack slot with mode MODE.
24627 Allocate a new slot if necessary.
24628
24629 The RTL for a function can have several slots available: N is
24630 which slot to use. */
24631
24632 rtx
24633 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24634 {
24635 struct stack_local_entry *s;
24636
24637 gcc_assert (n < MAX_386_STACK_LOCALS);
24638
24639 for (s = ix86_stack_locals; s; s = s->next)
24640 if (s->mode == mode && s->n == n)
24641 return validize_mem (copy_rtx (s->rtl));
24642
24643 s = ggc_alloc_stack_local_entry ();
24644 s->n = n;
24645 s->mode = mode;
24646 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24647
24648 s->next = ix86_stack_locals;
24649 ix86_stack_locals = s;
24650 return validize_mem (s->rtl);
24651 }
24652
24653 static void
24654 ix86_instantiate_decls (void)
24655 {
24656 struct stack_local_entry *s;
24657
24658 for (s = ix86_stack_locals; s; s = s->next)
24659 if (s->rtl != NULL_RTX)
24660 instantiate_decl_rtl (s->rtl);
24661 }
24662 \f
24663 /* Check whether x86 address PARTS is a pc-relative address. */
24664
24665 static bool
24666 rip_relative_addr_p (struct ix86_address *parts)
24667 {
24668 rtx base, index, disp;
24669
24670 base = parts->base;
24671 index = parts->index;
24672 disp = parts->disp;
24673
24674 if (disp && !base && !index)
24675 {
24676 if (TARGET_64BIT)
24677 {
24678 rtx symbol = disp;
24679
24680 if (GET_CODE (disp) == CONST)
24681 symbol = XEXP (disp, 0);
24682 if (GET_CODE (symbol) == PLUS
24683 && CONST_INT_P (XEXP (symbol, 1)))
24684 symbol = XEXP (symbol, 0);
24685
24686 if (GET_CODE (symbol) == LABEL_REF
24687 || (GET_CODE (symbol) == SYMBOL_REF
24688 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
24689 || (GET_CODE (symbol) == UNSPEC
24690 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
24691 || XINT (symbol, 1) == UNSPEC_PCREL
24692 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
24693 return true;
24694 }
24695 }
24696 return false;
24697 }
24698
24699 /* Calculate the length of the memory address in the instruction encoding.
24700 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24701 or other prefixes. We never generate addr32 prefix for LEA insn. */
24702
24703 int
24704 memory_address_length (rtx addr, bool lea)
24705 {
24706 struct ix86_address parts;
24707 rtx base, index, disp;
24708 int len;
24709 int ok;
24710
24711 if (GET_CODE (addr) == PRE_DEC
24712 || GET_CODE (addr) == POST_INC
24713 || GET_CODE (addr) == PRE_MODIFY
24714 || GET_CODE (addr) == POST_MODIFY)
24715 return 0;
24716
24717 ok = ix86_decompose_address (addr, &parts);
24718 gcc_assert (ok);
24719
24720 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24721
24722 /* If this is not LEA instruction, add the length of addr32 prefix. */
24723 if (TARGET_64BIT && !lea
24724 && (SImode_address_operand (addr, VOIDmode)
24725 || (parts.base && GET_MODE (parts.base) == SImode)
24726 || (parts.index && GET_MODE (parts.index) == SImode)))
24727 len++;
24728
24729 base = parts.base;
24730 index = parts.index;
24731 disp = parts.disp;
24732
24733 if (base && GET_CODE (base) == SUBREG)
24734 base = SUBREG_REG (base);
24735 if (index && GET_CODE (index) == SUBREG)
24736 index = SUBREG_REG (index);
24737
24738 gcc_assert (base == NULL_RTX || REG_P (base));
24739 gcc_assert (index == NULL_RTX || REG_P (index));
24740
24741 /* Rule of thumb:
24742 - esp as the base always wants an index,
24743 - ebp as the base always wants a displacement,
24744 - r12 as the base always wants an index,
24745 - r13 as the base always wants a displacement. */
24746
24747 /* Register Indirect. */
24748 if (base && !index && !disp)
24749 {
24750 /* esp (for its index) and ebp (for its displacement) need
24751 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24752 code. */
24753 if (base == arg_pointer_rtx
24754 || base == frame_pointer_rtx
24755 || REGNO (base) == SP_REG
24756 || REGNO (base) == BP_REG
24757 || REGNO (base) == R12_REG
24758 || REGNO (base) == R13_REG)
24759 len++;
24760 }
24761
24762 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24763 is not disp32, but disp32(%rip), so for disp32
24764 SIB byte is needed, unless print_operand_address
24765 optimizes it into disp32(%rip) or (%rip) is implied
24766 by UNSPEC. */
24767 else if (disp && !base && !index)
24768 {
24769 len += 4;
24770 if (rip_relative_addr_p (&parts))
24771 len++;
24772 }
24773 else
24774 {
24775 /* Find the length of the displacement constant. */
24776 if (disp)
24777 {
24778 if (base && satisfies_constraint_K (disp))
24779 len += 1;
24780 else
24781 len += 4;
24782 }
24783 /* ebp always wants a displacement. Similarly r13. */
24784 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24785 len++;
24786
24787 /* An index requires the two-byte modrm form.... */
24788 if (index
24789 /* ...like esp (or r12), which always wants an index. */
24790 || base == arg_pointer_rtx
24791 || base == frame_pointer_rtx
24792 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24793 len++;
24794 }
24795
24796 return len;
24797 }
24798
24799 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24800 is set, expect that insn have 8bit immediate alternative. */
24801 int
24802 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24803 {
24804 int len = 0;
24805 int i;
24806 extract_insn_cached (insn);
24807 for (i = recog_data.n_operands - 1; i >= 0; --i)
24808 if (CONSTANT_P (recog_data.operand[i]))
24809 {
24810 enum attr_mode mode = get_attr_mode (insn);
24811
24812 gcc_assert (!len);
24813 if (shortform && CONST_INT_P (recog_data.operand[i]))
24814 {
24815 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24816 switch (mode)
24817 {
24818 case MODE_QI:
24819 len = 1;
24820 continue;
24821 case MODE_HI:
24822 ival = trunc_int_for_mode (ival, HImode);
24823 break;
24824 case MODE_SI:
24825 ival = trunc_int_for_mode (ival, SImode);
24826 break;
24827 default:
24828 break;
24829 }
24830 if (IN_RANGE (ival, -128, 127))
24831 {
24832 len = 1;
24833 continue;
24834 }
24835 }
24836 switch (mode)
24837 {
24838 case MODE_QI:
24839 len = 1;
24840 break;
24841 case MODE_HI:
24842 len = 2;
24843 break;
24844 case MODE_SI:
24845 len = 4;
24846 break;
24847 /* Immediates for DImode instructions are encoded
24848 as 32bit sign extended values. */
24849 case MODE_DI:
24850 len = 4;
24851 break;
24852 default:
24853 fatal_insn ("unknown insn mode", insn);
24854 }
24855 }
24856 return len;
24857 }
24858
24859 /* Compute default value for "length_address" attribute. */
24860 int
24861 ix86_attr_length_address_default (rtx insn)
24862 {
24863 int i;
24864
24865 if (get_attr_type (insn) == TYPE_LEA)
24866 {
24867 rtx set = PATTERN (insn), addr;
24868
24869 if (GET_CODE (set) == PARALLEL)
24870 set = XVECEXP (set, 0, 0);
24871
24872 gcc_assert (GET_CODE (set) == SET);
24873
24874 addr = SET_SRC (set);
24875
24876 return memory_address_length (addr, true);
24877 }
24878
24879 extract_insn_cached (insn);
24880 for (i = recog_data.n_operands - 1; i >= 0; --i)
24881 if (MEM_P (recog_data.operand[i]))
24882 {
24883 constrain_operands_cached (reload_completed);
24884 if (which_alternative != -1)
24885 {
24886 const char *constraints = recog_data.constraints[i];
24887 int alt = which_alternative;
24888
24889 while (*constraints == '=' || *constraints == '+')
24890 constraints++;
24891 while (alt-- > 0)
24892 while (*constraints++ != ',')
24893 ;
24894 /* Skip ignored operands. */
24895 if (*constraints == 'X')
24896 continue;
24897 }
24898 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24899 }
24900 return 0;
24901 }
24902
24903 /* Compute default value for "length_vex" attribute. It includes
24904 2 or 3 byte VEX prefix and 1 opcode byte. */
24905
24906 int
24907 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24908 {
24909 int i;
24910
24911 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24912 byte VEX prefix. */
24913 if (!has_0f_opcode || has_vex_w)
24914 return 3 + 1;
24915
24916 /* We can always use 2 byte VEX prefix in 32bit. */
24917 if (!TARGET_64BIT)
24918 return 2 + 1;
24919
24920 extract_insn_cached (insn);
24921
24922 for (i = recog_data.n_operands - 1; i >= 0; --i)
24923 if (REG_P (recog_data.operand[i]))
24924 {
24925 /* REX.W bit uses 3 byte VEX prefix. */
24926 if (GET_MODE (recog_data.operand[i]) == DImode
24927 && GENERAL_REG_P (recog_data.operand[i]))
24928 return 3 + 1;
24929 }
24930 else
24931 {
24932 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24933 if (MEM_P (recog_data.operand[i])
24934 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24935 return 3 + 1;
24936 }
24937
24938 return 2 + 1;
24939 }
24940 \f
24941 /* Return the maximum number of instructions a cpu can issue. */
24942
24943 static int
24944 ix86_issue_rate (void)
24945 {
24946 switch (ix86_tune)
24947 {
24948 case PROCESSOR_PENTIUM:
24949 case PROCESSOR_ATOM:
24950 case PROCESSOR_SLM:
24951 case PROCESSOR_K6:
24952 case PROCESSOR_BTVER2:
24953 case PROCESSOR_PENTIUM4:
24954 case PROCESSOR_NOCONA:
24955 return 2;
24956
24957 case PROCESSOR_PENTIUMPRO:
24958 case PROCESSOR_ATHLON:
24959 case PROCESSOR_K8:
24960 case PROCESSOR_AMDFAM10:
24961 case PROCESSOR_GENERIC:
24962 case PROCESSOR_BTVER1:
24963 return 3;
24964
24965 case PROCESSOR_BDVER1:
24966 case PROCESSOR_BDVER2:
24967 case PROCESSOR_BDVER3:
24968 case PROCESSOR_BDVER4:
24969 case PROCESSOR_CORE2:
24970 case PROCESSOR_COREI7:
24971 case PROCESSOR_COREI7_AVX:
24972 case PROCESSOR_HASWELL:
24973 return 4;
24974
24975 default:
24976 return 1;
24977 }
24978 }
24979
24980 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24981 by DEP_INSN and nothing set by DEP_INSN. */
24982
24983 static bool
24984 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24985 {
24986 rtx set, set2;
24987
24988 /* Simplify the test for uninteresting insns. */
24989 if (insn_type != TYPE_SETCC
24990 && insn_type != TYPE_ICMOV
24991 && insn_type != TYPE_FCMOV
24992 && insn_type != TYPE_IBR)
24993 return false;
24994
24995 if ((set = single_set (dep_insn)) != 0)
24996 {
24997 set = SET_DEST (set);
24998 set2 = NULL_RTX;
24999 }
25000 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
25001 && XVECLEN (PATTERN (dep_insn), 0) == 2
25002 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
25003 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
25004 {
25005 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
25006 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
25007 }
25008 else
25009 return false;
25010
25011 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
25012 return false;
25013
25014 /* This test is true if the dependent insn reads the flags but
25015 not any other potentially set register. */
25016 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
25017 return false;
25018
25019 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
25020 return false;
25021
25022 return true;
25023 }
25024
25025 /* Return true iff USE_INSN has a memory address with operands set by
25026 SET_INSN. */
25027
25028 bool
25029 ix86_agi_dependent (rtx set_insn, rtx use_insn)
25030 {
25031 int i;
25032 extract_insn_cached (use_insn);
25033 for (i = recog_data.n_operands - 1; i >= 0; --i)
25034 if (MEM_P (recog_data.operand[i]))
25035 {
25036 rtx addr = XEXP (recog_data.operand[i], 0);
25037 return modified_in_p (addr, set_insn) != 0;
25038 }
25039 return false;
25040 }
25041
25042 /* Helper function for exact_store_load_dependency.
25043 Return true if addr is found in insn. */
25044 static bool
25045 exact_dependency_1 (rtx addr, rtx insn)
25046 {
25047 enum rtx_code code;
25048 const char *format_ptr;
25049 int i, j;
25050
25051 code = GET_CODE (insn);
25052 switch (code)
25053 {
25054 case MEM:
25055 if (rtx_equal_p (addr, insn))
25056 return true;
25057 break;
25058 case REG:
25059 CASE_CONST_ANY:
25060 case SYMBOL_REF:
25061 case CODE_LABEL:
25062 case PC:
25063 case CC0:
25064 case EXPR_LIST:
25065 return false;
25066 default:
25067 break;
25068 }
25069
25070 format_ptr = GET_RTX_FORMAT (code);
25071 for (i = 0; i < GET_RTX_LENGTH (code); i++)
25072 {
25073 switch (*format_ptr++)
25074 {
25075 case 'e':
25076 if (exact_dependency_1 (addr, XEXP (insn, i)))
25077 return true;
25078 break;
25079 case 'E':
25080 for (j = 0; j < XVECLEN (insn, i); j++)
25081 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
25082 return true;
25083 break;
25084 }
25085 }
25086 return false;
25087 }
25088
25089 /* Return true if there exists exact dependency for store & load, i.e.
25090 the same memory address is used in them. */
25091 static bool
25092 exact_store_load_dependency (rtx store, rtx load)
25093 {
25094 rtx set1, set2;
25095
25096 set1 = single_set (store);
25097 if (!set1)
25098 return false;
25099 if (!MEM_P (SET_DEST (set1)))
25100 return false;
25101 set2 = single_set (load);
25102 if (!set2)
25103 return false;
25104 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
25105 return true;
25106 return false;
25107 }
25108
25109 static int
25110 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
25111 {
25112 enum attr_type insn_type, dep_insn_type;
25113 enum attr_memory memory;
25114 rtx set, set2;
25115 int dep_insn_code_number;
25116
25117 /* Anti and output dependencies have zero cost on all CPUs. */
25118 if (REG_NOTE_KIND (link) != 0)
25119 return 0;
25120
25121 dep_insn_code_number = recog_memoized (dep_insn);
25122
25123 /* If we can't recognize the insns, we can't really do anything. */
25124 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
25125 return cost;
25126
25127 insn_type = get_attr_type (insn);
25128 dep_insn_type = get_attr_type (dep_insn);
25129
25130 switch (ix86_tune)
25131 {
25132 case PROCESSOR_PENTIUM:
25133 /* Address Generation Interlock adds a cycle of latency. */
25134 if (insn_type == TYPE_LEA)
25135 {
25136 rtx addr = PATTERN (insn);
25137
25138 if (GET_CODE (addr) == PARALLEL)
25139 addr = XVECEXP (addr, 0, 0);
25140
25141 gcc_assert (GET_CODE (addr) == SET);
25142
25143 addr = SET_SRC (addr);
25144 if (modified_in_p (addr, dep_insn))
25145 cost += 1;
25146 }
25147 else if (ix86_agi_dependent (dep_insn, insn))
25148 cost += 1;
25149
25150 /* ??? Compares pair with jump/setcc. */
25151 if (ix86_flags_dependent (insn, dep_insn, insn_type))
25152 cost = 0;
25153
25154 /* Floating point stores require value to be ready one cycle earlier. */
25155 if (insn_type == TYPE_FMOV
25156 && get_attr_memory (insn) == MEMORY_STORE
25157 && !ix86_agi_dependent (dep_insn, insn))
25158 cost += 1;
25159 break;
25160
25161 case PROCESSOR_PENTIUMPRO:
25162 memory = get_attr_memory (insn);
25163
25164 /* INT->FP conversion is expensive. */
25165 if (get_attr_fp_int_src (dep_insn))
25166 cost += 5;
25167
25168 /* There is one cycle extra latency between an FP op and a store. */
25169 if (insn_type == TYPE_FMOV
25170 && (set = single_set (dep_insn)) != NULL_RTX
25171 && (set2 = single_set (insn)) != NULL_RTX
25172 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
25173 && MEM_P (SET_DEST (set2)))
25174 cost += 1;
25175
25176 /* Show ability of reorder buffer to hide latency of load by executing
25177 in parallel with previous instruction in case
25178 previous instruction is not needed to compute the address. */
25179 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25180 && !ix86_agi_dependent (dep_insn, insn))
25181 {
25182 /* Claim moves to take one cycle, as core can issue one load
25183 at time and the next load can start cycle later. */
25184 if (dep_insn_type == TYPE_IMOV
25185 || dep_insn_type == TYPE_FMOV)
25186 cost = 1;
25187 else if (cost > 1)
25188 cost--;
25189 }
25190 break;
25191
25192 case PROCESSOR_K6:
25193 memory = get_attr_memory (insn);
25194
25195 /* The esp dependency is resolved before the instruction is really
25196 finished. */
25197 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25198 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25199 return 1;
25200
25201 /* INT->FP conversion is expensive. */
25202 if (get_attr_fp_int_src (dep_insn))
25203 cost += 5;
25204
25205 /* Show ability of reorder buffer to hide latency of load by executing
25206 in parallel with previous instruction in case
25207 previous instruction is not needed to compute the address. */
25208 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25209 && !ix86_agi_dependent (dep_insn, insn))
25210 {
25211 /* Claim moves to take one cycle, as core can issue one load
25212 at time and the next load can start cycle later. */
25213 if (dep_insn_type == TYPE_IMOV
25214 || dep_insn_type == TYPE_FMOV)
25215 cost = 1;
25216 else if (cost > 2)
25217 cost -= 2;
25218 else
25219 cost = 1;
25220 }
25221 break;
25222
25223 case PROCESSOR_ATHLON:
25224 case PROCESSOR_K8:
25225 case PROCESSOR_AMDFAM10:
25226 case PROCESSOR_BDVER1:
25227 case PROCESSOR_BDVER2:
25228 case PROCESSOR_BDVER3:
25229 case PROCESSOR_BDVER4:
25230 case PROCESSOR_BTVER1:
25231 case PROCESSOR_BTVER2:
25232 case PROCESSOR_GENERIC:
25233 memory = get_attr_memory (insn);
25234
25235 /* Stack engine allows to execute push&pop instructions in parall. */
25236 if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25237 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25238 && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
25239 return 0;
25240
25241 /* Show ability of reorder buffer to hide latency of load by executing
25242 in parallel with previous instruction in case
25243 previous instruction is not needed to compute the address. */
25244 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25245 && !ix86_agi_dependent (dep_insn, insn))
25246 {
25247 enum attr_unit unit = get_attr_unit (insn);
25248 int loadcost = 3;
25249
25250 /* Because of the difference between the length of integer and
25251 floating unit pipeline preparation stages, the memory operands
25252 for floating point are cheaper.
25253
25254 ??? For Athlon it the difference is most probably 2. */
25255 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
25256 loadcost = 3;
25257 else
25258 loadcost = TARGET_ATHLON ? 2 : 0;
25259
25260 if (cost >= loadcost)
25261 cost -= loadcost;
25262 else
25263 cost = 0;
25264 }
25265 break;
25266
25267 case PROCESSOR_CORE2:
25268 case PROCESSOR_COREI7:
25269 case PROCESSOR_COREI7_AVX:
25270 case PROCESSOR_HASWELL:
25271 memory = get_attr_memory (insn);
25272
25273 /* Stack engine allows to execute push&pop instructions in parall. */
25274 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25275 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25276 return 0;
25277
25278 /* Show ability of reorder buffer to hide latency of load by executing
25279 in parallel with previous instruction in case
25280 previous instruction is not needed to compute the address. */
25281 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25282 && !ix86_agi_dependent (dep_insn, insn))
25283 {
25284 if (cost >= 4)
25285 cost -= 4;
25286 else
25287 cost = 0;
25288 }
25289 break;
25290
25291 case PROCESSOR_SLM:
25292 if (!reload_completed)
25293 return cost;
25294
25295 /* Increase cost of integer loads. */
25296 memory = get_attr_memory (dep_insn);
25297 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25298 {
25299 enum attr_unit unit = get_attr_unit (dep_insn);
25300 if (unit == UNIT_INTEGER && cost == 1)
25301 {
25302 if (memory == MEMORY_LOAD)
25303 cost = 3;
25304 else
25305 {
25306 /* Increase cost of ld/st for short int types only
25307 because of store forwarding issue. */
25308 rtx set = single_set (dep_insn);
25309 if (set && (GET_MODE (SET_DEST (set)) == QImode
25310 || GET_MODE (SET_DEST (set)) == HImode))
25311 {
25312 /* Increase cost of store/load insn if exact
25313 dependence exists and it is load insn. */
25314 enum attr_memory insn_memory = get_attr_memory (insn);
25315 if (insn_memory == MEMORY_LOAD
25316 && exact_store_load_dependency (dep_insn, insn))
25317 cost = 3;
25318 }
25319 }
25320 }
25321 }
25322
25323 default:
25324 break;
25325 }
25326
25327 return cost;
25328 }
25329
25330 /* How many alternative schedules to try. This should be as wide as the
25331 scheduling freedom in the DFA, but no wider. Making this value too
25332 large results extra work for the scheduler. */
25333
25334 static int
25335 ia32_multipass_dfa_lookahead (void)
25336 {
25337 switch (ix86_tune)
25338 {
25339 case PROCESSOR_PENTIUM:
25340 return 2;
25341
25342 case PROCESSOR_PENTIUMPRO:
25343 case PROCESSOR_K6:
25344 return 1;
25345
25346 case PROCESSOR_BDVER1:
25347 case PROCESSOR_BDVER2:
25348 case PROCESSOR_BDVER3:
25349 case PROCESSOR_BDVER4:
25350 /* We use lookahead value 4 for BD both before and after reload
25351 schedules. Plan is to have value 8 included for O3. */
25352 return 4;
25353
25354 case PROCESSOR_CORE2:
25355 case PROCESSOR_COREI7:
25356 case PROCESSOR_COREI7_AVX:
25357 case PROCESSOR_HASWELL:
25358 case PROCESSOR_ATOM:
25359 case PROCESSOR_SLM:
25360 /* Generally, we want haifa-sched:max_issue() to look ahead as far
25361 as many instructions can be executed on a cycle, i.e.,
25362 issue_rate. I wonder why tuning for many CPUs does not do this. */
25363 if (reload_completed)
25364 return ix86_issue_rate ();
25365 /* Don't use lookahead for pre-reload schedule to save compile time. */
25366 return 0;
25367
25368 default:
25369 return 0;
25370 }
25371 }
25372
25373 /* Return true if target platform supports macro-fusion. */
25374
25375 static bool
25376 ix86_macro_fusion_p ()
25377 {
25378 return TARGET_FUSE_CMP_AND_BRANCH;
25379 }
25380
25381 /* Check whether current microarchitecture support macro fusion
25382 for insn pair "CONDGEN + CONDJMP". Refer to
25383 "Intel Architectures Optimization Reference Manual". */
25384
25385 static bool
25386 ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
25387 {
25388 rtx src, dest;
25389 rtx single_set = single_set (condgen);
25390 enum rtx_code ccode;
25391 rtx compare_set = NULL_RTX, test_if, cond;
25392 rtx alu_set = NULL_RTX, addr = NULL_RTX;
25393
25394 if (get_attr_type (condgen) != TYPE_TEST
25395 && get_attr_type (condgen) != TYPE_ICMP
25396 && get_attr_type (condgen) != TYPE_INCDEC
25397 && get_attr_type (condgen) != TYPE_ALU)
25398 return false;
25399
25400 if (single_set == NULL_RTX
25401 && !TARGET_FUSE_ALU_AND_BRANCH)
25402 return false;
25403
25404 if (single_set != NULL_RTX)
25405 compare_set = single_set;
25406 else
25407 {
25408 int i;
25409 rtx pat = PATTERN (condgen);
25410 for (i = 0; i < XVECLEN (pat, 0); i++)
25411 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
25412 {
25413 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
25414 if (GET_CODE (set_src) == COMPARE)
25415 compare_set = XVECEXP (pat, 0, i);
25416 else
25417 alu_set = XVECEXP (pat, 0, i);
25418 }
25419 }
25420 if (compare_set == NULL_RTX)
25421 return false;
25422 src = SET_SRC (compare_set);
25423 if (GET_CODE (src) != COMPARE)
25424 return false;
25425
25426 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
25427 supported. */
25428 if ((MEM_P (XEXP (src, 0))
25429 && CONST_INT_P (XEXP (src, 1)))
25430 || (MEM_P (XEXP (src, 1))
25431 && CONST_INT_P (XEXP (src, 0))))
25432 return false;
25433
25434 /* No fusion for RIP-relative address. */
25435 if (MEM_P (XEXP (src, 0)))
25436 addr = XEXP (XEXP (src, 0), 0);
25437 else if (MEM_P (XEXP (src, 1)))
25438 addr = XEXP (XEXP (src, 1), 0);
25439
25440 if (addr) {
25441 ix86_address parts;
25442 int ok = ix86_decompose_address (addr, &parts);
25443 gcc_assert (ok);
25444
25445 if (rip_relative_addr_p (&parts))
25446 return false;
25447 }
25448
25449 test_if = SET_SRC (pc_set (condjmp));
25450 cond = XEXP (test_if, 0);
25451 ccode = GET_CODE (cond);
25452 /* Check whether conditional jump use Sign or Overflow Flags. */
25453 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
25454 && (ccode == GE
25455 || ccode == GT
25456 || ccode == LE
25457 || ccode == LT))
25458 return false;
25459
25460 /* Return true for TYPE_TEST and TYPE_ICMP. */
25461 if (get_attr_type (condgen) == TYPE_TEST
25462 || get_attr_type (condgen) == TYPE_ICMP)
25463 return true;
25464
25465 /* The following is the case that macro-fusion for alu + jmp. */
25466 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
25467 return false;
25468
25469 /* No fusion for alu op with memory destination operand. */
25470 dest = SET_DEST (alu_set);
25471 if (MEM_P (dest))
25472 return false;
25473
25474 /* Macro-fusion for inc/dec + unsigned conditional jump is not
25475 supported. */
25476 if (get_attr_type (condgen) == TYPE_INCDEC
25477 && (ccode == GEU
25478 || ccode == GTU
25479 || ccode == LEU
25480 || ccode == LTU))
25481 return false;
25482
25483 return true;
25484 }
25485
25486 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
25487 execution. It is applied if
25488 (1) IMUL instruction is on the top of list;
25489 (2) There exists the only producer of independent IMUL instruction in
25490 ready list.
25491 Return index of IMUL producer if it was found and -1 otherwise. */
25492 static int
25493 do_reorder_for_imul (rtx *ready, int n_ready)
25494 {
25495 rtx insn, set, insn1, insn2;
25496 sd_iterator_def sd_it;
25497 dep_t dep;
25498 int index = -1;
25499 int i;
25500
25501 if (ix86_tune != PROCESSOR_ATOM)
25502 return index;
25503
25504 /* Check that IMUL instruction is on the top of ready list. */
25505 insn = ready[n_ready - 1];
25506 set = single_set (insn);
25507 if (!set)
25508 return index;
25509 if (!(GET_CODE (SET_SRC (set)) == MULT
25510 && GET_MODE (SET_SRC (set)) == SImode))
25511 return index;
25512
25513 /* Search for producer of independent IMUL instruction. */
25514 for (i = n_ready - 2; i >= 0; i--)
25515 {
25516 insn = ready[i];
25517 if (!NONDEBUG_INSN_P (insn))
25518 continue;
25519 /* Skip IMUL instruction. */
25520 insn2 = PATTERN (insn);
25521 if (GET_CODE (insn2) == PARALLEL)
25522 insn2 = XVECEXP (insn2, 0, 0);
25523 if (GET_CODE (insn2) == SET
25524 && GET_CODE (SET_SRC (insn2)) == MULT
25525 && GET_MODE (SET_SRC (insn2)) == SImode)
25526 continue;
25527
25528 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
25529 {
25530 rtx con;
25531 con = DEP_CON (dep);
25532 if (!NONDEBUG_INSN_P (con))
25533 continue;
25534 insn1 = PATTERN (con);
25535 if (GET_CODE (insn1) == PARALLEL)
25536 insn1 = XVECEXP (insn1, 0, 0);
25537
25538 if (GET_CODE (insn1) == SET
25539 && GET_CODE (SET_SRC (insn1)) == MULT
25540 && GET_MODE (SET_SRC (insn1)) == SImode)
25541 {
25542 sd_iterator_def sd_it1;
25543 dep_t dep1;
25544 /* Check if there is no other dependee for IMUL. */
25545 index = i;
25546 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
25547 {
25548 rtx pro;
25549 pro = DEP_PRO (dep1);
25550 if (!NONDEBUG_INSN_P (pro))
25551 continue;
25552 if (pro != insn)
25553 index = -1;
25554 }
25555 if (index >= 0)
25556 break;
25557 }
25558 }
25559 if (index >= 0)
25560 break;
25561 }
25562 return index;
25563 }
25564
25565 /* Try to find the best candidate on the top of ready list if two insns
25566 have the same priority - candidate is best if its dependees were
25567 scheduled earlier. Applied for Silvermont only.
25568 Return true if top 2 insns must be interchanged. */
25569 static bool
25570 swap_top_of_ready_list (rtx *ready, int n_ready)
25571 {
25572 rtx top = ready[n_ready - 1];
25573 rtx next = ready[n_ready - 2];
25574 rtx set;
25575 sd_iterator_def sd_it;
25576 dep_t dep;
25577 int clock1 = -1;
25578 int clock2 = -1;
25579 #define INSN_TICK(INSN) (HID (INSN)->tick)
25580
25581 if (ix86_tune != PROCESSOR_SLM)
25582 return false;
25583
25584 if (!NONDEBUG_INSN_P (top))
25585 return false;
25586 if (!NONJUMP_INSN_P (top))
25587 return false;
25588 if (!NONDEBUG_INSN_P (next))
25589 return false;
25590 if (!NONJUMP_INSN_P (next))
25591 return false;
25592 set = single_set (top);
25593 if (!set)
25594 return false;
25595 set = single_set (next);
25596 if (!set)
25597 return false;
25598
25599 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25600 {
25601 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25602 return false;
25603 /* Determine winner more precise. */
25604 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25605 {
25606 rtx pro;
25607 pro = DEP_PRO (dep);
25608 if (!NONDEBUG_INSN_P (pro))
25609 continue;
25610 if (INSN_TICK (pro) > clock1)
25611 clock1 = INSN_TICK (pro);
25612 }
25613 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25614 {
25615 rtx pro;
25616 pro = DEP_PRO (dep);
25617 if (!NONDEBUG_INSN_P (pro))
25618 continue;
25619 if (INSN_TICK (pro) > clock2)
25620 clock2 = INSN_TICK (pro);
25621 }
25622
25623 if (clock1 == clock2)
25624 {
25625 /* Determine winner - load must win. */
25626 enum attr_memory memory1, memory2;
25627 memory1 = get_attr_memory (top);
25628 memory2 = get_attr_memory (next);
25629 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25630 return true;
25631 }
25632 return (bool) (clock2 < clock1);
25633 }
25634 return false;
25635 #undef INSN_TICK
25636 }
25637
25638 /* Perform possible reodering of ready list for Atom/Silvermont only.
25639 Return issue rate. */
25640 static int
25641 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25642 int clock_var)
25643 {
25644 int issue_rate = -1;
25645 int n_ready = *pn_ready;
25646 int i;
25647 rtx insn;
25648 int index = -1;
25649
25650 /* Set up issue rate. */
25651 issue_rate = ix86_issue_rate ();
25652
25653 /* Do reodering for Atom/SLM only. */
25654 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25655 return issue_rate;
25656
25657 /* Nothing to do if ready list contains only 1 instruction. */
25658 if (n_ready <= 1)
25659 return issue_rate;
25660
25661 /* Do reodering for post-reload scheduler only. */
25662 if (!reload_completed)
25663 return issue_rate;
25664
25665 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25666 {
25667 if (sched_verbose > 1)
25668 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25669 INSN_UID (ready[index]));
25670
25671 /* Put IMUL producer (ready[index]) at the top of ready list. */
25672 insn = ready[index];
25673 for (i = index; i < n_ready - 1; i++)
25674 ready[i] = ready[i + 1];
25675 ready[n_ready - 1] = insn;
25676 return issue_rate;
25677 }
25678 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25679 {
25680 if (sched_verbose > 1)
25681 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25682 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25683 /* Swap 2 top elements of ready list. */
25684 insn = ready[n_ready - 1];
25685 ready[n_ready - 1] = ready[n_ready - 2];
25686 ready[n_ready - 2] = insn;
25687 }
25688 return issue_rate;
25689 }
25690
25691 static bool
25692 ix86_class_likely_spilled_p (reg_class_t);
25693
25694 /* Returns true if lhs of insn is HW function argument register and set up
25695 is_spilled to true if it is likely spilled HW register. */
25696 static bool
25697 insn_is_function_arg (rtx insn, bool* is_spilled)
25698 {
25699 rtx dst;
25700
25701 if (!NONDEBUG_INSN_P (insn))
25702 return false;
25703 /* Call instructions are not movable, ignore it. */
25704 if (CALL_P (insn))
25705 return false;
25706 insn = PATTERN (insn);
25707 if (GET_CODE (insn) == PARALLEL)
25708 insn = XVECEXP (insn, 0, 0);
25709 if (GET_CODE (insn) != SET)
25710 return false;
25711 dst = SET_DEST (insn);
25712 if (REG_P (dst) && HARD_REGISTER_P (dst)
25713 && ix86_function_arg_regno_p (REGNO (dst)))
25714 {
25715 /* Is it likely spilled HW register? */
25716 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25717 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25718 *is_spilled = true;
25719 return true;
25720 }
25721 return false;
25722 }
25723
25724 /* Add output dependencies for chain of function adjacent arguments if only
25725 there is a move to likely spilled HW register. Return first argument
25726 if at least one dependence was added or NULL otherwise. */
25727 static rtx
25728 add_parameter_dependencies (rtx call, rtx head)
25729 {
25730 rtx insn;
25731 rtx last = call;
25732 rtx first_arg = NULL;
25733 bool is_spilled = false;
25734
25735 head = PREV_INSN (head);
25736
25737 /* Find nearest to call argument passing instruction. */
25738 while (true)
25739 {
25740 last = PREV_INSN (last);
25741 if (last == head)
25742 return NULL;
25743 if (!NONDEBUG_INSN_P (last))
25744 continue;
25745 if (insn_is_function_arg (last, &is_spilled))
25746 break;
25747 return NULL;
25748 }
25749
25750 first_arg = last;
25751 while (true)
25752 {
25753 insn = PREV_INSN (last);
25754 if (!INSN_P (insn))
25755 break;
25756 if (insn == head)
25757 break;
25758 if (!NONDEBUG_INSN_P (insn))
25759 {
25760 last = insn;
25761 continue;
25762 }
25763 if (insn_is_function_arg (insn, &is_spilled))
25764 {
25765 /* Add output depdendence between two function arguments if chain
25766 of output arguments contains likely spilled HW registers. */
25767 if (is_spilled)
25768 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25769 first_arg = last = insn;
25770 }
25771 else
25772 break;
25773 }
25774 if (!is_spilled)
25775 return NULL;
25776 return first_arg;
25777 }
25778
25779 /* Add output or anti dependency from insn to first_arg to restrict its code
25780 motion. */
25781 static void
25782 avoid_func_arg_motion (rtx first_arg, rtx insn)
25783 {
25784 rtx set;
25785 rtx tmp;
25786
25787 set = single_set (insn);
25788 if (!set)
25789 return;
25790 tmp = SET_DEST (set);
25791 if (REG_P (tmp))
25792 {
25793 /* Add output dependency to the first function argument. */
25794 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25795 return;
25796 }
25797 /* Add anti dependency. */
25798 add_dependence (first_arg, insn, REG_DEP_ANTI);
25799 }
25800
25801 /* Avoid cross block motion of function argument through adding dependency
25802 from the first non-jump instruction in bb. */
25803 static void
25804 add_dependee_for_func_arg (rtx arg, basic_block bb)
25805 {
25806 rtx insn = BB_END (bb);
25807
25808 while (insn)
25809 {
25810 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25811 {
25812 rtx set = single_set (insn);
25813 if (set)
25814 {
25815 avoid_func_arg_motion (arg, insn);
25816 return;
25817 }
25818 }
25819 if (insn == BB_HEAD (bb))
25820 return;
25821 insn = PREV_INSN (insn);
25822 }
25823 }
25824
25825 /* Hook for pre-reload schedule - avoid motion of function arguments
25826 passed in likely spilled HW registers. */
25827 static void
25828 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25829 {
25830 rtx insn;
25831 rtx first_arg = NULL;
25832 if (reload_completed)
25833 return;
25834 while (head != tail && DEBUG_INSN_P (head))
25835 head = NEXT_INSN (head);
25836 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25837 if (INSN_P (insn) && CALL_P (insn))
25838 {
25839 first_arg = add_parameter_dependencies (insn, head);
25840 if (first_arg)
25841 {
25842 /* Add dependee for first argument to predecessors if only
25843 region contains more than one block. */
25844 basic_block bb = BLOCK_FOR_INSN (insn);
25845 int rgn = CONTAINING_RGN (bb->index);
25846 int nr_blks = RGN_NR_BLOCKS (rgn);
25847 /* Skip trivial regions and region head blocks that can have
25848 predecessors outside of region. */
25849 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25850 {
25851 edge e;
25852 edge_iterator ei;
25853 /* Assume that region is SCC, i.e. all immediate predecessors
25854 of non-head block are in the same region. */
25855 FOR_EACH_EDGE (e, ei, bb->preds)
25856 {
25857 /* Avoid creating of loop-carried dependencies through
25858 using topological odering in region. */
25859 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25860 add_dependee_for_func_arg (first_arg, e->src);
25861 }
25862 }
25863 insn = first_arg;
25864 if (insn == head)
25865 break;
25866 }
25867 }
25868 else if (first_arg)
25869 avoid_func_arg_motion (first_arg, insn);
25870 }
25871
25872 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25873 HW registers to maximum, to schedule them at soon as possible. These are
25874 moves from function argument registers at the top of the function entry
25875 and moves from function return value registers after call. */
25876 static int
25877 ix86_adjust_priority (rtx insn, int priority)
25878 {
25879 rtx set;
25880
25881 if (reload_completed)
25882 return priority;
25883
25884 if (!NONDEBUG_INSN_P (insn))
25885 return priority;
25886
25887 set = single_set (insn);
25888 if (set)
25889 {
25890 rtx tmp = SET_SRC (set);
25891 if (REG_P (tmp)
25892 && HARD_REGISTER_P (tmp)
25893 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25894 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25895 return current_sched_info->sched_max_insns_priority;
25896 }
25897
25898 return priority;
25899 }
25900
25901 /* Model decoder of Core 2/i7.
25902 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25903 track the instruction fetch block boundaries and make sure that long
25904 (9+ bytes) instructions are assigned to D0. */
25905
25906 /* Maximum length of an insn that can be handled by
25907 a secondary decoder unit. '8' for Core 2/i7. */
25908 static int core2i7_secondary_decoder_max_insn_size;
25909
25910 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25911 '16' for Core 2/i7. */
25912 static int core2i7_ifetch_block_size;
25913
25914 /* Maximum number of instructions decoder can handle per cycle.
25915 '6' for Core 2/i7. */
25916 static int core2i7_ifetch_block_max_insns;
25917
25918 typedef struct ix86_first_cycle_multipass_data_ *
25919 ix86_first_cycle_multipass_data_t;
25920 typedef const struct ix86_first_cycle_multipass_data_ *
25921 const_ix86_first_cycle_multipass_data_t;
25922
25923 /* A variable to store target state across calls to max_issue within
25924 one cycle. */
25925 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25926 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25927
25928 /* Initialize DATA. */
25929 static void
25930 core2i7_first_cycle_multipass_init (void *_data)
25931 {
25932 ix86_first_cycle_multipass_data_t data
25933 = (ix86_first_cycle_multipass_data_t) _data;
25934
25935 data->ifetch_block_len = 0;
25936 data->ifetch_block_n_insns = 0;
25937 data->ready_try_change = NULL;
25938 data->ready_try_change_size = 0;
25939 }
25940
25941 /* Advancing the cycle; reset ifetch block counts. */
25942 static void
25943 core2i7_dfa_post_advance_cycle (void)
25944 {
25945 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25946
25947 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25948
25949 data->ifetch_block_len = 0;
25950 data->ifetch_block_n_insns = 0;
25951 }
25952
25953 static int min_insn_size (rtx);
25954
25955 /* Filter out insns from ready_try that the core will not be able to issue
25956 on current cycle due to decoder. */
25957 static void
25958 core2i7_first_cycle_multipass_filter_ready_try
25959 (const_ix86_first_cycle_multipass_data_t data,
25960 char *ready_try, int n_ready, bool first_cycle_insn_p)
25961 {
25962 while (n_ready--)
25963 {
25964 rtx insn;
25965 int insn_size;
25966
25967 if (ready_try[n_ready])
25968 continue;
25969
25970 insn = get_ready_element (n_ready);
25971 insn_size = min_insn_size (insn);
25972
25973 if (/* If this is a too long an insn for a secondary decoder ... */
25974 (!first_cycle_insn_p
25975 && insn_size > core2i7_secondary_decoder_max_insn_size)
25976 /* ... or it would not fit into the ifetch block ... */
25977 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25978 /* ... or the decoder is full already ... */
25979 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25980 /* ... mask the insn out. */
25981 {
25982 ready_try[n_ready] = 1;
25983
25984 if (data->ready_try_change)
25985 bitmap_set_bit (data->ready_try_change, n_ready);
25986 }
25987 }
25988 }
25989
25990 /* Prepare for a new round of multipass lookahead scheduling. */
25991 static void
25992 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25993 bool first_cycle_insn_p)
25994 {
25995 ix86_first_cycle_multipass_data_t data
25996 = (ix86_first_cycle_multipass_data_t) _data;
25997 const_ix86_first_cycle_multipass_data_t prev_data
25998 = ix86_first_cycle_multipass_data;
25999
26000 /* Restore the state from the end of the previous round. */
26001 data->ifetch_block_len = prev_data->ifetch_block_len;
26002 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
26003
26004 /* Filter instructions that cannot be issued on current cycle due to
26005 decoder restrictions. */
26006 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
26007 first_cycle_insn_p);
26008 }
26009
26010 /* INSN is being issued in current solution. Account for its impact on
26011 the decoder model. */
26012 static void
26013 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
26014 rtx insn, const void *_prev_data)
26015 {
26016 ix86_first_cycle_multipass_data_t data
26017 = (ix86_first_cycle_multipass_data_t) _data;
26018 const_ix86_first_cycle_multipass_data_t prev_data
26019 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
26020
26021 int insn_size = min_insn_size (insn);
26022
26023 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
26024 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
26025 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
26026 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
26027
26028 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
26029 if (!data->ready_try_change)
26030 {
26031 data->ready_try_change = sbitmap_alloc (n_ready);
26032 data->ready_try_change_size = n_ready;
26033 }
26034 else if (data->ready_try_change_size < n_ready)
26035 {
26036 data->ready_try_change = sbitmap_resize (data->ready_try_change,
26037 n_ready, 0);
26038 data->ready_try_change_size = n_ready;
26039 }
26040 bitmap_clear (data->ready_try_change);
26041
26042 /* Filter out insns from ready_try that the core will not be able to issue
26043 on current cycle due to decoder. */
26044 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
26045 false);
26046 }
26047
26048 /* Revert the effect on ready_try. */
26049 static void
26050 core2i7_first_cycle_multipass_backtrack (const void *_data,
26051 char *ready_try,
26052 int n_ready ATTRIBUTE_UNUSED)
26053 {
26054 const_ix86_first_cycle_multipass_data_t data
26055 = (const_ix86_first_cycle_multipass_data_t) _data;
26056 unsigned int i = 0;
26057 sbitmap_iterator sbi;
26058
26059 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
26060 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
26061 {
26062 ready_try[i] = 0;
26063 }
26064 }
26065
26066 /* Save the result of multipass lookahead scheduling for the next round. */
26067 static void
26068 core2i7_first_cycle_multipass_end (const void *_data)
26069 {
26070 const_ix86_first_cycle_multipass_data_t data
26071 = (const_ix86_first_cycle_multipass_data_t) _data;
26072 ix86_first_cycle_multipass_data_t next_data
26073 = ix86_first_cycle_multipass_data;
26074
26075 if (data != NULL)
26076 {
26077 next_data->ifetch_block_len = data->ifetch_block_len;
26078 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
26079 }
26080 }
26081
26082 /* Deallocate target data. */
26083 static void
26084 core2i7_first_cycle_multipass_fini (void *_data)
26085 {
26086 ix86_first_cycle_multipass_data_t data
26087 = (ix86_first_cycle_multipass_data_t) _data;
26088
26089 if (data->ready_try_change)
26090 {
26091 sbitmap_free (data->ready_try_change);
26092 data->ready_try_change = NULL;
26093 data->ready_try_change_size = 0;
26094 }
26095 }
26096
26097 /* Prepare for scheduling pass. */
26098 static void
26099 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
26100 int verbose ATTRIBUTE_UNUSED,
26101 int max_uid ATTRIBUTE_UNUSED)
26102 {
26103 /* Install scheduling hooks for current CPU. Some of these hooks are used
26104 in time-critical parts of the scheduler, so we only set them up when
26105 they are actually used. */
26106 switch (ix86_tune)
26107 {
26108 case PROCESSOR_CORE2:
26109 case PROCESSOR_COREI7:
26110 case PROCESSOR_COREI7_AVX:
26111 case PROCESSOR_HASWELL:
26112 /* Do not perform multipass scheduling for pre-reload schedule
26113 to save compile time. */
26114 if (reload_completed)
26115 {
26116 targetm.sched.dfa_post_advance_cycle
26117 = core2i7_dfa_post_advance_cycle;
26118 targetm.sched.first_cycle_multipass_init
26119 = core2i7_first_cycle_multipass_init;
26120 targetm.sched.first_cycle_multipass_begin
26121 = core2i7_first_cycle_multipass_begin;
26122 targetm.sched.first_cycle_multipass_issue
26123 = core2i7_first_cycle_multipass_issue;
26124 targetm.sched.first_cycle_multipass_backtrack
26125 = core2i7_first_cycle_multipass_backtrack;
26126 targetm.sched.first_cycle_multipass_end
26127 = core2i7_first_cycle_multipass_end;
26128 targetm.sched.first_cycle_multipass_fini
26129 = core2i7_first_cycle_multipass_fini;
26130
26131 /* Set decoder parameters. */
26132 core2i7_secondary_decoder_max_insn_size = 8;
26133 core2i7_ifetch_block_size = 16;
26134 core2i7_ifetch_block_max_insns = 6;
26135 break;
26136 }
26137 /* ... Fall through ... */
26138 default:
26139 targetm.sched.dfa_post_advance_cycle = NULL;
26140 targetm.sched.first_cycle_multipass_init = NULL;
26141 targetm.sched.first_cycle_multipass_begin = NULL;
26142 targetm.sched.first_cycle_multipass_issue = NULL;
26143 targetm.sched.first_cycle_multipass_backtrack = NULL;
26144 targetm.sched.first_cycle_multipass_end = NULL;
26145 targetm.sched.first_cycle_multipass_fini = NULL;
26146 break;
26147 }
26148 }
26149
26150 \f
26151 /* Compute the alignment given to a constant that is being placed in memory.
26152 EXP is the constant and ALIGN is the alignment that the object would
26153 ordinarily have.
26154 The value of this function is used instead of that alignment to align
26155 the object. */
26156
26157 int
26158 ix86_constant_alignment (tree exp, int align)
26159 {
26160 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
26161 || TREE_CODE (exp) == INTEGER_CST)
26162 {
26163 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
26164 return 64;
26165 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
26166 return 128;
26167 }
26168 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
26169 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
26170 return BITS_PER_WORD;
26171
26172 return align;
26173 }
26174
26175 /* Compute the alignment for a static variable.
26176 TYPE is the data type, and ALIGN is the alignment that
26177 the object would ordinarily have. The value of this function is used
26178 instead of that alignment to align the object. */
26179
26180 int
26181 ix86_data_alignment (tree type, int align, bool opt)
26182 {
26183 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
26184
26185 if (opt
26186 && AGGREGATE_TYPE_P (type)
26187 && TYPE_SIZE (type)
26188 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26189 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
26190 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
26191 && align < max_align)
26192 align = max_align;
26193
26194 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26195 to 16byte boundary. */
26196 if (TARGET_64BIT)
26197 {
26198 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
26199 && TYPE_SIZE (type)
26200 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26201 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
26202 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26203 return 128;
26204 }
26205
26206 if (!opt)
26207 return align;
26208
26209 if (TREE_CODE (type) == ARRAY_TYPE)
26210 {
26211 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26212 return 64;
26213 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26214 return 128;
26215 }
26216 else if (TREE_CODE (type) == COMPLEX_TYPE)
26217 {
26218
26219 if (TYPE_MODE (type) == DCmode && align < 64)
26220 return 64;
26221 if ((TYPE_MODE (type) == XCmode
26222 || TYPE_MODE (type) == TCmode) && align < 128)
26223 return 128;
26224 }
26225 else if ((TREE_CODE (type) == RECORD_TYPE
26226 || TREE_CODE (type) == UNION_TYPE
26227 || TREE_CODE (type) == QUAL_UNION_TYPE)
26228 && TYPE_FIELDS (type))
26229 {
26230 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26231 return 64;
26232 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26233 return 128;
26234 }
26235 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26236 || TREE_CODE (type) == INTEGER_TYPE)
26237 {
26238 if (TYPE_MODE (type) == DFmode && align < 64)
26239 return 64;
26240 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26241 return 128;
26242 }
26243
26244 return align;
26245 }
26246
26247 /* Compute the alignment for a local variable or a stack slot. EXP is
26248 the data type or decl itself, MODE is the widest mode available and
26249 ALIGN is the alignment that the object would ordinarily have. The
26250 value of this macro is used instead of that alignment to align the
26251 object. */
26252
26253 unsigned int
26254 ix86_local_alignment (tree exp, enum machine_mode mode,
26255 unsigned int align)
26256 {
26257 tree type, decl;
26258
26259 if (exp && DECL_P (exp))
26260 {
26261 type = TREE_TYPE (exp);
26262 decl = exp;
26263 }
26264 else
26265 {
26266 type = exp;
26267 decl = NULL;
26268 }
26269
26270 /* Don't do dynamic stack realignment for long long objects with
26271 -mpreferred-stack-boundary=2. */
26272 if (!TARGET_64BIT
26273 && align == 64
26274 && ix86_preferred_stack_boundary < 64
26275 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
26276 && (!type || !TYPE_USER_ALIGN (type))
26277 && (!decl || !DECL_USER_ALIGN (decl)))
26278 align = 32;
26279
26280 /* If TYPE is NULL, we are allocating a stack slot for caller-save
26281 register in MODE. We will return the largest alignment of XF
26282 and DF. */
26283 if (!type)
26284 {
26285 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
26286 align = GET_MODE_ALIGNMENT (DFmode);
26287 return align;
26288 }
26289
26290 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26291 to 16byte boundary. Exact wording is:
26292
26293 An array uses the same alignment as its elements, except that a local or
26294 global array variable of length at least 16 bytes or
26295 a C99 variable-length array variable always has alignment of at least 16 bytes.
26296
26297 This was added to allow use of aligned SSE instructions at arrays. This
26298 rule is meant for static storage (where compiler can not do the analysis
26299 by itself). We follow it for automatic variables only when convenient.
26300 We fully control everything in the function compiled and functions from
26301 other unit can not rely on the alignment.
26302
26303 Exclude va_list type. It is the common case of local array where
26304 we can not benefit from the alignment.
26305
26306 TODO: Probably one should optimize for size only when var is not escaping. */
26307 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
26308 && TARGET_SSE)
26309 {
26310 if (AGGREGATE_TYPE_P (type)
26311 && (va_list_type_node == NULL_TREE
26312 || (TYPE_MAIN_VARIANT (type)
26313 != TYPE_MAIN_VARIANT (va_list_type_node)))
26314 && TYPE_SIZE (type)
26315 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26316 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
26317 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26318 return 128;
26319 }
26320 if (TREE_CODE (type) == ARRAY_TYPE)
26321 {
26322 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26323 return 64;
26324 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26325 return 128;
26326 }
26327 else if (TREE_CODE (type) == COMPLEX_TYPE)
26328 {
26329 if (TYPE_MODE (type) == DCmode && align < 64)
26330 return 64;
26331 if ((TYPE_MODE (type) == XCmode
26332 || TYPE_MODE (type) == TCmode) && align < 128)
26333 return 128;
26334 }
26335 else if ((TREE_CODE (type) == RECORD_TYPE
26336 || TREE_CODE (type) == UNION_TYPE
26337 || TREE_CODE (type) == QUAL_UNION_TYPE)
26338 && TYPE_FIELDS (type))
26339 {
26340 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26341 return 64;
26342 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26343 return 128;
26344 }
26345 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26346 || TREE_CODE (type) == INTEGER_TYPE)
26347 {
26348
26349 if (TYPE_MODE (type) == DFmode && align < 64)
26350 return 64;
26351 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26352 return 128;
26353 }
26354 return align;
26355 }
26356
26357 /* Compute the minimum required alignment for dynamic stack realignment
26358 purposes for a local variable, parameter or a stack slot. EXP is
26359 the data type or decl itself, MODE is its mode and ALIGN is the
26360 alignment that the object would ordinarily have. */
26361
26362 unsigned int
26363 ix86_minimum_alignment (tree exp, enum machine_mode mode,
26364 unsigned int align)
26365 {
26366 tree type, decl;
26367
26368 if (exp && DECL_P (exp))
26369 {
26370 type = TREE_TYPE (exp);
26371 decl = exp;
26372 }
26373 else
26374 {
26375 type = exp;
26376 decl = NULL;
26377 }
26378
26379 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
26380 return align;
26381
26382 /* Don't do dynamic stack realignment for long long objects with
26383 -mpreferred-stack-boundary=2. */
26384 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
26385 && (!type || !TYPE_USER_ALIGN (type))
26386 && (!decl || !DECL_USER_ALIGN (decl)))
26387 return 32;
26388
26389 return align;
26390 }
26391 \f
26392 /* Find a location for the static chain incoming to a nested function.
26393 This is a register, unless all free registers are used by arguments. */
26394
26395 static rtx
26396 ix86_static_chain (const_tree fndecl, bool incoming_p)
26397 {
26398 unsigned regno;
26399
26400 if (!DECL_STATIC_CHAIN (fndecl))
26401 return NULL;
26402
26403 if (TARGET_64BIT)
26404 {
26405 /* We always use R10 in 64-bit mode. */
26406 regno = R10_REG;
26407 }
26408 else
26409 {
26410 tree fntype;
26411 unsigned int ccvt;
26412
26413 /* By default in 32-bit mode we use ECX to pass the static chain. */
26414 regno = CX_REG;
26415
26416 fntype = TREE_TYPE (fndecl);
26417 ccvt = ix86_get_callcvt (fntype);
26418 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
26419 {
26420 /* Fastcall functions use ecx/edx for arguments, which leaves
26421 us with EAX for the static chain.
26422 Thiscall functions use ecx for arguments, which also
26423 leaves us with EAX for the static chain. */
26424 regno = AX_REG;
26425 }
26426 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
26427 {
26428 /* Thiscall functions use ecx for arguments, which leaves
26429 us with EAX and EDX for the static chain.
26430 We are using for abi-compatibility EAX. */
26431 regno = AX_REG;
26432 }
26433 else if (ix86_function_regparm (fntype, fndecl) == 3)
26434 {
26435 /* For regparm 3, we have no free call-clobbered registers in
26436 which to store the static chain. In order to implement this,
26437 we have the trampoline push the static chain to the stack.
26438 However, we can't push a value below the return address when
26439 we call the nested function directly, so we have to use an
26440 alternate entry point. For this we use ESI, and have the
26441 alternate entry point push ESI, so that things appear the
26442 same once we're executing the nested function. */
26443 if (incoming_p)
26444 {
26445 if (fndecl == current_function_decl)
26446 ix86_static_chain_on_stack = true;
26447 return gen_frame_mem (SImode,
26448 plus_constant (Pmode,
26449 arg_pointer_rtx, -8));
26450 }
26451 regno = SI_REG;
26452 }
26453 }
26454
26455 return gen_rtx_REG (Pmode, regno);
26456 }
26457
26458 /* Emit RTL insns to initialize the variable parts of a trampoline.
26459 FNDECL is the decl of the target address; M_TRAMP is a MEM for
26460 the trampoline, and CHAIN_VALUE is an RTX for the static chain
26461 to be passed to the target function. */
26462
26463 static void
26464 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
26465 {
26466 rtx mem, fnaddr;
26467 int opcode;
26468 int offset = 0;
26469
26470 fnaddr = XEXP (DECL_RTL (fndecl), 0);
26471
26472 if (TARGET_64BIT)
26473 {
26474 int size;
26475
26476 /* Load the function address to r11. Try to load address using
26477 the shorter movl instead of movabs. We may want to support
26478 movq for kernel mode, but kernel does not use trampolines at
26479 the moment. FNADDR is a 32bit address and may not be in
26480 DImode when ptr_mode == SImode. Always use movl in this
26481 case. */
26482 if (ptr_mode == SImode
26483 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
26484 {
26485 fnaddr = copy_addr_to_reg (fnaddr);
26486
26487 mem = adjust_address (m_tramp, HImode, offset);
26488 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
26489
26490 mem = adjust_address (m_tramp, SImode, offset + 2);
26491 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
26492 offset += 6;
26493 }
26494 else
26495 {
26496 mem = adjust_address (m_tramp, HImode, offset);
26497 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
26498
26499 mem = adjust_address (m_tramp, DImode, offset + 2);
26500 emit_move_insn (mem, fnaddr);
26501 offset += 10;
26502 }
26503
26504 /* Load static chain using movabs to r10. Use the shorter movl
26505 instead of movabs when ptr_mode == SImode. */
26506 if (ptr_mode == SImode)
26507 {
26508 opcode = 0xba41;
26509 size = 6;
26510 }
26511 else
26512 {
26513 opcode = 0xba49;
26514 size = 10;
26515 }
26516
26517 mem = adjust_address (m_tramp, HImode, offset);
26518 emit_move_insn (mem, gen_int_mode (opcode, HImode));
26519
26520 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
26521 emit_move_insn (mem, chain_value);
26522 offset += size;
26523
26524 /* Jump to r11; the last (unused) byte is a nop, only there to
26525 pad the write out to a single 32-bit store. */
26526 mem = adjust_address (m_tramp, SImode, offset);
26527 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
26528 offset += 4;
26529 }
26530 else
26531 {
26532 rtx disp, chain;
26533
26534 /* Depending on the static chain location, either load a register
26535 with a constant, or push the constant to the stack. All of the
26536 instructions are the same size. */
26537 chain = ix86_static_chain (fndecl, true);
26538 if (REG_P (chain))
26539 {
26540 switch (REGNO (chain))
26541 {
26542 case AX_REG:
26543 opcode = 0xb8; break;
26544 case CX_REG:
26545 opcode = 0xb9; break;
26546 default:
26547 gcc_unreachable ();
26548 }
26549 }
26550 else
26551 opcode = 0x68;
26552
26553 mem = adjust_address (m_tramp, QImode, offset);
26554 emit_move_insn (mem, gen_int_mode (opcode, QImode));
26555
26556 mem = adjust_address (m_tramp, SImode, offset + 1);
26557 emit_move_insn (mem, chain_value);
26558 offset += 5;
26559
26560 mem = adjust_address (m_tramp, QImode, offset);
26561 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
26562
26563 mem = adjust_address (m_tramp, SImode, offset + 1);
26564
26565 /* Compute offset from the end of the jmp to the target function.
26566 In the case in which the trampoline stores the static chain on
26567 the stack, we need to skip the first insn which pushes the
26568 (call-saved) register static chain; this push is 1 byte. */
26569 offset += 5;
26570 disp = expand_binop (SImode, sub_optab, fnaddr,
26571 plus_constant (Pmode, XEXP (m_tramp, 0),
26572 offset - (MEM_P (chain) ? 1 : 0)),
26573 NULL_RTX, 1, OPTAB_DIRECT);
26574 emit_move_insn (mem, disp);
26575 }
26576
26577 gcc_assert (offset <= TRAMPOLINE_SIZE);
26578
26579 #ifdef HAVE_ENABLE_EXECUTE_STACK
26580 #ifdef CHECK_EXECUTE_STACK_ENABLED
26581 if (CHECK_EXECUTE_STACK_ENABLED)
26582 #endif
26583 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
26584 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
26585 #endif
26586 }
26587 \f
26588 /* The following file contains several enumerations and data structures
26589 built from the definitions in i386-builtin-types.def. */
26590
26591 #include "i386-builtin-types.inc"
26592
26593 /* Table for the ix86 builtin non-function types. */
26594 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
26595
26596 /* Retrieve an element from the above table, building some of
26597 the types lazily. */
26598
26599 static tree
26600 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26601 {
26602 unsigned int index;
26603 tree type, itype;
26604
26605 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26606
26607 type = ix86_builtin_type_tab[(int) tcode];
26608 if (type != NULL)
26609 return type;
26610
26611 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26612 if (tcode <= IX86_BT_LAST_VECT)
26613 {
26614 enum machine_mode mode;
26615
26616 index = tcode - IX86_BT_LAST_PRIM - 1;
26617 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26618 mode = ix86_builtin_type_vect_mode[index];
26619
26620 type = build_vector_type_for_mode (itype, mode);
26621 }
26622 else
26623 {
26624 int quals;
26625
26626 index = tcode - IX86_BT_LAST_VECT - 1;
26627 if (tcode <= IX86_BT_LAST_PTR)
26628 quals = TYPE_UNQUALIFIED;
26629 else
26630 quals = TYPE_QUAL_CONST;
26631
26632 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26633 if (quals != TYPE_UNQUALIFIED)
26634 itype = build_qualified_type (itype, quals);
26635
26636 type = build_pointer_type (itype);
26637 }
26638
26639 ix86_builtin_type_tab[(int) tcode] = type;
26640 return type;
26641 }
26642
26643 /* Table for the ix86 builtin function types. */
26644 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26645
26646 /* Retrieve an element from the above table, building some of
26647 the types lazily. */
26648
26649 static tree
26650 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26651 {
26652 tree type;
26653
26654 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26655
26656 type = ix86_builtin_func_type_tab[(int) tcode];
26657 if (type != NULL)
26658 return type;
26659
26660 if (tcode <= IX86_BT_LAST_FUNC)
26661 {
26662 unsigned start = ix86_builtin_func_start[(int) tcode];
26663 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26664 tree rtype, atype, args = void_list_node;
26665 unsigned i;
26666
26667 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26668 for (i = after - 1; i > start; --i)
26669 {
26670 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26671 args = tree_cons (NULL, atype, args);
26672 }
26673
26674 type = build_function_type (rtype, args);
26675 }
26676 else
26677 {
26678 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26679 enum ix86_builtin_func_type icode;
26680
26681 icode = ix86_builtin_func_alias_base[index];
26682 type = ix86_get_builtin_func_type (icode);
26683 }
26684
26685 ix86_builtin_func_type_tab[(int) tcode] = type;
26686 return type;
26687 }
26688
26689
26690 /* Codes for all the SSE/MMX builtins. */
26691 enum ix86_builtins
26692 {
26693 IX86_BUILTIN_ADDPS,
26694 IX86_BUILTIN_ADDSS,
26695 IX86_BUILTIN_DIVPS,
26696 IX86_BUILTIN_DIVSS,
26697 IX86_BUILTIN_MULPS,
26698 IX86_BUILTIN_MULSS,
26699 IX86_BUILTIN_SUBPS,
26700 IX86_BUILTIN_SUBSS,
26701
26702 IX86_BUILTIN_CMPEQPS,
26703 IX86_BUILTIN_CMPLTPS,
26704 IX86_BUILTIN_CMPLEPS,
26705 IX86_BUILTIN_CMPGTPS,
26706 IX86_BUILTIN_CMPGEPS,
26707 IX86_BUILTIN_CMPNEQPS,
26708 IX86_BUILTIN_CMPNLTPS,
26709 IX86_BUILTIN_CMPNLEPS,
26710 IX86_BUILTIN_CMPNGTPS,
26711 IX86_BUILTIN_CMPNGEPS,
26712 IX86_BUILTIN_CMPORDPS,
26713 IX86_BUILTIN_CMPUNORDPS,
26714 IX86_BUILTIN_CMPEQSS,
26715 IX86_BUILTIN_CMPLTSS,
26716 IX86_BUILTIN_CMPLESS,
26717 IX86_BUILTIN_CMPNEQSS,
26718 IX86_BUILTIN_CMPNLTSS,
26719 IX86_BUILTIN_CMPNLESS,
26720 IX86_BUILTIN_CMPORDSS,
26721 IX86_BUILTIN_CMPUNORDSS,
26722
26723 IX86_BUILTIN_COMIEQSS,
26724 IX86_BUILTIN_COMILTSS,
26725 IX86_BUILTIN_COMILESS,
26726 IX86_BUILTIN_COMIGTSS,
26727 IX86_BUILTIN_COMIGESS,
26728 IX86_BUILTIN_COMINEQSS,
26729 IX86_BUILTIN_UCOMIEQSS,
26730 IX86_BUILTIN_UCOMILTSS,
26731 IX86_BUILTIN_UCOMILESS,
26732 IX86_BUILTIN_UCOMIGTSS,
26733 IX86_BUILTIN_UCOMIGESS,
26734 IX86_BUILTIN_UCOMINEQSS,
26735
26736 IX86_BUILTIN_CVTPI2PS,
26737 IX86_BUILTIN_CVTPS2PI,
26738 IX86_BUILTIN_CVTSI2SS,
26739 IX86_BUILTIN_CVTSI642SS,
26740 IX86_BUILTIN_CVTSS2SI,
26741 IX86_BUILTIN_CVTSS2SI64,
26742 IX86_BUILTIN_CVTTPS2PI,
26743 IX86_BUILTIN_CVTTSS2SI,
26744 IX86_BUILTIN_CVTTSS2SI64,
26745
26746 IX86_BUILTIN_MAXPS,
26747 IX86_BUILTIN_MAXSS,
26748 IX86_BUILTIN_MINPS,
26749 IX86_BUILTIN_MINSS,
26750
26751 IX86_BUILTIN_LOADUPS,
26752 IX86_BUILTIN_STOREUPS,
26753 IX86_BUILTIN_MOVSS,
26754
26755 IX86_BUILTIN_MOVHLPS,
26756 IX86_BUILTIN_MOVLHPS,
26757 IX86_BUILTIN_LOADHPS,
26758 IX86_BUILTIN_LOADLPS,
26759 IX86_BUILTIN_STOREHPS,
26760 IX86_BUILTIN_STORELPS,
26761
26762 IX86_BUILTIN_MASKMOVQ,
26763 IX86_BUILTIN_MOVMSKPS,
26764 IX86_BUILTIN_PMOVMSKB,
26765
26766 IX86_BUILTIN_MOVNTPS,
26767 IX86_BUILTIN_MOVNTQ,
26768
26769 IX86_BUILTIN_LOADDQU,
26770 IX86_BUILTIN_STOREDQU,
26771
26772 IX86_BUILTIN_PACKSSWB,
26773 IX86_BUILTIN_PACKSSDW,
26774 IX86_BUILTIN_PACKUSWB,
26775
26776 IX86_BUILTIN_PADDB,
26777 IX86_BUILTIN_PADDW,
26778 IX86_BUILTIN_PADDD,
26779 IX86_BUILTIN_PADDQ,
26780 IX86_BUILTIN_PADDSB,
26781 IX86_BUILTIN_PADDSW,
26782 IX86_BUILTIN_PADDUSB,
26783 IX86_BUILTIN_PADDUSW,
26784 IX86_BUILTIN_PSUBB,
26785 IX86_BUILTIN_PSUBW,
26786 IX86_BUILTIN_PSUBD,
26787 IX86_BUILTIN_PSUBQ,
26788 IX86_BUILTIN_PSUBSB,
26789 IX86_BUILTIN_PSUBSW,
26790 IX86_BUILTIN_PSUBUSB,
26791 IX86_BUILTIN_PSUBUSW,
26792
26793 IX86_BUILTIN_PAND,
26794 IX86_BUILTIN_PANDN,
26795 IX86_BUILTIN_POR,
26796 IX86_BUILTIN_PXOR,
26797
26798 IX86_BUILTIN_PAVGB,
26799 IX86_BUILTIN_PAVGW,
26800
26801 IX86_BUILTIN_PCMPEQB,
26802 IX86_BUILTIN_PCMPEQW,
26803 IX86_BUILTIN_PCMPEQD,
26804 IX86_BUILTIN_PCMPGTB,
26805 IX86_BUILTIN_PCMPGTW,
26806 IX86_BUILTIN_PCMPGTD,
26807
26808 IX86_BUILTIN_PMADDWD,
26809
26810 IX86_BUILTIN_PMAXSW,
26811 IX86_BUILTIN_PMAXUB,
26812 IX86_BUILTIN_PMINSW,
26813 IX86_BUILTIN_PMINUB,
26814
26815 IX86_BUILTIN_PMULHUW,
26816 IX86_BUILTIN_PMULHW,
26817 IX86_BUILTIN_PMULLW,
26818
26819 IX86_BUILTIN_PSADBW,
26820 IX86_BUILTIN_PSHUFW,
26821
26822 IX86_BUILTIN_PSLLW,
26823 IX86_BUILTIN_PSLLD,
26824 IX86_BUILTIN_PSLLQ,
26825 IX86_BUILTIN_PSRAW,
26826 IX86_BUILTIN_PSRAD,
26827 IX86_BUILTIN_PSRLW,
26828 IX86_BUILTIN_PSRLD,
26829 IX86_BUILTIN_PSRLQ,
26830 IX86_BUILTIN_PSLLWI,
26831 IX86_BUILTIN_PSLLDI,
26832 IX86_BUILTIN_PSLLQI,
26833 IX86_BUILTIN_PSRAWI,
26834 IX86_BUILTIN_PSRADI,
26835 IX86_BUILTIN_PSRLWI,
26836 IX86_BUILTIN_PSRLDI,
26837 IX86_BUILTIN_PSRLQI,
26838
26839 IX86_BUILTIN_PUNPCKHBW,
26840 IX86_BUILTIN_PUNPCKHWD,
26841 IX86_BUILTIN_PUNPCKHDQ,
26842 IX86_BUILTIN_PUNPCKLBW,
26843 IX86_BUILTIN_PUNPCKLWD,
26844 IX86_BUILTIN_PUNPCKLDQ,
26845
26846 IX86_BUILTIN_SHUFPS,
26847
26848 IX86_BUILTIN_RCPPS,
26849 IX86_BUILTIN_RCPSS,
26850 IX86_BUILTIN_RSQRTPS,
26851 IX86_BUILTIN_RSQRTPS_NR,
26852 IX86_BUILTIN_RSQRTSS,
26853 IX86_BUILTIN_RSQRTF,
26854 IX86_BUILTIN_SQRTPS,
26855 IX86_BUILTIN_SQRTPS_NR,
26856 IX86_BUILTIN_SQRTSS,
26857
26858 IX86_BUILTIN_UNPCKHPS,
26859 IX86_BUILTIN_UNPCKLPS,
26860
26861 IX86_BUILTIN_ANDPS,
26862 IX86_BUILTIN_ANDNPS,
26863 IX86_BUILTIN_ORPS,
26864 IX86_BUILTIN_XORPS,
26865
26866 IX86_BUILTIN_EMMS,
26867 IX86_BUILTIN_LDMXCSR,
26868 IX86_BUILTIN_STMXCSR,
26869 IX86_BUILTIN_SFENCE,
26870
26871 IX86_BUILTIN_FXSAVE,
26872 IX86_BUILTIN_FXRSTOR,
26873 IX86_BUILTIN_FXSAVE64,
26874 IX86_BUILTIN_FXRSTOR64,
26875
26876 IX86_BUILTIN_XSAVE,
26877 IX86_BUILTIN_XRSTOR,
26878 IX86_BUILTIN_XSAVE64,
26879 IX86_BUILTIN_XRSTOR64,
26880
26881 IX86_BUILTIN_XSAVEOPT,
26882 IX86_BUILTIN_XSAVEOPT64,
26883
26884 /* 3DNow! Original */
26885 IX86_BUILTIN_FEMMS,
26886 IX86_BUILTIN_PAVGUSB,
26887 IX86_BUILTIN_PF2ID,
26888 IX86_BUILTIN_PFACC,
26889 IX86_BUILTIN_PFADD,
26890 IX86_BUILTIN_PFCMPEQ,
26891 IX86_BUILTIN_PFCMPGE,
26892 IX86_BUILTIN_PFCMPGT,
26893 IX86_BUILTIN_PFMAX,
26894 IX86_BUILTIN_PFMIN,
26895 IX86_BUILTIN_PFMUL,
26896 IX86_BUILTIN_PFRCP,
26897 IX86_BUILTIN_PFRCPIT1,
26898 IX86_BUILTIN_PFRCPIT2,
26899 IX86_BUILTIN_PFRSQIT1,
26900 IX86_BUILTIN_PFRSQRT,
26901 IX86_BUILTIN_PFSUB,
26902 IX86_BUILTIN_PFSUBR,
26903 IX86_BUILTIN_PI2FD,
26904 IX86_BUILTIN_PMULHRW,
26905
26906 /* 3DNow! Athlon Extensions */
26907 IX86_BUILTIN_PF2IW,
26908 IX86_BUILTIN_PFNACC,
26909 IX86_BUILTIN_PFPNACC,
26910 IX86_BUILTIN_PI2FW,
26911 IX86_BUILTIN_PSWAPDSI,
26912 IX86_BUILTIN_PSWAPDSF,
26913
26914 /* SSE2 */
26915 IX86_BUILTIN_ADDPD,
26916 IX86_BUILTIN_ADDSD,
26917 IX86_BUILTIN_DIVPD,
26918 IX86_BUILTIN_DIVSD,
26919 IX86_BUILTIN_MULPD,
26920 IX86_BUILTIN_MULSD,
26921 IX86_BUILTIN_SUBPD,
26922 IX86_BUILTIN_SUBSD,
26923
26924 IX86_BUILTIN_CMPEQPD,
26925 IX86_BUILTIN_CMPLTPD,
26926 IX86_BUILTIN_CMPLEPD,
26927 IX86_BUILTIN_CMPGTPD,
26928 IX86_BUILTIN_CMPGEPD,
26929 IX86_BUILTIN_CMPNEQPD,
26930 IX86_BUILTIN_CMPNLTPD,
26931 IX86_BUILTIN_CMPNLEPD,
26932 IX86_BUILTIN_CMPNGTPD,
26933 IX86_BUILTIN_CMPNGEPD,
26934 IX86_BUILTIN_CMPORDPD,
26935 IX86_BUILTIN_CMPUNORDPD,
26936 IX86_BUILTIN_CMPEQSD,
26937 IX86_BUILTIN_CMPLTSD,
26938 IX86_BUILTIN_CMPLESD,
26939 IX86_BUILTIN_CMPNEQSD,
26940 IX86_BUILTIN_CMPNLTSD,
26941 IX86_BUILTIN_CMPNLESD,
26942 IX86_BUILTIN_CMPORDSD,
26943 IX86_BUILTIN_CMPUNORDSD,
26944
26945 IX86_BUILTIN_COMIEQSD,
26946 IX86_BUILTIN_COMILTSD,
26947 IX86_BUILTIN_COMILESD,
26948 IX86_BUILTIN_COMIGTSD,
26949 IX86_BUILTIN_COMIGESD,
26950 IX86_BUILTIN_COMINEQSD,
26951 IX86_BUILTIN_UCOMIEQSD,
26952 IX86_BUILTIN_UCOMILTSD,
26953 IX86_BUILTIN_UCOMILESD,
26954 IX86_BUILTIN_UCOMIGTSD,
26955 IX86_BUILTIN_UCOMIGESD,
26956 IX86_BUILTIN_UCOMINEQSD,
26957
26958 IX86_BUILTIN_MAXPD,
26959 IX86_BUILTIN_MAXSD,
26960 IX86_BUILTIN_MINPD,
26961 IX86_BUILTIN_MINSD,
26962
26963 IX86_BUILTIN_ANDPD,
26964 IX86_BUILTIN_ANDNPD,
26965 IX86_BUILTIN_ORPD,
26966 IX86_BUILTIN_XORPD,
26967
26968 IX86_BUILTIN_SQRTPD,
26969 IX86_BUILTIN_SQRTSD,
26970
26971 IX86_BUILTIN_UNPCKHPD,
26972 IX86_BUILTIN_UNPCKLPD,
26973
26974 IX86_BUILTIN_SHUFPD,
26975
26976 IX86_BUILTIN_LOADUPD,
26977 IX86_BUILTIN_STOREUPD,
26978 IX86_BUILTIN_MOVSD,
26979
26980 IX86_BUILTIN_LOADHPD,
26981 IX86_BUILTIN_LOADLPD,
26982
26983 IX86_BUILTIN_CVTDQ2PD,
26984 IX86_BUILTIN_CVTDQ2PS,
26985
26986 IX86_BUILTIN_CVTPD2DQ,
26987 IX86_BUILTIN_CVTPD2PI,
26988 IX86_BUILTIN_CVTPD2PS,
26989 IX86_BUILTIN_CVTTPD2DQ,
26990 IX86_BUILTIN_CVTTPD2PI,
26991
26992 IX86_BUILTIN_CVTPI2PD,
26993 IX86_BUILTIN_CVTSI2SD,
26994 IX86_BUILTIN_CVTSI642SD,
26995
26996 IX86_BUILTIN_CVTSD2SI,
26997 IX86_BUILTIN_CVTSD2SI64,
26998 IX86_BUILTIN_CVTSD2SS,
26999 IX86_BUILTIN_CVTSS2SD,
27000 IX86_BUILTIN_CVTTSD2SI,
27001 IX86_BUILTIN_CVTTSD2SI64,
27002
27003 IX86_BUILTIN_CVTPS2DQ,
27004 IX86_BUILTIN_CVTPS2PD,
27005 IX86_BUILTIN_CVTTPS2DQ,
27006
27007 IX86_BUILTIN_MOVNTI,
27008 IX86_BUILTIN_MOVNTI64,
27009 IX86_BUILTIN_MOVNTPD,
27010 IX86_BUILTIN_MOVNTDQ,
27011
27012 IX86_BUILTIN_MOVQ128,
27013
27014 /* SSE2 MMX */
27015 IX86_BUILTIN_MASKMOVDQU,
27016 IX86_BUILTIN_MOVMSKPD,
27017 IX86_BUILTIN_PMOVMSKB128,
27018
27019 IX86_BUILTIN_PACKSSWB128,
27020 IX86_BUILTIN_PACKSSDW128,
27021 IX86_BUILTIN_PACKUSWB128,
27022
27023 IX86_BUILTIN_PADDB128,
27024 IX86_BUILTIN_PADDW128,
27025 IX86_BUILTIN_PADDD128,
27026 IX86_BUILTIN_PADDQ128,
27027 IX86_BUILTIN_PADDSB128,
27028 IX86_BUILTIN_PADDSW128,
27029 IX86_BUILTIN_PADDUSB128,
27030 IX86_BUILTIN_PADDUSW128,
27031 IX86_BUILTIN_PSUBB128,
27032 IX86_BUILTIN_PSUBW128,
27033 IX86_BUILTIN_PSUBD128,
27034 IX86_BUILTIN_PSUBQ128,
27035 IX86_BUILTIN_PSUBSB128,
27036 IX86_BUILTIN_PSUBSW128,
27037 IX86_BUILTIN_PSUBUSB128,
27038 IX86_BUILTIN_PSUBUSW128,
27039
27040 IX86_BUILTIN_PAND128,
27041 IX86_BUILTIN_PANDN128,
27042 IX86_BUILTIN_POR128,
27043 IX86_BUILTIN_PXOR128,
27044
27045 IX86_BUILTIN_PAVGB128,
27046 IX86_BUILTIN_PAVGW128,
27047
27048 IX86_BUILTIN_PCMPEQB128,
27049 IX86_BUILTIN_PCMPEQW128,
27050 IX86_BUILTIN_PCMPEQD128,
27051 IX86_BUILTIN_PCMPGTB128,
27052 IX86_BUILTIN_PCMPGTW128,
27053 IX86_BUILTIN_PCMPGTD128,
27054
27055 IX86_BUILTIN_PMADDWD128,
27056
27057 IX86_BUILTIN_PMAXSW128,
27058 IX86_BUILTIN_PMAXUB128,
27059 IX86_BUILTIN_PMINSW128,
27060 IX86_BUILTIN_PMINUB128,
27061
27062 IX86_BUILTIN_PMULUDQ,
27063 IX86_BUILTIN_PMULUDQ128,
27064 IX86_BUILTIN_PMULHUW128,
27065 IX86_BUILTIN_PMULHW128,
27066 IX86_BUILTIN_PMULLW128,
27067
27068 IX86_BUILTIN_PSADBW128,
27069 IX86_BUILTIN_PSHUFHW,
27070 IX86_BUILTIN_PSHUFLW,
27071 IX86_BUILTIN_PSHUFD,
27072
27073 IX86_BUILTIN_PSLLDQI128,
27074 IX86_BUILTIN_PSLLWI128,
27075 IX86_BUILTIN_PSLLDI128,
27076 IX86_BUILTIN_PSLLQI128,
27077 IX86_BUILTIN_PSRAWI128,
27078 IX86_BUILTIN_PSRADI128,
27079 IX86_BUILTIN_PSRLDQI128,
27080 IX86_BUILTIN_PSRLWI128,
27081 IX86_BUILTIN_PSRLDI128,
27082 IX86_BUILTIN_PSRLQI128,
27083
27084 IX86_BUILTIN_PSLLDQ128,
27085 IX86_BUILTIN_PSLLW128,
27086 IX86_BUILTIN_PSLLD128,
27087 IX86_BUILTIN_PSLLQ128,
27088 IX86_BUILTIN_PSRAW128,
27089 IX86_BUILTIN_PSRAD128,
27090 IX86_BUILTIN_PSRLW128,
27091 IX86_BUILTIN_PSRLD128,
27092 IX86_BUILTIN_PSRLQ128,
27093
27094 IX86_BUILTIN_PUNPCKHBW128,
27095 IX86_BUILTIN_PUNPCKHWD128,
27096 IX86_BUILTIN_PUNPCKHDQ128,
27097 IX86_BUILTIN_PUNPCKHQDQ128,
27098 IX86_BUILTIN_PUNPCKLBW128,
27099 IX86_BUILTIN_PUNPCKLWD128,
27100 IX86_BUILTIN_PUNPCKLDQ128,
27101 IX86_BUILTIN_PUNPCKLQDQ128,
27102
27103 IX86_BUILTIN_CLFLUSH,
27104 IX86_BUILTIN_MFENCE,
27105 IX86_BUILTIN_LFENCE,
27106 IX86_BUILTIN_PAUSE,
27107
27108 IX86_BUILTIN_FNSTENV,
27109 IX86_BUILTIN_FLDENV,
27110 IX86_BUILTIN_FNSTSW,
27111 IX86_BUILTIN_FNCLEX,
27112
27113 IX86_BUILTIN_BSRSI,
27114 IX86_BUILTIN_BSRDI,
27115 IX86_BUILTIN_RDPMC,
27116 IX86_BUILTIN_RDTSC,
27117 IX86_BUILTIN_RDTSCP,
27118 IX86_BUILTIN_ROLQI,
27119 IX86_BUILTIN_ROLHI,
27120 IX86_BUILTIN_RORQI,
27121 IX86_BUILTIN_RORHI,
27122
27123 /* SSE3. */
27124 IX86_BUILTIN_ADDSUBPS,
27125 IX86_BUILTIN_HADDPS,
27126 IX86_BUILTIN_HSUBPS,
27127 IX86_BUILTIN_MOVSHDUP,
27128 IX86_BUILTIN_MOVSLDUP,
27129 IX86_BUILTIN_ADDSUBPD,
27130 IX86_BUILTIN_HADDPD,
27131 IX86_BUILTIN_HSUBPD,
27132 IX86_BUILTIN_LDDQU,
27133
27134 IX86_BUILTIN_MONITOR,
27135 IX86_BUILTIN_MWAIT,
27136
27137 /* SSSE3. */
27138 IX86_BUILTIN_PHADDW,
27139 IX86_BUILTIN_PHADDD,
27140 IX86_BUILTIN_PHADDSW,
27141 IX86_BUILTIN_PHSUBW,
27142 IX86_BUILTIN_PHSUBD,
27143 IX86_BUILTIN_PHSUBSW,
27144 IX86_BUILTIN_PMADDUBSW,
27145 IX86_BUILTIN_PMULHRSW,
27146 IX86_BUILTIN_PSHUFB,
27147 IX86_BUILTIN_PSIGNB,
27148 IX86_BUILTIN_PSIGNW,
27149 IX86_BUILTIN_PSIGND,
27150 IX86_BUILTIN_PALIGNR,
27151 IX86_BUILTIN_PABSB,
27152 IX86_BUILTIN_PABSW,
27153 IX86_BUILTIN_PABSD,
27154
27155 IX86_BUILTIN_PHADDW128,
27156 IX86_BUILTIN_PHADDD128,
27157 IX86_BUILTIN_PHADDSW128,
27158 IX86_BUILTIN_PHSUBW128,
27159 IX86_BUILTIN_PHSUBD128,
27160 IX86_BUILTIN_PHSUBSW128,
27161 IX86_BUILTIN_PMADDUBSW128,
27162 IX86_BUILTIN_PMULHRSW128,
27163 IX86_BUILTIN_PSHUFB128,
27164 IX86_BUILTIN_PSIGNB128,
27165 IX86_BUILTIN_PSIGNW128,
27166 IX86_BUILTIN_PSIGND128,
27167 IX86_BUILTIN_PALIGNR128,
27168 IX86_BUILTIN_PABSB128,
27169 IX86_BUILTIN_PABSW128,
27170 IX86_BUILTIN_PABSD128,
27171
27172 /* AMDFAM10 - SSE4A New Instructions. */
27173 IX86_BUILTIN_MOVNTSD,
27174 IX86_BUILTIN_MOVNTSS,
27175 IX86_BUILTIN_EXTRQI,
27176 IX86_BUILTIN_EXTRQ,
27177 IX86_BUILTIN_INSERTQI,
27178 IX86_BUILTIN_INSERTQ,
27179
27180 /* SSE4.1. */
27181 IX86_BUILTIN_BLENDPD,
27182 IX86_BUILTIN_BLENDPS,
27183 IX86_BUILTIN_BLENDVPD,
27184 IX86_BUILTIN_BLENDVPS,
27185 IX86_BUILTIN_PBLENDVB128,
27186 IX86_BUILTIN_PBLENDW128,
27187
27188 IX86_BUILTIN_DPPD,
27189 IX86_BUILTIN_DPPS,
27190
27191 IX86_BUILTIN_INSERTPS128,
27192
27193 IX86_BUILTIN_MOVNTDQA,
27194 IX86_BUILTIN_MPSADBW128,
27195 IX86_BUILTIN_PACKUSDW128,
27196 IX86_BUILTIN_PCMPEQQ,
27197 IX86_BUILTIN_PHMINPOSUW128,
27198
27199 IX86_BUILTIN_PMAXSB128,
27200 IX86_BUILTIN_PMAXSD128,
27201 IX86_BUILTIN_PMAXUD128,
27202 IX86_BUILTIN_PMAXUW128,
27203
27204 IX86_BUILTIN_PMINSB128,
27205 IX86_BUILTIN_PMINSD128,
27206 IX86_BUILTIN_PMINUD128,
27207 IX86_BUILTIN_PMINUW128,
27208
27209 IX86_BUILTIN_PMOVSXBW128,
27210 IX86_BUILTIN_PMOVSXBD128,
27211 IX86_BUILTIN_PMOVSXBQ128,
27212 IX86_BUILTIN_PMOVSXWD128,
27213 IX86_BUILTIN_PMOVSXWQ128,
27214 IX86_BUILTIN_PMOVSXDQ128,
27215
27216 IX86_BUILTIN_PMOVZXBW128,
27217 IX86_BUILTIN_PMOVZXBD128,
27218 IX86_BUILTIN_PMOVZXBQ128,
27219 IX86_BUILTIN_PMOVZXWD128,
27220 IX86_BUILTIN_PMOVZXWQ128,
27221 IX86_BUILTIN_PMOVZXDQ128,
27222
27223 IX86_BUILTIN_PMULDQ128,
27224 IX86_BUILTIN_PMULLD128,
27225
27226 IX86_BUILTIN_ROUNDSD,
27227 IX86_BUILTIN_ROUNDSS,
27228
27229 IX86_BUILTIN_ROUNDPD,
27230 IX86_BUILTIN_ROUNDPS,
27231
27232 IX86_BUILTIN_FLOORPD,
27233 IX86_BUILTIN_CEILPD,
27234 IX86_BUILTIN_TRUNCPD,
27235 IX86_BUILTIN_RINTPD,
27236 IX86_BUILTIN_ROUNDPD_AZ,
27237
27238 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
27239 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
27240 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
27241
27242 IX86_BUILTIN_FLOORPS,
27243 IX86_BUILTIN_CEILPS,
27244 IX86_BUILTIN_TRUNCPS,
27245 IX86_BUILTIN_RINTPS,
27246 IX86_BUILTIN_ROUNDPS_AZ,
27247
27248 IX86_BUILTIN_FLOORPS_SFIX,
27249 IX86_BUILTIN_CEILPS_SFIX,
27250 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
27251
27252 IX86_BUILTIN_PTESTZ,
27253 IX86_BUILTIN_PTESTC,
27254 IX86_BUILTIN_PTESTNZC,
27255
27256 IX86_BUILTIN_VEC_INIT_V2SI,
27257 IX86_BUILTIN_VEC_INIT_V4HI,
27258 IX86_BUILTIN_VEC_INIT_V8QI,
27259 IX86_BUILTIN_VEC_EXT_V2DF,
27260 IX86_BUILTIN_VEC_EXT_V2DI,
27261 IX86_BUILTIN_VEC_EXT_V4SF,
27262 IX86_BUILTIN_VEC_EXT_V4SI,
27263 IX86_BUILTIN_VEC_EXT_V8HI,
27264 IX86_BUILTIN_VEC_EXT_V2SI,
27265 IX86_BUILTIN_VEC_EXT_V4HI,
27266 IX86_BUILTIN_VEC_EXT_V16QI,
27267 IX86_BUILTIN_VEC_SET_V2DI,
27268 IX86_BUILTIN_VEC_SET_V4SF,
27269 IX86_BUILTIN_VEC_SET_V4SI,
27270 IX86_BUILTIN_VEC_SET_V8HI,
27271 IX86_BUILTIN_VEC_SET_V4HI,
27272 IX86_BUILTIN_VEC_SET_V16QI,
27273
27274 IX86_BUILTIN_VEC_PACK_SFIX,
27275 IX86_BUILTIN_VEC_PACK_SFIX256,
27276
27277 /* SSE4.2. */
27278 IX86_BUILTIN_CRC32QI,
27279 IX86_BUILTIN_CRC32HI,
27280 IX86_BUILTIN_CRC32SI,
27281 IX86_BUILTIN_CRC32DI,
27282
27283 IX86_BUILTIN_PCMPESTRI128,
27284 IX86_BUILTIN_PCMPESTRM128,
27285 IX86_BUILTIN_PCMPESTRA128,
27286 IX86_BUILTIN_PCMPESTRC128,
27287 IX86_BUILTIN_PCMPESTRO128,
27288 IX86_BUILTIN_PCMPESTRS128,
27289 IX86_BUILTIN_PCMPESTRZ128,
27290 IX86_BUILTIN_PCMPISTRI128,
27291 IX86_BUILTIN_PCMPISTRM128,
27292 IX86_BUILTIN_PCMPISTRA128,
27293 IX86_BUILTIN_PCMPISTRC128,
27294 IX86_BUILTIN_PCMPISTRO128,
27295 IX86_BUILTIN_PCMPISTRS128,
27296 IX86_BUILTIN_PCMPISTRZ128,
27297
27298 IX86_BUILTIN_PCMPGTQ,
27299
27300 /* AES instructions */
27301 IX86_BUILTIN_AESENC128,
27302 IX86_BUILTIN_AESENCLAST128,
27303 IX86_BUILTIN_AESDEC128,
27304 IX86_BUILTIN_AESDECLAST128,
27305 IX86_BUILTIN_AESIMC128,
27306 IX86_BUILTIN_AESKEYGENASSIST128,
27307
27308 /* PCLMUL instruction */
27309 IX86_BUILTIN_PCLMULQDQ128,
27310
27311 /* AVX */
27312 IX86_BUILTIN_ADDPD256,
27313 IX86_BUILTIN_ADDPS256,
27314 IX86_BUILTIN_ADDSUBPD256,
27315 IX86_BUILTIN_ADDSUBPS256,
27316 IX86_BUILTIN_ANDPD256,
27317 IX86_BUILTIN_ANDPS256,
27318 IX86_BUILTIN_ANDNPD256,
27319 IX86_BUILTIN_ANDNPS256,
27320 IX86_BUILTIN_BLENDPD256,
27321 IX86_BUILTIN_BLENDPS256,
27322 IX86_BUILTIN_BLENDVPD256,
27323 IX86_BUILTIN_BLENDVPS256,
27324 IX86_BUILTIN_DIVPD256,
27325 IX86_BUILTIN_DIVPS256,
27326 IX86_BUILTIN_DPPS256,
27327 IX86_BUILTIN_HADDPD256,
27328 IX86_BUILTIN_HADDPS256,
27329 IX86_BUILTIN_HSUBPD256,
27330 IX86_BUILTIN_HSUBPS256,
27331 IX86_BUILTIN_MAXPD256,
27332 IX86_BUILTIN_MAXPS256,
27333 IX86_BUILTIN_MINPD256,
27334 IX86_BUILTIN_MINPS256,
27335 IX86_BUILTIN_MULPD256,
27336 IX86_BUILTIN_MULPS256,
27337 IX86_BUILTIN_ORPD256,
27338 IX86_BUILTIN_ORPS256,
27339 IX86_BUILTIN_SHUFPD256,
27340 IX86_BUILTIN_SHUFPS256,
27341 IX86_BUILTIN_SUBPD256,
27342 IX86_BUILTIN_SUBPS256,
27343 IX86_BUILTIN_XORPD256,
27344 IX86_BUILTIN_XORPS256,
27345 IX86_BUILTIN_CMPSD,
27346 IX86_BUILTIN_CMPSS,
27347 IX86_BUILTIN_CMPPD,
27348 IX86_BUILTIN_CMPPS,
27349 IX86_BUILTIN_CMPPD256,
27350 IX86_BUILTIN_CMPPS256,
27351 IX86_BUILTIN_CVTDQ2PD256,
27352 IX86_BUILTIN_CVTDQ2PS256,
27353 IX86_BUILTIN_CVTPD2PS256,
27354 IX86_BUILTIN_CVTPS2DQ256,
27355 IX86_BUILTIN_CVTPS2PD256,
27356 IX86_BUILTIN_CVTTPD2DQ256,
27357 IX86_BUILTIN_CVTPD2DQ256,
27358 IX86_BUILTIN_CVTTPS2DQ256,
27359 IX86_BUILTIN_EXTRACTF128PD256,
27360 IX86_BUILTIN_EXTRACTF128PS256,
27361 IX86_BUILTIN_EXTRACTF128SI256,
27362 IX86_BUILTIN_VZEROALL,
27363 IX86_BUILTIN_VZEROUPPER,
27364 IX86_BUILTIN_VPERMILVARPD,
27365 IX86_BUILTIN_VPERMILVARPS,
27366 IX86_BUILTIN_VPERMILVARPD256,
27367 IX86_BUILTIN_VPERMILVARPS256,
27368 IX86_BUILTIN_VPERMILPD,
27369 IX86_BUILTIN_VPERMILPS,
27370 IX86_BUILTIN_VPERMILPD256,
27371 IX86_BUILTIN_VPERMILPS256,
27372 IX86_BUILTIN_VPERMIL2PD,
27373 IX86_BUILTIN_VPERMIL2PS,
27374 IX86_BUILTIN_VPERMIL2PD256,
27375 IX86_BUILTIN_VPERMIL2PS256,
27376 IX86_BUILTIN_VPERM2F128PD256,
27377 IX86_BUILTIN_VPERM2F128PS256,
27378 IX86_BUILTIN_VPERM2F128SI256,
27379 IX86_BUILTIN_VBROADCASTSS,
27380 IX86_BUILTIN_VBROADCASTSD256,
27381 IX86_BUILTIN_VBROADCASTSS256,
27382 IX86_BUILTIN_VBROADCASTPD256,
27383 IX86_BUILTIN_VBROADCASTPS256,
27384 IX86_BUILTIN_VINSERTF128PD256,
27385 IX86_BUILTIN_VINSERTF128PS256,
27386 IX86_BUILTIN_VINSERTF128SI256,
27387 IX86_BUILTIN_LOADUPD256,
27388 IX86_BUILTIN_LOADUPS256,
27389 IX86_BUILTIN_STOREUPD256,
27390 IX86_BUILTIN_STOREUPS256,
27391 IX86_BUILTIN_LDDQU256,
27392 IX86_BUILTIN_MOVNTDQ256,
27393 IX86_BUILTIN_MOVNTPD256,
27394 IX86_BUILTIN_MOVNTPS256,
27395 IX86_BUILTIN_LOADDQU256,
27396 IX86_BUILTIN_STOREDQU256,
27397 IX86_BUILTIN_MASKLOADPD,
27398 IX86_BUILTIN_MASKLOADPS,
27399 IX86_BUILTIN_MASKSTOREPD,
27400 IX86_BUILTIN_MASKSTOREPS,
27401 IX86_BUILTIN_MASKLOADPD256,
27402 IX86_BUILTIN_MASKLOADPS256,
27403 IX86_BUILTIN_MASKSTOREPD256,
27404 IX86_BUILTIN_MASKSTOREPS256,
27405 IX86_BUILTIN_MOVSHDUP256,
27406 IX86_BUILTIN_MOVSLDUP256,
27407 IX86_BUILTIN_MOVDDUP256,
27408
27409 IX86_BUILTIN_SQRTPD256,
27410 IX86_BUILTIN_SQRTPS256,
27411 IX86_BUILTIN_SQRTPS_NR256,
27412 IX86_BUILTIN_RSQRTPS256,
27413 IX86_BUILTIN_RSQRTPS_NR256,
27414
27415 IX86_BUILTIN_RCPPS256,
27416
27417 IX86_BUILTIN_ROUNDPD256,
27418 IX86_BUILTIN_ROUNDPS256,
27419
27420 IX86_BUILTIN_FLOORPD256,
27421 IX86_BUILTIN_CEILPD256,
27422 IX86_BUILTIN_TRUNCPD256,
27423 IX86_BUILTIN_RINTPD256,
27424 IX86_BUILTIN_ROUNDPD_AZ256,
27425
27426 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
27427 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
27428 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
27429
27430 IX86_BUILTIN_FLOORPS256,
27431 IX86_BUILTIN_CEILPS256,
27432 IX86_BUILTIN_TRUNCPS256,
27433 IX86_BUILTIN_RINTPS256,
27434 IX86_BUILTIN_ROUNDPS_AZ256,
27435
27436 IX86_BUILTIN_FLOORPS_SFIX256,
27437 IX86_BUILTIN_CEILPS_SFIX256,
27438 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
27439
27440 IX86_BUILTIN_UNPCKHPD256,
27441 IX86_BUILTIN_UNPCKLPD256,
27442 IX86_BUILTIN_UNPCKHPS256,
27443 IX86_BUILTIN_UNPCKLPS256,
27444
27445 IX86_BUILTIN_SI256_SI,
27446 IX86_BUILTIN_PS256_PS,
27447 IX86_BUILTIN_PD256_PD,
27448 IX86_BUILTIN_SI_SI256,
27449 IX86_BUILTIN_PS_PS256,
27450 IX86_BUILTIN_PD_PD256,
27451
27452 IX86_BUILTIN_VTESTZPD,
27453 IX86_BUILTIN_VTESTCPD,
27454 IX86_BUILTIN_VTESTNZCPD,
27455 IX86_BUILTIN_VTESTZPS,
27456 IX86_BUILTIN_VTESTCPS,
27457 IX86_BUILTIN_VTESTNZCPS,
27458 IX86_BUILTIN_VTESTZPD256,
27459 IX86_BUILTIN_VTESTCPD256,
27460 IX86_BUILTIN_VTESTNZCPD256,
27461 IX86_BUILTIN_VTESTZPS256,
27462 IX86_BUILTIN_VTESTCPS256,
27463 IX86_BUILTIN_VTESTNZCPS256,
27464 IX86_BUILTIN_PTESTZ256,
27465 IX86_BUILTIN_PTESTC256,
27466 IX86_BUILTIN_PTESTNZC256,
27467
27468 IX86_BUILTIN_MOVMSKPD256,
27469 IX86_BUILTIN_MOVMSKPS256,
27470
27471 /* AVX2 */
27472 IX86_BUILTIN_MPSADBW256,
27473 IX86_BUILTIN_PABSB256,
27474 IX86_BUILTIN_PABSW256,
27475 IX86_BUILTIN_PABSD256,
27476 IX86_BUILTIN_PACKSSDW256,
27477 IX86_BUILTIN_PACKSSWB256,
27478 IX86_BUILTIN_PACKUSDW256,
27479 IX86_BUILTIN_PACKUSWB256,
27480 IX86_BUILTIN_PADDB256,
27481 IX86_BUILTIN_PADDW256,
27482 IX86_BUILTIN_PADDD256,
27483 IX86_BUILTIN_PADDQ256,
27484 IX86_BUILTIN_PADDSB256,
27485 IX86_BUILTIN_PADDSW256,
27486 IX86_BUILTIN_PADDUSB256,
27487 IX86_BUILTIN_PADDUSW256,
27488 IX86_BUILTIN_PALIGNR256,
27489 IX86_BUILTIN_AND256I,
27490 IX86_BUILTIN_ANDNOT256I,
27491 IX86_BUILTIN_PAVGB256,
27492 IX86_BUILTIN_PAVGW256,
27493 IX86_BUILTIN_PBLENDVB256,
27494 IX86_BUILTIN_PBLENDVW256,
27495 IX86_BUILTIN_PCMPEQB256,
27496 IX86_BUILTIN_PCMPEQW256,
27497 IX86_BUILTIN_PCMPEQD256,
27498 IX86_BUILTIN_PCMPEQQ256,
27499 IX86_BUILTIN_PCMPGTB256,
27500 IX86_BUILTIN_PCMPGTW256,
27501 IX86_BUILTIN_PCMPGTD256,
27502 IX86_BUILTIN_PCMPGTQ256,
27503 IX86_BUILTIN_PHADDW256,
27504 IX86_BUILTIN_PHADDD256,
27505 IX86_BUILTIN_PHADDSW256,
27506 IX86_BUILTIN_PHSUBW256,
27507 IX86_BUILTIN_PHSUBD256,
27508 IX86_BUILTIN_PHSUBSW256,
27509 IX86_BUILTIN_PMADDUBSW256,
27510 IX86_BUILTIN_PMADDWD256,
27511 IX86_BUILTIN_PMAXSB256,
27512 IX86_BUILTIN_PMAXSW256,
27513 IX86_BUILTIN_PMAXSD256,
27514 IX86_BUILTIN_PMAXUB256,
27515 IX86_BUILTIN_PMAXUW256,
27516 IX86_BUILTIN_PMAXUD256,
27517 IX86_BUILTIN_PMINSB256,
27518 IX86_BUILTIN_PMINSW256,
27519 IX86_BUILTIN_PMINSD256,
27520 IX86_BUILTIN_PMINUB256,
27521 IX86_BUILTIN_PMINUW256,
27522 IX86_BUILTIN_PMINUD256,
27523 IX86_BUILTIN_PMOVMSKB256,
27524 IX86_BUILTIN_PMOVSXBW256,
27525 IX86_BUILTIN_PMOVSXBD256,
27526 IX86_BUILTIN_PMOVSXBQ256,
27527 IX86_BUILTIN_PMOVSXWD256,
27528 IX86_BUILTIN_PMOVSXWQ256,
27529 IX86_BUILTIN_PMOVSXDQ256,
27530 IX86_BUILTIN_PMOVZXBW256,
27531 IX86_BUILTIN_PMOVZXBD256,
27532 IX86_BUILTIN_PMOVZXBQ256,
27533 IX86_BUILTIN_PMOVZXWD256,
27534 IX86_BUILTIN_PMOVZXWQ256,
27535 IX86_BUILTIN_PMOVZXDQ256,
27536 IX86_BUILTIN_PMULDQ256,
27537 IX86_BUILTIN_PMULHRSW256,
27538 IX86_BUILTIN_PMULHUW256,
27539 IX86_BUILTIN_PMULHW256,
27540 IX86_BUILTIN_PMULLW256,
27541 IX86_BUILTIN_PMULLD256,
27542 IX86_BUILTIN_PMULUDQ256,
27543 IX86_BUILTIN_POR256,
27544 IX86_BUILTIN_PSADBW256,
27545 IX86_BUILTIN_PSHUFB256,
27546 IX86_BUILTIN_PSHUFD256,
27547 IX86_BUILTIN_PSHUFHW256,
27548 IX86_BUILTIN_PSHUFLW256,
27549 IX86_BUILTIN_PSIGNB256,
27550 IX86_BUILTIN_PSIGNW256,
27551 IX86_BUILTIN_PSIGND256,
27552 IX86_BUILTIN_PSLLDQI256,
27553 IX86_BUILTIN_PSLLWI256,
27554 IX86_BUILTIN_PSLLW256,
27555 IX86_BUILTIN_PSLLDI256,
27556 IX86_BUILTIN_PSLLD256,
27557 IX86_BUILTIN_PSLLQI256,
27558 IX86_BUILTIN_PSLLQ256,
27559 IX86_BUILTIN_PSRAWI256,
27560 IX86_BUILTIN_PSRAW256,
27561 IX86_BUILTIN_PSRADI256,
27562 IX86_BUILTIN_PSRAD256,
27563 IX86_BUILTIN_PSRLDQI256,
27564 IX86_BUILTIN_PSRLWI256,
27565 IX86_BUILTIN_PSRLW256,
27566 IX86_BUILTIN_PSRLDI256,
27567 IX86_BUILTIN_PSRLD256,
27568 IX86_BUILTIN_PSRLQI256,
27569 IX86_BUILTIN_PSRLQ256,
27570 IX86_BUILTIN_PSUBB256,
27571 IX86_BUILTIN_PSUBW256,
27572 IX86_BUILTIN_PSUBD256,
27573 IX86_BUILTIN_PSUBQ256,
27574 IX86_BUILTIN_PSUBSB256,
27575 IX86_BUILTIN_PSUBSW256,
27576 IX86_BUILTIN_PSUBUSB256,
27577 IX86_BUILTIN_PSUBUSW256,
27578 IX86_BUILTIN_PUNPCKHBW256,
27579 IX86_BUILTIN_PUNPCKHWD256,
27580 IX86_BUILTIN_PUNPCKHDQ256,
27581 IX86_BUILTIN_PUNPCKHQDQ256,
27582 IX86_BUILTIN_PUNPCKLBW256,
27583 IX86_BUILTIN_PUNPCKLWD256,
27584 IX86_BUILTIN_PUNPCKLDQ256,
27585 IX86_BUILTIN_PUNPCKLQDQ256,
27586 IX86_BUILTIN_PXOR256,
27587 IX86_BUILTIN_MOVNTDQA256,
27588 IX86_BUILTIN_VBROADCASTSS_PS,
27589 IX86_BUILTIN_VBROADCASTSS_PS256,
27590 IX86_BUILTIN_VBROADCASTSD_PD256,
27591 IX86_BUILTIN_VBROADCASTSI256,
27592 IX86_BUILTIN_PBLENDD256,
27593 IX86_BUILTIN_PBLENDD128,
27594 IX86_BUILTIN_PBROADCASTB256,
27595 IX86_BUILTIN_PBROADCASTW256,
27596 IX86_BUILTIN_PBROADCASTD256,
27597 IX86_BUILTIN_PBROADCASTQ256,
27598 IX86_BUILTIN_PBROADCASTB128,
27599 IX86_BUILTIN_PBROADCASTW128,
27600 IX86_BUILTIN_PBROADCASTD128,
27601 IX86_BUILTIN_PBROADCASTQ128,
27602 IX86_BUILTIN_VPERMVARSI256,
27603 IX86_BUILTIN_VPERMDF256,
27604 IX86_BUILTIN_VPERMVARSF256,
27605 IX86_BUILTIN_VPERMDI256,
27606 IX86_BUILTIN_VPERMTI256,
27607 IX86_BUILTIN_VEXTRACT128I256,
27608 IX86_BUILTIN_VINSERT128I256,
27609 IX86_BUILTIN_MASKLOADD,
27610 IX86_BUILTIN_MASKLOADQ,
27611 IX86_BUILTIN_MASKLOADD256,
27612 IX86_BUILTIN_MASKLOADQ256,
27613 IX86_BUILTIN_MASKSTORED,
27614 IX86_BUILTIN_MASKSTOREQ,
27615 IX86_BUILTIN_MASKSTORED256,
27616 IX86_BUILTIN_MASKSTOREQ256,
27617 IX86_BUILTIN_PSLLVV4DI,
27618 IX86_BUILTIN_PSLLVV2DI,
27619 IX86_BUILTIN_PSLLVV8SI,
27620 IX86_BUILTIN_PSLLVV4SI,
27621 IX86_BUILTIN_PSRAVV8SI,
27622 IX86_BUILTIN_PSRAVV4SI,
27623 IX86_BUILTIN_PSRLVV4DI,
27624 IX86_BUILTIN_PSRLVV2DI,
27625 IX86_BUILTIN_PSRLVV8SI,
27626 IX86_BUILTIN_PSRLVV4SI,
27627
27628 IX86_BUILTIN_GATHERSIV2DF,
27629 IX86_BUILTIN_GATHERSIV4DF,
27630 IX86_BUILTIN_GATHERDIV2DF,
27631 IX86_BUILTIN_GATHERDIV4DF,
27632 IX86_BUILTIN_GATHERSIV4SF,
27633 IX86_BUILTIN_GATHERSIV8SF,
27634 IX86_BUILTIN_GATHERDIV4SF,
27635 IX86_BUILTIN_GATHERDIV8SF,
27636 IX86_BUILTIN_GATHERSIV2DI,
27637 IX86_BUILTIN_GATHERSIV4DI,
27638 IX86_BUILTIN_GATHERDIV2DI,
27639 IX86_BUILTIN_GATHERDIV4DI,
27640 IX86_BUILTIN_GATHERSIV4SI,
27641 IX86_BUILTIN_GATHERSIV8SI,
27642 IX86_BUILTIN_GATHERDIV4SI,
27643 IX86_BUILTIN_GATHERDIV8SI,
27644
27645 /* Alternate 4 element gather for the vectorizer where
27646 all operands are 32-byte wide. */
27647 IX86_BUILTIN_GATHERALTSIV4DF,
27648 IX86_BUILTIN_GATHERALTDIV8SF,
27649 IX86_BUILTIN_GATHERALTSIV4DI,
27650 IX86_BUILTIN_GATHERALTDIV8SI,
27651
27652 /* TFmode support builtins. */
27653 IX86_BUILTIN_INFQ,
27654 IX86_BUILTIN_HUGE_VALQ,
27655 IX86_BUILTIN_FABSQ,
27656 IX86_BUILTIN_COPYSIGNQ,
27657
27658 /* Vectorizer support builtins. */
27659 IX86_BUILTIN_CPYSGNPS,
27660 IX86_BUILTIN_CPYSGNPD,
27661 IX86_BUILTIN_CPYSGNPS256,
27662 IX86_BUILTIN_CPYSGNPD256,
27663
27664 /* FMA4 instructions. */
27665 IX86_BUILTIN_VFMADDSS,
27666 IX86_BUILTIN_VFMADDSD,
27667 IX86_BUILTIN_VFMADDPS,
27668 IX86_BUILTIN_VFMADDPD,
27669 IX86_BUILTIN_VFMADDPS256,
27670 IX86_BUILTIN_VFMADDPD256,
27671 IX86_BUILTIN_VFMADDSUBPS,
27672 IX86_BUILTIN_VFMADDSUBPD,
27673 IX86_BUILTIN_VFMADDSUBPS256,
27674 IX86_BUILTIN_VFMADDSUBPD256,
27675
27676 /* FMA3 instructions. */
27677 IX86_BUILTIN_VFMADDSS3,
27678 IX86_BUILTIN_VFMADDSD3,
27679
27680 /* XOP instructions. */
27681 IX86_BUILTIN_VPCMOV,
27682 IX86_BUILTIN_VPCMOV_V2DI,
27683 IX86_BUILTIN_VPCMOV_V4SI,
27684 IX86_BUILTIN_VPCMOV_V8HI,
27685 IX86_BUILTIN_VPCMOV_V16QI,
27686 IX86_BUILTIN_VPCMOV_V4SF,
27687 IX86_BUILTIN_VPCMOV_V2DF,
27688 IX86_BUILTIN_VPCMOV256,
27689 IX86_BUILTIN_VPCMOV_V4DI256,
27690 IX86_BUILTIN_VPCMOV_V8SI256,
27691 IX86_BUILTIN_VPCMOV_V16HI256,
27692 IX86_BUILTIN_VPCMOV_V32QI256,
27693 IX86_BUILTIN_VPCMOV_V8SF256,
27694 IX86_BUILTIN_VPCMOV_V4DF256,
27695
27696 IX86_BUILTIN_VPPERM,
27697
27698 IX86_BUILTIN_VPMACSSWW,
27699 IX86_BUILTIN_VPMACSWW,
27700 IX86_BUILTIN_VPMACSSWD,
27701 IX86_BUILTIN_VPMACSWD,
27702 IX86_BUILTIN_VPMACSSDD,
27703 IX86_BUILTIN_VPMACSDD,
27704 IX86_BUILTIN_VPMACSSDQL,
27705 IX86_BUILTIN_VPMACSSDQH,
27706 IX86_BUILTIN_VPMACSDQL,
27707 IX86_BUILTIN_VPMACSDQH,
27708 IX86_BUILTIN_VPMADCSSWD,
27709 IX86_BUILTIN_VPMADCSWD,
27710
27711 IX86_BUILTIN_VPHADDBW,
27712 IX86_BUILTIN_VPHADDBD,
27713 IX86_BUILTIN_VPHADDBQ,
27714 IX86_BUILTIN_VPHADDWD,
27715 IX86_BUILTIN_VPHADDWQ,
27716 IX86_BUILTIN_VPHADDDQ,
27717 IX86_BUILTIN_VPHADDUBW,
27718 IX86_BUILTIN_VPHADDUBD,
27719 IX86_BUILTIN_VPHADDUBQ,
27720 IX86_BUILTIN_VPHADDUWD,
27721 IX86_BUILTIN_VPHADDUWQ,
27722 IX86_BUILTIN_VPHADDUDQ,
27723 IX86_BUILTIN_VPHSUBBW,
27724 IX86_BUILTIN_VPHSUBWD,
27725 IX86_BUILTIN_VPHSUBDQ,
27726
27727 IX86_BUILTIN_VPROTB,
27728 IX86_BUILTIN_VPROTW,
27729 IX86_BUILTIN_VPROTD,
27730 IX86_BUILTIN_VPROTQ,
27731 IX86_BUILTIN_VPROTB_IMM,
27732 IX86_BUILTIN_VPROTW_IMM,
27733 IX86_BUILTIN_VPROTD_IMM,
27734 IX86_BUILTIN_VPROTQ_IMM,
27735
27736 IX86_BUILTIN_VPSHLB,
27737 IX86_BUILTIN_VPSHLW,
27738 IX86_BUILTIN_VPSHLD,
27739 IX86_BUILTIN_VPSHLQ,
27740 IX86_BUILTIN_VPSHAB,
27741 IX86_BUILTIN_VPSHAW,
27742 IX86_BUILTIN_VPSHAD,
27743 IX86_BUILTIN_VPSHAQ,
27744
27745 IX86_BUILTIN_VFRCZSS,
27746 IX86_BUILTIN_VFRCZSD,
27747 IX86_BUILTIN_VFRCZPS,
27748 IX86_BUILTIN_VFRCZPD,
27749 IX86_BUILTIN_VFRCZPS256,
27750 IX86_BUILTIN_VFRCZPD256,
27751
27752 IX86_BUILTIN_VPCOMEQUB,
27753 IX86_BUILTIN_VPCOMNEUB,
27754 IX86_BUILTIN_VPCOMLTUB,
27755 IX86_BUILTIN_VPCOMLEUB,
27756 IX86_BUILTIN_VPCOMGTUB,
27757 IX86_BUILTIN_VPCOMGEUB,
27758 IX86_BUILTIN_VPCOMFALSEUB,
27759 IX86_BUILTIN_VPCOMTRUEUB,
27760
27761 IX86_BUILTIN_VPCOMEQUW,
27762 IX86_BUILTIN_VPCOMNEUW,
27763 IX86_BUILTIN_VPCOMLTUW,
27764 IX86_BUILTIN_VPCOMLEUW,
27765 IX86_BUILTIN_VPCOMGTUW,
27766 IX86_BUILTIN_VPCOMGEUW,
27767 IX86_BUILTIN_VPCOMFALSEUW,
27768 IX86_BUILTIN_VPCOMTRUEUW,
27769
27770 IX86_BUILTIN_VPCOMEQUD,
27771 IX86_BUILTIN_VPCOMNEUD,
27772 IX86_BUILTIN_VPCOMLTUD,
27773 IX86_BUILTIN_VPCOMLEUD,
27774 IX86_BUILTIN_VPCOMGTUD,
27775 IX86_BUILTIN_VPCOMGEUD,
27776 IX86_BUILTIN_VPCOMFALSEUD,
27777 IX86_BUILTIN_VPCOMTRUEUD,
27778
27779 IX86_BUILTIN_VPCOMEQUQ,
27780 IX86_BUILTIN_VPCOMNEUQ,
27781 IX86_BUILTIN_VPCOMLTUQ,
27782 IX86_BUILTIN_VPCOMLEUQ,
27783 IX86_BUILTIN_VPCOMGTUQ,
27784 IX86_BUILTIN_VPCOMGEUQ,
27785 IX86_BUILTIN_VPCOMFALSEUQ,
27786 IX86_BUILTIN_VPCOMTRUEUQ,
27787
27788 IX86_BUILTIN_VPCOMEQB,
27789 IX86_BUILTIN_VPCOMNEB,
27790 IX86_BUILTIN_VPCOMLTB,
27791 IX86_BUILTIN_VPCOMLEB,
27792 IX86_BUILTIN_VPCOMGTB,
27793 IX86_BUILTIN_VPCOMGEB,
27794 IX86_BUILTIN_VPCOMFALSEB,
27795 IX86_BUILTIN_VPCOMTRUEB,
27796
27797 IX86_BUILTIN_VPCOMEQW,
27798 IX86_BUILTIN_VPCOMNEW,
27799 IX86_BUILTIN_VPCOMLTW,
27800 IX86_BUILTIN_VPCOMLEW,
27801 IX86_BUILTIN_VPCOMGTW,
27802 IX86_BUILTIN_VPCOMGEW,
27803 IX86_BUILTIN_VPCOMFALSEW,
27804 IX86_BUILTIN_VPCOMTRUEW,
27805
27806 IX86_BUILTIN_VPCOMEQD,
27807 IX86_BUILTIN_VPCOMNED,
27808 IX86_BUILTIN_VPCOMLTD,
27809 IX86_BUILTIN_VPCOMLED,
27810 IX86_BUILTIN_VPCOMGTD,
27811 IX86_BUILTIN_VPCOMGED,
27812 IX86_BUILTIN_VPCOMFALSED,
27813 IX86_BUILTIN_VPCOMTRUED,
27814
27815 IX86_BUILTIN_VPCOMEQQ,
27816 IX86_BUILTIN_VPCOMNEQ,
27817 IX86_BUILTIN_VPCOMLTQ,
27818 IX86_BUILTIN_VPCOMLEQ,
27819 IX86_BUILTIN_VPCOMGTQ,
27820 IX86_BUILTIN_VPCOMGEQ,
27821 IX86_BUILTIN_VPCOMFALSEQ,
27822 IX86_BUILTIN_VPCOMTRUEQ,
27823
27824 /* LWP instructions. */
27825 IX86_BUILTIN_LLWPCB,
27826 IX86_BUILTIN_SLWPCB,
27827 IX86_BUILTIN_LWPVAL32,
27828 IX86_BUILTIN_LWPVAL64,
27829 IX86_BUILTIN_LWPINS32,
27830 IX86_BUILTIN_LWPINS64,
27831
27832 IX86_BUILTIN_CLZS,
27833
27834 /* RTM */
27835 IX86_BUILTIN_XBEGIN,
27836 IX86_BUILTIN_XEND,
27837 IX86_BUILTIN_XABORT,
27838 IX86_BUILTIN_XTEST,
27839
27840 /* BMI instructions. */
27841 IX86_BUILTIN_BEXTR32,
27842 IX86_BUILTIN_BEXTR64,
27843 IX86_BUILTIN_CTZS,
27844
27845 /* TBM instructions. */
27846 IX86_BUILTIN_BEXTRI32,
27847 IX86_BUILTIN_BEXTRI64,
27848
27849 /* BMI2 instructions. */
27850 IX86_BUILTIN_BZHI32,
27851 IX86_BUILTIN_BZHI64,
27852 IX86_BUILTIN_PDEP32,
27853 IX86_BUILTIN_PDEP64,
27854 IX86_BUILTIN_PEXT32,
27855 IX86_BUILTIN_PEXT64,
27856
27857 /* ADX instructions. */
27858 IX86_BUILTIN_ADDCARRYX32,
27859 IX86_BUILTIN_ADDCARRYX64,
27860
27861 /* FSGSBASE instructions. */
27862 IX86_BUILTIN_RDFSBASE32,
27863 IX86_BUILTIN_RDFSBASE64,
27864 IX86_BUILTIN_RDGSBASE32,
27865 IX86_BUILTIN_RDGSBASE64,
27866 IX86_BUILTIN_WRFSBASE32,
27867 IX86_BUILTIN_WRFSBASE64,
27868 IX86_BUILTIN_WRGSBASE32,
27869 IX86_BUILTIN_WRGSBASE64,
27870
27871 /* RDRND instructions. */
27872 IX86_BUILTIN_RDRAND16_STEP,
27873 IX86_BUILTIN_RDRAND32_STEP,
27874 IX86_BUILTIN_RDRAND64_STEP,
27875
27876 /* RDSEED instructions. */
27877 IX86_BUILTIN_RDSEED16_STEP,
27878 IX86_BUILTIN_RDSEED32_STEP,
27879 IX86_BUILTIN_RDSEED64_STEP,
27880
27881 /* F16C instructions. */
27882 IX86_BUILTIN_CVTPH2PS,
27883 IX86_BUILTIN_CVTPH2PS256,
27884 IX86_BUILTIN_CVTPS2PH,
27885 IX86_BUILTIN_CVTPS2PH256,
27886
27887 /* CFString built-in for darwin */
27888 IX86_BUILTIN_CFSTRING,
27889
27890 /* Builtins to get CPU type and supported features. */
27891 IX86_BUILTIN_CPU_INIT,
27892 IX86_BUILTIN_CPU_IS,
27893 IX86_BUILTIN_CPU_SUPPORTS,
27894
27895 IX86_BUILTIN_MAX
27896 };
27897
27898 /* Table for the ix86 builtin decls. */
27899 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27900
27901 /* Table of all of the builtin functions that are possible with different ISA's
27902 but are waiting to be built until a function is declared to use that
27903 ISA. */
27904 struct builtin_isa {
27905 const char *name; /* function name */
27906 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27907 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27908 bool const_p; /* true if the declaration is constant */
27909 bool set_and_not_built_p;
27910 };
27911
27912 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27913
27914
27915 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27916 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27917 function decl in the ix86_builtins array. Returns the function decl or
27918 NULL_TREE, if the builtin was not added.
27919
27920 If the front end has a special hook for builtin functions, delay adding
27921 builtin functions that aren't in the current ISA until the ISA is changed
27922 with function specific optimization. Doing so, can save about 300K for the
27923 default compiler. When the builtin is expanded, check at that time whether
27924 it is valid.
27925
27926 If the front end doesn't have a special hook, record all builtins, even if
27927 it isn't an instruction set in the current ISA in case the user uses
27928 function specific options for a different ISA, so that we don't get scope
27929 errors if a builtin is added in the middle of a function scope. */
27930
27931 static inline tree
27932 def_builtin (HOST_WIDE_INT mask, const char *name,
27933 enum ix86_builtin_func_type tcode,
27934 enum ix86_builtins code)
27935 {
27936 tree decl = NULL_TREE;
27937
27938 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27939 {
27940 ix86_builtins_isa[(int) code].isa = mask;
27941
27942 mask &= ~OPTION_MASK_ISA_64BIT;
27943 if (mask == 0
27944 || (mask & ix86_isa_flags) != 0
27945 || (lang_hooks.builtin_function
27946 == lang_hooks.builtin_function_ext_scope))
27947
27948 {
27949 tree type = ix86_get_builtin_func_type (tcode);
27950 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27951 NULL, NULL_TREE);
27952 ix86_builtins[(int) code] = decl;
27953 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27954 }
27955 else
27956 {
27957 ix86_builtins[(int) code] = NULL_TREE;
27958 ix86_builtins_isa[(int) code].tcode = tcode;
27959 ix86_builtins_isa[(int) code].name = name;
27960 ix86_builtins_isa[(int) code].const_p = false;
27961 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27962 }
27963 }
27964
27965 return decl;
27966 }
27967
27968 /* Like def_builtin, but also marks the function decl "const". */
27969
27970 static inline tree
27971 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27972 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27973 {
27974 tree decl = def_builtin (mask, name, tcode, code);
27975 if (decl)
27976 TREE_READONLY (decl) = 1;
27977 else
27978 ix86_builtins_isa[(int) code].const_p = true;
27979
27980 return decl;
27981 }
27982
27983 /* Add any new builtin functions for a given ISA that may not have been
27984 declared. This saves a bit of space compared to adding all of the
27985 declarations to the tree, even if we didn't use them. */
27986
27987 static void
27988 ix86_add_new_builtins (HOST_WIDE_INT isa)
27989 {
27990 int i;
27991
27992 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27993 {
27994 if ((ix86_builtins_isa[i].isa & isa) != 0
27995 && ix86_builtins_isa[i].set_and_not_built_p)
27996 {
27997 tree decl, type;
27998
27999 /* Don't define the builtin again. */
28000 ix86_builtins_isa[i].set_and_not_built_p = false;
28001
28002 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
28003 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
28004 type, i, BUILT_IN_MD, NULL,
28005 NULL_TREE);
28006
28007 ix86_builtins[i] = decl;
28008 if (ix86_builtins_isa[i].const_p)
28009 TREE_READONLY (decl) = 1;
28010 }
28011 }
28012 }
28013
28014 /* Bits for builtin_description.flag. */
28015
28016 /* Set when we don't support the comparison natively, and should
28017 swap_comparison in order to support it. */
28018 #define BUILTIN_DESC_SWAP_OPERANDS 1
28019
28020 struct builtin_description
28021 {
28022 const HOST_WIDE_INT mask;
28023 const enum insn_code icode;
28024 const char *const name;
28025 const enum ix86_builtins code;
28026 const enum rtx_code comparison;
28027 const int flag;
28028 };
28029
28030 static const struct builtin_description bdesc_comi[] =
28031 {
28032 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
28033 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
28034 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
28035 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
28036 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
28037 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
28038 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
28039 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
28040 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
28041 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
28042 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
28043 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
28044 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
28045 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
28046 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
28047 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
28048 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
28049 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
28050 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
28051 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
28052 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
28053 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
28054 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
28055 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
28056 };
28057
28058 static const struct builtin_description bdesc_pcmpestr[] =
28059 {
28060 /* SSE4.2 */
28061 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
28062 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
28063 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
28064 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
28065 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
28066 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
28067 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
28068 };
28069
28070 static const struct builtin_description bdesc_pcmpistr[] =
28071 {
28072 /* SSE4.2 */
28073 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
28074 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
28075 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
28076 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
28077 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
28078 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
28079 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
28080 };
28081
28082 /* Special builtins with variable number of arguments. */
28083 static const struct builtin_description bdesc_special_args[] =
28084 {
28085 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
28086 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
28087 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
28088
28089 /* 80387 (for use internally for atomic compound assignment). */
28090 { 0, CODE_FOR_fnstenv, "__builtin_ia32_fnstenv", IX86_BUILTIN_FNSTENV, UNKNOWN, (int) VOID_FTYPE_PVOID },
28091 { 0, CODE_FOR_fldenv, "__builtin_ia32_fldenv", IX86_BUILTIN_FLDENV, UNKNOWN, (int) VOID_FTYPE_PCVOID },
28092 { 0, CODE_FOR_fnstsw, "__builtin_ia32_fnstsw", IX86_BUILTIN_FNSTSW, UNKNOWN, (int) VOID_FTYPE_PUSHORT },
28093 { 0, CODE_FOR_fnclex, "__builtin_ia32_fnclex", IX86_BUILTIN_FNCLEX, UNKNOWN, (int) VOID_FTYPE_VOID },
28094
28095 /* MMX */
28096 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
28097
28098 /* 3DNow! */
28099 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
28100
28101 /* FXSR, XSAVE and XSAVEOPT */
28102 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
28103 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
28104 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28105 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28106 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28107
28108 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
28109 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
28110 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28111 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28112 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28113
28114 /* SSE */
28115 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28117 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28118
28119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28123
28124 /* SSE or 3DNow!A */
28125 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28126 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
28127
28128 /* SSE2 */
28129 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28130 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28131 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28132 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
28133 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28134 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
28135 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
28136 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
28137 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
28138 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28139
28140 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28141 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28142
28143 /* SSE3 */
28144 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28145
28146 /* SSE4.1 */
28147 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
28148
28149 /* SSE4A */
28150 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28151 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28152
28153 /* AVX */
28154 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
28155 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
28156
28157 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28158 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28159 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28160 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
28161 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
28162
28163 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28164 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28165 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28166 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28167 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28168 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
28169 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28170
28171 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
28172 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28173 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28174
28175 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
28176 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
28177 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
28178 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
28179 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
28180 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
28181 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
28182 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
28183
28184 /* AVX2 */
28185 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
28186 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
28187 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
28188 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
28189 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
28190 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
28191 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
28192 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
28193 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
28194
28195 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
28196 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
28197 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
28198 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
28199 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
28200 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
28201
28202 /* FSGSBASE */
28203 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28204 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28205 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28206 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28207 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28208 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28209 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28210 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28211
28212 /* RTM */
28213 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28214 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
28215 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
28216 };
28217
28218 /* Builtins with variable number of arguments. */
28219 static const struct builtin_description bdesc_args[] =
28220 {
28221 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
28222 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
28223 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
28224 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28225 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28226 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28227 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28228
28229 /* MMX */
28230 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28231 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28232 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28233 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28234 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28235 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28236
28237 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28238 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28239 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28240 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28241 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28242 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28243 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28244 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28245
28246 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28247 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28248
28249 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28250 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28251 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28252 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28253
28254 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28255 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28256 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28257 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28258 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28259 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28260
28261 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28262 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28263 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28264 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28265 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
28266 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
28267
28268 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28269 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
28270 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28271
28272 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
28273
28274 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28275 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28276 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28277 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28278 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28279 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28280
28281 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28282 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28283 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28284 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28285 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28286 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28287
28288 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28289 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28290 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28291 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28292
28293 /* 3DNow! */
28294 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28295 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28296 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28297 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28298
28299 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28300 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28301 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28302 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28303 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28304 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28305 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28306 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28307 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28308 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28309 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28310 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28311 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28312 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28313 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28314
28315 /* 3DNow!A */
28316 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28317 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28318 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28319 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28320 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28321 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28322
28323 /* SSE */
28324 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
28325 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28326 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28327 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28328 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28329 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28330 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28331 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28332 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28333 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28334 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28335 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28336
28337 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28338
28339 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28340 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28341 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28342 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28343 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28344 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28345 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28346 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28347
28348 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28349 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28350 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28351 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28352 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28353 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28354 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28355 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28356 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28357 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28358 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
28359 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28360 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28361 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28362 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28363 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28364 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28365 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28366 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28367 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28368
28369 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28370 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28371 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28372 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28373
28374 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28375 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28376 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28377 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28378
28379 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28380
28381 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28382 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28383 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28384 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28385 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28386
28387 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
28388 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
28389 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
28390
28391 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
28392
28393 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28394 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28395 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28396
28397 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
28398 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
28399
28400 /* SSE MMX or 3Dnow!A */
28401 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28402 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28403 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28404
28405 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28406 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28407 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28408 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28409
28410 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
28411 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
28412
28413 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
28414
28415 /* SSE2 */
28416 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28417
28418 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
28419 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
28420 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28421 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
28422 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
28423
28424 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28425 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28426 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
28427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28428 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28429
28430 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
28431
28432 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28433 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28434 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28435 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28436
28437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28438 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
28439 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28440
28441 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28442 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28443 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28444 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28445 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28446 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28447 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28448 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28449
28450 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28451 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28452 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28453 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28454 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
28455 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28456 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28457 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28458 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28459 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28460 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28461 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28462 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28463 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28464 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28465 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28466 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28467 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28468 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28469 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28470
28471 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28472 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28473 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28474 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28475
28476 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28477 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28478 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28479 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28480
28481 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28482
28483 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28484 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28485 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28486
28487 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28488
28489 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28490 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28491 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28492 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28493 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28494 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28495 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28496 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28497
28498 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28499 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28500 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28501 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28502 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28503 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28504 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28505 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28506
28507 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28508 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
28509
28510 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28511 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28512 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28513 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28514
28515 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28516 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28517
28518 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28519 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28520 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28521 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28522 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28523 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28524
28525 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28526 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28527 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28528 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28529
28530 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28531 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28532 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28533 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28534 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28535 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28536 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28537 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28538
28539 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28540 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28541 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28542
28543 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28544 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
28545
28546 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
28547 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28548
28549 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
28550
28551 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
28552 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
28553 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
28554 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
28555
28556 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28557 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28558 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28559 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28560 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28561 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28562 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28563
28564 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28565 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28566 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28567 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28568 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28569 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28570 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28571
28572 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28573 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28574 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28575 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28576
28577 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
28578 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28579 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28580
28581 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
28582
28583 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28584
28585 /* SSE2 MMX */
28586 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28587 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28588
28589 /* SSE3 */
28590 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
28591 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28592
28593 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28594 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28595 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28596 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28597 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28598 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28599
28600 /* SSSE3 */
28601 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28602 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
28603 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28604 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
28605 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28606 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28607
28608 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28609 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28610 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28611 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28612 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28613 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28614 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28615 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28616 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28617 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28618 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28619 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28620 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28621 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28622 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28623 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28624 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28625 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28626 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28627 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28628 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28629 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28630 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28631 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28632
28633 /* SSSE3. */
28634 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28635 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28636
28637 /* SSE4.1 */
28638 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28639 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28640 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28641 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28642 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28643 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28644 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28645 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28646 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28647 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28648
28649 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28650 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28651 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28652 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28653 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28654 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28655 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28656 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28657 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28658 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28659 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28660 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28661 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28662
28663 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28664 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28665 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28666 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28667 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28668 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28669 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28670 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28671 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28672 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28673 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28674 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28675
28676 /* SSE4.1 */
28677 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28678 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28679 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28680 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28681
28682 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28683 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28684 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28685 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28686
28687 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28688 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28689
28690 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28691 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28692
28693 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28694 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28695 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28696 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28697
28698 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28699 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28700
28701 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28702 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28703
28704 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28705 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28706 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28707
28708 /* SSE4.2 */
28709 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28710 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28711 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28712 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28713 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28714
28715 /* SSE4A */
28716 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28717 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28718 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28719 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28720
28721 /* AES */
28722 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28723 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28724
28725 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28726 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28727 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28728 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28729
28730 /* PCLMUL */
28731 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28732
28733 /* AVX */
28734 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28735 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28736 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28737 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28738 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28739 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28740 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28741 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28742 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28743 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28744 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28745 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28746 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28747 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28748 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28749 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28750 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28751 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28752 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28753 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28754 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28755 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28756 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28757 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28758 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28759 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28760
28761 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28762 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28763 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28764 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28765
28766 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28767 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28768 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28769 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28770 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28771 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28772 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28773 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28774 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28775 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28776 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28777 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28779 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28780 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28781 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28782 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28783 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28784 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28785 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28786 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28787 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28788 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28789 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28790 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28791 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28792 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28793 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28794 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28795 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28796 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28797 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28798 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28799 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28800
28801 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28802 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28803 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28804
28805 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28806 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28807 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28808 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28809 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28810
28811 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28812
28813 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28814 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28815
28816 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28817 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28818 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28819 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28820
28821 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28822 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28823
28824 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28825 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28826
28827 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28829 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28830 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28831
28832 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28834
28835 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28836 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28837
28838 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28839 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28840 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28841 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28842
28843 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28844 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28845 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28846 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28847 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28848 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28849
28850 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28851 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28852 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28853 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28854 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28855 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28856 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28857 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28858 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28859 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28860 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28861 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28862 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28863 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28864 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28865
28866 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28867 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28868
28869 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28870 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28871
28872 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28873
28874 /* AVX2 */
28875 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28876 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28877 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28878 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28879 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28880 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28881 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28882 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28883 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28884 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28885 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28886 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28887 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28888 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28889 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28890 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28891 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28892 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28893 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28894 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28895 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28896 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28897 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28898 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28900 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28901 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28904 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28905 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28906 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28907 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28908 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28911 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28913 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28914 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28915 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28916 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28917 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28918 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28919 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28920 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28921 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28922 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28923 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28924 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28925 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28926 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28927 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28928 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28929 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28930 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28931 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28932 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28933 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28934 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28935 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28936 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28937 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28938 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28939 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28940 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28941 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28942 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28943 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28944 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28945 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28946 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28947 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28950 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28951 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28952 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28953 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28954 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28955 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28956 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28957 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28958 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28959 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28960 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28961 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28962 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28963 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28964 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28965 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28967 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28968 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28969 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28970 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28971 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28972 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28973 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28974 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28975 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28976 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28977 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28978 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28979 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28980 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28981 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28982 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28983 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28984 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28985 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28986 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28987 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28988 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28989 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28990 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28991 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28992 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28993 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28994 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28995 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28996 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28997 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28998 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28999 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
29000 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
29001 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
29002 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
29003 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
29004 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29005 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
29006 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
29007 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
29008 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
29009 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
29010 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
29011 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29012 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
29013 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29014 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29015 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29016 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29017 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29018 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
29019 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29020 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29021
29022 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
29023
29024 /* BMI */
29025 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29026 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29027 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
29028
29029 /* TBM */
29030 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29031 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29032
29033 /* F16C */
29034 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
29035 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
29036 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
29037 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
29038
29039 /* BMI2 */
29040 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29041 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29042 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29043 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29044 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29045 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29046 };
29047
29048 /* FMA4 and XOP. */
29049 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
29050 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
29051 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
29052 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
29053 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
29054 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
29055 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
29056 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
29057 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
29058 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
29059 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
29060 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
29061 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
29062 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
29063 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
29064 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
29065 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
29066 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
29067 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
29068 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
29069 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
29070 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
29071 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
29072 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
29073 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
29074 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
29075 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
29076 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
29077 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
29078 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
29079 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
29080 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
29081 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
29082 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
29083 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
29084 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
29085 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
29086 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
29087 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
29088 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
29089 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
29090 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
29091 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
29092 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
29093 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
29094 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
29095 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
29096 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
29097 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
29098 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
29099 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
29100 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
29101
29102 static const struct builtin_description bdesc_multi_arg[] =
29103 {
29104 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
29105 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
29106 UNKNOWN, (int)MULTI_ARG_3_SF },
29107 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
29108 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
29109 UNKNOWN, (int)MULTI_ARG_3_DF },
29110
29111 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
29112 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
29113 UNKNOWN, (int)MULTI_ARG_3_SF },
29114 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
29115 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
29116 UNKNOWN, (int)MULTI_ARG_3_DF },
29117
29118 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
29119 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
29120 UNKNOWN, (int)MULTI_ARG_3_SF },
29121 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
29122 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
29123 UNKNOWN, (int)MULTI_ARG_3_DF },
29124 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
29125 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
29126 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29127 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
29128 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
29129 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29130
29131 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
29132 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
29133 UNKNOWN, (int)MULTI_ARG_3_SF },
29134 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
29135 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
29136 UNKNOWN, (int)MULTI_ARG_3_DF },
29137 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
29138 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
29139 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29140 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
29141 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
29142 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29143
29144 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
29145 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
29146 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
29147 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
29148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
29149 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
29150 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
29151
29152 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
29155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
29156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
29157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
29158 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
29159
29160 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
29161
29162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29163 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29166 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29168 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29170 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29174
29175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
29177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
29178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
29179 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
29180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
29181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
29182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
29183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29184 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
29185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
29186 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
29187 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29188 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
29189 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
29190 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
29191
29192 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
29193 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
29194 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
29195 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
29196 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
29197 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
29198
29199 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29200 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29201 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29202 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29203 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29204 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29205 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29206 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29207 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29208 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29209 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29210 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29211 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29212 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29213 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29214
29215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
29216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29218 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
29219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
29220 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
29221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
29222
29223 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
29224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29226 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
29227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
29228 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
29229 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
29230
29231 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
29232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29234 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
29235 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
29236 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
29237 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
29238
29239 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29240 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29241 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29242 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
29243 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
29244 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
29245 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
29246
29247 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
29248 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29249 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29250 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
29251 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
29252 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
29253 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
29254
29255 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
29256 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29257 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29258 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
29259 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
29260 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
29261 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
29262
29263 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
29264 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29265 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29266 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
29267 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
29268 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
29269 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
29270
29271 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29272 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29273 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29274 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
29275 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
29276 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
29277 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
29278
29279 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29280 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29281 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29282 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29283 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29284 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29285 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29286 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29287
29288 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29289 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29290 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29291 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29292 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29293 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29294 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29295 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29296
29297 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
29298 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
29299 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
29300 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
29301
29302 };
29303 \f
29304 /* TM vector builtins. */
29305
29306 /* Reuse the existing x86-specific `struct builtin_description' cause
29307 we're lazy. Add casts to make them fit. */
29308 static const struct builtin_description bdesc_tm[] =
29309 {
29310 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29311 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29312 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29313 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29314 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29315 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29316 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29317
29318 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29319 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29320 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29321 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29322 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29323 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29324 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29325
29326 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29327 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29328 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29329 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29330 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29331 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29332 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29333
29334 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
29335 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
29336 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
29337 };
29338
29339 /* TM callbacks. */
29340
29341 /* Return the builtin decl needed to load a vector of TYPE. */
29342
29343 static tree
29344 ix86_builtin_tm_load (tree type)
29345 {
29346 if (TREE_CODE (type) == VECTOR_TYPE)
29347 {
29348 switch (tree_to_uhwi (TYPE_SIZE (type)))
29349 {
29350 case 64:
29351 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
29352 case 128:
29353 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
29354 case 256:
29355 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
29356 }
29357 }
29358 return NULL_TREE;
29359 }
29360
29361 /* Return the builtin decl needed to store a vector of TYPE. */
29362
29363 static tree
29364 ix86_builtin_tm_store (tree type)
29365 {
29366 if (TREE_CODE (type) == VECTOR_TYPE)
29367 {
29368 switch (tree_to_uhwi (TYPE_SIZE (type)))
29369 {
29370 case 64:
29371 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
29372 case 128:
29373 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
29374 case 256:
29375 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
29376 }
29377 }
29378 return NULL_TREE;
29379 }
29380 \f
29381 /* Initialize the transactional memory vector load/store builtins. */
29382
29383 static void
29384 ix86_init_tm_builtins (void)
29385 {
29386 enum ix86_builtin_func_type ftype;
29387 const struct builtin_description *d;
29388 size_t i;
29389 tree decl;
29390 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
29391 tree attrs_log, attrs_type_log;
29392
29393 if (!flag_tm)
29394 return;
29395
29396 /* If there are no builtins defined, we must be compiling in a
29397 language without trans-mem support. */
29398 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
29399 return;
29400
29401 /* Use whatever attributes a normal TM load has. */
29402 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
29403 attrs_load = DECL_ATTRIBUTES (decl);
29404 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29405 /* Use whatever attributes a normal TM store has. */
29406 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
29407 attrs_store = DECL_ATTRIBUTES (decl);
29408 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29409 /* Use whatever attributes a normal TM log has. */
29410 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
29411 attrs_log = DECL_ATTRIBUTES (decl);
29412 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29413
29414 for (i = 0, d = bdesc_tm;
29415 i < ARRAY_SIZE (bdesc_tm);
29416 i++, d++)
29417 {
29418 if ((d->mask & ix86_isa_flags) != 0
29419 || (lang_hooks.builtin_function
29420 == lang_hooks.builtin_function_ext_scope))
29421 {
29422 tree type, attrs, attrs_type;
29423 enum built_in_function code = (enum built_in_function) d->code;
29424
29425 ftype = (enum ix86_builtin_func_type) d->flag;
29426 type = ix86_get_builtin_func_type (ftype);
29427
29428 if (BUILTIN_TM_LOAD_P (code))
29429 {
29430 attrs = attrs_load;
29431 attrs_type = attrs_type_load;
29432 }
29433 else if (BUILTIN_TM_STORE_P (code))
29434 {
29435 attrs = attrs_store;
29436 attrs_type = attrs_type_store;
29437 }
29438 else
29439 {
29440 attrs = attrs_log;
29441 attrs_type = attrs_type_log;
29442 }
29443 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
29444 /* The builtin without the prefix for
29445 calling it directly. */
29446 d->name + strlen ("__builtin_"),
29447 attrs);
29448 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
29449 set the TYPE_ATTRIBUTES. */
29450 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
29451
29452 set_builtin_decl (code, decl, false);
29453 }
29454 }
29455 }
29456
29457 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
29458 in the current target ISA to allow the user to compile particular modules
29459 with different target specific options that differ from the command line
29460 options. */
29461 static void
29462 ix86_init_mmx_sse_builtins (void)
29463 {
29464 const struct builtin_description * d;
29465 enum ix86_builtin_func_type ftype;
29466 size_t i;
29467
29468 /* Add all special builtins with variable number of operands. */
29469 for (i = 0, d = bdesc_special_args;
29470 i < ARRAY_SIZE (bdesc_special_args);
29471 i++, d++)
29472 {
29473 if (d->name == 0)
29474 continue;
29475
29476 ftype = (enum ix86_builtin_func_type) d->flag;
29477 def_builtin (d->mask, d->name, ftype, d->code);
29478 }
29479
29480 /* Add all builtins with variable number of operands. */
29481 for (i = 0, d = bdesc_args;
29482 i < ARRAY_SIZE (bdesc_args);
29483 i++, d++)
29484 {
29485 if (d->name == 0)
29486 continue;
29487
29488 ftype = (enum ix86_builtin_func_type) d->flag;
29489 def_builtin_const (d->mask, d->name, ftype, d->code);
29490 }
29491
29492 /* pcmpestr[im] insns. */
29493 for (i = 0, d = bdesc_pcmpestr;
29494 i < ARRAY_SIZE (bdesc_pcmpestr);
29495 i++, d++)
29496 {
29497 if (d->code == IX86_BUILTIN_PCMPESTRM128)
29498 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
29499 else
29500 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
29501 def_builtin_const (d->mask, d->name, ftype, d->code);
29502 }
29503
29504 /* pcmpistr[im] insns. */
29505 for (i = 0, d = bdesc_pcmpistr;
29506 i < ARRAY_SIZE (bdesc_pcmpistr);
29507 i++, d++)
29508 {
29509 if (d->code == IX86_BUILTIN_PCMPISTRM128)
29510 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
29511 else
29512 ftype = INT_FTYPE_V16QI_V16QI_INT;
29513 def_builtin_const (d->mask, d->name, ftype, d->code);
29514 }
29515
29516 /* comi/ucomi insns. */
29517 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29518 {
29519 if (d->mask == OPTION_MASK_ISA_SSE2)
29520 ftype = INT_FTYPE_V2DF_V2DF;
29521 else
29522 ftype = INT_FTYPE_V4SF_V4SF;
29523 def_builtin_const (d->mask, d->name, ftype, d->code);
29524 }
29525
29526 /* SSE */
29527 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
29528 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
29529 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
29530 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
29531
29532 /* SSE or 3DNow!A */
29533 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29534 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
29535 IX86_BUILTIN_MASKMOVQ);
29536
29537 /* SSE2 */
29538 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
29539 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
29540
29541 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
29542 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
29543 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
29544 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
29545
29546 /* SSE3. */
29547 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
29548 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
29549 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
29550 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
29551
29552 /* AES */
29553 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
29554 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
29555 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
29556 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
29557 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
29558 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
29559 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
29560 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
29561 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
29562 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
29563 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
29564 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
29565
29566 /* PCLMUL */
29567 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
29568 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
29569
29570 /* RDRND */
29571 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
29572 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
29573 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
29574 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
29575 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
29576 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
29577 IX86_BUILTIN_RDRAND64_STEP);
29578
29579 /* AVX2 */
29580 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
29581 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
29582 IX86_BUILTIN_GATHERSIV2DF);
29583
29584 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
29585 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
29586 IX86_BUILTIN_GATHERSIV4DF);
29587
29588 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
29589 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
29590 IX86_BUILTIN_GATHERDIV2DF);
29591
29592 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
29593 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
29594 IX86_BUILTIN_GATHERDIV4DF);
29595
29596 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
29597 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
29598 IX86_BUILTIN_GATHERSIV4SF);
29599
29600 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
29601 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
29602 IX86_BUILTIN_GATHERSIV8SF);
29603
29604 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
29605 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
29606 IX86_BUILTIN_GATHERDIV4SF);
29607
29608 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
29609 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29610 IX86_BUILTIN_GATHERDIV8SF);
29611
29612 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29613 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29614 IX86_BUILTIN_GATHERSIV2DI);
29615
29616 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29617 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29618 IX86_BUILTIN_GATHERSIV4DI);
29619
29620 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29621 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29622 IX86_BUILTIN_GATHERDIV2DI);
29623
29624 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29625 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29626 IX86_BUILTIN_GATHERDIV4DI);
29627
29628 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29629 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29630 IX86_BUILTIN_GATHERSIV4SI);
29631
29632 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29633 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29634 IX86_BUILTIN_GATHERSIV8SI);
29635
29636 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29637 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29638 IX86_BUILTIN_GATHERDIV4SI);
29639
29640 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29641 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29642 IX86_BUILTIN_GATHERDIV8SI);
29643
29644 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29645 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29646 IX86_BUILTIN_GATHERALTSIV4DF);
29647
29648 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29649 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29650 IX86_BUILTIN_GATHERALTDIV8SF);
29651
29652 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29653 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29654 IX86_BUILTIN_GATHERALTSIV4DI);
29655
29656 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29657 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29658 IX86_BUILTIN_GATHERALTDIV8SI);
29659
29660 /* RTM. */
29661 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29662 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29663
29664 /* MMX access to the vec_init patterns. */
29665 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29666 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29667
29668 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29669 V4HI_FTYPE_HI_HI_HI_HI,
29670 IX86_BUILTIN_VEC_INIT_V4HI);
29671
29672 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29673 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29674 IX86_BUILTIN_VEC_INIT_V8QI);
29675
29676 /* Access to the vec_extract patterns. */
29677 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29678 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29679 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29680 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29681 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29682 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29683 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29684 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29685 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29686 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29687
29688 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29689 "__builtin_ia32_vec_ext_v4hi",
29690 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29691
29692 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29693 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29694
29695 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29696 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29697
29698 /* Access to the vec_set patterns. */
29699 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29700 "__builtin_ia32_vec_set_v2di",
29701 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29702
29703 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29704 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29705
29706 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29707 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29708
29709 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29710 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29711
29712 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29713 "__builtin_ia32_vec_set_v4hi",
29714 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29715
29716 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29717 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29718
29719 /* RDSEED */
29720 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29721 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29722 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29723 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29724 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29725 "__builtin_ia32_rdseed_di_step",
29726 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29727
29728 /* ADCX */
29729 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29730 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29731 def_builtin (OPTION_MASK_ISA_64BIT,
29732 "__builtin_ia32_addcarryx_u64",
29733 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29734 IX86_BUILTIN_ADDCARRYX64);
29735
29736 /* Add FMA4 multi-arg argument instructions */
29737 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29738 {
29739 if (d->name == 0)
29740 continue;
29741
29742 ftype = (enum ix86_builtin_func_type) d->flag;
29743 def_builtin_const (d->mask, d->name, ftype, d->code);
29744 }
29745 }
29746
29747 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29748 to return a pointer to VERSION_DECL if the outcome of the expression
29749 formed by PREDICATE_CHAIN is true. This function will be called during
29750 version dispatch to decide which function version to execute. It returns
29751 the basic block at the end, to which more conditions can be added. */
29752
29753 static basic_block
29754 add_condition_to_bb (tree function_decl, tree version_decl,
29755 tree predicate_chain, basic_block new_bb)
29756 {
29757 gimple return_stmt;
29758 tree convert_expr, result_var;
29759 gimple convert_stmt;
29760 gimple call_cond_stmt;
29761 gimple if_else_stmt;
29762
29763 basic_block bb1, bb2, bb3;
29764 edge e12, e23;
29765
29766 tree cond_var, and_expr_var = NULL_TREE;
29767 gimple_seq gseq;
29768
29769 tree predicate_decl, predicate_arg;
29770
29771 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29772
29773 gcc_assert (new_bb != NULL);
29774 gseq = bb_seq (new_bb);
29775
29776
29777 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29778 build_fold_addr_expr (version_decl));
29779 result_var = create_tmp_var (ptr_type_node, NULL);
29780 convert_stmt = gimple_build_assign (result_var, convert_expr);
29781 return_stmt = gimple_build_return (result_var);
29782
29783 if (predicate_chain == NULL_TREE)
29784 {
29785 gimple_seq_add_stmt (&gseq, convert_stmt);
29786 gimple_seq_add_stmt (&gseq, return_stmt);
29787 set_bb_seq (new_bb, gseq);
29788 gimple_set_bb (convert_stmt, new_bb);
29789 gimple_set_bb (return_stmt, new_bb);
29790 pop_cfun ();
29791 return new_bb;
29792 }
29793
29794 while (predicate_chain != NULL)
29795 {
29796 cond_var = create_tmp_var (integer_type_node, NULL);
29797 predicate_decl = TREE_PURPOSE (predicate_chain);
29798 predicate_arg = TREE_VALUE (predicate_chain);
29799 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29800 gimple_call_set_lhs (call_cond_stmt, cond_var);
29801
29802 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29803 gimple_set_bb (call_cond_stmt, new_bb);
29804 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29805
29806 predicate_chain = TREE_CHAIN (predicate_chain);
29807
29808 if (and_expr_var == NULL)
29809 and_expr_var = cond_var;
29810 else
29811 {
29812 gimple assign_stmt;
29813 /* Use MIN_EXPR to check if any integer is zero?.
29814 and_expr_var = min_expr <cond_var, and_expr_var> */
29815 assign_stmt = gimple_build_assign (and_expr_var,
29816 build2 (MIN_EXPR, integer_type_node,
29817 cond_var, and_expr_var));
29818
29819 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29820 gimple_set_bb (assign_stmt, new_bb);
29821 gimple_seq_add_stmt (&gseq, assign_stmt);
29822 }
29823 }
29824
29825 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29826 integer_zero_node,
29827 NULL_TREE, NULL_TREE);
29828 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29829 gimple_set_bb (if_else_stmt, new_bb);
29830 gimple_seq_add_stmt (&gseq, if_else_stmt);
29831
29832 gimple_seq_add_stmt (&gseq, convert_stmt);
29833 gimple_seq_add_stmt (&gseq, return_stmt);
29834 set_bb_seq (new_bb, gseq);
29835
29836 bb1 = new_bb;
29837 e12 = split_block (bb1, if_else_stmt);
29838 bb2 = e12->dest;
29839 e12->flags &= ~EDGE_FALLTHRU;
29840 e12->flags |= EDGE_TRUE_VALUE;
29841
29842 e23 = split_block (bb2, return_stmt);
29843
29844 gimple_set_bb (convert_stmt, bb2);
29845 gimple_set_bb (return_stmt, bb2);
29846
29847 bb3 = e23->dest;
29848 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29849
29850 remove_edge (e23);
29851 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
29852
29853 pop_cfun ();
29854
29855 return bb3;
29856 }
29857
29858 /* This parses the attribute arguments to target in DECL and determines
29859 the right builtin to use to match the platform specification.
29860 It returns the priority value for this version decl. If PREDICATE_LIST
29861 is not NULL, it stores the list of cpu features that need to be checked
29862 before dispatching this function. */
29863
29864 static unsigned int
29865 get_builtin_code_for_version (tree decl, tree *predicate_list)
29866 {
29867 tree attrs;
29868 struct cl_target_option cur_target;
29869 tree target_node;
29870 struct cl_target_option *new_target;
29871 const char *arg_str = NULL;
29872 const char *attrs_str = NULL;
29873 char *tok_str = NULL;
29874 char *token;
29875
29876 /* Priority of i386 features, greater value is higher priority. This is
29877 used to decide the order in which function dispatch must happen. For
29878 instance, a version specialized for SSE4.2 should be checked for dispatch
29879 before a version for SSE3, as SSE4.2 implies SSE3. */
29880 enum feature_priority
29881 {
29882 P_ZERO = 0,
29883 P_MMX,
29884 P_SSE,
29885 P_SSE2,
29886 P_SSE3,
29887 P_SSSE3,
29888 P_PROC_SSSE3,
29889 P_SSE4_a,
29890 P_PROC_SSE4_a,
29891 P_SSE4_1,
29892 P_SSE4_2,
29893 P_PROC_SSE4_2,
29894 P_POPCNT,
29895 P_AVX,
29896 P_AVX2,
29897 P_FMA,
29898 P_PROC_FMA
29899 };
29900
29901 enum feature_priority priority = P_ZERO;
29902
29903 /* These are the target attribute strings for which a dispatcher is
29904 available, from fold_builtin_cpu. */
29905
29906 static struct _feature_list
29907 {
29908 const char *const name;
29909 const enum feature_priority priority;
29910 }
29911 const feature_list[] =
29912 {
29913 {"mmx", P_MMX},
29914 {"sse", P_SSE},
29915 {"sse2", P_SSE2},
29916 {"sse3", P_SSE3},
29917 {"ssse3", P_SSSE3},
29918 {"sse4.1", P_SSE4_1},
29919 {"sse4.2", P_SSE4_2},
29920 {"popcnt", P_POPCNT},
29921 {"avx", P_AVX},
29922 {"avx2", P_AVX2}
29923 };
29924
29925
29926 static unsigned int NUM_FEATURES
29927 = sizeof (feature_list) / sizeof (struct _feature_list);
29928
29929 unsigned int i;
29930
29931 tree predicate_chain = NULL_TREE;
29932 tree predicate_decl, predicate_arg;
29933
29934 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29935 gcc_assert (attrs != NULL);
29936
29937 attrs = TREE_VALUE (TREE_VALUE (attrs));
29938
29939 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29940 attrs_str = TREE_STRING_POINTER (attrs);
29941
29942 /* Return priority zero for default function. */
29943 if (strcmp (attrs_str, "default") == 0)
29944 return 0;
29945
29946 /* Handle arch= if specified. For priority, set it to be 1 more than
29947 the best instruction set the processor can handle. For instance, if
29948 there is a version for atom and a version for ssse3 (the highest ISA
29949 priority for atom), the atom version must be checked for dispatch
29950 before the ssse3 version. */
29951 if (strstr (attrs_str, "arch=") != NULL)
29952 {
29953 cl_target_option_save (&cur_target, &global_options);
29954 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
29955 &global_options_set);
29956
29957 gcc_assert (target_node);
29958 new_target = TREE_TARGET_OPTION (target_node);
29959 gcc_assert (new_target);
29960
29961 if (new_target->arch_specified && new_target->arch > 0)
29962 {
29963 switch (new_target->arch)
29964 {
29965 case PROCESSOR_CORE2:
29966 arg_str = "core2";
29967 priority = P_PROC_SSSE3;
29968 break;
29969 case PROCESSOR_COREI7:
29970 arg_str = "corei7";
29971 priority = P_PROC_SSE4_2;
29972 break;
29973 case PROCESSOR_COREI7_AVX:
29974 arg_str = "corei7-avx";
29975 priority = P_PROC_SSE4_2;
29976 break;
29977 case PROCESSOR_ATOM:
29978 arg_str = "atom";
29979 priority = P_PROC_SSSE3;
29980 break;
29981 case PROCESSOR_AMDFAM10:
29982 arg_str = "amdfam10h";
29983 priority = P_PROC_SSE4_a;
29984 break;
29985 case PROCESSOR_BDVER1:
29986 arg_str = "bdver1";
29987 priority = P_PROC_FMA;
29988 break;
29989 case PROCESSOR_BDVER2:
29990 arg_str = "bdver2";
29991 priority = P_PROC_FMA;
29992 break;
29993 }
29994 }
29995
29996 cl_target_option_restore (&global_options, &cur_target);
29997
29998 if (predicate_list && arg_str == NULL)
29999 {
30000 error_at (DECL_SOURCE_LOCATION (decl),
30001 "No dispatcher found for the versioning attributes");
30002 return 0;
30003 }
30004
30005 if (predicate_list)
30006 {
30007 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
30008 /* For a C string literal the length includes the trailing NULL. */
30009 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
30010 predicate_chain = tree_cons (predicate_decl, predicate_arg,
30011 predicate_chain);
30012 }
30013 }
30014
30015 /* Process feature name. */
30016 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
30017 strcpy (tok_str, attrs_str);
30018 token = strtok (tok_str, ",");
30019 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
30020
30021 while (token != NULL)
30022 {
30023 /* Do not process "arch=" */
30024 if (strncmp (token, "arch=", 5) == 0)
30025 {
30026 token = strtok (NULL, ",");
30027 continue;
30028 }
30029 for (i = 0; i < NUM_FEATURES; ++i)
30030 {
30031 if (strcmp (token, feature_list[i].name) == 0)
30032 {
30033 if (predicate_list)
30034 {
30035 predicate_arg = build_string_literal (
30036 strlen (feature_list[i].name) + 1,
30037 feature_list[i].name);
30038 predicate_chain = tree_cons (predicate_decl, predicate_arg,
30039 predicate_chain);
30040 }
30041 /* Find the maximum priority feature. */
30042 if (feature_list[i].priority > priority)
30043 priority = feature_list[i].priority;
30044
30045 break;
30046 }
30047 }
30048 if (predicate_list && i == NUM_FEATURES)
30049 {
30050 error_at (DECL_SOURCE_LOCATION (decl),
30051 "No dispatcher found for %s", token);
30052 return 0;
30053 }
30054 token = strtok (NULL, ",");
30055 }
30056 free (tok_str);
30057
30058 if (predicate_list && predicate_chain == NULL_TREE)
30059 {
30060 error_at (DECL_SOURCE_LOCATION (decl),
30061 "No dispatcher found for the versioning attributes : %s",
30062 attrs_str);
30063 return 0;
30064 }
30065 else if (predicate_list)
30066 {
30067 predicate_chain = nreverse (predicate_chain);
30068 *predicate_list = predicate_chain;
30069 }
30070
30071 return priority;
30072 }
30073
30074 /* This compares the priority of target features in function DECL1
30075 and DECL2. It returns positive value if DECL1 is higher priority,
30076 negative value if DECL2 is higher priority and 0 if they are the
30077 same. */
30078
30079 static int
30080 ix86_compare_version_priority (tree decl1, tree decl2)
30081 {
30082 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
30083 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
30084
30085 return (int)priority1 - (int)priority2;
30086 }
30087
30088 /* V1 and V2 point to function versions with different priorities
30089 based on the target ISA. This function compares their priorities. */
30090
30091 static int
30092 feature_compare (const void *v1, const void *v2)
30093 {
30094 typedef struct _function_version_info
30095 {
30096 tree version_decl;
30097 tree predicate_chain;
30098 unsigned int dispatch_priority;
30099 } function_version_info;
30100
30101 const function_version_info c1 = *(const function_version_info *)v1;
30102 const function_version_info c2 = *(const function_version_info *)v2;
30103 return (c2.dispatch_priority - c1.dispatch_priority);
30104 }
30105
30106 /* This function generates the dispatch function for
30107 multi-versioned functions. DISPATCH_DECL is the function which will
30108 contain the dispatch logic. FNDECLS are the function choices for
30109 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
30110 in DISPATCH_DECL in which the dispatch code is generated. */
30111
30112 static int
30113 dispatch_function_versions (tree dispatch_decl,
30114 void *fndecls_p,
30115 basic_block *empty_bb)
30116 {
30117 tree default_decl;
30118 gimple ifunc_cpu_init_stmt;
30119 gimple_seq gseq;
30120 int ix;
30121 tree ele;
30122 vec<tree> *fndecls;
30123 unsigned int num_versions = 0;
30124 unsigned int actual_versions = 0;
30125 unsigned int i;
30126
30127 struct _function_version_info
30128 {
30129 tree version_decl;
30130 tree predicate_chain;
30131 unsigned int dispatch_priority;
30132 }*function_version_info;
30133
30134 gcc_assert (dispatch_decl != NULL
30135 && fndecls_p != NULL
30136 && empty_bb != NULL);
30137
30138 /*fndecls_p is actually a vector. */
30139 fndecls = static_cast<vec<tree> *> (fndecls_p);
30140
30141 /* At least one more version other than the default. */
30142 num_versions = fndecls->length ();
30143 gcc_assert (num_versions >= 2);
30144
30145 function_version_info = (struct _function_version_info *)
30146 XNEWVEC (struct _function_version_info, (num_versions - 1));
30147
30148 /* The first version in the vector is the default decl. */
30149 default_decl = (*fndecls)[0];
30150
30151 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
30152
30153 gseq = bb_seq (*empty_bb);
30154 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
30155 constructors, so explicity call __builtin_cpu_init here. */
30156 ifunc_cpu_init_stmt = gimple_build_call_vec (
30157 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
30158 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
30159 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
30160 set_bb_seq (*empty_bb, gseq);
30161
30162 pop_cfun ();
30163
30164
30165 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
30166 {
30167 tree version_decl = ele;
30168 tree predicate_chain = NULL_TREE;
30169 unsigned int priority;
30170 /* Get attribute string, parse it and find the right predicate decl.
30171 The predicate function could be a lengthy combination of many
30172 features, like arch-type and various isa-variants. */
30173 priority = get_builtin_code_for_version (version_decl,
30174 &predicate_chain);
30175
30176 if (predicate_chain == NULL_TREE)
30177 continue;
30178
30179 function_version_info [actual_versions].version_decl = version_decl;
30180 function_version_info [actual_versions].predicate_chain
30181 = predicate_chain;
30182 function_version_info [actual_versions].dispatch_priority = priority;
30183 actual_versions++;
30184 }
30185
30186 /* Sort the versions according to descending order of dispatch priority. The
30187 priority is based on the ISA. This is not a perfect solution. There
30188 could still be ambiguity. If more than one function version is suitable
30189 to execute, which one should be dispatched? In future, allow the user
30190 to specify a dispatch priority next to the version. */
30191 qsort (function_version_info, actual_versions,
30192 sizeof (struct _function_version_info), feature_compare);
30193
30194 for (i = 0; i < actual_versions; ++i)
30195 *empty_bb = add_condition_to_bb (dispatch_decl,
30196 function_version_info[i].version_decl,
30197 function_version_info[i].predicate_chain,
30198 *empty_bb);
30199
30200 /* dispatch default version at the end. */
30201 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
30202 NULL, *empty_bb);
30203
30204 free (function_version_info);
30205 return 0;
30206 }
30207
30208 /* Comparator function to be used in qsort routine to sort attribute
30209 specification strings to "target". */
30210
30211 static int
30212 attr_strcmp (const void *v1, const void *v2)
30213 {
30214 const char *c1 = *(char *const*)v1;
30215 const char *c2 = *(char *const*)v2;
30216 return strcmp (c1, c2);
30217 }
30218
30219 /* ARGLIST is the argument to target attribute. This function tokenizes
30220 the comma separated arguments, sorts them and returns a string which
30221 is a unique identifier for the comma separated arguments. It also
30222 replaces non-identifier characters "=,-" with "_". */
30223
30224 static char *
30225 sorted_attr_string (tree arglist)
30226 {
30227 tree arg;
30228 size_t str_len_sum = 0;
30229 char **args = NULL;
30230 char *attr_str, *ret_str;
30231 char *attr = NULL;
30232 unsigned int argnum = 1;
30233 unsigned int i;
30234
30235 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30236 {
30237 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30238 size_t len = strlen (str);
30239 str_len_sum += len + 1;
30240 if (arg != arglist)
30241 argnum++;
30242 for (i = 0; i < strlen (str); i++)
30243 if (str[i] == ',')
30244 argnum++;
30245 }
30246
30247 attr_str = XNEWVEC (char, str_len_sum);
30248 str_len_sum = 0;
30249 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30250 {
30251 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30252 size_t len = strlen (str);
30253 memcpy (attr_str + str_len_sum, str, len);
30254 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
30255 str_len_sum += len + 1;
30256 }
30257
30258 /* Replace "=,-" with "_". */
30259 for (i = 0; i < strlen (attr_str); i++)
30260 if (attr_str[i] == '=' || attr_str[i]== '-')
30261 attr_str[i] = '_';
30262
30263 if (argnum == 1)
30264 return attr_str;
30265
30266 args = XNEWVEC (char *, argnum);
30267
30268 i = 0;
30269 attr = strtok (attr_str, ",");
30270 while (attr != NULL)
30271 {
30272 args[i] = attr;
30273 i++;
30274 attr = strtok (NULL, ",");
30275 }
30276
30277 qsort (args, argnum, sizeof (char *), attr_strcmp);
30278
30279 ret_str = XNEWVEC (char, str_len_sum);
30280 str_len_sum = 0;
30281 for (i = 0; i < argnum; i++)
30282 {
30283 size_t len = strlen (args[i]);
30284 memcpy (ret_str + str_len_sum, args[i], len);
30285 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
30286 str_len_sum += len + 1;
30287 }
30288
30289 XDELETEVEC (args);
30290 XDELETEVEC (attr_str);
30291 return ret_str;
30292 }
30293
30294 /* This function changes the assembler name for functions that are
30295 versions. If DECL is a function version and has a "target"
30296 attribute, it appends the attribute string to its assembler name. */
30297
30298 static tree
30299 ix86_mangle_function_version_assembler_name (tree decl, tree id)
30300 {
30301 tree version_attr;
30302 const char *orig_name, *version_string;
30303 char *attr_str, *assembler_name;
30304
30305 if (DECL_DECLARED_INLINE_P (decl)
30306 && lookup_attribute ("gnu_inline",
30307 DECL_ATTRIBUTES (decl)))
30308 error_at (DECL_SOURCE_LOCATION (decl),
30309 "Function versions cannot be marked as gnu_inline,"
30310 " bodies have to be generated");
30311
30312 if (DECL_VIRTUAL_P (decl)
30313 || DECL_VINDEX (decl))
30314 sorry ("Virtual function multiversioning not supported");
30315
30316 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30317
30318 /* target attribute string cannot be NULL. */
30319 gcc_assert (version_attr != NULL_TREE);
30320
30321 orig_name = IDENTIFIER_POINTER (id);
30322 version_string
30323 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
30324
30325 if (strcmp (version_string, "default") == 0)
30326 return id;
30327
30328 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
30329 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
30330
30331 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
30332
30333 /* Allow assembler name to be modified if already set. */
30334 if (DECL_ASSEMBLER_NAME_SET_P (decl))
30335 SET_DECL_RTL (decl, NULL);
30336
30337 tree ret = get_identifier (assembler_name);
30338 XDELETEVEC (attr_str);
30339 XDELETEVEC (assembler_name);
30340 return ret;
30341 }
30342
30343 /* This function returns true if FN1 and FN2 are versions of the same function,
30344 that is, the target strings of the function decls are different. This assumes
30345 that FN1 and FN2 have the same signature. */
30346
30347 static bool
30348 ix86_function_versions (tree fn1, tree fn2)
30349 {
30350 tree attr1, attr2;
30351 char *target1, *target2;
30352 bool result;
30353
30354 if (TREE_CODE (fn1) != FUNCTION_DECL
30355 || TREE_CODE (fn2) != FUNCTION_DECL)
30356 return false;
30357
30358 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
30359 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
30360
30361 /* At least one function decl should have the target attribute specified. */
30362 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
30363 return false;
30364
30365 /* Diagnose missing target attribute if one of the decls is already
30366 multi-versioned. */
30367 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
30368 {
30369 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
30370 {
30371 if (attr2 != NULL_TREE)
30372 {
30373 tree tem = fn1;
30374 fn1 = fn2;
30375 fn2 = tem;
30376 attr1 = attr2;
30377 }
30378 error_at (DECL_SOURCE_LOCATION (fn2),
30379 "missing %<target%> attribute for multi-versioned %D",
30380 fn2);
30381 inform (DECL_SOURCE_LOCATION (fn1),
30382 "previous declaration of %D", fn1);
30383 /* Prevent diagnosing of the same error multiple times. */
30384 DECL_ATTRIBUTES (fn2)
30385 = tree_cons (get_identifier ("target"),
30386 copy_node (TREE_VALUE (attr1)),
30387 DECL_ATTRIBUTES (fn2));
30388 }
30389 return false;
30390 }
30391
30392 target1 = sorted_attr_string (TREE_VALUE (attr1));
30393 target2 = sorted_attr_string (TREE_VALUE (attr2));
30394
30395 /* The sorted target strings must be different for fn1 and fn2
30396 to be versions. */
30397 if (strcmp (target1, target2) == 0)
30398 result = false;
30399 else
30400 result = true;
30401
30402 XDELETEVEC (target1);
30403 XDELETEVEC (target2);
30404
30405 return result;
30406 }
30407
30408 static tree
30409 ix86_mangle_decl_assembler_name (tree decl, tree id)
30410 {
30411 /* For function version, add the target suffix to the assembler name. */
30412 if (TREE_CODE (decl) == FUNCTION_DECL
30413 && DECL_FUNCTION_VERSIONED (decl))
30414 id = ix86_mangle_function_version_assembler_name (decl, id);
30415 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
30416 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
30417 #endif
30418
30419 return id;
30420 }
30421
30422 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
30423 is true, append the full path name of the source file. */
30424
30425 static char *
30426 make_name (tree decl, const char *suffix, bool make_unique)
30427 {
30428 char *global_var_name;
30429 int name_len;
30430 const char *name;
30431 const char *unique_name = NULL;
30432
30433 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
30434
30435 /* Get a unique name that can be used globally without any chances
30436 of collision at link time. */
30437 if (make_unique)
30438 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
30439
30440 name_len = strlen (name) + strlen (suffix) + 2;
30441
30442 if (make_unique)
30443 name_len += strlen (unique_name) + 1;
30444 global_var_name = XNEWVEC (char, name_len);
30445
30446 /* Use '.' to concatenate names as it is demangler friendly. */
30447 if (make_unique)
30448 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
30449 suffix);
30450 else
30451 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
30452
30453 return global_var_name;
30454 }
30455
30456 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30457
30458 /* Make a dispatcher declaration for the multi-versioned function DECL.
30459 Calls to DECL function will be replaced with calls to the dispatcher
30460 by the front-end. Return the decl created. */
30461
30462 static tree
30463 make_dispatcher_decl (const tree decl)
30464 {
30465 tree func_decl;
30466 char *func_name;
30467 tree fn_type, func_type;
30468 bool is_uniq = false;
30469
30470 if (TREE_PUBLIC (decl) == 0)
30471 is_uniq = true;
30472
30473 func_name = make_name (decl, "ifunc", is_uniq);
30474
30475 fn_type = TREE_TYPE (decl);
30476 func_type = build_function_type (TREE_TYPE (fn_type),
30477 TYPE_ARG_TYPES (fn_type));
30478
30479 func_decl = build_fn_decl (func_name, func_type);
30480 XDELETEVEC (func_name);
30481 TREE_USED (func_decl) = 1;
30482 DECL_CONTEXT (func_decl) = NULL_TREE;
30483 DECL_INITIAL (func_decl) = error_mark_node;
30484 DECL_ARTIFICIAL (func_decl) = 1;
30485 /* Mark this func as external, the resolver will flip it again if
30486 it gets generated. */
30487 DECL_EXTERNAL (func_decl) = 1;
30488 /* This will be of type IFUNCs have to be externally visible. */
30489 TREE_PUBLIC (func_decl) = 1;
30490
30491 return func_decl;
30492 }
30493
30494 #endif
30495
30496 /* Returns true if decl is multi-versioned and DECL is the default function,
30497 that is it is not tagged with target specific optimization. */
30498
30499 static bool
30500 is_function_default_version (const tree decl)
30501 {
30502 if (TREE_CODE (decl) != FUNCTION_DECL
30503 || !DECL_FUNCTION_VERSIONED (decl))
30504 return false;
30505 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30506 gcc_assert (attr);
30507 attr = TREE_VALUE (TREE_VALUE (attr));
30508 return (TREE_CODE (attr) == STRING_CST
30509 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
30510 }
30511
30512 /* Make a dispatcher declaration for the multi-versioned function DECL.
30513 Calls to DECL function will be replaced with calls to the dispatcher
30514 by the front-end. Returns the decl of the dispatcher function. */
30515
30516 static tree
30517 ix86_get_function_versions_dispatcher (void *decl)
30518 {
30519 tree fn = (tree) decl;
30520 struct cgraph_node *node = NULL;
30521 struct cgraph_node *default_node = NULL;
30522 struct cgraph_function_version_info *node_v = NULL;
30523 struct cgraph_function_version_info *first_v = NULL;
30524
30525 tree dispatch_decl = NULL;
30526
30527 struct cgraph_function_version_info *default_version_info = NULL;
30528
30529 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
30530
30531 node = cgraph_get_node (fn);
30532 gcc_assert (node != NULL);
30533
30534 node_v = get_cgraph_node_version (node);
30535 gcc_assert (node_v != NULL);
30536
30537 if (node_v->dispatcher_resolver != NULL)
30538 return node_v->dispatcher_resolver;
30539
30540 /* Find the default version and make it the first node. */
30541 first_v = node_v;
30542 /* Go to the beginning of the chain. */
30543 while (first_v->prev != NULL)
30544 first_v = first_v->prev;
30545 default_version_info = first_v;
30546 while (default_version_info != NULL)
30547 {
30548 if (is_function_default_version
30549 (default_version_info->this_node->decl))
30550 break;
30551 default_version_info = default_version_info->next;
30552 }
30553
30554 /* If there is no default node, just return NULL. */
30555 if (default_version_info == NULL)
30556 return NULL;
30557
30558 /* Make default info the first node. */
30559 if (first_v != default_version_info)
30560 {
30561 default_version_info->prev->next = default_version_info->next;
30562 if (default_version_info->next)
30563 default_version_info->next->prev = default_version_info->prev;
30564 first_v->prev = default_version_info;
30565 default_version_info->next = first_v;
30566 default_version_info->prev = NULL;
30567 }
30568
30569 default_node = default_version_info->this_node;
30570
30571 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30572 if (targetm.has_ifunc_p ())
30573 {
30574 struct cgraph_function_version_info *it_v = NULL;
30575 struct cgraph_node *dispatcher_node = NULL;
30576 struct cgraph_function_version_info *dispatcher_version_info = NULL;
30577
30578 /* Right now, the dispatching is done via ifunc. */
30579 dispatch_decl = make_dispatcher_decl (default_node->decl);
30580
30581 dispatcher_node = cgraph_get_create_node (dispatch_decl);
30582 gcc_assert (dispatcher_node != NULL);
30583 dispatcher_node->dispatcher_function = 1;
30584 dispatcher_version_info
30585 = insert_new_cgraph_node_version (dispatcher_node);
30586 dispatcher_version_info->next = default_version_info;
30587 dispatcher_node->definition = 1;
30588
30589 /* Set the dispatcher for all the versions. */
30590 it_v = default_version_info;
30591 while (it_v != NULL)
30592 {
30593 it_v->dispatcher_resolver = dispatch_decl;
30594 it_v = it_v->next;
30595 }
30596 }
30597 else
30598 #endif
30599 {
30600 error_at (DECL_SOURCE_LOCATION (default_node->decl),
30601 "multiversioning needs ifunc which is not supported "
30602 "on this target");
30603 }
30604
30605 return dispatch_decl;
30606 }
30607
30608 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
30609 it to CHAIN. */
30610
30611 static tree
30612 make_attribute (const char *name, const char *arg_name, tree chain)
30613 {
30614 tree attr_name;
30615 tree attr_arg_name;
30616 tree attr_args;
30617 tree attr;
30618
30619 attr_name = get_identifier (name);
30620 attr_arg_name = build_string (strlen (arg_name), arg_name);
30621 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30622 attr = tree_cons (attr_name, attr_args, chain);
30623 return attr;
30624 }
30625
30626 /* Make the resolver function decl to dispatch the versions of
30627 a multi-versioned function, DEFAULT_DECL. Create an
30628 empty basic block in the resolver and store the pointer in
30629 EMPTY_BB. Return the decl of the resolver function. */
30630
30631 static tree
30632 make_resolver_func (const tree default_decl,
30633 const tree dispatch_decl,
30634 basic_block *empty_bb)
30635 {
30636 char *resolver_name;
30637 tree decl, type, decl_name, t;
30638 bool is_uniq = false;
30639
30640 /* IFUNC's have to be globally visible. So, if the default_decl is
30641 not, then the name of the IFUNC should be made unique. */
30642 if (TREE_PUBLIC (default_decl) == 0)
30643 is_uniq = true;
30644
30645 /* Append the filename to the resolver function if the versions are
30646 not externally visible. This is because the resolver function has
30647 to be externally visible for the loader to find it. So, appending
30648 the filename will prevent conflicts with a resolver function from
30649 another module which is based on the same version name. */
30650 resolver_name = make_name (default_decl, "resolver", is_uniq);
30651
30652 /* The resolver function should return a (void *). */
30653 type = build_function_type_list (ptr_type_node, NULL_TREE);
30654
30655 decl = build_fn_decl (resolver_name, type);
30656 decl_name = get_identifier (resolver_name);
30657 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30658
30659 DECL_NAME (decl) = decl_name;
30660 TREE_USED (decl) = 1;
30661 DECL_ARTIFICIAL (decl) = 1;
30662 DECL_IGNORED_P (decl) = 0;
30663 /* IFUNC resolvers have to be externally visible. */
30664 TREE_PUBLIC (decl) = 1;
30665 DECL_UNINLINABLE (decl) = 1;
30666
30667 /* Resolver is not external, body is generated. */
30668 DECL_EXTERNAL (decl) = 0;
30669 DECL_EXTERNAL (dispatch_decl) = 0;
30670
30671 DECL_CONTEXT (decl) = NULL_TREE;
30672 DECL_INITIAL (decl) = make_node (BLOCK);
30673 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30674
30675 if (DECL_COMDAT_GROUP (default_decl)
30676 || TREE_PUBLIC (default_decl))
30677 {
30678 /* In this case, each translation unit with a call to this
30679 versioned function will put out a resolver. Ensure it
30680 is comdat to keep just one copy. */
30681 DECL_COMDAT (decl) = 1;
30682 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30683 }
30684 /* Build result decl and add to function_decl. */
30685 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30686 DECL_ARTIFICIAL (t) = 1;
30687 DECL_IGNORED_P (t) = 1;
30688 DECL_RESULT (decl) = t;
30689
30690 gimplify_function_tree (decl);
30691 push_cfun (DECL_STRUCT_FUNCTION (decl));
30692 *empty_bb = init_lowered_empty_function (decl, false);
30693
30694 cgraph_add_new_function (decl, true);
30695 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30696
30697 pop_cfun ();
30698
30699 gcc_assert (dispatch_decl != NULL);
30700 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30701 DECL_ATTRIBUTES (dispatch_decl)
30702 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30703
30704 /* Create the alias for dispatch to resolver here. */
30705 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30706 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30707 XDELETEVEC (resolver_name);
30708 return decl;
30709 }
30710
30711 /* Generate the dispatching code body to dispatch multi-versioned function
30712 DECL. The target hook is called to process the "target" attributes and
30713 provide the code to dispatch the right function at run-time. NODE points
30714 to the dispatcher decl whose body will be created. */
30715
30716 static tree
30717 ix86_generate_version_dispatcher_body (void *node_p)
30718 {
30719 tree resolver_decl;
30720 basic_block empty_bb;
30721 tree default_ver_decl;
30722 struct cgraph_node *versn;
30723 struct cgraph_node *node;
30724
30725 struct cgraph_function_version_info *node_version_info = NULL;
30726 struct cgraph_function_version_info *versn_info = NULL;
30727
30728 node = (cgraph_node *)node_p;
30729
30730 node_version_info = get_cgraph_node_version (node);
30731 gcc_assert (node->dispatcher_function
30732 && node_version_info != NULL);
30733
30734 if (node_version_info->dispatcher_resolver)
30735 return node_version_info->dispatcher_resolver;
30736
30737 /* The first version in the chain corresponds to the default version. */
30738 default_ver_decl = node_version_info->next->this_node->decl;
30739
30740 /* node is going to be an alias, so remove the finalized bit. */
30741 node->definition = false;
30742
30743 resolver_decl = make_resolver_func (default_ver_decl,
30744 node->decl, &empty_bb);
30745
30746 node_version_info->dispatcher_resolver = resolver_decl;
30747
30748 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30749
30750 stack_vec<tree, 2> fn_ver_vec;
30751
30752 for (versn_info = node_version_info->next; versn_info;
30753 versn_info = versn_info->next)
30754 {
30755 versn = versn_info->this_node;
30756 /* Check for virtual functions here again, as by this time it should
30757 have been determined if this function needs a vtable index or
30758 not. This happens for methods in derived classes that override
30759 virtual methods in base classes but are not explicitly marked as
30760 virtual. */
30761 if (DECL_VINDEX (versn->decl))
30762 sorry ("Virtual function multiversioning not supported");
30763
30764 fn_ver_vec.safe_push (versn->decl);
30765 }
30766
30767 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30768 rebuild_cgraph_edges ();
30769 pop_cfun ();
30770 return resolver_decl;
30771 }
30772 /* This builds the processor_model struct type defined in
30773 libgcc/config/i386/cpuinfo.c */
30774
30775 static tree
30776 build_processor_model_struct (void)
30777 {
30778 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30779 "__cpu_features"};
30780 tree field = NULL_TREE, field_chain = NULL_TREE;
30781 int i;
30782 tree type = make_node (RECORD_TYPE);
30783
30784 /* The first 3 fields are unsigned int. */
30785 for (i = 0; i < 3; ++i)
30786 {
30787 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30788 get_identifier (field_name[i]), unsigned_type_node);
30789 if (field_chain != NULL_TREE)
30790 DECL_CHAIN (field) = field_chain;
30791 field_chain = field;
30792 }
30793
30794 /* The last field is an array of unsigned integers of size one. */
30795 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30796 get_identifier (field_name[3]),
30797 build_array_type (unsigned_type_node,
30798 build_index_type (size_one_node)));
30799 if (field_chain != NULL_TREE)
30800 DECL_CHAIN (field) = field_chain;
30801 field_chain = field;
30802
30803 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30804 return type;
30805 }
30806
30807 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30808
30809 static tree
30810 make_var_decl (tree type, const char *name)
30811 {
30812 tree new_decl;
30813
30814 new_decl = build_decl (UNKNOWN_LOCATION,
30815 VAR_DECL,
30816 get_identifier(name),
30817 type);
30818
30819 DECL_EXTERNAL (new_decl) = 1;
30820 TREE_STATIC (new_decl) = 1;
30821 TREE_PUBLIC (new_decl) = 1;
30822 DECL_INITIAL (new_decl) = 0;
30823 DECL_ARTIFICIAL (new_decl) = 0;
30824 DECL_PRESERVE_P (new_decl) = 1;
30825
30826 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30827 assemble_variable (new_decl, 0, 0, 0);
30828
30829 return new_decl;
30830 }
30831
30832 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30833 into an integer defined in libgcc/config/i386/cpuinfo.c */
30834
30835 static tree
30836 fold_builtin_cpu (tree fndecl, tree *args)
30837 {
30838 unsigned int i;
30839 enum ix86_builtins fn_code = (enum ix86_builtins)
30840 DECL_FUNCTION_CODE (fndecl);
30841 tree param_string_cst = NULL;
30842
30843 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30844 enum processor_features
30845 {
30846 F_CMOV = 0,
30847 F_MMX,
30848 F_POPCNT,
30849 F_SSE,
30850 F_SSE2,
30851 F_SSE3,
30852 F_SSSE3,
30853 F_SSE4_1,
30854 F_SSE4_2,
30855 F_AVX,
30856 F_AVX2,
30857 F_MAX
30858 };
30859
30860 /* These are the values for vendor types and cpu types and subtypes
30861 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30862 the corresponding start value. */
30863 enum processor_model
30864 {
30865 M_INTEL = 1,
30866 M_AMD,
30867 M_CPU_TYPE_START,
30868 M_INTEL_ATOM,
30869 M_INTEL_CORE2,
30870 M_INTEL_COREI7,
30871 M_AMDFAM10H,
30872 M_AMDFAM15H,
30873 M_INTEL_SLM,
30874 M_CPU_SUBTYPE_START,
30875 M_INTEL_COREI7_NEHALEM,
30876 M_INTEL_COREI7_WESTMERE,
30877 M_INTEL_COREI7_SANDYBRIDGE,
30878 M_AMDFAM10H_BARCELONA,
30879 M_AMDFAM10H_SHANGHAI,
30880 M_AMDFAM10H_ISTANBUL,
30881 M_AMDFAM15H_BDVER1,
30882 M_AMDFAM15H_BDVER2,
30883 M_AMDFAM15H_BDVER3,
30884 M_AMDFAM15H_BDVER4
30885 };
30886
30887 static struct _arch_names_table
30888 {
30889 const char *const name;
30890 const enum processor_model model;
30891 }
30892 const arch_names_table[] =
30893 {
30894 {"amd", M_AMD},
30895 {"intel", M_INTEL},
30896 {"atom", M_INTEL_ATOM},
30897 {"slm", M_INTEL_SLM},
30898 {"core2", M_INTEL_CORE2},
30899 {"corei7", M_INTEL_COREI7},
30900 {"nehalem", M_INTEL_COREI7_NEHALEM},
30901 {"westmere", M_INTEL_COREI7_WESTMERE},
30902 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30903 {"amdfam10h", M_AMDFAM10H},
30904 {"barcelona", M_AMDFAM10H_BARCELONA},
30905 {"shanghai", M_AMDFAM10H_SHANGHAI},
30906 {"istanbul", M_AMDFAM10H_ISTANBUL},
30907 {"amdfam15h", M_AMDFAM15H},
30908 {"bdver1", M_AMDFAM15H_BDVER1},
30909 {"bdver2", M_AMDFAM15H_BDVER2},
30910 {"bdver3", M_AMDFAM15H_BDVER3},
30911 {"bdver4", M_AMDFAM15H_BDVER4},
30912 };
30913
30914 static struct _isa_names_table
30915 {
30916 const char *const name;
30917 const enum processor_features feature;
30918 }
30919 const isa_names_table[] =
30920 {
30921 {"cmov", F_CMOV},
30922 {"mmx", F_MMX},
30923 {"popcnt", F_POPCNT},
30924 {"sse", F_SSE},
30925 {"sse2", F_SSE2},
30926 {"sse3", F_SSE3},
30927 {"ssse3", F_SSSE3},
30928 {"sse4.1", F_SSE4_1},
30929 {"sse4.2", F_SSE4_2},
30930 {"avx", F_AVX},
30931 {"avx2", F_AVX2}
30932 };
30933
30934 tree __processor_model_type = build_processor_model_struct ();
30935 tree __cpu_model_var = make_var_decl (__processor_model_type,
30936 "__cpu_model");
30937
30938
30939 varpool_add_new_variable (__cpu_model_var);
30940
30941 gcc_assert ((args != NULL) && (*args != NULL));
30942
30943 param_string_cst = *args;
30944 while (param_string_cst
30945 && TREE_CODE (param_string_cst) != STRING_CST)
30946 {
30947 /* *args must be a expr that can contain other EXPRS leading to a
30948 STRING_CST. */
30949 if (!EXPR_P (param_string_cst))
30950 {
30951 error ("Parameter to builtin must be a string constant or literal");
30952 return integer_zero_node;
30953 }
30954 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30955 }
30956
30957 gcc_assert (param_string_cst);
30958
30959 if (fn_code == IX86_BUILTIN_CPU_IS)
30960 {
30961 tree ref;
30962 tree field;
30963 tree final;
30964
30965 unsigned int field_val = 0;
30966 unsigned int NUM_ARCH_NAMES
30967 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30968
30969 for (i = 0; i < NUM_ARCH_NAMES; i++)
30970 if (strcmp (arch_names_table[i].name,
30971 TREE_STRING_POINTER (param_string_cst)) == 0)
30972 break;
30973
30974 if (i == NUM_ARCH_NAMES)
30975 {
30976 error ("Parameter to builtin not valid: %s",
30977 TREE_STRING_POINTER (param_string_cst));
30978 return integer_zero_node;
30979 }
30980
30981 field = TYPE_FIELDS (__processor_model_type);
30982 field_val = arch_names_table[i].model;
30983
30984 /* CPU types are stored in the next field. */
30985 if (field_val > M_CPU_TYPE_START
30986 && field_val < M_CPU_SUBTYPE_START)
30987 {
30988 field = DECL_CHAIN (field);
30989 field_val -= M_CPU_TYPE_START;
30990 }
30991
30992 /* CPU subtypes are stored in the next field. */
30993 if (field_val > M_CPU_SUBTYPE_START)
30994 {
30995 field = DECL_CHAIN ( DECL_CHAIN (field));
30996 field_val -= M_CPU_SUBTYPE_START;
30997 }
30998
30999 /* Get the appropriate field in __cpu_model. */
31000 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
31001 field, NULL_TREE);
31002
31003 /* Check the value. */
31004 final = build2 (EQ_EXPR, unsigned_type_node, ref,
31005 build_int_cstu (unsigned_type_node, field_val));
31006 return build1 (CONVERT_EXPR, integer_type_node, final);
31007 }
31008 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
31009 {
31010 tree ref;
31011 tree array_elt;
31012 tree field;
31013 tree final;
31014
31015 unsigned int field_val = 0;
31016 unsigned int NUM_ISA_NAMES
31017 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
31018
31019 for (i = 0; i < NUM_ISA_NAMES; i++)
31020 if (strcmp (isa_names_table[i].name,
31021 TREE_STRING_POINTER (param_string_cst)) == 0)
31022 break;
31023
31024 if (i == NUM_ISA_NAMES)
31025 {
31026 error ("Parameter to builtin not valid: %s",
31027 TREE_STRING_POINTER (param_string_cst));
31028 return integer_zero_node;
31029 }
31030
31031 field = TYPE_FIELDS (__processor_model_type);
31032 /* Get the last field, which is __cpu_features. */
31033 while (DECL_CHAIN (field))
31034 field = DECL_CHAIN (field);
31035
31036 /* Get the appropriate field: __cpu_model.__cpu_features */
31037 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
31038 field, NULL_TREE);
31039
31040 /* Access the 0th element of __cpu_features array. */
31041 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
31042 integer_zero_node, NULL_TREE, NULL_TREE);
31043
31044 field_val = (1 << isa_names_table[i].feature);
31045 /* Return __cpu_model.__cpu_features[0] & field_val */
31046 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
31047 build_int_cstu (unsigned_type_node, field_val));
31048 return build1 (CONVERT_EXPR, integer_type_node, final);
31049 }
31050 gcc_unreachable ();
31051 }
31052
31053 static tree
31054 ix86_fold_builtin (tree fndecl, int n_args,
31055 tree *args, bool ignore ATTRIBUTE_UNUSED)
31056 {
31057 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31058 {
31059 enum ix86_builtins fn_code = (enum ix86_builtins)
31060 DECL_FUNCTION_CODE (fndecl);
31061 if (fn_code == IX86_BUILTIN_CPU_IS
31062 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
31063 {
31064 gcc_assert (n_args == 1);
31065 return fold_builtin_cpu (fndecl, args);
31066 }
31067 }
31068
31069 #ifdef SUBTARGET_FOLD_BUILTIN
31070 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
31071 #endif
31072
31073 return NULL_TREE;
31074 }
31075
31076 /* Make builtins to detect cpu type and features supported. NAME is
31077 the builtin name, CODE is the builtin code, and FTYPE is the function
31078 type of the builtin. */
31079
31080 static void
31081 make_cpu_type_builtin (const char* name, int code,
31082 enum ix86_builtin_func_type ftype, bool is_const)
31083 {
31084 tree decl;
31085 tree type;
31086
31087 type = ix86_get_builtin_func_type (ftype);
31088 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31089 NULL, NULL_TREE);
31090 gcc_assert (decl != NULL_TREE);
31091 ix86_builtins[(int) code] = decl;
31092 TREE_READONLY (decl) = is_const;
31093 }
31094
31095 /* Make builtins to get CPU type and features supported. The created
31096 builtins are :
31097
31098 __builtin_cpu_init (), to detect cpu type and features,
31099 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
31100 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
31101 */
31102
31103 static void
31104 ix86_init_platform_type_builtins (void)
31105 {
31106 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
31107 INT_FTYPE_VOID, false);
31108 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
31109 INT_FTYPE_PCCHAR, true);
31110 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
31111 INT_FTYPE_PCCHAR, true);
31112 }
31113
31114 /* Internal method for ix86_init_builtins. */
31115
31116 static void
31117 ix86_init_builtins_va_builtins_abi (void)
31118 {
31119 tree ms_va_ref, sysv_va_ref;
31120 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
31121 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
31122 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
31123 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
31124
31125 if (!TARGET_64BIT)
31126 return;
31127 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
31128 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
31129 ms_va_ref = build_reference_type (ms_va_list_type_node);
31130 sysv_va_ref =
31131 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
31132
31133 fnvoid_va_end_ms =
31134 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31135 fnvoid_va_start_ms =
31136 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31137 fnvoid_va_end_sysv =
31138 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
31139 fnvoid_va_start_sysv =
31140 build_varargs_function_type_list (void_type_node, sysv_va_ref,
31141 NULL_TREE);
31142 fnvoid_va_copy_ms =
31143 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
31144 NULL_TREE);
31145 fnvoid_va_copy_sysv =
31146 build_function_type_list (void_type_node, sysv_va_ref,
31147 sysv_va_ref, NULL_TREE);
31148
31149 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
31150 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
31151 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
31152 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
31153 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
31154 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
31155 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
31156 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31157 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
31158 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31159 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
31160 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31161 }
31162
31163 static void
31164 ix86_init_builtin_types (void)
31165 {
31166 tree float128_type_node, float80_type_node;
31167
31168 /* The __float80 type. */
31169 float80_type_node = long_double_type_node;
31170 if (TYPE_MODE (float80_type_node) != XFmode)
31171 {
31172 /* The __float80 type. */
31173 float80_type_node = make_node (REAL_TYPE);
31174
31175 TYPE_PRECISION (float80_type_node) = 80;
31176 layout_type (float80_type_node);
31177 }
31178 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
31179
31180 /* The __float128 type. */
31181 float128_type_node = make_node (REAL_TYPE);
31182 TYPE_PRECISION (float128_type_node) = 128;
31183 layout_type (float128_type_node);
31184 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
31185
31186 /* This macro is built by i386-builtin-types.awk. */
31187 DEFINE_BUILTIN_PRIMITIVE_TYPES;
31188 }
31189
31190 static void
31191 ix86_init_builtins (void)
31192 {
31193 tree t;
31194
31195 ix86_init_builtin_types ();
31196
31197 /* Builtins to get CPU type and features. */
31198 ix86_init_platform_type_builtins ();
31199
31200 /* TFmode support builtins. */
31201 def_builtin_const (0, "__builtin_infq",
31202 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
31203 def_builtin_const (0, "__builtin_huge_valq",
31204 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
31205
31206 /* We will expand them to normal call if SSE isn't available since
31207 they are used by libgcc. */
31208 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
31209 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
31210 BUILT_IN_MD, "__fabstf2", NULL_TREE);
31211 TREE_READONLY (t) = 1;
31212 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
31213
31214 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
31215 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
31216 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
31217 TREE_READONLY (t) = 1;
31218 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
31219
31220 ix86_init_tm_builtins ();
31221 ix86_init_mmx_sse_builtins ();
31222
31223 if (TARGET_LP64)
31224 ix86_init_builtins_va_builtins_abi ();
31225
31226 #ifdef SUBTARGET_INIT_BUILTINS
31227 SUBTARGET_INIT_BUILTINS;
31228 #endif
31229 }
31230
31231 /* Return the ix86 builtin for CODE. */
31232
31233 static tree
31234 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
31235 {
31236 if (code >= IX86_BUILTIN_MAX)
31237 return error_mark_node;
31238
31239 return ix86_builtins[code];
31240 }
31241
31242 /* Errors in the source file can cause expand_expr to return const0_rtx
31243 where we expect a vector. To avoid crashing, use one of the vector
31244 clear instructions. */
31245 static rtx
31246 safe_vector_operand (rtx x, enum machine_mode mode)
31247 {
31248 if (x == const0_rtx)
31249 x = CONST0_RTX (mode);
31250 return x;
31251 }
31252
31253 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
31254
31255 static rtx
31256 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
31257 {
31258 rtx pat;
31259 tree arg0 = CALL_EXPR_ARG (exp, 0);
31260 tree arg1 = CALL_EXPR_ARG (exp, 1);
31261 rtx op0 = expand_normal (arg0);
31262 rtx op1 = expand_normal (arg1);
31263 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31264 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31265 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
31266
31267 if (VECTOR_MODE_P (mode0))
31268 op0 = safe_vector_operand (op0, mode0);
31269 if (VECTOR_MODE_P (mode1))
31270 op1 = safe_vector_operand (op1, mode1);
31271
31272 if (optimize || !target
31273 || GET_MODE (target) != tmode
31274 || !insn_data[icode].operand[0].predicate (target, tmode))
31275 target = gen_reg_rtx (tmode);
31276
31277 if (GET_MODE (op1) == SImode && mode1 == TImode)
31278 {
31279 rtx x = gen_reg_rtx (V4SImode);
31280 emit_insn (gen_sse2_loadd (x, op1));
31281 op1 = gen_lowpart (TImode, x);
31282 }
31283
31284 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31285 op0 = copy_to_mode_reg (mode0, op0);
31286 if (!insn_data[icode].operand[2].predicate (op1, mode1))
31287 op1 = copy_to_mode_reg (mode1, op1);
31288
31289 pat = GEN_FCN (icode) (target, op0, op1);
31290 if (! pat)
31291 return 0;
31292
31293 emit_insn (pat);
31294
31295 return target;
31296 }
31297
31298 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
31299
31300 static rtx
31301 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
31302 enum ix86_builtin_func_type m_type,
31303 enum rtx_code sub_code)
31304 {
31305 rtx pat;
31306 int i;
31307 int nargs;
31308 bool comparison_p = false;
31309 bool tf_p = false;
31310 bool last_arg_constant = false;
31311 int num_memory = 0;
31312 struct {
31313 rtx op;
31314 enum machine_mode mode;
31315 } args[4];
31316
31317 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31318
31319 switch (m_type)
31320 {
31321 case MULTI_ARG_4_DF2_DI_I:
31322 case MULTI_ARG_4_DF2_DI_I1:
31323 case MULTI_ARG_4_SF2_SI_I:
31324 case MULTI_ARG_4_SF2_SI_I1:
31325 nargs = 4;
31326 last_arg_constant = true;
31327 break;
31328
31329 case MULTI_ARG_3_SF:
31330 case MULTI_ARG_3_DF:
31331 case MULTI_ARG_3_SF2:
31332 case MULTI_ARG_3_DF2:
31333 case MULTI_ARG_3_DI:
31334 case MULTI_ARG_3_SI:
31335 case MULTI_ARG_3_SI_DI:
31336 case MULTI_ARG_3_HI:
31337 case MULTI_ARG_3_HI_SI:
31338 case MULTI_ARG_3_QI:
31339 case MULTI_ARG_3_DI2:
31340 case MULTI_ARG_3_SI2:
31341 case MULTI_ARG_3_HI2:
31342 case MULTI_ARG_3_QI2:
31343 nargs = 3;
31344 break;
31345
31346 case MULTI_ARG_2_SF:
31347 case MULTI_ARG_2_DF:
31348 case MULTI_ARG_2_DI:
31349 case MULTI_ARG_2_SI:
31350 case MULTI_ARG_2_HI:
31351 case MULTI_ARG_2_QI:
31352 nargs = 2;
31353 break;
31354
31355 case MULTI_ARG_2_DI_IMM:
31356 case MULTI_ARG_2_SI_IMM:
31357 case MULTI_ARG_2_HI_IMM:
31358 case MULTI_ARG_2_QI_IMM:
31359 nargs = 2;
31360 last_arg_constant = true;
31361 break;
31362
31363 case MULTI_ARG_1_SF:
31364 case MULTI_ARG_1_DF:
31365 case MULTI_ARG_1_SF2:
31366 case MULTI_ARG_1_DF2:
31367 case MULTI_ARG_1_DI:
31368 case MULTI_ARG_1_SI:
31369 case MULTI_ARG_1_HI:
31370 case MULTI_ARG_1_QI:
31371 case MULTI_ARG_1_SI_DI:
31372 case MULTI_ARG_1_HI_DI:
31373 case MULTI_ARG_1_HI_SI:
31374 case MULTI_ARG_1_QI_DI:
31375 case MULTI_ARG_1_QI_SI:
31376 case MULTI_ARG_1_QI_HI:
31377 nargs = 1;
31378 break;
31379
31380 case MULTI_ARG_2_DI_CMP:
31381 case MULTI_ARG_2_SI_CMP:
31382 case MULTI_ARG_2_HI_CMP:
31383 case MULTI_ARG_2_QI_CMP:
31384 nargs = 2;
31385 comparison_p = true;
31386 break;
31387
31388 case MULTI_ARG_2_SF_TF:
31389 case MULTI_ARG_2_DF_TF:
31390 case MULTI_ARG_2_DI_TF:
31391 case MULTI_ARG_2_SI_TF:
31392 case MULTI_ARG_2_HI_TF:
31393 case MULTI_ARG_2_QI_TF:
31394 nargs = 2;
31395 tf_p = true;
31396 break;
31397
31398 default:
31399 gcc_unreachable ();
31400 }
31401
31402 if (optimize || !target
31403 || GET_MODE (target) != tmode
31404 || !insn_data[icode].operand[0].predicate (target, tmode))
31405 target = gen_reg_rtx (tmode);
31406
31407 gcc_assert (nargs <= 4);
31408
31409 for (i = 0; i < nargs; i++)
31410 {
31411 tree arg = CALL_EXPR_ARG (exp, i);
31412 rtx op = expand_normal (arg);
31413 int adjust = (comparison_p) ? 1 : 0;
31414 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
31415
31416 if (last_arg_constant && i == nargs - 1)
31417 {
31418 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
31419 {
31420 enum insn_code new_icode = icode;
31421 switch (icode)
31422 {
31423 case CODE_FOR_xop_vpermil2v2df3:
31424 case CODE_FOR_xop_vpermil2v4sf3:
31425 case CODE_FOR_xop_vpermil2v4df3:
31426 case CODE_FOR_xop_vpermil2v8sf3:
31427 error ("the last argument must be a 2-bit immediate");
31428 return gen_reg_rtx (tmode);
31429 case CODE_FOR_xop_rotlv2di3:
31430 new_icode = CODE_FOR_rotlv2di3;
31431 goto xop_rotl;
31432 case CODE_FOR_xop_rotlv4si3:
31433 new_icode = CODE_FOR_rotlv4si3;
31434 goto xop_rotl;
31435 case CODE_FOR_xop_rotlv8hi3:
31436 new_icode = CODE_FOR_rotlv8hi3;
31437 goto xop_rotl;
31438 case CODE_FOR_xop_rotlv16qi3:
31439 new_icode = CODE_FOR_rotlv16qi3;
31440 xop_rotl:
31441 if (CONST_INT_P (op))
31442 {
31443 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
31444 op = GEN_INT (INTVAL (op) & mask);
31445 gcc_checking_assert
31446 (insn_data[icode].operand[i + 1].predicate (op, mode));
31447 }
31448 else
31449 {
31450 gcc_checking_assert
31451 (nargs == 2
31452 && insn_data[new_icode].operand[0].mode == tmode
31453 && insn_data[new_icode].operand[1].mode == tmode
31454 && insn_data[new_icode].operand[2].mode == mode
31455 && insn_data[new_icode].operand[0].predicate
31456 == insn_data[icode].operand[0].predicate
31457 && insn_data[new_icode].operand[1].predicate
31458 == insn_data[icode].operand[1].predicate);
31459 icode = new_icode;
31460 goto non_constant;
31461 }
31462 break;
31463 default:
31464 gcc_unreachable ();
31465 }
31466 }
31467 }
31468 else
31469 {
31470 non_constant:
31471 if (VECTOR_MODE_P (mode))
31472 op = safe_vector_operand (op, mode);
31473
31474 /* If we aren't optimizing, only allow one memory operand to be
31475 generated. */
31476 if (memory_operand (op, mode))
31477 num_memory++;
31478
31479 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
31480
31481 if (optimize
31482 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
31483 || num_memory > 1)
31484 op = force_reg (mode, op);
31485 }
31486
31487 args[i].op = op;
31488 args[i].mode = mode;
31489 }
31490
31491 switch (nargs)
31492 {
31493 case 1:
31494 pat = GEN_FCN (icode) (target, args[0].op);
31495 break;
31496
31497 case 2:
31498 if (tf_p)
31499 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
31500 GEN_INT ((int)sub_code));
31501 else if (! comparison_p)
31502 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31503 else
31504 {
31505 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
31506 args[0].op,
31507 args[1].op);
31508
31509 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
31510 }
31511 break;
31512
31513 case 3:
31514 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31515 break;
31516
31517 case 4:
31518 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
31519 break;
31520
31521 default:
31522 gcc_unreachable ();
31523 }
31524
31525 if (! pat)
31526 return 0;
31527
31528 emit_insn (pat);
31529 return target;
31530 }
31531
31532 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
31533 insns with vec_merge. */
31534
31535 static rtx
31536 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
31537 rtx target)
31538 {
31539 rtx pat;
31540 tree arg0 = CALL_EXPR_ARG (exp, 0);
31541 rtx op1, op0 = expand_normal (arg0);
31542 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31543 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31544
31545 if (optimize || !target
31546 || GET_MODE (target) != tmode
31547 || !insn_data[icode].operand[0].predicate (target, tmode))
31548 target = gen_reg_rtx (tmode);
31549
31550 if (VECTOR_MODE_P (mode0))
31551 op0 = safe_vector_operand (op0, mode0);
31552
31553 if ((optimize && !register_operand (op0, mode0))
31554 || !insn_data[icode].operand[1].predicate (op0, mode0))
31555 op0 = copy_to_mode_reg (mode0, op0);
31556
31557 op1 = op0;
31558 if (!insn_data[icode].operand[2].predicate (op1, mode0))
31559 op1 = copy_to_mode_reg (mode0, op1);
31560
31561 pat = GEN_FCN (icode) (target, op0, op1);
31562 if (! pat)
31563 return 0;
31564 emit_insn (pat);
31565 return target;
31566 }
31567
31568 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
31569
31570 static rtx
31571 ix86_expand_sse_compare (const struct builtin_description *d,
31572 tree exp, rtx target, bool swap)
31573 {
31574 rtx pat;
31575 tree arg0 = CALL_EXPR_ARG (exp, 0);
31576 tree arg1 = CALL_EXPR_ARG (exp, 1);
31577 rtx op0 = expand_normal (arg0);
31578 rtx op1 = expand_normal (arg1);
31579 rtx op2;
31580 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31581 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31582 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31583 enum rtx_code comparison = d->comparison;
31584
31585 if (VECTOR_MODE_P (mode0))
31586 op0 = safe_vector_operand (op0, mode0);
31587 if (VECTOR_MODE_P (mode1))
31588 op1 = safe_vector_operand (op1, mode1);
31589
31590 /* Swap operands if we have a comparison that isn't available in
31591 hardware. */
31592 if (swap)
31593 {
31594 rtx tmp = gen_reg_rtx (mode1);
31595 emit_move_insn (tmp, op1);
31596 op1 = op0;
31597 op0 = tmp;
31598 }
31599
31600 if (optimize || !target
31601 || GET_MODE (target) != tmode
31602 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31603 target = gen_reg_rtx (tmode);
31604
31605 if ((optimize && !register_operand (op0, mode0))
31606 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
31607 op0 = copy_to_mode_reg (mode0, op0);
31608 if ((optimize && !register_operand (op1, mode1))
31609 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
31610 op1 = copy_to_mode_reg (mode1, op1);
31611
31612 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
31613 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31614 if (! pat)
31615 return 0;
31616 emit_insn (pat);
31617 return target;
31618 }
31619
31620 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31621
31622 static rtx
31623 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31624 rtx target)
31625 {
31626 rtx pat;
31627 tree arg0 = CALL_EXPR_ARG (exp, 0);
31628 tree arg1 = CALL_EXPR_ARG (exp, 1);
31629 rtx op0 = expand_normal (arg0);
31630 rtx op1 = expand_normal (arg1);
31631 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31632 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31633 enum rtx_code comparison = d->comparison;
31634
31635 if (VECTOR_MODE_P (mode0))
31636 op0 = safe_vector_operand (op0, mode0);
31637 if (VECTOR_MODE_P (mode1))
31638 op1 = safe_vector_operand (op1, mode1);
31639
31640 /* Swap operands if we have a comparison that isn't available in
31641 hardware. */
31642 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31643 {
31644 rtx tmp = op1;
31645 op1 = op0;
31646 op0 = tmp;
31647 }
31648
31649 target = gen_reg_rtx (SImode);
31650 emit_move_insn (target, const0_rtx);
31651 target = gen_rtx_SUBREG (QImode, target, 0);
31652
31653 if ((optimize && !register_operand (op0, mode0))
31654 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31655 op0 = copy_to_mode_reg (mode0, op0);
31656 if ((optimize && !register_operand (op1, mode1))
31657 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31658 op1 = copy_to_mode_reg (mode1, op1);
31659
31660 pat = GEN_FCN (d->icode) (op0, op1);
31661 if (! pat)
31662 return 0;
31663 emit_insn (pat);
31664 emit_insn (gen_rtx_SET (VOIDmode,
31665 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31666 gen_rtx_fmt_ee (comparison, QImode,
31667 SET_DEST (pat),
31668 const0_rtx)));
31669
31670 return SUBREG_REG (target);
31671 }
31672
31673 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31674
31675 static rtx
31676 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31677 rtx target)
31678 {
31679 rtx pat;
31680 tree arg0 = CALL_EXPR_ARG (exp, 0);
31681 rtx op1, op0 = expand_normal (arg0);
31682 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31683 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31684
31685 if (optimize || target == 0
31686 || GET_MODE (target) != tmode
31687 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31688 target = gen_reg_rtx (tmode);
31689
31690 if (VECTOR_MODE_P (mode0))
31691 op0 = safe_vector_operand (op0, mode0);
31692
31693 if ((optimize && !register_operand (op0, mode0))
31694 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31695 op0 = copy_to_mode_reg (mode0, op0);
31696
31697 op1 = GEN_INT (d->comparison);
31698
31699 pat = GEN_FCN (d->icode) (target, op0, op1);
31700 if (! pat)
31701 return 0;
31702 emit_insn (pat);
31703 return target;
31704 }
31705
31706 static rtx
31707 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31708 tree exp, rtx target)
31709 {
31710 rtx pat;
31711 tree arg0 = CALL_EXPR_ARG (exp, 0);
31712 tree arg1 = CALL_EXPR_ARG (exp, 1);
31713 rtx op0 = expand_normal (arg0);
31714 rtx op1 = expand_normal (arg1);
31715 rtx op2;
31716 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31717 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31718 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31719
31720 if (optimize || target == 0
31721 || GET_MODE (target) != tmode
31722 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31723 target = gen_reg_rtx (tmode);
31724
31725 op0 = safe_vector_operand (op0, mode0);
31726 op1 = safe_vector_operand (op1, mode1);
31727
31728 if ((optimize && !register_operand (op0, mode0))
31729 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31730 op0 = copy_to_mode_reg (mode0, op0);
31731 if ((optimize && !register_operand (op1, mode1))
31732 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31733 op1 = copy_to_mode_reg (mode1, op1);
31734
31735 op2 = GEN_INT (d->comparison);
31736
31737 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31738 if (! pat)
31739 return 0;
31740 emit_insn (pat);
31741 return target;
31742 }
31743
31744 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31745
31746 static rtx
31747 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31748 rtx target)
31749 {
31750 rtx pat;
31751 tree arg0 = CALL_EXPR_ARG (exp, 0);
31752 tree arg1 = CALL_EXPR_ARG (exp, 1);
31753 rtx op0 = expand_normal (arg0);
31754 rtx op1 = expand_normal (arg1);
31755 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31756 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31757 enum rtx_code comparison = d->comparison;
31758
31759 if (VECTOR_MODE_P (mode0))
31760 op0 = safe_vector_operand (op0, mode0);
31761 if (VECTOR_MODE_P (mode1))
31762 op1 = safe_vector_operand (op1, mode1);
31763
31764 target = gen_reg_rtx (SImode);
31765 emit_move_insn (target, const0_rtx);
31766 target = gen_rtx_SUBREG (QImode, target, 0);
31767
31768 if ((optimize && !register_operand (op0, mode0))
31769 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31770 op0 = copy_to_mode_reg (mode0, op0);
31771 if ((optimize && !register_operand (op1, mode1))
31772 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31773 op1 = copy_to_mode_reg (mode1, op1);
31774
31775 pat = GEN_FCN (d->icode) (op0, op1);
31776 if (! pat)
31777 return 0;
31778 emit_insn (pat);
31779 emit_insn (gen_rtx_SET (VOIDmode,
31780 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31781 gen_rtx_fmt_ee (comparison, QImode,
31782 SET_DEST (pat),
31783 const0_rtx)));
31784
31785 return SUBREG_REG (target);
31786 }
31787
31788 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31789
31790 static rtx
31791 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31792 tree exp, rtx target)
31793 {
31794 rtx pat;
31795 tree arg0 = CALL_EXPR_ARG (exp, 0);
31796 tree arg1 = CALL_EXPR_ARG (exp, 1);
31797 tree arg2 = CALL_EXPR_ARG (exp, 2);
31798 tree arg3 = CALL_EXPR_ARG (exp, 3);
31799 tree arg4 = CALL_EXPR_ARG (exp, 4);
31800 rtx scratch0, scratch1;
31801 rtx op0 = expand_normal (arg0);
31802 rtx op1 = expand_normal (arg1);
31803 rtx op2 = expand_normal (arg2);
31804 rtx op3 = expand_normal (arg3);
31805 rtx op4 = expand_normal (arg4);
31806 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31807
31808 tmode0 = insn_data[d->icode].operand[0].mode;
31809 tmode1 = insn_data[d->icode].operand[1].mode;
31810 modev2 = insn_data[d->icode].operand[2].mode;
31811 modei3 = insn_data[d->icode].operand[3].mode;
31812 modev4 = insn_data[d->icode].operand[4].mode;
31813 modei5 = insn_data[d->icode].operand[5].mode;
31814 modeimm = insn_data[d->icode].operand[6].mode;
31815
31816 if (VECTOR_MODE_P (modev2))
31817 op0 = safe_vector_operand (op0, modev2);
31818 if (VECTOR_MODE_P (modev4))
31819 op2 = safe_vector_operand (op2, modev4);
31820
31821 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31822 op0 = copy_to_mode_reg (modev2, op0);
31823 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31824 op1 = copy_to_mode_reg (modei3, op1);
31825 if ((optimize && !register_operand (op2, modev4))
31826 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31827 op2 = copy_to_mode_reg (modev4, op2);
31828 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31829 op3 = copy_to_mode_reg (modei5, op3);
31830
31831 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31832 {
31833 error ("the fifth argument must be an 8-bit immediate");
31834 return const0_rtx;
31835 }
31836
31837 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31838 {
31839 if (optimize || !target
31840 || GET_MODE (target) != tmode0
31841 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31842 target = gen_reg_rtx (tmode0);
31843
31844 scratch1 = gen_reg_rtx (tmode1);
31845
31846 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31847 }
31848 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31849 {
31850 if (optimize || !target
31851 || GET_MODE (target) != tmode1
31852 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31853 target = gen_reg_rtx (tmode1);
31854
31855 scratch0 = gen_reg_rtx (tmode0);
31856
31857 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31858 }
31859 else
31860 {
31861 gcc_assert (d->flag);
31862
31863 scratch0 = gen_reg_rtx (tmode0);
31864 scratch1 = gen_reg_rtx (tmode1);
31865
31866 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31867 }
31868
31869 if (! pat)
31870 return 0;
31871
31872 emit_insn (pat);
31873
31874 if (d->flag)
31875 {
31876 target = gen_reg_rtx (SImode);
31877 emit_move_insn (target, const0_rtx);
31878 target = gen_rtx_SUBREG (QImode, target, 0);
31879
31880 emit_insn
31881 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31882 gen_rtx_fmt_ee (EQ, QImode,
31883 gen_rtx_REG ((enum machine_mode) d->flag,
31884 FLAGS_REG),
31885 const0_rtx)));
31886 return SUBREG_REG (target);
31887 }
31888 else
31889 return target;
31890 }
31891
31892
31893 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31894
31895 static rtx
31896 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31897 tree exp, rtx target)
31898 {
31899 rtx pat;
31900 tree arg0 = CALL_EXPR_ARG (exp, 0);
31901 tree arg1 = CALL_EXPR_ARG (exp, 1);
31902 tree arg2 = CALL_EXPR_ARG (exp, 2);
31903 rtx scratch0, scratch1;
31904 rtx op0 = expand_normal (arg0);
31905 rtx op1 = expand_normal (arg1);
31906 rtx op2 = expand_normal (arg2);
31907 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31908
31909 tmode0 = insn_data[d->icode].operand[0].mode;
31910 tmode1 = insn_data[d->icode].operand[1].mode;
31911 modev2 = insn_data[d->icode].operand[2].mode;
31912 modev3 = insn_data[d->icode].operand[3].mode;
31913 modeimm = insn_data[d->icode].operand[4].mode;
31914
31915 if (VECTOR_MODE_P (modev2))
31916 op0 = safe_vector_operand (op0, modev2);
31917 if (VECTOR_MODE_P (modev3))
31918 op1 = safe_vector_operand (op1, modev3);
31919
31920 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31921 op0 = copy_to_mode_reg (modev2, op0);
31922 if ((optimize && !register_operand (op1, modev3))
31923 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31924 op1 = copy_to_mode_reg (modev3, op1);
31925
31926 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31927 {
31928 error ("the third argument must be an 8-bit immediate");
31929 return const0_rtx;
31930 }
31931
31932 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31933 {
31934 if (optimize || !target
31935 || GET_MODE (target) != tmode0
31936 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31937 target = gen_reg_rtx (tmode0);
31938
31939 scratch1 = gen_reg_rtx (tmode1);
31940
31941 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31942 }
31943 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31944 {
31945 if (optimize || !target
31946 || GET_MODE (target) != tmode1
31947 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31948 target = gen_reg_rtx (tmode1);
31949
31950 scratch0 = gen_reg_rtx (tmode0);
31951
31952 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31953 }
31954 else
31955 {
31956 gcc_assert (d->flag);
31957
31958 scratch0 = gen_reg_rtx (tmode0);
31959 scratch1 = gen_reg_rtx (tmode1);
31960
31961 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31962 }
31963
31964 if (! pat)
31965 return 0;
31966
31967 emit_insn (pat);
31968
31969 if (d->flag)
31970 {
31971 target = gen_reg_rtx (SImode);
31972 emit_move_insn (target, const0_rtx);
31973 target = gen_rtx_SUBREG (QImode, target, 0);
31974
31975 emit_insn
31976 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31977 gen_rtx_fmt_ee (EQ, QImode,
31978 gen_rtx_REG ((enum machine_mode) d->flag,
31979 FLAGS_REG),
31980 const0_rtx)));
31981 return SUBREG_REG (target);
31982 }
31983 else
31984 return target;
31985 }
31986
31987 /* Subroutine of ix86_expand_builtin to take care of insns with
31988 variable number of operands. */
31989
31990 static rtx
31991 ix86_expand_args_builtin (const struct builtin_description *d,
31992 tree exp, rtx target)
31993 {
31994 rtx pat, real_target;
31995 unsigned int i, nargs;
31996 unsigned int nargs_constant = 0;
31997 int num_memory = 0;
31998 struct
31999 {
32000 rtx op;
32001 enum machine_mode mode;
32002 } args[4];
32003 bool last_arg_count = false;
32004 enum insn_code icode = d->icode;
32005 const struct insn_data_d *insn_p = &insn_data[icode];
32006 enum machine_mode tmode = insn_p->operand[0].mode;
32007 enum machine_mode rmode = VOIDmode;
32008 bool swap = false;
32009 enum rtx_code comparison = d->comparison;
32010
32011 switch ((enum ix86_builtin_func_type) d->flag)
32012 {
32013 case V2DF_FTYPE_V2DF_ROUND:
32014 case V4DF_FTYPE_V4DF_ROUND:
32015 case V4SF_FTYPE_V4SF_ROUND:
32016 case V8SF_FTYPE_V8SF_ROUND:
32017 case V4SI_FTYPE_V4SF_ROUND:
32018 case V8SI_FTYPE_V8SF_ROUND:
32019 return ix86_expand_sse_round (d, exp, target);
32020 case V4SI_FTYPE_V2DF_V2DF_ROUND:
32021 case V8SI_FTYPE_V4DF_V4DF_ROUND:
32022 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
32023 case INT_FTYPE_V8SF_V8SF_PTEST:
32024 case INT_FTYPE_V4DI_V4DI_PTEST:
32025 case INT_FTYPE_V4DF_V4DF_PTEST:
32026 case INT_FTYPE_V4SF_V4SF_PTEST:
32027 case INT_FTYPE_V2DI_V2DI_PTEST:
32028 case INT_FTYPE_V2DF_V2DF_PTEST:
32029 return ix86_expand_sse_ptest (d, exp, target);
32030 case FLOAT128_FTYPE_FLOAT128:
32031 case FLOAT_FTYPE_FLOAT:
32032 case INT_FTYPE_INT:
32033 case UINT64_FTYPE_INT:
32034 case UINT16_FTYPE_UINT16:
32035 case INT64_FTYPE_INT64:
32036 case INT64_FTYPE_V4SF:
32037 case INT64_FTYPE_V2DF:
32038 case INT_FTYPE_V16QI:
32039 case INT_FTYPE_V8QI:
32040 case INT_FTYPE_V8SF:
32041 case INT_FTYPE_V4DF:
32042 case INT_FTYPE_V4SF:
32043 case INT_FTYPE_V2DF:
32044 case INT_FTYPE_V32QI:
32045 case V16QI_FTYPE_V16QI:
32046 case V8SI_FTYPE_V8SF:
32047 case V8SI_FTYPE_V4SI:
32048 case V8HI_FTYPE_V8HI:
32049 case V8HI_FTYPE_V16QI:
32050 case V8QI_FTYPE_V8QI:
32051 case V8SF_FTYPE_V8SF:
32052 case V8SF_FTYPE_V8SI:
32053 case V8SF_FTYPE_V4SF:
32054 case V8SF_FTYPE_V8HI:
32055 case V4SI_FTYPE_V4SI:
32056 case V4SI_FTYPE_V16QI:
32057 case V4SI_FTYPE_V4SF:
32058 case V4SI_FTYPE_V8SI:
32059 case V4SI_FTYPE_V8HI:
32060 case V4SI_FTYPE_V4DF:
32061 case V4SI_FTYPE_V2DF:
32062 case V4HI_FTYPE_V4HI:
32063 case V4DF_FTYPE_V4DF:
32064 case V4DF_FTYPE_V4SI:
32065 case V4DF_FTYPE_V4SF:
32066 case V4DF_FTYPE_V2DF:
32067 case V4SF_FTYPE_V4SF:
32068 case V4SF_FTYPE_V4SI:
32069 case V4SF_FTYPE_V8SF:
32070 case V4SF_FTYPE_V4DF:
32071 case V4SF_FTYPE_V8HI:
32072 case V4SF_FTYPE_V2DF:
32073 case V2DI_FTYPE_V2DI:
32074 case V2DI_FTYPE_V16QI:
32075 case V2DI_FTYPE_V8HI:
32076 case V2DI_FTYPE_V4SI:
32077 case V2DF_FTYPE_V2DF:
32078 case V2DF_FTYPE_V4SI:
32079 case V2DF_FTYPE_V4DF:
32080 case V2DF_FTYPE_V4SF:
32081 case V2DF_FTYPE_V2SI:
32082 case V2SI_FTYPE_V2SI:
32083 case V2SI_FTYPE_V4SF:
32084 case V2SI_FTYPE_V2SF:
32085 case V2SI_FTYPE_V2DF:
32086 case V2SF_FTYPE_V2SF:
32087 case V2SF_FTYPE_V2SI:
32088 case V32QI_FTYPE_V32QI:
32089 case V32QI_FTYPE_V16QI:
32090 case V16HI_FTYPE_V16HI:
32091 case V16HI_FTYPE_V8HI:
32092 case V8SI_FTYPE_V8SI:
32093 case V16HI_FTYPE_V16QI:
32094 case V8SI_FTYPE_V16QI:
32095 case V4DI_FTYPE_V16QI:
32096 case V8SI_FTYPE_V8HI:
32097 case V4DI_FTYPE_V8HI:
32098 case V4DI_FTYPE_V4SI:
32099 case V4DI_FTYPE_V2DI:
32100 nargs = 1;
32101 break;
32102 case V4SF_FTYPE_V4SF_VEC_MERGE:
32103 case V2DF_FTYPE_V2DF_VEC_MERGE:
32104 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
32105 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
32106 case V16QI_FTYPE_V16QI_V16QI:
32107 case V16QI_FTYPE_V8HI_V8HI:
32108 case V8QI_FTYPE_V8QI_V8QI:
32109 case V8QI_FTYPE_V4HI_V4HI:
32110 case V8HI_FTYPE_V8HI_V8HI:
32111 case V8HI_FTYPE_V16QI_V16QI:
32112 case V8HI_FTYPE_V4SI_V4SI:
32113 case V8SF_FTYPE_V8SF_V8SF:
32114 case V8SF_FTYPE_V8SF_V8SI:
32115 case V4SI_FTYPE_V4SI_V4SI:
32116 case V4SI_FTYPE_V8HI_V8HI:
32117 case V4SI_FTYPE_V4SF_V4SF:
32118 case V4SI_FTYPE_V2DF_V2DF:
32119 case V4HI_FTYPE_V4HI_V4HI:
32120 case V4HI_FTYPE_V8QI_V8QI:
32121 case V4HI_FTYPE_V2SI_V2SI:
32122 case V4DF_FTYPE_V4DF_V4DF:
32123 case V4DF_FTYPE_V4DF_V4DI:
32124 case V4SF_FTYPE_V4SF_V4SF:
32125 case V4SF_FTYPE_V4SF_V4SI:
32126 case V4SF_FTYPE_V4SF_V2SI:
32127 case V4SF_FTYPE_V4SF_V2DF:
32128 case V4SF_FTYPE_V4SF_DI:
32129 case V4SF_FTYPE_V4SF_SI:
32130 case V2DI_FTYPE_V2DI_V2DI:
32131 case V2DI_FTYPE_V16QI_V16QI:
32132 case V2DI_FTYPE_V4SI_V4SI:
32133 case V2UDI_FTYPE_V4USI_V4USI:
32134 case V2DI_FTYPE_V2DI_V16QI:
32135 case V2DI_FTYPE_V2DF_V2DF:
32136 case V2SI_FTYPE_V2SI_V2SI:
32137 case V2SI_FTYPE_V4HI_V4HI:
32138 case V2SI_FTYPE_V2SF_V2SF:
32139 case V2DF_FTYPE_V2DF_V2DF:
32140 case V2DF_FTYPE_V2DF_V4SF:
32141 case V2DF_FTYPE_V2DF_V2DI:
32142 case V2DF_FTYPE_V2DF_DI:
32143 case V2DF_FTYPE_V2DF_SI:
32144 case V2SF_FTYPE_V2SF_V2SF:
32145 case V1DI_FTYPE_V1DI_V1DI:
32146 case V1DI_FTYPE_V8QI_V8QI:
32147 case V1DI_FTYPE_V2SI_V2SI:
32148 case V32QI_FTYPE_V16HI_V16HI:
32149 case V16HI_FTYPE_V8SI_V8SI:
32150 case V32QI_FTYPE_V32QI_V32QI:
32151 case V16HI_FTYPE_V32QI_V32QI:
32152 case V16HI_FTYPE_V16HI_V16HI:
32153 case V8SI_FTYPE_V4DF_V4DF:
32154 case V8SI_FTYPE_V8SI_V8SI:
32155 case V8SI_FTYPE_V16HI_V16HI:
32156 case V4DI_FTYPE_V4DI_V4DI:
32157 case V4DI_FTYPE_V8SI_V8SI:
32158 case V4UDI_FTYPE_V8USI_V8USI:
32159 if (comparison == UNKNOWN)
32160 return ix86_expand_binop_builtin (icode, exp, target);
32161 nargs = 2;
32162 break;
32163 case V4SF_FTYPE_V4SF_V4SF_SWAP:
32164 case V2DF_FTYPE_V2DF_V2DF_SWAP:
32165 gcc_assert (comparison != UNKNOWN);
32166 nargs = 2;
32167 swap = true;
32168 break;
32169 case V16HI_FTYPE_V16HI_V8HI_COUNT:
32170 case V16HI_FTYPE_V16HI_SI_COUNT:
32171 case V8SI_FTYPE_V8SI_V4SI_COUNT:
32172 case V8SI_FTYPE_V8SI_SI_COUNT:
32173 case V4DI_FTYPE_V4DI_V2DI_COUNT:
32174 case V4DI_FTYPE_V4DI_INT_COUNT:
32175 case V8HI_FTYPE_V8HI_V8HI_COUNT:
32176 case V8HI_FTYPE_V8HI_SI_COUNT:
32177 case V4SI_FTYPE_V4SI_V4SI_COUNT:
32178 case V4SI_FTYPE_V4SI_SI_COUNT:
32179 case V4HI_FTYPE_V4HI_V4HI_COUNT:
32180 case V4HI_FTYPE_V4HI_SI_COUNT:
32181 case V2DI_FTYPE_V2DI_V2DI_COUNT:
32182 case V2DI_FTYPE_V2DI_SI_COUNT:
32183 case V2SI_FTYPE_V2SI_V2SI_COUNT:
32184 case V2SI_FTYPE_V2SI_SI_COUNT:
32185 case V1DI_FTYPE_V1DI_V1DI_COUNT:
32186 case V1DI_FTYPE_V1DI_SI_COUNT:
32187 nargs = 2;
32188 last_arg_count = true;
32189 break;
32190 case UINT64_FTYPE_UINT64_UINT64:
32191 case UINT_FTYPE_UINT_UINT:
32192 case UINT_FTYPE_UINT_USHORT:
32193 case UINT_FTYPE_UINT_UCHAR:
32194 case UINT16_FTYPE_UINT16_INT:
32195 case UINT8_FTYPE_UINT8_INT:
32196 nargs = 2;
32197 break;
32198 case V2DI_FTYPE_V2DI_INT_CONVERT:
32199 nargs = 2;
32200 rmode = V1TImode;
32201 nargs_constant = 1;
32202 break;
32203 case V4DI_FTYPE_V4DI_INT_CONVERT:
32204 nargs = 2;
32205 rmode = V2TImode;
32206 nargs_constant = 1;
32207 break;
32208 case V8HI_FTYPE_V8HI_INT:
32209 case V8HI_FTYPE_V8SF_INT:
32210 case V8HI_FTYPE_V4SF_INT:
32211 case V8SF_FTYPE_V8SF_INT:
32212 case V4SI_FTYPE_V4SI_INT:
32213 case V4SI_FTYPE_V8SI_INT:
32214 case V4HI_FTYPE_V4HI_INT:
32215 case V4DF_FTYPE_V4DF_INT:
32216 case V4SF_FTYPE_V4SF_INT:
32217 case V4SF_FTYPE_V8SF_INT:
32218 case V2DI_FTYPE_V2DI_INT:
32219 case V2DF_FTYPE_V2DF_INT:
32220 case V2DF_FTYPE_V4DF_INT:
32221 case V16HI_FTYPE_V16HI_INT:
32222 case V8SI_FTYPE_V8SI_INT:
32223 case V4DI_FTYPE_V4DI_INT:
32224 case V2DI_FTYPE_V4DI_INT:
32225 nargs = 2;
32226 nargs_constant = 1;
32227 break;
32228 case V16QI_FTYPE_V16QI_V16QI_V16QI:
32229 case V8SF_FTYPE_V8SF_V8SF_V8SF:
32230 case V4DF_FTYPE_V4DF_V4DF_V4DF:
32231 case V4SF_FTYPE_V4SF_V4SF_V4SF:
32232 case V2DF_FTYPE_V2DF_V2DF_V2DF:
32233 case V32QI_FTYPE_V32QI_V32QI_V32QI:
32234 nargs = 3;
32235 break;
32236 case V32QI_FTYPE_V32QI_V32QI_INT:
32237 case V16HI_FTYPE_V16HI_V16HI_INT:
32238 case V16QI_FTYPE_V16QI_V16QI_INT:
32239 case V4DI_FTYPE_V4DI_V4DI_INT:
32240 case V8HI_FTYPE_V8HI_V8HI_INT:
32241 case V8SI_FTYPE_V8SI_V8SI_INT:
32242 case V8SI_FTYPE_V8SI_V4SI_INT:
32243 case V8SF_FTYPE_V8SF_V8SF_INT:
32244 case V8SF_FTYPE_V8SF_V4SF_INT:
32245 case V4SI_FTYPE_V4SI_V4SI_INT:
32246 case V4DF_FTYPE_V4DF_V4DF_INT:
32247 case V4DF_FTYPE_V4DF_V2DF_INT:
32248 case V4SF_FTYPE_V4SF_V4SF_INT:
32249 case V2DI_FTYPE_V2DI_V2DI_INT:
32250 case V4DI_FTYPE_V4DI_V2DI_INT:
32251 case V2DF_FTYPE_V2DF_V2DF_INT:
32252 nargs = 3;
32253 nargs_constant = 1;
32254 break;
32255 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
32256 nargs = 3;
32257 rmode = V4DImode;
32258 nargs_constant = 1;
32259 break;
32260 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
32261 nargs = 3;
32262 rmode = V2DImode;
32263 nargs_constant = 1;
32264 break;
32265 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
32266 nargs = 3;
32267 rmode = DImode;
32268 nargs_constant = 1;
32269 break;
32270 case V2DI_FTYPE_V2DI_UINT_UINT:
32271 nargs = 3;
32272 nargs_constant = 2;
32273 break;
32274 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
32275 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
32276 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
32277 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
32278 nargs = 4;
32279 nargs_constant = 1;
32280 break;
32281 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
32282 nargs = 4;
32283 nargs_constant = 2;
32284 break;
32285 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
32286 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
32287 nargs = 4;
32288 break;
32289 default:
32290 gcc_unreachable ();
32291 }
32292
32293 gcc_assert (nargs <= ARRAY_SIZE (args));
32294
32295 if (comparison != UNKNOWN)
32296 {
32297 gcc_assert (nargs == 2);
32298 return ix86_expand_sse_compare (d, exp, target, swap);
32299 }
32300
32301 if (rmode == VOIDmode || rmode == tmode)
32302 {
32303 if (optimize
32304 || target == 0
32305 || GET_MODE (target) != tmode
32306 || !insn_p->operand[0].predicate (target, tmode))
32307 target = gen_reg_rtx (tmode);
32308 real_target = target;
32309 }
32310 else
32311 {
32312 real_target = gen_reg_rtx (tmode);
32313 target = simplify_gen_subreg (rmode, real_target, tmode, 0);
32314 }
32315
32316 for (i = 0; i < nargs; i++)
32317 {
32318 tree arg = CALL_EXPR_ARG (exp, i);
32319 rtx op = expand_normal (arg);
32320 enum machine_mode mode = insn_p->operand[i + 1].mode;
32321 bool match = insn_p->operand[i + 1].predicate (op, mode);
32322
32323 if (last_arg_count && (i + 1) == nargs)
32324 {
32325 /* SIMD shift insns take either an 8-bit immediate or
32326 register as count. But builtin functions take int as
32327 count. If count doesn't match, we put it in register. */
32328 if (!match)
32329 {
32330 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
32331 if (!insn_p->operand[i + 1].predicate (op, mode))
32332 op = copy_to_reg (op);
32333 }
32334 }
32335 else if ((nargs - i) <= nargs_constant)
32336 {
32337 if (!match)
32338 switch (icode)
32339 {
32340 case CODE_FOR_avx2_inserti128:
32341 case CODE_FOR_avx2_extracti128:
32342 error ("the last argument must be an 1-bit immediate");
32343 return const0_rtx;
32344
32345 case CODE_FOR_sse4_1_roundsd:
32346 case CODE_FOR_sse4_1_roundss:
32347
32348 case CODE_FOR_sse4_1_roundpd:
32349 case CODE_FOR_sse4_1_roundps:
32350 case CODE_FOR_avx_roundpd256:
32351 case CODE_FOR_avx_roundps256:
32352
32353 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
32354 case CODE_FOR_sse4_1_roundps_sfix:
32355 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
32356 case CODE_FOR_avx_roundps_sfix256:
32357
32358 case CODE_FOR_sse4_1_blendps:
32359 case CODE_FOR_avx_blendpd256:
32360 case CODE_FOR_avx_vpermilv4df:
32361 error ("the last argument must be a 4-bit immediate");
32362 return const0_rtx;
32363
32364 case CODE_FOR_sse4_1_blendpd:
32365 case CODE_FOR_avx_vpermilv2df:
32366 case CODE_FOR_xop_vpermil2v2df3:
32367 case CODE_FOR_xop_vpermil2v4sf3:
32368 case CODE_FOR_xop_vpermil2v4df3:
32369 case CODE_FOR_xop_vpermil2v8sf3:
32370 error ("the last argument must be a 2-bit immediate");
32371 return const0_rtx;
32372
32373 case CODE_FOR_avx_vextractf128v4df:
32374 case CODE_FOR_avx_vextractf128v8sf:
32375 case CODE_FOR_avx_vextractf128v8si:
32376 case CODE_FOR_avx_vinsertf128v4df:
32377 case CODE_FOR_avx_vinsertf128v8sf:
32378 case CODE_FOR_avx_vinsertf128v8si:
32379 error ("the last argument must be a 1-bit immediate");
32380 return const0_rtx;
32381
32382 case CODE_FOR_avx_vmcmpv2df3:
32383 case CODE_FOR_avx_vmcmpv4sf3:
32384 case CODE_FOR_avx_cmpv2df3:
32385 case CODE_FOR_avx_cmpv4sf3:
32386 case CODE_FOR_avx_cmpv4df3:
32387 case CODE_FOR_avx_cmpv8sf3:
32388 error ("the last argument must be a 5-bit immediate");
32389 return const0_rtx;
32390
32391 default:
32392 switch (nargs_constant)
32393 {
32394 case 2:
32395 if ((nargs - i) == nargs_constant)
32396 {
32397 error ("the next to last argument must be an 8-bit immediate");
32398 break;
32399 }
32400 case 1:
32401 error ("the last argument must be an 8-bit immediate");
32402 break;
32403 default:
32404 gcc_unreachable ();
32405 }
32406 return const0_rtx;
32407 }
32408 }
32409 else
32410 {
32411 if (VECTOR_MODE_P (mode))
32412 op = safe_vector_operand (op, mode);
32413
32414 /* If we aren't optimizing, only allow one memory operand to
32415 be generated. */
32416 if (memory_operand (op, mode))
32417 num_memory++;
32418
32419 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
32420 {
32421 if (optimize || !match || num_memory > 1)
32422 op = copy_to_mode_reg (mode, op);
32423 }
32424 else
32425 {
32426 op = copy_to_reg (op);
32427 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
32428 }
32429 }
32430
32431 args[i].op = op;
32432 args[i].mode = mode;
32433 }
32434
32435 switch (nargs)
32436 {
32437 case 1:
32438 pat = GEN_FCN (icode) (real_target, args[0].op);
32439 break;
32440 case 2:
32441 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
32442 break;
32443 case 3:
32444 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32445 args[2].op);
32446 break;
32447 case 4:
32448 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32449 args[2].op, args[3].op);
32450 break;
32451 default:
32452 gcc_unreachable ();
32453 }
32454
32455 if (! pat)
32456 return 0;
32457
32458 emit_insn (pat);
32459 return target;
32460 }
32461
32462 /* Subroutine of ix86_expand_builtin to take care of special insns
32463 with variable number of operands. */
32464
32465 static rtx
32466 ix86_expand_special_args_builtin (const struct builtin_description *d,
32467 tree exp, rtx target)
32468 {
32469 tree arg;
32470 rtx pat, op;
32471 unsigned int i, nargs, arg_adjust, memory;
32472 struct
32473 {
32474 rtx op;
32475 enum machine_mode mode;
32476 } args[3];
32477 enum insn_code icode = d->icode;
32478 bool last_arg_constant = false;
32479 const struct insn_data_d *insn_p = &insn_data[icode];
32480 enum machine_mode tmode = insn_p->operand[0].mode;
32481 enum { load, store } klass;
32482
32483 switch ((enum ix86_builtin_func_type) d->flag)
32484 {
32485 case VOID_FTYPE_VOID:
32486 emit_insn (GEN_FCN (icode) (target));
32487 return 0;
32488 case VOID_FTYPE_UINT64:
32489 case VOID_FTYPE_UNSIGNED:
32490 nargs = 0;
32491 klass = store;
32492 memory = 0;
32493 break;
32494
32495 case INT_FTYPE_VOID:
32496 case UINT64_FTYPE_VOID:
32497 case UNSIGNED_FTYPE_VOID:
32498 nargs = 0;
32499 klass = load;
32500 memory = 0;
32501 break;
32502 case UINT64_FTYPE_PUNSIGNED:
32503 case V2DI_FTYPE_PV2DI:
32504 case V4DI_FTYPE_PV4DI:
32505 case V32QI_FTYPE_PCCHAR:
32506 case V16QI_FTYPE_PCCHAR:
32507 case V8SF_FTYPE_PCV4SF:
32508 case V8SF_FTYPE_PCFLOAT:
32509 case V4SF_FTYPE_PCFLOAT:
32510 case V4DF_FTYPE_PCV2DF:
32511 case V4DF_FTYPE_PCDOUBLE:
32512 case V2DF_FTYPE_PCDOUBLE:
32513 case VOID_FTYPE_PVOID:
32514 nargs = 1;
32515 klass = load;
32516 memory = 0;
32517 break;
32518 case VOID_FTYPE_PV2SF_V4SF:
32519 case VOID_FTYPE_PV4DI_V4DI:
32520 case VOID_FTYPE_PV2DI_V2DI:
32521 case VOID_FTYPE_PCHAR_V32QI:
32522 case VOID_FTYPE_PCHAR_V16QI:
32523 case VOID_FTYPE_PFLOAT_V8SF:
32524 case VOID_FTYPE_PFLOAT_V4SF:
32525 case VOID_FTYPE_PDOUBLE_V4DF:
32526 case VOID_FTYPE_PDOUBLE_V2DF:
32527 case VOID_FTYPE_PLONGLONG_LONGLONG:
32528 case VOID_FTYPE_PULONGLONG_ULONGLONG:
32529 case VOID_FTYPE_PINT_INT:
32530 nargs = 1;
32531 klass = store;
32532 /* Reserve memory operand for target. */
32533 memory = ARRAY_SIZE (args);
32534 break;
32535 case V4SF_FTYPE_V4SF_PCV2SF:
32536 case V2DF_FTYPE_V2DF_PCDOUBLE:
32537 nargs = 2;
32538 klass = load;
32539 memory = 1;
32540 break;
32541 case V8SF_FTYPE_PCV8SF_V8SI:
32542 case V4DF_FTYPE_PCV4DF_V4DI:
32543 case V4SF_FTYPE_PCV4SF_V4SI:
32544 case V2DF_FTYPE_PCV2DF_V2DI:
32545 case V8SI_FTYPE_PCV8SI_V8SI:
32546 case V4DI_FTYPE_PCV4DI_V4DI:
32547 case V4SI_FTYPE_PCV4SI_V4SI:
32548 case V2DI_FTYPE_PCV2DI_V2DI:
32549 nargs = 2;
32550 klass = load;
32551 memory = 0;
32552 break;
32553 case VOID_FTYPE_PV8SF_V8SI_V8SF:
32554 case VOID_FTYPE_PV4DF_V4DI_V4DF:
32555 case VOID_FTYPE_PV4SF_V4SI_V4SF:
32556 case VOID_FTYPE_PV2DF_V2DI_V2DF:
32557 case VOID_FTYPE_PV8SI_V8SI_V8SI:
32558 case VOID_FTYPE_PV4DI_V4DI_V4DI:
32559 case VOID_FTYPE_PV4SI_V4SI_V4SI:
32560 case VOID_FTYPE_PV2DI_V2DI_V2DI:
32561 nargs = 2;
32562 klass = store;
32563 /* Reserve memory operand for target. */
32564 memory = ARRAY_SIZE (args);
32565 break;
32566 case VOID_FTYPE_UINT_UINT_UINT:
32567 case VOID_FTYPE_UINT64_UINT_UINT:
32568 case UCHAR_FTYPE_UINT_UINT_UINT:
32569 case UCHAR_FTYPE_UINT64_UINT_UINT:
32570 nargs = 3;
32571 klass = load;
32572 memory = ARRAY_SIZE (args);
32573 last_arg_constant = true;
32574 break;
32575 default:
32576 gcc_unreachable ();
32577 }
32578
32579 gcc_assert (nargs <= ARRAY_SIZE (args));
32580
32581 if (klass == store)
32582 {
32583 arg = CALL_EXPR_ARG (exp, 0);
32584 op = expand_normal (arg);
32585 gcc_assert (target == 0);
32586 if (memory)
32587 {
32588 op = ix86_zero_extend_to_Pmode (op);
32589 target = gen_rtx_MEM (tmode, op);
32590 }
32591 else
32592 target = force_reg (tmode, op);
32593 arg_adjust = 1;
32594 }
32595 else
32596 {
32597 arg_adjust = 0;
32598 if (optimize
32599 || target == 0
32600 || !register_operand (target, tmode)
32601 || GET_MODE (target) != tmode)
32602 target = gen_reg_rtx (tmode);
32603 }
32604
32605 for (i = 0; i < nargs; i++)
32606 {
32607 enum machine_mode mode = insn_p->operand[i + 1].mode;
32608 bool match;
32609
32610 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
32611 op = expand_normal (arg);
32612 match = insn_p->operand[i + 1].predicate (op, mode);
32613
32614 if (last_arg_constant && (i + 1) == nargs)
32615 {
32616 if (!match)
32617 {
32618 if (icode == CODE_FOR_lwp_lwpvalsi3
32619 || icode == CODE_FOR_lwp_lwpinssi3
32620 || icode == CODE_FOR_lwp_lwpvaldi3
32621 || icode == CODE_FOR_lwp_lwpinsdi3)
32622 error ("the last argument must be a 32-bit immediate");
32623 else
32624 error ("the last argument must be an 8-bit immediate");
32625 return const0_rtx;
32626 }
32627 }
32628 else
32629 {
32630 if (i == memory)
32631 {
32632 /* This must be the memory operand. */
32633 op = ix86_zero_extend_to_Pmode (op);
32634 op = gen_rtx_MEM (mode, op);
32635 gcc_assert (GET_MODE (op) == mode
32636 || GET_MODE (op) == VOIDmode);
32637 }
32638 else
32639 {
32640 /* This must be register. */
32641 if (VECTOR_MODE_P (mode))
32642 op = safe_vector_operand (op, mode);
32643
32644 gcc_assert (GET_MODE (op) == mode
32645 || GET_MODE (op) == VOIDmode);
32646 op = copy_to_mode_reg (mode, op);
32647 }
32648 }
32649
32650 args[i].op = op;
32651 args[i].mode = mode;
32652 }
32653
32654 switch (nargs)
32655 {
32656 case 0:
32657 pat = GEN_FCN (icode) (target);
32658 break;
32659 case 1:
32660 pat = GEN_FCN (icode) (target, args[0].op);
32661 break;
32662 case 2:
32663 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32664 break;
32665 case 3:
32666 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32667 break;
32668 default:
32669 gcc_unreachable ();
32670 }
32671
32672 if (! pat)
32673 return 0;
32674 emit_insn (pat);
32675 return klass == store ? 0 : target;
32676 }
32677
32678 /* Return the integer constant in ARG. Constrain it to be in the range
32679 of the subparts of VEC_TYPE; issue an error if not. */
32680
32681 static int
32682 get_element_number (tree vec_type, tree arg)
32683 {
32684 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32685
32686 if (!tree_fits_uhwi_p (arg)
32687 || (elt = tree_to_uhwi (arg), elt > max))
32688 {
32689 error ("selector must be an integer constant in the range 0..%wi", max);
32690 return 0;
32691 }
32692
32693 return elt;
32694 }
32695
32696 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32697 ix86_expand_vector_init. We DO have language-level syntax for this, in
32698 the form of (type){ init-list }. Except that since we can't place emms
32699 instructions from inside the compiler, we can't allow the use of MMX
32700 registers unless the user explicitly asks for it. So we do *not* define
32701 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32702 we have builtins invoked by mmintrin.h that gives us license to emit
32703 these sorts of instructions. */
32704
32705 static rtx
32706 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32707 {
32708 enum machine_mode tmode = TYPE_MODE (type);
32709 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32710 int i, n_elt = GET_MODE_NUNITS (tmode);
32711 rtvec v = rtvec_alloc (n_elt);
32712
32713 gcc_assert (VECTOR_MODE_P (tmode));
32714 gcc_assert (call_expr_nargs (exp) == n_elt);
32715
32716 for (i = 0; i < n_elt; ++i)
32717 {
32718 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32719 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32720 }
32721
32722 if (!target || !register_operand (target, tmode))
32723 target = gen_reg_rtx (tmode);
32724
32725 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32726 return target;
32727 }
32728
32729 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32730 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32731 had a language-level syntax for referencing vector elements. */
32732
32733 static rtx
32734 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32735 {
32736 enum machine_mode tmode, mode0;
32737 tree arg0, arg1;
32738 int elt;
32739 rtx op0;
32740
32741 arg0 = CALL_EXPR_ARG (exp, 0);
32742 arg1 = CALL_EXPR_ARG (exp, 1);
32743
32744 op0 = expand_normal (arg0);
32745 elt = get_element_number (TREE_TYPE (arg0), arg1);
32746
32747 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32748 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32749 gcc_assert (VECTOR_MODE_P (mode0));
32750
32751 op0 = force_reg (mode0, op0);
32752
32753 if (optimize || !target || !register_operand (target, tmode))
32754 target = gen_reg_rtx (tmode);
32755
32756 ix86_expand_vector_extract (true, target, op0, elt);
32757
32758 return target;
32759 }
32760
32761 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32762 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32763 a language-level syntax for referencing vector elements. */
32764
32765 static rtx
32766 ix86_expand_vec_set_builtin (tree exp)
32767 {
32768 enum machine_mode tmode, mode1;
32769 tree arg0, arg1, arg2;
32770 int elt;
32771 rtx op0, op1, target;
32772
32773 arg0 = CALL_EXPR_ARG (exp, 0);
32774 arg1 = CALL_EXPR_ARG (exp, 1);
32775 arg2 = CALL_EXPR_ARG (exp, 2);
32776
32777 tmode = TYPE_MODE (TREE_TYPE (arg0));
32778 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32779 gcc_assert (VECTOR_MODE_P (tmode));
32780
32781 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32782 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32783 elt = get_element_number (TREE_TYPE (arg0), arg2);
32784
32785 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32786 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32787
32788 op0 = force_reg (tmode, op0);
32789 op1 = force_reg (mode1, op1);
32790
32791 /* OP0 is the source of these builtin functions and shouldn't be
32792 modified. Create a copy, use it and return it as target. */
32793 target = gen_reg_rtx (tmode);
32794 emit_move_insn (target, op0);
32795 ix86_expand_vector_set (true, target, op1, elt);
32796
32797 return target;
32798 }
32799
32800 /* Expand an expression EXP that calls a built-in function,
32801 with result going to TARGET if that's convenient
32802 (and in mode MODE if that's convenient).
32803 SUBTARGET may be used as the target for computing one of EXP's operands.
32804 IGNORE is nonzero if the value is to be ignored. */
32805
32806 static rtx
32807 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32808 enum machine_mode mode, int ignore)
32809 {
32810 const struct builtin_description *d;
32811 size_t i;
32812 enum insn_code icode;
32813 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32814 tree arg0, arg1, arg2, arg3, arg4;
32815 rtx op0, op1, op2, op3, op4, pat, insn;
32816 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32817 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32818
32819 /* For CPU builtins that can be folded, fold first and expand the fold. */
32820 switch (fcode)
32821 {
32822 case IX86_BUILTIN_CPU_INIT:
32823 {
32824 /* Make it call __cpu_indicator_init in libgcc. */
32825 tree call_expr, fndecl, type;
32826 type = build_function_type_list (integer_type_node, NULL_TREE);
32827 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32828 call_expr = build_call_expr (fndecl, 0);
32829 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32830 }
32831 case IX86_BUILTIN_CPU_IS:
32832 case IX86_BUILTIN_CPU_SUPPORTS:
32833 {
32834 tree arg0 = CALL_EXPR_ARG (exp, 0);
32835 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32836 gcc_assert (fold_expr != NULL_TREE);
32837 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32838 }
32839 }
32840
32841 /* Determine whether the builtin function is available under the current ISA.
32842 Originally the builtin was not created if it wasn't applicable to the
32843 current ISA based on the command line switches. With function specific
32844 options, we need to check in the context of the function making the call
32845 whether it is supported. */
32846 if (ix86_builtins_isa[fcode].isa
32847 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32848 {
32849 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32850 NULL, (enum fpmath_unit) 0, false);
32851
32852 if (!opts)
32853 error ("%qE needs unknown isa option", fndecl);
32854 else
32855 {
32856 gcc_assert (opts != NULL);
32857 error ("%qE needs isa option %s", fndecl, opts);
32858 free (opts);
32859 }
32860 return const0_rtx;
32861 }
32862
32863 switch (fcode)
32864 {
32865 case IX86_BUILTIN_MASKMOVQ:
32866 case IX86_BUILTIN_MASKMOVDQU:
32867 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32868 ? CODE_FOR_mmx_maskmovq
32869 : CODE_FOR_sse2_maskmovdqu);
32870 /* Note the arg order is different from the operand order. */
32871 arg1 = CALL_EXPR_ARG (exp, 0);
32872 arg2 = CALL_EXPR_ARG (exp, 1);
32873 arg0 = CALL_EXPR_ARG (exp, 2);
32874 op0 = expand_normal (arg0);
32875 op1 = expand_normal (arg1);
32876 op2 = expand_normal (arg2);
32877 mode0 = insn_data[icode].operand[0].mode;
32878 mode1 = insn_data[icode].operand[1].mode;
32879 mode2 = insn_data[icode].operand[2].mode;
32880
32881 op0 = ix86_zero_extend_to_Pmode (op0);
32882 op0 = gen_rtx_MEM (mode1, op0);
32883
32884 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32885 op0 = copy_to_mode_reg (mode0, op0);
32886 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32887 op1 = copy_to_mode_reg (mode1, op1);
32888 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32889 op2 = copy_to_mode_reg (mode2, op2);
32890 pat = GEN_FCN (icode) (op0, op1, op2);
32891 if (! pat)
32892 return 0;
32893 emit_insn (pat);
32894 return 0;
32895
32896 case IX86_BUILTIN_LDMXCSR:
32897 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32898 target = assign_386_stack_local (SImode, SLOT_TEMP);
32899 emit_move_insn (target, op0);
32900 emit_insn (gen_sse_ldmxcsr (target));
32901 return 0;
32902
32903 case IX86_BUILTIN_STMXCSR:
32904 target = assign_386_stack_local (SImode, SLOT_TEMP);
32905 emit_insn (gen_sse_stmxcsr (target));
32906 return copy_to_mode_reg (SImode, target);
32907
32908 case IX86_BUILTIN_CLFLUSH:
32909 arg0 = CALL_EXPR_ARG (exp, 0);
32910 op0 = expand_normal (arg0);
32911 icode = CODE_FOR_sse2_clflush;
32912 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32913 op0 = ix86_zero_extend_to_Pmode (op0);
32914
32915 emit_insn (gen_sse2_clflush (op0));
32916 return 0;
32917
32918 case IX86_BUILTIN_MONITOR:
32919 arg0 = CALL_EXPR_ARG (exp, 0);
32920 arg1 = CALL_EXPR_ARG (exp, 1);
32921 arg2 = CALL_EXPR_ARG (exp, 2);
32922 op0 = expand_normal (arg0);
32923 op1 = expand_normal (arg1);
32924 op2 = expand_normal (arg2);
32925 if (!REG_P (op0))
32926 op0 = ix86_zero_extend_to_Pmode (op0);
32927 if (!REG_P (op1))
32928 op1 = copy_to_mode_reg (SImode, op1);
32929 if (!REG_P (op2))
32930 op2 = copy_to_mode_reg (SImode, op2);
32931 emit_insn (ix86_gen_monitor (op0, op1, op2));
32932 return 0;
32933
32934 case IX86_BUILTIN_MWAIT:
32935 arg0 = CALL_EXPR_ARG (exp, 0);
32936 arg1 = CALL_EXPR_ARG (exp, 1);
32937 op0 = expand_normal (arg0);
32938 op1 = expand_normal (arg1);
32939 if (!REG_P (op0))
32940 op0 = copy_to_mode_reg (SImode, op0);
32941 if (!REG_P (op1))
32942 op1 = copy_to_mode_reg (SImode, op1);
32943 emit_insn (gen_sse3_mwait (op0, op1));
32944 return 0;
32945
32946 case IX86_BUILTIN_VEC_INIT_V2SI:
32947 case IX86_BUILTIN_VEC_INIT_V4HI:
32948 case IX86_BUILTIN_VEC_INIT_V8QI:
32949 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32950
32951 case IX86_BUILTIN_VEC_EXT_V2DF:
32952 case IX86_BUILTIN_VEC_EXT_V2DI:
32953 case IX86_BUILTIN_VEC_EXT_V4SF:
32954 case IX86_BUILTIN_VEC_EXT_V4SI:
32955 case IX86_BUILTIN_VEC_EXT_V8HI:
32956 case IX86_BUILTIN_VEC_EXT_V2SI:
32957 case IX86_BUILTIN_VEC_EXT_V4HI:
32958 case IX86_BUILTIN_VEC_EXT_V16QI:
32959 return ix86_expand_vec_ext_builtin (exp, target);
32960
32961 case IX86_BUILTIN_VEC_SET_V2DI:
32962 case IX86_BUILTIN_VEC_SET_V4SF:
32963 case IX86_BUILTIN_VEC_SET_V4SI:
32964 case IX86_BUILTIN_VEC_SET_V8HI:
32965 case IX86_BUILTIN_VEC_SET_V4HI:
32966 case IX86_BUILTIN_VEC_SET_V16QI:
32967 return ix86_expand_vec_set_builtin (exp);
32968
32969 case IX86_BUILTIN_INFQ:
32970 case IX86_BUILTIN_HUGE_VALQ:
32971 {
32972 REAL_VALUE_TYPE inf;
32973 rtx tmp;
32974
32975 real_inf (&inf);
32976 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32977
32978 tmp = validize_mem (force_const_mem (mode, tmp));
32979
32980 if (target == 0)
32981 target = gen_reg_rtx (mode);
32982
32983 emit_move_insn (target, tmp);
32984 return target;
32985 }
32986
32987 case IX86_BUILTIN_RDPMC:
32988 case IX86_BUILTIN_RDTSC:
32989 case IX86_BUILTIN_RDTSCP:
32990
32991 op0 = gen_reg_rtx (DImode);
32992 op1 = gen_reg_rtx (DImode);
32993
32994 if (fcode == IX86_BUILTIN_RDPMC)
32995 {
32996 arg0 = CALL_EXPR_ARG (exp, 0);
32997 op2 = expand_normal (arg0);
32998 if (!register_operand (op2, SImode))
32999 op2 = copy_to_mode_reg (SImode, op2);
33000
33001 insn = (TARGET_64BIT
33002 ? gen_rdpmc_rex64 (op0, op1, op2)
33003 : gen_rdpmc (op0, op2));
33004 emit_insn (insn);
33005 }
33006 else if (fcode == IX86_BUILTIN_RDTSC)
33007 {
33008 insn = (TARGET_64BIT
33009 ? gen_rdtsc_rex64 (op0, op1)
33010 : gen_rdtsc (op0));
33011 emit_insn (insn);
33012 }
33013 else
33014 {
33015 op2 = gen_reg_rtx (SImode);
33016
33017 insn = (TARGET_64BIT
33018 ? gen_rdtscp_rex64 (op0, op1, op2)
33019 : gen_rdtscp (op0, op2));
33020 emit_insn (insn);
33021
33022 arg0 = CALL_EXPR_ARG (exp, 0);
33023 op4 = expand_normal (arg0);
33024 if (!address_operand (op4, VOIDmode))
33025 {
33026 op4 = convert_memory_address (Pmode, op4);
33027 op4 = copy_addr_to_reg (op4);
33028 }
33029 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
33030 }
33031
33032 if (target == 0)
33033 {
33034 /* mode is VOIDmode if __builtin_rd* has been called
33035 without lhs. */
33036 if (mode == VOIDmode)
33037 return target;
33038 target = gen_reg_rtx (mode);
33039 }
33040
33041 if (TARGET_64BIT)
33042 {
33043 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
33044 op1, 1, OPTAB_DIRECT);
33045 op0 = expand_simple_binop (DImode, IOR, op0, op1,
33046 op0, 1, OPTAB_DIRECT);
33047 }
33048
33049 emit_move_insn (target, op0);
33050 return target;
33051
33052 case IX86_BUILTIN_FXSAVE:
33053 case IX86_BUILTIN_FXRSTOR:
33054 case IX86_BUILTIN_FXSAVE64:
33055 case IX86_BUILTIN_FXRSTOR64:
33056 case IX86_BUILTIN_FNSTENV:
33057 case IX86_BUILTIN_FLDENV:
33058 case IX86_BUILTIN_FNSTSW:
33059 mode0 = BLKmode;
33060 switch (fcode)
33061 {
33062 case IX86_BUILTIN_FXSAVE:
33063 icode = CODE_FOR_fxsave;
33064 break;
33065 case IX86_BUILTIN_FXRSTOR:
33066 icode = CODE_FOR_fxrstor;
33067 break;
33068 case IX86_BUILTIN_FXSAVE64:
33069 icode = CODE_FOR_fxsave64;
33070 break;
33071 case IX86_BUILTIN_FXRSTOR64:
33072 icode = CODE_FOR_fxrstor64;
33073 break;
33074 case IX86_BUILTIN_FNSTENV:
33075 icode = CODE_FOR_fnstenv;
33076 break;
33077 case IX86_BUILTIN_FLDENV:
33078 icode = CODE_FOR_fldenv;
33079 break;
33080 case IX86_BUILTIN_FNSTSW:
33081 icode = CODE_FOR_fnstsw;
33082 mode0 = HImode;
33083 break;
33084 default:
33085 gcc_unreachable ();
33086 }
33087
33088 arg0 = CALL_EXPR_ARG (exp, 0);
33089 op0 = expand_normal (arg0);
33090
33091 if (!address_operand (op0, VOIDmode))
33092 {
33093 op0 = convert_memory_address (Pmode, op0);
33094 op0 = copy_addr_to_reg (op0);
33095 }
33096 op0 = gen_rtx_MEM (mode0, op0);
33097
33098 pat = GEN_FCN (icode) (op0);
33099 if (pat)
33100 emit_insn (pat);
33101 return 0;
33102
33103 case IX86_BUILTIN_XSAVE:
33104 case IX86_BUILTIN_XRSTOR:
33105 case IX86_BUILTIN_XSAVE64:
33106 case IX86_BUILTIN_XRSTOR64:
33107 case IX86_BUILTIN_XSAVEOPT:
33108 case IX86_BUILTIN_XSAVEOPT64:
33109 arg0 = CALL_EXPR_ARG (exp, 0);
33110 arg1 = CALL_EXPR_ARG (exp, 1);
33111 op0 = expand_normal (arg0);
33112 op1 = expand_normal (arg1);
33113
33114 if (!address_operand (op0, VOIDmode))
33115 {
33116 op0 = convert_memory_address (Pmode, op0);
33117 op0 = copy_addr_to_reg (op0);
33118 }
33119 op0 = gen_rtx_MEM (BLKmode, op0);
33120
33121 op1 = force_reg (DImode, op1);
33122
33123 if (TARGET_64BIT)
33124 {
33125 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
33126 NULL, 1, OPTAB_DIRECT);
33127 switch (fcode)
33128 {
33129 case IX86_BUILTIN_XSAVE:
33130 icode = CODE_FOR_xsave_rex64;
33131 break;
33132 case IX86_BUILTIN_XRSTOR:
33133 icode = CODE_FOR_xrstor_rex64;
33134 break;
33135 case IX86_BUILTIN_XSAVE64:
33136 icode = CODE_FOR_xsave64;
33137 break;
33138 case IX86_BUILTIN_XRSTOR64:
33139 icode = CODE_FOR_xrstor64;
33140 break;
33141 case IX86_BUILTIN_XSAVEOPT:
33142 icode = CODE_FOR_xsaveopt_rex64;
33143 break;
33144 case IX86_BUILTIN_XSAVEOPT64:
33145 icode = CODE_FOR_xsaveopt64;
33146 break;
33147 default:
33148 gcc_unreachable ();
33149 }
33150
33151 op2 = gen_lowpart (SImode, op2);
33152 op1 = gen_lowpart (SImode, op1);
33153 pat = GEN_FCN (icode) (op0, op1, op2);
33154 }
33155 else
33156 {
33157 switch (fcode)
33158 {
33159 case IX86_BUILTIN_XSAVE:
33160 icode = CODE_FOR_xsave;
33161 break;
33162 case IX86_BUILTIN_XRSTOR:
33163 icode = CODE_FOR_xrstor;
33164 break;
33165 case IX86_BUILTIN_XSAVEOPT:
33166 icode = CODE_FOR_xsaveopt;
33167 break;
33168 default:
33169 gcc_unreachable ();
33170 }
33171 pat = GEN_FCN (icode) (op0, op1);
33172 }
33173
33174 if (pat)
33175 emit_insn (pat);
33176 return 0;
33177
33178 case IX86_BUILTIN_LLWPCB:
33179 arg0 = CALL_EXPR_ARG (exp, 0);
33180 op0 = expand_normal (arg0);
33181 icode = CODE_FOR_lwp_llwpcb;
33182 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
33183 op0 = ix86_zero_extend_to_Pmode (op0);
33184 emit_insn (gen_lwp_llwpcb (op0));
33185 return 0;
33186
33187 case IX86_BUILTIN_SLWPCB:
33188 icode = CODE_FOR_lwp_slwpcb;
33189 if (!target
33190 || !insn_data[icode].operand[0].predicate (target, Pmode))
33191 target = gen_reg_rtx (Pmode);
33192 emit_insn (gen_lwp_slwpcb (target));
33193 return target;
33194
33195 case IX86_BUILTIN_BEXTRI32:
33196 case IX86_BUILTIN_BEXTRI64:
33197 arg0 = CALL_EXPR_ARG (exp, 0);
33198 arg1 = CALL_EXPR_ARG (exp, 1);
33199 op0 = expand_normal (arg0);
33200 op1 = expand_normal (arg1);
33201 icode = (fcode == IX86_BUILTIN_BEXTRI32
33202 ? CODE_FOR_tbm_bextri_si
33203 : CODE_FOR_tbm_bextri_di);
33204 if (!CONST_INT_P (op1))
33205 {
33206 error ("last argument must be an immediate");
33207 return const0_rtx;
33208 }
33209 else
33210 {
33211 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
33212 unsigned char lsb_index = INTVAL (op1) & 0xFF;
33213 op1 = GEN_INT (length);
33214 op2 = GEN_INT (lsb_index);
33215 pat = GEN_FCN (icode) (target, op0, op1, op2);
33216 if (pat)
33217 emit_insn (pat);
33218 return target;
33219 }
33220
33221 case IX86_BUILTIN_RDRAND16_STEP:
33222 icode = CODE_FOR_rdrandhi_1;
33223 mode0 = HImode;
33224 goto rdrand_step;
33225
33226 case IX86_BUILTIN_RDRAND32_STEP:
33227 icode = CODE_FOR_rdrandsi_1;
33228 mode0 = SImode;
33229 goto rdrand_step;
33230
33231 case IX86_BUILTIN_RDRAND64_STEP:
33232 icode = CODE_FOR_rdranddi_1;
33233 mode0 = DImode;
33234
33235 rdrand_step:
33236 op0 = gen_reg_rtx (mode0);
33237 emit_insn (GEN_FCN (icode) (op0));
33238
33239 arg0 = CALL_EXPR_ARG (exp, 0);
33240 op1 = expand_normal (arg0);
33241 if (!address_operand (op1, VOIDmode))
33242 {
33243 op1 = convert_memory_address (Pmode, op1);
33244 op1 = copy_addr_to_reg (op1);
33245 }
33246 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33247
33248 op1 = gen_reg_rtx (SImode);
33249 emit_move_insn (op1, CONST1_RTX (SImode));
33250
33251 /* Emit SImode conditional move. */
33252 if (mode0 == HImode)
33253 {
33254 op2 = gen_reg_rtx (SImode);
33255 emit_insn (gen_zero_extendhisi2 (op2, op0));
33256 }
33257 else if (mode0 == SImode)
33258 op2 = op0;
33259 else
33260 op2 = gen_rtx_SUBREG (SImode, op0, 0);
33261
33262 if (target == 0)
33263 target = gen_reg_rtx (SImode);
33264
33265 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
33266 const0_rtx);
33267 emit_insn (gen_rtx_SET (VOIDmode, target,
33268 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
33269 return target;
33270
33271 case IX86_BUILTIN_RDSEED16_STEP:
33272 icode = CODE_FOR_rdseedhi_1;
33273 mode0 = HImode;
33274 goto rdseed_step;
33275
33276 case IX86_BUILTIN_RDSEED32_STEP:
33277 icode = CODE_FOR_rdseedsi_1;
33278 mode0 = SImode;
33279 goto rdseed_step;
33280
33281 case IX86_BUILTIN_RDSEED64_STEP:
33282 icode = CODE_FOR_rdseeddi_1;
33283 mode0 = DImode;
33284
33285 rdseed_step:
33286 op0 = gen_reg_rtx (mode0);
33287 emit_insn (GEN_FCN (icode) (op0));
33288
33289 arg0 = CALL_EXPR_ARG (exp, 0);
33290 op1 = expand_normal (arg0);
33291 if (!address_operand (op1, VOIDmode))
33292 {
33293 op1 = convert_memory_address (Pmode, op1);
33294 op1 = copy_addr_to_reg (op1);
33295 }
33296 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33297
33298 op2 = gen_reg_rtx (QImode);
33299
33300 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
33301 const0_rtx);
33302 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
33303
33304 if (target == 0)
33305 target = gen_reg_rtx (SImode);
33306
33307 emit_insn (gen_zero_extendqisi2 (target, op2));
33308 return target;
33309
33310 case IX86_BUILTIN_ADDCARRYX32:
33311 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
33312 mode0 = SImode;
33313 goto addcarryx;
33314
33315 case IX86_BUILTIN_ADDCARRYX64:
33316 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
33317 mode0 = DImode;
33318
33319 addcarryx:
33320 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
33321 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
33322 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
33323 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
33324
33325 op0 = gen_reg_rtx (QImode);
33326
33327 /* Generate CF from input operand. */
33328 op1 = expand_normal (arg0);
33329 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
33330 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
33331
33332 /* Gen ADCX instruction to compute X+Y+CF. */
33333 op2 = expand_normal (arg1);
33334 op3 = expand_normal (arg2);
33335
33336 if (!REG_P (op2))
33337 op2 = copy_to_mode_reg (mode0, op2);
33338 if (!REG_P (op3))
33339 op3 = copy_to_mode_reg (mode0, op3);
33340
33341 op0 = gen_reg_rtx (mode0);
33342
33343 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
33344 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
33345 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
33346
33347 /* Store the result. */
33348 op4 = expand_normal (arg3);
33349 if (!address_operand (op4, VOIDmode))
33350 {
33351 op4 = convert_memory_address (Pmode, op4);
33352 op4 = copy_addr_to_reg (op4);
33353 }
33354 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
33355
33356 /* Return current CF value. */
33357 if (target == 0)
33358 target = gen_reg_rtx (QImode);
33359
33360 PUT_MODE (pat, QImode);
33361 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
33362 return target;
33363
33364 case IX86_BUILTIN_GATHERSIV2DF:
33365 icode = CODE_FOR_avx2_gathersiv2df;
33366 goto gather_gen;
33367 case IX86_BUILTIN_GATHERSIV4DF:
33368 icode = CODE_FOR_avx2_gathersiv4df;
33369 goto gather_gen;
33370 case IX86_BUILTIN_GATHERDIV2DF:
33371 icode = CODE_FOR_avx2_gatherdiv2df;
33372 goto gather_gen;
33373 case IX86_BUILTIN_GATHERDIV4DF:
33374 icode = CODE_FOR_avx2_gatherdiv4df;
33375 goto gather_gen;
33376 case IX86_BUILTIN_GATHERSIV4SF:
33377 icode = CODE_FOR_avx2_gathersiv4sf;
33378 goto gather_gen;
33379 case IX86_BUILTIN_GATHERSIV8SF:
33380 icode = CODE_FOR_avx2_gathersiv8sf;
33381 goto gather_gen;
33382 case IX86_BUILTIN_GATHERDIV4SF:
33383 icode = CODE_FOR_avx2_gatherdiv4sf;
33384 goto gather_gen;
33385 case IX86_BUILTIN_GATHERDIV8SF:
33386 icode = CODE_FOR_avx2_gatherdiv8sf;
33387 goto gather_gen;
33388 case IX86_BUILTIN_GATHERSIV2DI:
33389 icode = CODE_FOR_avx2_gathersiv2di;
33390 goto gather_gen;
33391 case IX86_BUILTIN_GATHERSIV4DI:
33392 icode = CODE_FOR_avx2_gathersiv4di;
33393 goto gather_gen;
33394 case IX86_BUILTIN_GATHERDIV2DI:
33395 icode = CODE_FOR_avx2_gatherdiv2di;
33396 goto gather_gen;
33397 case IX86_BUILTIN_GATHERDIV4DI:
33398 icode = CODE_FOR_avx2_gatherdiv4di;
33399 goto gather_gen;
33400 case IX86_BUILTIN_GATHERSIV4SI:
33401 icode = CODE_FOR_avx2_gathersiv4si;
33402 goto gather_gen;
33403 case IX86_BUILTIN_GATHERSIV8SI:
33404 icode = CODE_FOR_avx2_gathersiv8si;
33405 goto gather_gen;
33406 case IX86_BUILTIN_GATHERDIV4SI:
33407 icode = CODE_FOR_avx2_gatherdiv4si;
33408 goto gather_gen;
33409 case IX86_BUILTIN_GATHERDIV8SI:
33410 icode = CODE_FOR_avx2_gatherdiv8si;
33411 goto gather_gen;
33412 case IX86_BUILTIN_GATHERALTSIV4DF:
33413 icode = CODE_FOR_avx2_gathersiv4df;
33414 goto gather_gen;
33415 case IX86_BUILTIN_GATHERALTDIV8SF:
33416 icode = CODE_FOR_avx2_gatherdiv8sf;
33417 goto gather_gen;
33418 case IX86_BUILTIN_GATHERALTSIV4DI:
33419 icode = CODE_FOR_avx2_gathersiv4di;
33420 goto gather_gen;
33421 case IX86_BUILTIN_GATHERALTDIV8SI:
33422 icode = CODE_FOR_avx2_gatherdiv8si;
33423 goto gather_gen;
33424
33425 gather_gen:
33426 arg0 = CALL_EXPR_ARG (exp, 0);
33427 arg1 = CALL_EXPR_ARG (exp, 1);
33428 arg2 = CALL_EXPR_ARG (exp, 2);
33429 arg3 = CALL_EXPR_ARG (exp, 3);
33430 arg4 = CALL_EXPR_ARG (exp, 4);
33431 op0 = expand_normal (arg0);
33432 op1 = expand_normal (arg1);
33433 op2 = expand_normal (arg2);
33434 op3 = expand_normal (arg3);
33435 op4 = expand_normal (arg4);
33436 /* Note the arg order is different from the operand order. */
33437 mode0 = insn_data[icode].operand[1].mode;
33438 mode2 = insn_data[icode].operand[3].mode;
33439 mode3 = insn_data[icode].operand[4].mode;
33440 mode4 = insn_data[icode].operand[5].mode;
33441
33442 if (target == NULL_RTX
33443 || GET_MODE (target) != insn_data[icode].operand[0].mode)
33444 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
33445 else
33446 subtarget = target;
33447
33448 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
33449 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
33450 {
33451 rtx half = gen_reg_rtx (V4SImode);
33452 if (!nonimmediate_operand (op2, V8SImode))
33453 op2 = copy_to_mode_reg (V8SImode, op2);
33454 emit_insn (gen_vec_extract_lo_v8si (half, op2));
33455 op2 = half;
33456 }
33457 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
33458 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
33459 {
33460 rtx (*gen) (rtx, rtx);
33461 rtx half = gen_reg_rtx (mode0);
33462 if (mode0 == V4SFmode)
33463 gen = gen_vec_extract_lo_v8sf;
33464 else
33465 gen = gen_vec_extract_lo_v8si;
33466 if (!nonimmediate_operand (op0, GET_MODE (op0)))
33467 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
33468 emit_insn (gen (half, op0));
33469 op0 = half;
33470 if (!nonimmediate_operand (op3, GET_MODE (op3)))
33471 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
33472 emit_insn (gen (half, op3));
33473 op3 = half;
33474 }
33475
33476 /* Force memory operand only with base register here. But we
33477 don't want to do it on memory operand for other builtin
33478 functions. */
33479 op1 = ix86_zero_extend_to_Pmode (op1);
33480
33481 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33482 op0 = copy_to_mode_reg (mode0, op0);
33483 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
33484 op1 = copy_to_mode_reg (Pmode, op1);
33485 if (!insn_data[icode].operand[3].predicate (op2, mode2))
33486 op2 = copy_to_mode_reg (mode2, op2);
33487 if (!insn_data[icode].operand[4].predicate (op3, mode3))
33488 op3 = copy_to_mode_reg (mode3, op3);
33489 if (!insn_data[icode].operand[5].predicate (op4, mode4))
33490 {
33491 error ("last argument must be scale 1, 2, 4, 8");
33492 return const0_rtx;
33493 }
33494
33495 /* Optimize. If mask is known to have all high bits set,
33496 replace op0 with pc_rtx to signal that the instruction
33497 overwrites the whole destination and doesn't use its
33498 previous contents. */
33499 if (optimize)
33500 {
33501 if (TREE_CODE (arg3) == VECTOR_CST)
33502 {
33503 unsigned int negative = 0;
33504 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
33505 {
33506 tree cst = VECTOR_CST_ELT (arg3, i);
33507 if (TREE_CODE (cst) == INTEGER_CST
33508 && tree_int_cst_sign_bit (cst))
33509 negative++;
33510 else if (TREE_CODE (cst) == REAL_CST
33511 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
33512 negative++;
33513 }
33514 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
33515 op0 = pc_rtx;
33516 }
33517 else if (TREE_CODE (arg3) == SSA_NAME)
33518 {
33519 /* Recognize also when mask is like:
33520 __v2df src = _mm_setzero_pd ();
33521 __v2df mask = _mm_cmpeq_pd (src, src);
33522 or
33523 __v8sf src = _mm256_setzero_ps ();
33524 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
33525 as that is a cheaper way to load all ones into
33526 a register than having to load a constant from
33527 memory. */
33528 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
33529 if (is_gimple_call (def_stmt))
33530 {
33531 tree fndecl = gimple_call_fndecl (def_stmt);
33532 if (fndecl
33533 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33534 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
33535 {
33536 case IX86_BUILTIN_CMPPD:
33537 case IX86_BUILTIN_CMPPS:
33538 case IX86_BUILTIN_CMPPD256:
33539 case IX86_BUILTIN_CMPPS256:
33540 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
33541 break;
33542 /* FALLTHRU */
33543 case IX86_BUILTIN_CMPEQPD:
33544 case IX86_BUILTIN_CMPEQPS:
33545 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
33546 && initializer_zerop (gimple_call_arg (def_stmt,
33547 1)))
33548 op0 = pc_rtx;
33549 break;
33550 default:
33551 break;
33552 }
33553 }
33554 }
33555 }
33556
33557 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
33558 if (! pat)
33559 return const0_rtx;
33560 emit_insn (pat);
33561
33562 if (fcode == IX86_BUILTIN_GATHERDIV8SF
33563 || fcode == IX86_BUILTIN_GATHERDIV8SI)
33564 {
33565 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
33566 ? V4SFmode : V4SImode;
33567 if (target == NULL_RTX)
33568 target = gen_reg_rtx (tmode);
33569 if (tmode == V4SFmode)
33570 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
33571 else
33572 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
33573 }
33574 else
33575 target = subtarget;
33576
33577 return target;
33578
33579 case IX86_BUILTIN_XABORT:
33580 icode = CODE_FOR_xabort;
33581 arg0 = CALL_EXPR_ARG (exp, 0);
33582 op0 = expand_normal (arg0);
33583 mode0 = insn_data[icode].operand[0].mode;
33584 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33585 {
33586 error ("the xabort's argument must be an 8-bit immediate");
33587 return const0_rtx;
33588 }
33589 emit_insn (gen_xabort (op0));
33590 return 0;
33591
33592 default:
33593 break;
33594 }
33595
33596 for (i = 0, d = bdesc_special_args;
33597 i < ARRAY_SIZE (bdesc_special_args);
33598 i++, d++)
33599 if (d->code == fcode)
33600 return ix86_expand_special_args_builtin (d, exp, target);
33601
33602 for (i = 0, d = bdesc_args;
33603 i < ARRAY_SIZE (bdesc_args);
33604 i++, d++)
33605 if (d->code == fcode)
33606 switch (fcode)
33607 {
33608 case IX86_BUILTIN_FABSQ:
33609 case IX86_BUILTIN_COPYSIGNQ:
33610 if (!TARGET_SSE)
33611 /* Emit a normal call if SSE isn't available. */
33612 return expand_call (exp, target, ignore);
33613 default:
33614 return ix86_expand_args_builtin (d, exp, target);
33615 }
33616
33617 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
33618 if (d->code == fcode)
33619 return ix86_expand_sse_comi (d, exp, target);
33620
33621 for (i = 0, d = bdesc_pcmpestr;
33622 i < ARRAY_SIZE (bdesc_pcmpestr);
33623 i++, d++)
33624 if (d->code == fcode)
33625 return ix86_expand_sse_pcmpestr (d, exp, target);
33626
33627 for (i = 0, d = bdesc_pcmpistr;
33628 i < ARRAY_SIZE (bdesc_pcmpistr);
33629 i++, d++)
33630 if (d->code == fcode)
33631 return ix86_expand_sse_pcmpistr (d, exp, target);
33632
33633 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33634 if (d->code == fcode)
33635 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33636 (enum ix86_builtin_func_type)
33637 d->flag, d->comparison);
33638
33639 gcc_unreachable ();
33640 }
33641
33642 /* Returns a function decl for a vectorized version of the builtin function
33643 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33644 if it is not available. */
33645
33646 static tree
33647 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33648 tree type_in)
33649 {
33650 enum machine_mode in_mode, out_mode;
33651 int in_n, out_n;
33652 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33653
33654 if (TREE_CODE (type_out) != VECTOR_TYPE
33655 || TREE_CODE (type_in) != VECTOR_TYPE
33656 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33657 return NULL_TREE;
33658
33659 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33660 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33661 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33662 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33663
33664 switch (fn)
33665 {
33666 case BUILT_IN_SQRT:
33667 if (out_mode == DFmode && in_mode == DFmode)
33668 {
33669 if (out_n == 2 && in_n == 2)
33670 return ix86_builtins[IX86_BUILTIN_SQRTPD];
33671 else if (out_n == 4 && in_n == 4)
33672 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
33673 }
33674 break;
33675
33676 case BUILT_IN_SQRTF:
33677 if (out_mode == SFmode && in_mode == SFmode)
33678 {
33679 if (out_n == 4 && in_n == 4)
33680 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
33681 else if (out_n == 8 && in_n == 8)
33682 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
33683 }
33684 break;
33685
33686 case BUILT_IN_IFLOOR:
33687 case BUILT_IN_LFLOOR:
33688 case BUILT_IN_LLFLOOR:
33689 /* The round insn does not trap on denormals. */
33690 if (flag_trapping_math || !TARGET_ROUND)
33691 break;
33692
33693 if (out_mode == SImode && in_mode == DFmode)
33694 {
33695 if (out_n == 4 && in_n == 2)
33696 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
33697 else if (out_n == 8 && in_n == 4)
33698 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
33699 }
33700 break;
33701
33702 case BUILT_IN_IFLOORF:
33703 case BUILT_IN_LFLOORF:
33704 case BUILT_IN_LLFLOORF:
33705 /* The round insn does not trap on denormals. */
33706 if (flag_trapping_math || !TARGET_ROUND)
33707 break;
33708
33709 if (out_mode == SImode && in_mode == SFmode)
33710 {
33711 if (out_n == 4 && in_n == 4)
33712 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
33713 else if (out_n == 8 && in_n == 8)
33714 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
33715 }
33716 break;
33717
33718 case BUILT_IN_ICEIL:
33719 case BUILT_IN_LCEIL:
33720 case BUILT_IN_LLCEIL:
33721 /* The round insn does not trap on denormals. */
33722 if (flag_trapping_math || !TARGET_ROUND)
33723 break;
33724
33725 if (out_mode == SImode && in_mode == DFmode)
33726 {
33727 if (out_n == 4 && in_n == 2)
33728 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
33729 else if (out_n == 8 && in_n == 4)
33730 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
33731 }
33732 break;
33733
33734 case BUILT_IN_ICEILF:
33735 case BUILT_IN_LCEILF:
33736 case BUILT_IN_LLCEILF:
33737 /* The round insn does not trap on denormals. */
33738 if (flag_trapping_math || !TARGET_ROUND)
33739 break;
33740
33741 if (out_mode == SImode && in_mode == SFmode)
33742 {
33743 if (out_n == 4 && in_n == 4)
33744 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
33745 else if (out_n == 8 && in_n == 8)
33746 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
33747 }
33748 break;
33749
33750 case BUILT_IN_IRINT:
33751 case BUILT_IN_LRINT:
33752 case BUILT_IN_LLRINT:
33753 if (out_mode == SImode && in_mode == DFmode)
33754 {
33755 if (out_n == 4 && in_n == 2)
33756 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33757 else if (out_n == 8 && in_n == 4)
33758 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33759 }
33760 break;
33761
33762 case BUILT_IN_IRINTF:
33763 case BUILT_IN_LRINTF:
33764 case BUILT_IN_LLRINTF:
33765 if (out_mode == SImode && in_mode == SFmode)
33766 {
33767 if (out_n == 4 && in_n == 4)
33768 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33769 else if (out_n == 8 && in_n == 8)
33770 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33771 }
33772 break;
33773
33774 case BUILT_IN_IROUND:
33775 case BUILT_IN_LROUND:
33776 case BUILT_IN_LLROUND:
33777 /* The round insn does not trap on denormals. */
33778 if (flag_trapping_math || !TARGET_ROUND)
33779 break;
33780
33781 if (out_mode == SImode && in_mode == DFmode)
33782 {
33783 if (out_n == 4 && in_n == 2)
33784 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33785 else if (out_n == 8 && in_n == 4)
33786 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33787 }
33788 break;
33789
33790 case BUILT_IN_IROUNDF:
33791 case BUILT_IN_LROUNDF:
33792 case BUILT_IN_LLROUNDF:
33793 /* The round insn does not trap on denormals. */
33794 if (flag_trapping_math || !TARGET_ROUND)
33795 break;
33796
33797 if (out_mode == SImode && in_mode == SFmode)
33798 {
33799 if (out_n == 4 && in_n == 4)
33800 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33801 else if (out_n == 8 && in_n == 8)
33802 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33803 }
33804 break;
33805
33806 case BUILT_IN_COPYSIGN:
33807 if (out_mode == DFmode && in_mode == DFmode)
33808 {
33809 if (out_n == 2 && in_n == 2)
33810 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33811 else if (out_n == 4 && in_n == 4)
33812 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33813 }
33814 break;
33815
33816 case BUILT_IN_COPYSIGNF:
33817 if (out_mode == SFmode && in_mode == SFmode)
33818 {
33819 if (out_n == 4 && in_n == 4)
33820 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33821 else if (out_n == 8 && in_n == 8)
33822 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33823 }
33824 break;
33825
33826 case BUILT_IN_FLOOR:
33827 /* The round insn does not trap on denormals. */
33828 if (flag_trapping_math || !TARGET_ROUND)
33829 break;
33830
33831 if (out_mode == DFmode && in_mode == DFmode)
33832 {
33833 if (out_n == 2 && in_n == 2)
33834 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33835 else if (out_n == 4 && in_n == 4)
33836 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33837 }
33838 break;
33839
33840 case BUILT_IN_FLOORF:
33841 /* The round insn does not trap on denormals. */
33842 if (flag_trapping_math || !TARGET_ROUND)
33843 break;
33844
33845 if (out_mode == SFmode && in_mode == SFmode)
33846 {
33847 if (out_n == 4 && in_n == 4)
33848 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33849 else if (out_n == 8 && in_n == 8)
33850 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33851 }
33852 break;
33853
33854 case BUILT_IN_CEIL:
33855 /* The round insn does not trap on denormals. */
33856 if (flag_trapping_math || !TARGET_ROUND)
33857 break;
33858
33859 if (out_mode == DFmode && in_mode == DFmode)
33860 {
33861 if (out_n == 2 && in_n == 2)
33862 return ix86_builtins[IX86_BUILTIN_CEILPD];
33863 else if (out_n == 4 && in_n == 4)
33864 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33865 }
33866 break;
33867
33868 case BUILT_IN_CEILF:
33869 /* The round insn does not trap on denormals. */
33870 if (flag_trapping_math || !TARGET_ROUND)
33871 break;
33872
33873 if (out_mode == SFmode && in_mode == SFmode)
33874 {
33875 if (out_n == 4 && in_n == 4)
33876 return ix86_builtins[IX86_BUILTIN_CEILPS];
33877 else if (out_n == 8 && in_n == 8)
33878 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33879 }
33880 break;
33881
33882 case BUILT_IN_TRUNC:
33883 /* The round insn does not trap on denormals. */
33884 if (flag_trapping_math || !TARGET_ROUND)
33885 break;
33886
33887 if (out_mode == DFmode && in_mode == DFmode)
33888 {
33889 if (out_n == 2 && in_n == 2)
33890 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33891 else if (out_n == 4 && in_n == 4)
33892 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33893 }
33894 break;
33895
33896 case BUILT_IN_TRUNCF:
33897 /* The round insn does not trap on denormals. */
33898 if (flag_trapping_math || !TARGET_ROUND)
33899 break;
33900
33901 if (out_mode == SFmode && in_mode == SFmode)
33902 {
33903 if (out_n == 4 && in_n == 4)
33904 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33905 else if (out_n == 8 && in_n == 8)
33906 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33907 }
33908 break;
33909
33910 case BUILT_IN_RINT:
33911 /* The round insn does not trap on denormals. */
33912 if (flag_trapping_math || !TARGET_ROUND)
33913 break;
33914
33915 if (out_mode == DFmode && in_mode == DFmode)
33916 {
33917 if (out_n == 2 && in_n == 2)
33918 return ix86_builtins[IX86_BUILTIN_RINTPD];
33919 else if (out_n == 4 && in_n == 4)
33920 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33921 }
33922 break;
33923
33924 case BUILT_IN_RINTF:
33925 /* The round insn does not trap on denormals. */
33926 if (flag_trapping_math || !TARGET_ROUND)
33927 break;
33928
33929 if (out_mode == SFmode && in_mode == SFmode)
33930 {
33931 if (out_n == 4 && in_n == 4)
33932 return ix86_builtins[IX86_BUILTIN_RINTPS];
33933 else if (out_n == 8 && in_n == 8)
33934 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33935 }
33936 break;
33937
33938 case BUILT_IN_ROUND:
33939 /* The round insn does not trap on denormals. */
33940 if (flag_trapping_math || !TARGET_ROUND)
33941 break;
33942
33943 if (out_mode == DFmode && in_mode == DFmode)
33944 {
33945 if (out_n == 2 && in_n == 2)
33946 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33947 else if (out_n == 4 && in_n == 4)
33948 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33949 }
33950 break;
33951
33952 case BUILT_IN_ROUNDF:
33953 /* The round insn does not trap on denormals. */
33954 if (flag_trapping_math || !TARGET_ROUND)
33955 break;
33956
33957 if (out_mode == SFmode && in_mode == SFmode)
33958 {
33959 if (out_n == 4 && in_n == 4)
33960 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33961 else if (out_n == 8 && in_n == 8)
33962 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33963 }
33964 break;
33965
33966 case BUILT_IN_FMA:
33967 if (out_mode == DFmode && in_mode == DFmode)
33968 {
33969 if (out_n == 2 && in_n == 2)
33970 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33971 if (out_n == 4 && in_n == 4)
33972 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33973 }
33974 break;
33975
33976 case BUILT_IN_FMAF:
33977 if (out_mode == SFmode && in_mode == SFmode)
33978 {
33979 if (out_n == 4 && in_n == 4)
33980 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33981 if (out_n == 8 && in_n == 8)
33982 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33983 }
33984 break;
33985
33986 default:
33987 break;
33988 }
33989
33990 /* Dispatch to a handler for a vectorization library. */
33991 if (ix86_veclib_handler)
33992 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33993 type_in);
33994
33995 return NULL_TREE;
33996 }
33997
33998 /* Handler for an SVML-style interface to
33999 a library with vectorized intrinsics. */
34000
34001 static tree
34002 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
34003 {
34004 char name[20];
34005 tree fntype, new_fndecl, args;
34006 unsigned arity;
34007 const char *bname;
34008 enum machine_mode el_mode, in_mode;
34009 int n, in_n;
34010
34011 /* The SVML is suitable for unsafe math only. */
34012 if (!flag_unsafe_math_optimizations)
34013 return NULL_TREE;
34014
34015 el_mode = TYPE_MODE (TREE_TYPE (type_out));
34016 n = TYPE_VECTOR_SUBPARTS (type_out);
34017 in_mode = TYPE_MODE (TREE_TYPE (type_in));
34018 in_n = TYPE_VECTOR_SUBPARTS (type_in);
34019 if (el_mode != in_mode
34020 || n != in_n)
34021 return NULL_TREE;
34022
34023 switch (fn)
34024 {
34025 case BUILT_IN_EXP:
34026 case BUILT_IN_LOG:
34027 case BUILT_IN_LOG10:
34028 case BUILT_IN_POW:
34029 case BUILT_IN_TANH:
34030 case BUILT_IN_TAN:
34031 case BUILT_IN_ATAN:
34032 case BUILT_IN_ATAN2:
34033 case BUILT_IN_ATANH:
34034 case BUILT_IN_CBRT:
34035 case BUILT_IN_SINH:
34036 case BUILT_IN_SIN:
34037 case BUILT_IN_ASINH:
34038 case BUILT_IN_ASIN:
34039 case BUILT_IN_COSH:
34040 case BUILT_IN_COS:
34041 case BUILT_IN_ACOSH:
34042 case BUILT_IN_ACOS:
34043 if (el_mode != DFmode || n != 2)
34044 return NULL_TREE;
34045 break;
34046
34047 case BUILT_IN_EXPF:
34048 case BUILT_IN_LOGF:
34049 case BUILT_IN_LOG10F:
34050 case BUILT_IN_POWF:
34051 case BUILT_IN_TANHF:
34052 case BUILT_IN_TANF:
34053 case BUILT_IN_ATANF:
34054 case BUILT_IN_ATAN2F:
34055 case BUILT_IN_ATANHF:
34056 case BUILT_IN_CBRTF:
34057 case BUILT_IN_SINHF:
34058 case BUILT_IN_SINF:
34059 case BUILT_IN_ASINHF:
34060 case BUILT_IN_ASINF:
34061 case BUILT_IN_COSHF:
34062 case BUILT_IN_COSF:
34063 case BUILT_IN_ACOSHF:
34064 case BUILT_IN_ACOSF:
34065 if (el_mode != SFmode || n != 4)
34066 return NULL_TREE;
34067 break;
34068
34069 default:
34070 return NULL_TREE;
34071 }
34072
34073 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34074
34075 if (fn == BUILT_IN_LOGF)
34076 strcpy (name, "vmlsLn4");
34077 else if (fn == BUILT_IN_LOG)
34078 strcpy (name, "vmldLn2");
34079 else if (n == 4)
34080 {
34081 sprintf (name, "vmls%s", bname+10);
34082 name[strlen (name)-1] = '4';
34083 }
34084 else
34085 sprintf (name, "vmld%s2", bname+10);
34086
34087 /* Convert to uppercase. */
34088 name[4] &= ~0x20;
34089
34090 arity = 0;
34091 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34092 args;
34093 args = TREE_CHAIN (args))
34094 arity++;
34095
34096 if (arity == 1)
34097 fntype = build_function_type_list (type_out, type_in, NULL);
34098 else
34099 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34100
34101 /* Build a function declaration for the vectorized function. */
34102 new_fndecl = build_decl (BUILTINS_LOCATION,
34103 FUNCTION_DECL, get_identifier (name), fntype);
34104 TREE_PUBLIC (new_fndecl) = 1;
34105 DECL_EXTERNAL (new_fndecl) = 1;
34106 DECL_IS_NOVOPS (new_fndecl) = 1;
34107 TREE_READONLY (new_fndecl) = 1;
34108
34109 return new_fndecl;
34110 }
34111
34112 /* Handler for an ACML-style interface to
34113 a library with vectorized intrinsics. */
34114
34115 static tree
34116 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
34117 {
34118 char name[20] = "__vr.._";
34119 tree fntype, new_fndecl, args;
34120 unsigned arity;
34121 const char *bname;
34122 enum machine_mode el_mode, in_mode;
34123 int n, in_n;
34124
34125 /* The ACML is 64bits only and suitable for unsafe math only as
34126 it does not correctly support parts of IEEE with the required
34127 precision such as denormals. */
34128 if (!TARGET_64BIT
34129 || !flag_unsafe_math_optimizations)
34130 return NULL_TREE;
34131
34132 el_mode = TYPE_MODE (TREE_TYPE (type_out));
34133 n = TYPE_VECTOR_SUBPARTS (type_out);
34134 in_mode = TYPE_MODE (TREE_TYPE (type_in));
34135 in_n = TYPE_VECTOR_SUBPARTS (type_in);
34136 if (el_mode != in_mode
34137 || n != in_n)
34138 return NULL_TREE;
34139
34140 switch (fn)
34141 {
34142 case BUILT_IN_SIN:
34143 case BUILT_IN_COS:
34144 case BUILT_IN_EXP:
34145 case BUILT_IN_LOG:
34146 case BUILT_IN_LOG2:
34147 case BUILT_IN_LOG10:
34148 name[4] = 'd';
34149 name[5] = '2';
34150 if (el_mode != DFmode
34151 || n != 2)
34152 return NULL_TREE;
34153 break;
34154
34155 case BUILT_IN_SINF:
34156 case BUILT_IN_COSF:
34157 case BUILT_IN_EXPF:
34158 case BUILT_IN_POWF:
34159 case BUILT_IN_LOGF:
34160 case BUILT_IN_LOG2F:
34161 case BUILT_IN_LOG10F:
34162 name[4] = 's';
34163 name[5] = '4';
34164 if (el_mode != SFmode
34165 || n != 4)
34166 return NULL_TREE;
34167 break;
34168
34169 default:
34170 return NULL_TREE;
34171 }
34172
34173 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34174 sprintf (name + 7, "%s", bname+10);
34175
34176 arity = 0;
34177 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34178 args;
34179 args = TREE_CHAIN (args))
34180 arity++;
34181
34182 if (arity == 1)
34183 fntype = build_function_type_list (type_out, type_in, NULL);
34184 else
34185 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34186
34187 /* Build a function declaration for the vectorized function. */
34188 new_fndecl = build_decl (BUILTINS_LOCATION,
34189 FUNCTION_DECL, get_identifier (name), fntype);
34190 TREE_PUBLIC (new_fndecl) = 1;
34191 DECL_EXTERNAL (new_fndecl) = 1;
34192 DECL_IS_NOVOPS (new_fndecl) = 1;
34193 TREE_READONLY (new_fndecl) = 1;
34194
34195 return new_fndecl;
34196 }
34197
34198 /* Returns a decl of a function that implements gather load with
34199 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
34200 Return NULL_TREE if it is not available. */
34201
34202 static tree
34203 ix86_vectorize_builtin_gather (const_tree mem_vectype,
34204 const_tree index_type, int scale)
34205 {
34206 bool si;
34207 enum ix86_builtins code;
34208
34209 if (! TARGET_AVX2)
34210 return NULL_TREE;
34211
34212 if ((TREE_CODE (index_type) != INTEGER_TYPE
34213 && !POINTER_TYPE_P (index_type))
34214 || (TYPE_MODE (index_type) != SImode
34215 && TYPE_MODE (index_type) != DImode))
34216 return NULL_TREE;
34217
34218 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
34219 return NULL_TREE;
34220
34221 /* v*gather* insn sign extends index to pointer mode. */
34222 if (TYPE_PRECISION (index_type) < POINTER_SIZE
34223 && TYPE_UNSIGNED (index_type))
34224 return NULL_TREE;
34225
34226 if (scale <= 0
34227 || scale > 8
34228 || (scale & (scale - 1)) != 0)
34229 return NULL_TREE;
34230
34231 si = TYPE_MODE (index_type) == SImode;
34232 switch (TYPE_MODE (mem_vectype))
34233 {
34234 case V2DFmode:
34235 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
34236 break;
34237 case V4DFmode:
34238 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
34239 break;
34240 case V2DImode:
34241 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
34242 break;
34243 case V4DImode:
34244 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
34245 break;
34246 case V4SFmode:
34247 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
34248 break;
34249 case V8SFmode:
34250 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
34251 break;
34252 case V4SImode:
34253 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
34254 break;
34255 case V8SImode:
34256 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
34257 break;
34258 default:
34259 return NULL_TREE;
34260 }
34261
34262 return ix86_builtins[code];
34263 }
34264
34265 /* Returns a code for a target-specific builtin that implements
34266 reciprocal of the function, or NULL_TREE if not available. */
34267
34268 static tree
34269 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
34270 bool sqrt ATTRIBUTE_UNUSED)
34271 {
34272 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
34273 && flag_finite_math_only && !flag_trapping_math
34274 && flag_unsafe_math_optimizations))
34275 return NULL_TREE;
34276
34277 if (md_fn)
34278 /* Machine dependent builtins. */
34279 switch (fn)
34280 {
34281 /* Vectorized version of sqrt to rsqrt conversion. */
34282 case IX86_BUILTIN_SQRTPS_NR:
34283 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
34284
34285 case IX86_BUILTIN_SQRTPS_NR256:
34286 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
34287
34288 default:
34289 return NULL_TREE;
34290 }
34291 else
34292 /* Normal builtins. */
34293 switch (fn)
34294 {
34295 /* Sqrt to rsqrt conversion. */
34296 case BUILT_IN_SQRTF:
34297 return ix86_builtins[IX86_BUILTIN_RSQRTF];
34298
34299 default:
34300 return NULL_TREE;
34301 }
34302 }
34303 \f
34304 /* Helper for avx_vpermilps256_operand et al. This is also used by
34305 the expansion functions to turn the parallel back into a mask.
34306 The return value is 0 for no match and the imm8+1 for a match. */
34307
34308 int
34309 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
34310 {
34311 unsigned i, nelt = GET_MODE_NUNITS (mode);
34312 unsigned mask = 0;
34313 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34314
34315 if (XVECLEN (par, 0) != (int) nelt)
34316 return 0;
34317
34318 /* Validate that all of the elements are constants, and not totally
34319 out of range. Copy the data into an integral array to make the
34320 subsequent checks easier. */
34321 for (i = 0; i < nelt; ++i)
34322 {
34323 rtx er = XVECEXP (par, 0, i);
34324 unsigned HOST_WIDE_INT ei;
34325
34326 if (!CONST_INT_P (er))
34327 return 0;
34328 ei = INTVAL (er);
34329 if (ei >= nelt)
34330 return 0;
34331 ipar[i] = ei;
34332 }
34333
34334 switch (mode)
34335 {
34336 case V4DFmode:
34337 /* In the 256-bit DFmode case, we can only move elements within
34338 a 128-bit lane. */
34339 for (i = 0; i < 2; ++i)
34340 {
34341 if (ipar[i] >= 2)
34342 return 0;
34343 mask |= ipar[i] << i;
34344 }
34345 for (i = 2; i < 4; ++i)
34346 {
34347 if (ipar[i] < 2)
34348 return 0;
34349 mask |= (ipar[i] - 2) << i;
34350 }
34351 break;
34352
34353 case V8SFmode:
34354 /* In the 256-bit SFmode case, we have full freedom of movement
34355 within the low 128-bit lane, but the high 128-bit lane must
34356 mirror the exact same pattern. */
34357 for (i = 0; i < 4; ++i)
34358 if (ipar[i] + 4 != ipar[i + 4])
34359 return 0;
34360 nelt = 4;
34361 /* FALLTHRU */
34362
34363 case V2DFmode:
34364 case V4SFmode:
34365 /* In the 128-bit case, we've full freedom in the placement of
34366 the elements from the source operand. */
34367 for (i = 0; i < nelt; ++i)
34368 mask |= ipar[i] << (i * (nelt / 2));
34369 break;
34370
34371 default:
34372 gcc_unreachable ();
34373 }
34374
34375 /* Make sure success has a non-zero value by adding one. */
34376 return mask + 1;
34377 }
34378
34379 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
34380 the expansion functions to turn the parallel back into a mask.
34381 The return value is 0 for no match and the imm8+1 for a match. */
34382
34383 int
34384 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
34385 {
34386 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
34387 unsigned mask = 0;
34388 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34389
34390 if (XVECLEN (par, 0) != (int) nelt)
34391 return 0;
34392
34393 /* Validate that all of the elements are constants, and not totally
34394 out of range. Copy the data into an integral array to make the
34395 subsequent checks easier. */
34396 for (i = 0; i < nelt; ++i)
34397 {
34398 rtx er = XVECEXP (par, 0, i);
34399 unsigned HOST_WIDE_INT ei;
34400
34401 if (!CONST_INT_P (er))
34402 return 0;
34403 ei = INTVAL (er);
34404 if (ei >= 2 * nelt)
34405 return 0;
34406 ipar[i] = ei;
34407 }
34408
34409 /* Validate that the halves of the permute are halves. */
34410 for (i = 0; i < nelt2 - 1; ++i)
34411 if (ipar[i] + 1 != ipar[i + 1])
34412 return 0;
34413 for (i = nelt2; i < nelt - 1; ++i)
34414 if (ipar[i] + 1 != ipar[i + 1])
34415 return 0;
34416
34417 /* Reconstruct the mask. */
34418 for (i = 0; i < 2; ++i)
34419 {
34420 unsigned e = ipar[i * nelt2];
34421 if (e % nelt2)
34422 return 0;
34423 e /= nelt2;
34424 mask |= e << (i * 4);
34425 }
34426
34427 /* Make sure success has a non-zero value by adding one. */
34428 return mask + 1;
34429 }
34430 \f
34431 /* Store OPERAND to the memory after reload is completed. This means
34432 that we can't easily use assign_stack_local. */
34433 rtx
34434 ix86_force_to_memory (enum machine_mode mode, rtx operand)
34435 {
34436 rtx result;
34437
34438 gcc_assert (reload_completed);
34439 if (ix86_using_red_zone ())
34440 {
34441 result = gen_rtx_MEM (mode,
34442 gen_rtx_PLUS (Pmode,
34443 stack_pointer_rtx,
34444 GEN_INT (-RED_ZONE_SIZE)));
34445 emit_move_insn (result, operand);
34446 }
34447 else if (TARGET_64BIT)
34448 {
34449 switch (mode)
34450 {
34451 case HImode:
34452 case SImode:
34453 operand = gen_lowpart (DImode, operand);
34454 /* FALLTHRU */
34455 case DImode:
34456 emit_insn (
34457 gen_rtx_SET (VOIDmode,
34458 gen_rtx_MEM (DImode,
34459 gen_rtx_PRE_DEC (DImode,
34460 stack_pointer_rtx)),
34461 operand));
34462 break;
34463 default:
34464 gcc_unreachable ();
34465 }
34466 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34467 }
34468 else
34469 {
34470 switch (mode)
34471 {
34472 case DImode:
34473 {
34474 rtx operands[2];
34475 split_double_mode (mode, &operand, 1, operands, operands + 1);
34476 emit_insn (
34477 gen_rtx_SET (VOIDmode,
34478 gen_rtx_MEM (SImode,
34479 gen_rtx_PRE_DEC (Pmode,
34480 stack_pointer_rtx)),
34481 operands[1]));
34482 emit_insn (
34483 gen_rtx_SET (VOIDmode,
34484 gen_rtx_MEM (SImode,
34485 gen_rtx_PRE_DEC (Pmode,
34486 stack_pointer_rtx)),
34487 operands[0]));
34488 }
34489 break;
34490 case HImode:
34491 /* Store HImodes as SImodes. */
34492 operand = gen_lowpart (SImode, operand);
34493 /* FALLTHRU */
34494 case SImode:
34495 emit_insn (
34496 gen_rtx_SET (VOIDmode,
34497 gen_rtx_MEM (GET_MODE (operand),
34498 gen_rtx_PRE_DEC (SImode,
34499 stack_pointer_rtx)),
34500 operand));
34501 break;
34502 default:
34503 gcc_unreachable ();
34504 }
34505 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34506 }
34507 return result;
34508 }
34509
34510 /* Free operand from the memory. */
34511 void
34512 ix86_free_from_memory (enum machine_mode mode)
34513 {
34514 if (!ix86_using_red_zone ())
34515 {
34516 int size;
34517
34518 if (mode == DImode || TARGET_64BIT)
34519 size = 8;
34520 else
34521 size = 4;
34522 /* Use LEA to deallocate stack space. In peephole2 it will be converted
34523 to pop or add instruction if registers are available. */
34524 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
34525 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
34526 GEN_INT (size))));
34527 }
34528 }
34529
34530 /* Return a register priority for hard reg REGNO. */
34531 static int
34532 ix86_register_priority (int hard_regno)
34533 {
34534 /* ebp and r13 as the base always wants a displacement, r12 as the
34535 base always wants an index. So discourage their usage in an
34536 address. */
34537 if (hard_regno == R12_REG || hard_regno == R13_REG)
34538 return 0;
34539 if (hard_regno == BP_REG)
34540 return 1;
34541 /* New x86-64 int registers result in bigger code size. Discourage
34542 them. */
34543 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
34544 return 2;
34545 /* New x86-64 SSE registers result in bigger code size. Discourage
34546 them. */
34547 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
34548 return 2;
34549 /* Usage of AX register results in smaller code. Prefer it. */
34550 if (hard_regno == 0)
34551 return 4;
34552 return 3;
34553 }
34554
34555 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
34556
34557 Put float CONST_DOUBLE in the constant pool instead of fp regs.
34558 QImode must go into class Q_REGS.
34559 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
34560 movdf to do mem-to-mem moves through integer regs. */
34561
34562 static reg_class_t
34563 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
34564 {
34565 enum machine_mode mode = GET_MODE (x);
34566
34567 /* We're only allowed to return a subclass of CLASS. Many of the
34568 following checks fail for NO_REGS, so eliminate that early. */
34569 if (regclass == NO_REGS)
34570 return NO_REGS;
34571
34572 /* All classes can load zeros. */
34573 if (x == CONST0_RTX (mode))
34574 return regclass;
34575
34576 /* Force constants into memory if we are loading a (nonzero) constant into
34577 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
34578 instructions to load from a constant. */
34579 if (CONSTANT_P (x)
34580 && (MAYBE_MMX_CLASS_P (regclass)
34581 || MAYBE_SSE_CLASS_P (regclass)
34582 || MAYBE_MASK_CLASS_P (regclass)))
34583 return NO_REGS;
34584
34585 /* Prefer SSE regs only, if we can use them for math. */
34586 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
34587 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
34588
34589 /* Floating-point constants need more complex checks. */
34590 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
34591 {
34592 /* General regs can load everything. */
34593 if (reg_class_subset_p (regclass, GENERAL_REGS))
34594 return regclass;
34595
34596 /* Floats can load 0 and 1 plus some others. Note that we eliminated
34597 zero above. We only want to wind up preferring 80387 registers if
34598 we plan on doing computation with them. */
34599 if (TARGET_80387
34600 && standard_80387_constant_p (x) > 0)
34601 {
34602 /* Limit class to non-sse. */
34603 if (regclass == FLOAT_SSE_REGS)
34604 return FLOAT_REGS;
34605 if (regclass == FP_TOP_SSE_REGS)
34606 return FP_TOP_REG;
34607 if (regclass == FP_SECOND_SSE_REGS)
34608 return FP_SECOND_REG;
34609 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
34610 return regclass;
34611 }
34612
34613 return NO_REGS;
34614 }
34615
34616 /* Generally when we see PLUS here, it's the function invariant
34617 (plus soft-fp const_int). Which can only be computed into general
34618 regs. */
34619 if (GET_CODE (x) == PLUS)
34620 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
34621
34622 /* QImode constants are easy to load, but non-constant QImode data
34623 must go into Q_REGS. */
34624 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
34625 {
34626 if (reg_class_subset_p (regclass, Q_REGS))
34627 return regclass;
34628 if (reg_class_subset_p (Q_REGS, regclass))
34629 return Q_REGS;
34630 return NO_REGS;
34631 }
34632
34633 return regclass;
34634 }
34635
34636 /* Discourage putting floating-point values in SSE registers unless
34637 SSE math is being used, and likewise for the 387 registers. */
34638 static reg_class_t
34639 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34640 {
34641 enum machine_mode mode = GET_MODE (x);
34642
34643 /* Restrict the output reload class to the register bank that we are doing
34644 math on. If we would like not to return a subset of CLASS, reject this
34645 alternative: if reload cannot do this, it will still use its choice. */
34646 mode = GET_MODE (x);
34647 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34648 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
34649
34650 if (X87_FLOAT_MODE_P (mode))
34651 {
34652 if (regclass == FP_TOP_SSE_REGS)
34653 return FP_TOP_REG;
34654 else if (regclass == FP_SECOND_SSE_REGS)
34655 return FP_SECOND_REG;
34656 else
34657 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34658 }
34659
34660 return regclass;
34661 }
34662
34663 static reg_class_t
34664 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34665 enum machine_mode mode, secondary_reload_info *sri)
34666 {
34667 /* Double-word spills from general registers to non-offsettable memory
34668 references (zero-extended addresses) require special handling. */
34669 if (TARGET_64BIT
34670 && MEM_P (x)
34671 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34672 && INTEGER_CLASS_P (rclass)
34673 && !offsettable_memref_p (x))
34674 {
34675 sri->icode = (in_p
34676 ? CODE_FOR_reload_noff_load
34677 : CODE_FOR_reload_noff_store);
34678 /* Add the cost of moving address to a temporary. */
34679 sri->extra_cost = 1;
34680
34681 return NO_REGS;
34682 }
34683
34684 /* QImode spills from non-QI registers require
34685 intermediate register on 32bit targets. */
34686 if (mode == QImode
34687 && (MAYBE_MASK_CLASS_P (rclass)
34688 || (!TARGET_64BIT && !in_p
34689 && INTEGER_CLASS_P (rclass)
34690 && MAYBE_NON_Q_CLASS_P (rclass))))
34691 {
34692 int regno;
34693
34694 if (REG_P (x))
34695 regno = REGNO (x);
34696 else
34697 regno = -1;
34698
34699 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34700 regno = true_regnum (x);
34701
34702 /* Return Q_REGS if the operand is in memory. */
34703 if (regno == -1)
34704 return Q_REGS;
34705 }
34706
34707 /* This condition handles corner case where an expression involving
34708 pointers gets vectorized. We're trying to use the address of a
34709 stack slot as a vector initializer.
34710
34711 (set (reg:V2DI 74 [ vect_cst_.2 ])
34712 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34713
34714 Eventually frame gets turned into sp+offset like this:
34715
34716 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34717 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34718 (const_int 392 [0x188]))))
34719
34720 That later gets turned into:
34721
34722 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34723 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34724 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34725
34726 We'll have the following reload recorded:
34727
34728 Reload 0: reload_in (DI) =
34729 (plus:DI (reg/f:DI 7 sp)
34730 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34731 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34732 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34733 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34734 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34735 reload_reg_rtx: (reg:V2DI 22 xmm1)
34736
34737 Which isn't going to work since SSE instructions can't handle scalar
34738 additions. Returning GENERAL_REGS forces the addition into integer
34739 register and reload can handle subsequent reloads without problems. */
34740
34741 if (in_p && GET_CODE (x) == PLUS
34742 && SSE_CLASS_P (rclass)
34743 && SCALAR_INT_MODE_P (mode))
34744 return GENERAL_REGS;
34745
34746 return NO_REGS;
34747 }
34748
34749 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34750
34751 static bool
34752 ix86_class_likely_spilled_p (reg_class_t rclass)
34753 {
34754 switch (rclass)
34755 {
34756 case AREG:
34757 case DREG:
34758 case CREG:
34759 case BREG:
34760 case AD_REGS:
34761 case SIREG:
34762 case DIREG:
34763 case SSE_FIRST_REG:
34764 case FP_TOP_REG:
34765 case FP_SECOND_REG:
34766 case BND_REGS:
34767 return true;
34768
34769 default:
34770 break;
34771 }
34772
34773 return false;
34774 }
34775
34776 /* If we are copying between general and FP registers, we need a memory
34777 location. The same is true for SSE and MMX registers.
34778
34779 To optimize register_move_cost performance, allow inline variant.
34780
34781 The macro can't work reliably when one of the CLASSES is class containing
34782 registers from multiple units (SSE, MMX, integer). We avoid this by never
34783 combining those units in single alternative in the machine description.
34784 Ensure that this constraint holds to avoid unexpected surprises.
34785
34786 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34787 enforce these sanity checks. */
34788
34789 static inline bool
34790 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34791 enum machine_mode mode, int strict)
34792 {
34793 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34794 return false;
34795 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34796 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34797 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34798 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34799 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34800 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34801 {
34802 gcc_assert (!strict || lra_in_progress);
34803 return true;
34804 }
34805
34806 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34807 return true;
34808
34809 /* ??? This is a lie. We do have moves between mmx/general, and for
34810 mmx/sse2. But by saying we need secondary memory we discourage the
34811 register allocator from using the mmx registers unless needed. */
34812 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34813 return true;
34814
34815 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34816 {
34817 /* SSE1 doesn't have any direct moves from other classes. */
34818 if (!TARGET_SSE2)
34819 return true;
34820
34821 /* If the target says that inter-unit moves are more expensive
34822 than moving through memory, then don't generate them. */
34823 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34824 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34825 return true;
34826
34827 /* Between SSE and general, we have moves no larger than word size. */
34828 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34829 return true;
34830 }
34831
34832 return false;
34833 }
34834
34835 bool
34836 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34837 enum machine_mode mode, int strict)
34838 {
34839 return inline_secondary_memory_needed (class1, class2, mode, strict);
34840 }
34841
34842 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34843
34844 On the 80386, this is the size of MODE in words,
34845 except in the FP regs, where a single reg is always enough. */
34846
34847 static unsigned char
34848 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34849 {
34850 if (MAYBE_INTEGER_CLASS_P (rclass))
34851 {
34852 if (mode == XFmode)
34853 return (TARGET_64BIT ? 2 : 3);
34854 else if (mode == XCmode)
34855 return (TARGET_64BIT ? 4 : 6);
34856 else
34857 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34858 }
34859 else
34860 {
34861 if (COMPLEX_MODE_P (mode))
34862 return 2;
34863 else
34864 return 1;
34865 }
34866 }
34867
34868 /* Return true if the registers in CLASS cannot represent the change from
34869 modes FROM to TO. */
34870
34871 bool
34872 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34873 enum reg_class regclass)
34874 {
34875 if (from == to)
34876 return false;
34877
34878 /* x87 registers can't do subreg at all, as all values are reformatted
34879 to extended precision. */
34880 if (MAYBE_FLOAT_CLASS_P (regclass))
34881 return true;
34882
34883 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34884 {
34885 /* Vector registers do not support QI or HImode loads. If we don't
34886 disallow a change to these modes, reload will assume it's ok to
34887 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34888 the vec_dupv4hi pattern. */
34889 if (GET_MODE_SIZE (from) < 4)
34890 return true;
34891
34892 /* Vector registers do not support subreg with nonzero offsets, which
34893 are otherwise valid for integer registers. Since we can't see
34894 whether we have a nonzero offset from here, prohibit all
34895 nonparadoxical subregs changing size. */
34896 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34897 return true;
34898 }
34899
34900 return false;
34901 }
34902
34903 /* Return the cost of moving data of mode M between a
34904 register and memory. A value of 2 is the default; this cost is
34905 relative to those in `REGISTER_MOVE_COST'.
34906
34907 This function is used extensively by register_move_cost that is used to
34908 build tables at startup. Make it inline in this case.
34909 When IN is 2, return maximum of in and out move cost.
34910
34911 If moving between registers and memory is more expensive than
34912 between two registers, you should define this macro to express the
34913 relative cost.
34914
34915 Model also increased moving costs of QImode registers in non
34916 Q_REGS classes.
34917 */
34918 static inline int
34919 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34920 int in)
34921 {
34922 int cost;
34923 if (FLOAT_CLASS_P (regclass))
34924 {
34925 int index;
34926 switch (mode)
34927 {
34928 case SFmode:
34929 index = 0;
34930 break;
34931 case DFmode:
34932 index = 1;
34933 break;
34934 case XFmode:
34935 index = 2;
34936 break;
34937 default:
34938 return 100;
34939 }
34940 if (in == 2)
34941 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34942 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34943 }
34944 if (SSE_CLASS_P (regclass))
34945 {
34946 int index;
34947 switch (GET_MODE_SIZE (mode))
34948 {
34949 case 4:
34950 index = 0;
34951 break;
34952 case 8:
34953 index = 1;
34954 break;
34955 case 16:
34956 index = 2;
34957 break;
34958 default:
34959 return 100;
34960 }
34961 if (in == 2)
34962 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34963 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34964 }
34965 if (MMX_CLASS_P (regclass))
34966 {
34967 int index;
34968 switch (GET_MODE_SIZE (mode))
34969 {
34970 case 4:
34971 index = 0;
34972 break;
34973 case 8:
34974 index = 1;
34975 break;
34976 default:
34977 return 100;
34978 }
34979 if (in)
34980 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34981 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34982 }
34983 switch (GET_MODE_SIZE (mode))
34984 {
34985 case 1:
34986 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34987 {
34988 if (!in)
34989 return ix86_cost->int_store[0];
34990 if (TARGET_PARTIAL_REG_DEPENDENCY
34991 && optimize_function_for_speed_p (cfun))
34992 cost = ix86_cost->movzbl_load;
34993 else
34994 cost = ix86_cost->int_load[0];
34995 if (in == 2)
34996 return MAX (cost, ix86_cost->int_store[0]);
34997 return cost;
34998 }
34999 else
35000 {
35001 if (in == 2)
35002 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
35003 if (in)
35004 return ix86_cost->movzbl_load;
35005 else
35006 return ix86_cost->int_store[0] + 4;
35007 }
35008 break;
35009 case 2:
35010 if (in == 2)
35011 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
35012 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
35013 default:
35014 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
35015 if (mode == TFmode)
35016 mode = XFmode;
35017 if (in == 2)
35018 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
35019 else if (in)
35020 cost = ix86_cost->int_load[2];
35021 else
35022 cost = ix86_cost->int_store[2];
35023 return (cost * (((int) GET_MODE_SIZE (mode)
35024 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
35025 }
35026 }
35027
35028 static int
35029 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
35030 bool in)
35031 {
35032 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
35033 }
35034
35035
35036 /* Return the cost of moving data from a register in class CLASS1 to
35037 one in class CLASS2.
35038
35039 It is not required that the cost always equal 2 when FROM is the same as TO;
35040 on some machines it is expensive to move between registers if they are not
35041 general registers. */
35042
35043 static int
35044 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
35045 reg_class_t class2_i)
35046 {
35047 enum reg_class class1 = (enum reg_class) class1_i;
35048 enum reg_class class2 = (enum reg_class) class2_i;
35049
35050 /* In case we require secondary memory, compute cost of the store followed
35051 by load. In order to avoid bad register allocation choices, we need
35052 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
35053
35054 if (inline_secondary_memory_needed (class1, class2, mode, 0))
35055 {
35056 int cost = 1;
35057
35058 cost += inline_memory_move_cost (mode, class1, 2);
35059 cost += inline_memory_move_cost (mode, class2, 2);
35060
35061 /* In case of copying from general_purpose_register we may emit multiple
35062 stores followed by single load causing memory size mismatch stall.
35063 Count this as arbitrarily high cost of 20. */
35064 if (targetm.class_max_nregs (class1, mode)
35065 > targetm.class_max_nregs (class2, mode))
35066 cost += 20;
35067
35068 /* In the case of FP/MMX moves, the registers actually overlap, and we
35069 have to switch modes in order to treat them differently. */
35070 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
35071 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
35072 cost += 20;
35073
35074 return cost;
35075 }
35076
35077 /* Moves between SSE/MMX and integer unit are expensive. */
35078 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
35079 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
35080
35081 /* ??? By keeping returned value relatively high, we limit the number
35082 of moves between integer and MMX/SSE registers for all targets.
35083 Additionally, high value prevents problem with x86_modes_tieable_p(),
35084 where integer modes in MMX/SSE registers are not tieable
35085 because of missing QImode and HImode moves to, from or between
35086 MMX/SSE registers. */
35087 return MAX (8, ix86_cost->mmxsse_to_integer);
35088
35089 if (MAYBE_FLOAT_CLASS_P (class1))
35090 return ix86_cost->fp_move;
35091 if (MAYBE_SSE_CLASS_P (class1))
35092 return ix86_cost->sse_move;
35093 if (MAYBE_MMX_CLASS_P (class1))
35094 return ix86_cost->mmx_move;
35095 return 2;
35096 }
35097
35098 /* Return TRUE if hard register REGNO can hold a value of machine-mode
35099 MODE. */
35100
35101 bool
35102 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
35103 {
35104 /* Flags and only flags can only hold CCmode values. */
35105 if (CC_REGNO_P (regno))
35106 return GET_MODE_CLASS (mode) == MODE_CC;
35107 if (GET_MODE_CLASS (mode) == MODE_CC
35108 || GET_MODE_CLASS (mode) == MODE_RANDOM
35109 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
35110 return false;
35111 if (STACK_REGNO_P (regno))
35112 return VALID_FP_MODE_P (mode);
35113 if (MASK_REGNO_P (regno))
35114 return VALID_MASK_REG_MODE (mode);
35115 if (BND_REGNO_P (regno))
35116 return VALID_BND_REG_MODE (mode);
35117 if (SSE_REGNO_P (regno))
35118 {
35119 /* We implement the move patterns for all vector modes into and
35120 out of SSE registers, even when no operation instructions
35121 are available. */
35122
35123 /* For AVX-512 we allow, regardless of regno:
35124 - XI mode
35125 - any of 512-bit wide vector mode
35126 - any scalar mode. */
35127 if (TARGET_AVX512F
35128 && (mode == XImode
35129 || VALID_AVX512F_REG_MODE (mode)
35130 || VALID_AVX512F_SCALAR_MODE (mode)))
35131 return true;
35132
35133 /* xmm16-xmm31 are only available for AVX-512. */
35134 if (EXT_REX_SSE_REGNO_P (regno))
35135 return false;
35136
35137 /* OImode move is available only when AVX is enabled. */
35138 return ((TARGET_AVX && mode == OImode)
35139 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35140 || VALID_SSE_REG_MODE (mode)
35141 || VALID_SSE2_REG_MODE (mode)
35142 || VALID_MMX_REG_MODE (mode)
35143 || VALID_MMX_REG_MODE_3DNOW (mode));
35144 }
35145 if (MMX_REGNO_P (regno))
35146 {
35147 /* We implement the move patterns for 3DNOW modes even in MMX mode,
35148 so if the register is available at all, then we can move data of
35149 the given mode into or out of it. */
35150 return (VALID_MMX_REG_MODE (mode)
35151 || VALID_MMX_REG_MODE_3DNOW (mode));
35152 }
35153
35154 if (mode == QImode)
35155 {
35156 /* Take care for QImode values - they can be in non-QI regs,
35157 but then they do cause partial register stalls. */
35158 if (ANY_QI_REGNO_P (regno))
35159 return true;
35160 if (!TARGET_PARTIAL_REG_STALL)
35161 return true;
35162 /* LRA checks if the hard register is OK for the given mode.
35163 QImode values can live in non-QI regs, so we allow all
35164 registers here. */
35165 if (lra_in_progress)
35166 return true;
35167 return !can_create_pseudo_p ();
35168 }
35169 /* We handle both integer and floats in the general purpose registers. */
35170 else if (VALID_INT_MODE_P (mode))
35171 return true;
35172 else if (VALID_FP_MODE_P (mode))
35173 return true;
35174 else if (VALID_DFP_MODE_P (mode))
35175 return true;
35176 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
35177 on to use that value in smaller contexts, this can easily force a
35178 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
35179 supporting DImode, allow it. */
35180 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
35181 return true;
35182
35183 return false;
35184 }
35185
35186 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
35187 tieable integer mode. */
35188
35189 static bool
35190 ix86_tieable_integer_mode_p (enum machine_mode mode)
35191 {
35192 switch (mode)
35193 {
35194 case HImode:
35195 case SImode:
35196 return true;
35197
35198 case QImode:
35199 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
35200
35201 case DImode:
35202 return TARGET_64BIT;
35203
35204 default:
35205 return false;
35206 }
35207 }
35208
35209 /* Return true if MODE1 is accessible in a register that can hold MODE2
35210 without copying. That is, all register classes that can hold MODE2
35211 can also hold MODE1. */
35212
35213 bool
35214 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
35215 {
35216 if (mode1 == mode2)
35217 return true;
35218
35219 if (ix86_tieable_integer_mode_p (mode1)
35220 && ix86_tieable_integer_mode_p (mode2))
35221 return true;
35222
35223 /* MODE2 being XFmode implies fp stack or general regs, which means we
35224 can tie any smaller floating point modes to it. Note that we do not
35225 tie this with TFmode. */
35226 if (mode2 == XFmode)
35227 return mode1 == SFmode || mode1 == DFmode;
35228
35229 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
35230 that we can tie it with SFmode. */
35231 if (mode2 == DFmode)
35232 return mode1 == SFmode;
35233
35234 /* If MODE2 is only appropriate for an SSE register, then tie with
35235 any other mode acceptable to SSE registers. */
35236 if (GET_MODE_SIZE (mode2) == 32
35237 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35238 return (GET_MODE_SIZE (mode1) == 32
35239 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35240 if (GET_MODE_SIZE (mode2) == 16
35241 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35242 return (GET_MODE_SIZE (mode1) == 16
35243 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35244
35245 /* If MODE2 is appropriate for an MMX register, then tie
35246 with any other mode acceptable to MMX registers. */
35247 if (GET_MODE_SIZE (mode2) == 8
35248 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
35249 return (GET_MODE_SIZE (mode1) == 8
35250 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
35251
35252 return false;
35253 }
35254
35255 /* Return the cost of moving between two registers of mode MODE. */
35256
35257 static int
35258 ix86_set_reg_reg_cost (enum machine_mode mode)
35259 {
35260 unsigned int units = UNITS_PER_WORD;
35261
35262 switch (GET_MODE_CLASS (mode))
35263 {
35264 default:
35265 break;
35266
35267 case MODE_CC:
35268 units = GET_MODE_SIZE (CCmode);
35269 break;
35270
35271 case MODE_FLOAT:
35272 if ((TARGET_SSE && mode == TFmode)
35273 || (TARGET_80387 && mode == XFmode)
35274 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
35275 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
35276 units = GET_MODE_SIZE (mode);
35277 break;
35278
35279 case MODE_COMPLEX_FLOAT:
35280 if ((TARGET_SSE && mode == TCmode)
35281 || (TARGET_80387 && mode == XCmode)
35282 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
35283 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
35284 units = GET_MODE_SIZE (mode);
35285 break;
35286
35287 case MODE_VECTOR_INT:
35288 case MODE_VECTOR_FLOAT:
35289 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
35290 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35291 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
35292 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
35293 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
35294 units = GET_MODE_SIZE (mode);
35295 }
35296
35297 /* Return the cost of moving between two registers of mode MODE,
35298 assuming that the move will be in pieces of at most UNITS bytes. */
35299 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
35300 }
35301
35302 /* Compute a (partial) cost for rtx X. Return true if the complete
35303 cost has been computed, and false if subexpressions should be
35304 scanned. In either case, *TOTAL contains the cost result. */
35305
35306 static bool
35307 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
35308 bool speed)
35309 {
35310 enum rtx_code code = (enum rtx_code) code_i;
35311 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
35312 enum machine_mode mode = GET_MODE (x);
35313 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
35314
35315 switch (code)
35316 {
35317 case SET:
35318 if (register_operand (SET_DEST (x), VOIDmode)
35319 && reg_or_0_operand (SET_SRC (x), VOIDmode))
35320 {
35321 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
35322 return true;
35323 }
35324 return false;
35325
35326 case CONST_INT:
35327 case CONST:
35328 case LABEL_REF:
35329 case SYMBOL_REF:
35330 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
35331 *total = 3;
35332 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
35333 *total = 2;
35334 else if (flag_pic && SYMBOLIC_CONST (x)
35335 && (!TARGET_64BIT
35336 || (!GET_CODE (x) != LABEL_REF
35337 && (GET_CODE (x) != SYMBOL_REF
35338 || !SYMBOL_REF_LOCAL_P (x)))))
35339 *total = 1;
35340 else
35341 *total = 0;
35342 return true;
35343
35344 case CONST_DOUBLE:
35345 if (mode == VOIDmode)
35346 {
35347 *total = 0;
35348 return true;
35349 }
35350 switch (standard_80387_constant_p (x))
35351 {
35352 case 1: /* 0.0 */
35353 *total = 1;
35354 return true;
35355 default: /* Other constants */
35356 *total = 2;
35357 return true;
35358 case 0:
35359 case -1:
35360 break;
35361 }
35362 if (SSE_FLOAT_MODE_P (mode))
35363 {
35364 case CONST_VECTOR:
35365 switch (standard_sse_constant_p (x))
35366 {
35367 case 0:
35368 break;
35369 case 1: /* 0: xor eliminates false dependency */
35370 *total = 0;
35371 return true;
35372 default: /* -1: cmp contains false dependency */
35373 *total = 1;
35374 return true;
35375 }
35376 }
35377 /* Fall back to (MEM (SYMBOL_REF)), since that's where
35378 it'll probably end up. Add a penalty for size. */
35379 *total = (COSTS_N_INSNS (1)
35380 + (flag_pic != 0 && !TARGET_64BIT)
35381 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
35382 return true;
35383
35384 case ZERO_EXTEND:
35385 /* The zero extensions is often completely free on x86_64, so make
35386 it as cheap as possible. */
35387 if (TARGET_64BIT && mode == DImode
35388 && GET_MODE (XEXP (x, 0)) == SImode)
35389 *total = 1;
35390 else if (TARGET_ZERO_EXTEND_WITH_AND)
35391 *total = cost->add;
35392 else
35393 *total = cost->movzx;
35394 return false;
35395
35396 case SIGN_EXTEND:
35397 *total = cost->movsx;
35398 return false;
35399
35400 case ASHIFT:
35401 if (SCALAR_INT_MODE_P (mode)
35402 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
35403 && CONST_INT_P (XEXP (x, 1)))
35404 {
35405 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35406 if (value == 1)
35407 {
35408 *total = cost->add;
35409 return false;
35410 }
35411 if ((value == 2 || value == 3)
35412 && cost->lea <= cost->shift_const)
35413 {
35414 *total = cost->lea;
35415 return false;
35416 }
35417 }
35418 /* FALLTHRU */
35419
35420 case ROTATE:
35421 case ASHIFTRT:
35422 case LSHIFTRT:
35423 case ROTATERT:
35424 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35425 {
35426 /* ??? Should be SSE vector operation cost. */
35427 /* At least for published AMD latencies, this really is the same
35428 as the latency for a simple fpu operation like fabs. */
35429 /* V*QImode is emulated with 1-11 insns. */
35430 if (mode == V16QImode || mode == V32QImode)
35431 {
35432 int count = 11;
35433 if (TARGET_XOP && mode == V16QImode)
35434 {
35435 /* For XOP we use vpshab, which requires a broadcast of the
35436 value to the variable shift insn. For constants this
35437 means a V16Q const in mem; even when we can perform the
35438 shift with one insn set the cost to prefer paddb. */
35439 if (CONSTANT_P (XEXP (x, 1)))
35440 {
35441 *total = (cost->fabs
35442 + rtx_cost (XEXP (x, 0), code, 0, speed)
35443 + (speed ? 2 : COSTS_N_BYTES (16)));
35444 return true;
35445 }
35446 count = 3;
35447 }
35448 else if (TARGET_SSSE3)
35449 count = 7;
35450 *total = cost->fabs * count;
35451 }
35452 else
35453 *total = cost->fabs;
35454 }
35455 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35456 {
35457 if (CONST_INT_P (XEXP (x, 1)))
35458 {
35459 if (INTVAL (XEXP (x, 1)) > 32)
35460 *total = cost->shift_const + COSTS_N_INSNS (2);
35461 else
35462 *total = cost->shift_const * 2;
35463 }
35464 else
35465 {
35466 if (GET_CODE (XEXP (x, 1)) == AND)
35467 *total = cost->shift_var * 2;
35468 else
35469 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
35470 }
35471 }
35472 else
35473 {
35474 if (CONST_INT_P (XEXP (x, 1)))
35475 *total = cost->shift_const;
35476 else if (GET_CODE (XEXP (x, 1)) == SUBREG
35477 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
35478 {
35479 /* Return the cost after shift-and truncation. */
35480 *total = cost->shift_var;
35481 return true;
35482 }
35483 else
35484 *total = cost->shift_var;
35485 }
35486 return false;
35487
35488 case FMA:
35489 {
35490 rtx sub;
35491
35492 gcc_assert (FLOAT_MODE_P (mode));
35493 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
35494
35495 /* ??? SSE scalar/vector cost should be used here. */
35496 /* ??? Bald assumption that fma has the same cost as fmul. */
35497 *total = cost->fmul;
35498 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
35499
35500 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
35501 sub = XEXP (x, 0);
35502 if (GET_CODE (sub) == NEG)
35503 sub = XEXP (sub, 0);
35504 *total += rtx_cost (sub, FMA, 0, speed);
35505
35506 sub = XEXP (x, 2);
35507 if (GET_CODE (sub) == NEG)
35508 sub = XEXP (sub, 0);
35509 *total += rtx_cost (sub, FMA, 2, speed);
35510 return true;
35511 }
35512
35513 case MULT:
35514 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35515 {
35516 /* ??? SSE scalar cost should be used here. */
35517 *total = cost->fmul;
35518 return false;
35519 }
35520 else if (X87_FLOAT_MODE_P (mode))
35521 {
35522 *total = cost->fmul;
35523 return false;
35524 }
35525 else if (FLOAT_MODE_P (mode))
35526 {
35527 /* ??? SSE vector cost should be used here. */
35528 *total = cost->fmul;
35529 return false;
35530 }
35531 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35532 {
35533 /* V*QImode is emulated with 7-13 insns. */
35534 if (mode == V16QImode || mode == V32QImode)
35535 {
35536 int extra = 11;
35537 if (TARGET_XOP && mode == V16QImode)
35538 extra = 5;
35539 else if (TARGET_SSSE3)
35540 extra = 6;
35541 *total = cost->fmul * 2 + cost->fabs * extra;
35542 }
35543 /* V*DImode is emulated with 5-8 insns. */
35544 else if (mode == V2DImode || mode == V4DImode)
35545 {
35546 if (TARGET_XOP && mode == V2DImode)
35547 *total = cost->fmul * 2 + cost->fabs * 3;
35548 else
35549 *total = cost->fmul * 3 + cost->fabs * 5;
35550 }
35551 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
35552 insns, including two PMULUDQ. */
35553 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
35554 *total = cost->fmul * 2 + cost->fabs * 5;
35555 else
35556 *total = cost->fmul;
35557 return false;
35558 }
35559 else
35560 {
35561 rtx op0 = XEXP (x, 0);
35562 rtx op1 = XEXP (x, 1);
35563 int nbits;
35564 if (CONST_INT_P (XEXP (x, 1)))
35565 {
35566 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35567 for (nbits = 0; value != 0; value &= value - 1)
35568 nbits++;
35569 }
35570 else
35571 /* This is arbitrary. */
35572 nbits = 7;
35573
35574 /* Compute costs correctly for widening multiplication. */
35575 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
35576 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
35577 == GET_MODE_SIZE (mode))
35578 {
35579 int is_mulwiden = 0;
35580 enum machine_mode inner_mode = GET_MODE (op0);
35581
35582 if (GET_CODE (op0) == GET_CODE (op1))
35583 is_mulwiden = 1, op1 = XEXP (op1, 0);
35584 else if (CONST_INT_P (op1))
35585 {
35586 if (GET_CODE (op0) == SIGN_EXTEND)
35587 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
35588 == INTVAL (op1);
35589 else
35590 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
35591 }
35592
35593 if (is_mulwiden)
35594 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
35595 }
35596
35597 *total = (cost->mult_init[MODE_INDEX (mode)]
35598 + nbits * cost->mult_bit
35599 + rtx_cost (op0, outer_code, opno, speed)
35600 + rtx_cost (op1, outer_code, opno, speed));
35601
35602 return true;
35603 }
35604
35605 case DIV:
35606 case UDIV:
35607 case MOD:
35608 case UMOD:
35609 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35610 /* ??? SSE cost should be used here. */
35611 *total = cost->fdiv;
35612 else if (X87_FLOAT_MODE_P (mode))
35613 *total = cost->fdiv;
35614 else if (FLOAT_MODE_P (mode))
35615 /* ??? SSE vector cost should be used here. */
35616 *total = cost->fdiv;
35617 else
35618 *total = cost->divide[MODE_INDEX (mode)];
35619 return false;
35620
35621 case PLUS:
35622 if (GET_MODE_CLASS (mode) == MODE_INT
35623 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
35624 {
35625 if (GET_CODE (XEXP (x, 0)) == PLUS
35626 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
35627 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
35628 && CONSTANT_P (XEXP (x, 1)))
35629 {
35630 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35631 if (val == 2 || val == 4 || val == 8)
35632 {
35633 *total = cost->lea;
35634 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35635 outer_code, opno, speed);
35636 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35637 outer_code, opno, speed);
35638 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35639 return true;
35640 }
35641 }
35642 else if (GET_CODE (XEXP (x, 0)) == MULT
35643 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35644 {
35645 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35646 if (val == 2 || val == 4 || val == 8)
35647 {
35648 *total = cost->lea;
35649 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35650 outer_code, opno, speed);
35651 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35652 return true;
35653 }
35654 }
35655 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35656 {
35657 *total = cost->lea;
35658 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35659 outer_code, opno, speed);
35660 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35661 outer_code, opno, speed);
35662 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35663 return true;
35664 }
35665 }
35666 /* FALLTHRU */
35667
35668 case MINUS:
35669 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35670 {
35671 /* ??? SSE cost should be used here. */
35672 *total = cost->fadd;
35673 return false;
35674 }
35675 else if (X87_FLOAT_MODE_P (mode))
35676 {
35677 *total = cost->fadd;
35678 return false;
35679 }
35680 else if (FLOAT_MODE_P (mode))
35681 {
35682 /* ??? SSE vector cost should be used here. */
35683 *total = cost->fadd;
35684 return false;
35685 }
35686 /* FALLTHRU */
35687
35688 case AND:
35689 case IOR:
35690 case XOR:
35691 if (GET_MODE_CLASS (mode) == MODE_INT
35692 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35693 {
35694 *total = (cost->add * 2
35695 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35696 << (GET_MODE (XEXP (x, 0)) != DImode))
35697 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35698 << (GET_MODE (XEXP (x, 1)) != DImode)));
35699 return true;
35700 }
35701 /* FALLTHRU */
35702
35703 case NEG:
35704 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35705 {
35706 /* ??? SSE cost should be used here. */
35707 *total = cost->fchs;
35708 return false;
35709 }
35710 else if (X87_FLOAT_MODE_P (mode))
35711 {
35712 *total = cost->fchs;
35713 return false;
35714 }
35715 else if (FLOAT_MODE_P (mode))
35716 {
35717 /* ??? SSE vector cost should be used here. */
35718 *total = cost->fchs;
35719 return false;
35720 }
35721 /* FALLTHRU */
35722
35723 case NOT:
35724 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35725 {
35726 /* ??? Should be SSE vector operation cost. */
35727 /* At least for published AMD latencies, this really is the same
35728 as the latency for a simple fpu operation like fabs. */
35729 *total = cost->fabs;
35730 }
35731 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35732 *total = cost->add * 2;
35733 else
35734 *total = cost->add;
35735 return false;
35736
35737 case COMPARE:
35738 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35739 && XEXP (XEXP (x, 0), 1) == const1_rtx
35740 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35741 && XEXP (x, 1) == const0_rtx)
35742 {
35743 /* This kind of construct is implemented using test[bwl].
35744 Treat it as if we had an AND. */
35745 *total = (cost->add
35746 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35747 + rtx_cost (const1_rtx, outer_code, opno, speed));
35748 return true;
35749 }
35750 return false;
35751
35752 case FLOAT_EXTEND:
35753 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35754 *total = 0;
35755 return false;
35756
35757 case ABS:
35758 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35759 /* ??? SSE cost should be used here. */
35760 *total = cost->fabs;
35761 else if (X87_FLOAT_MODE_P (mode))
35762 *total = cost->fabs;
35763 else if (FLOAT_MODE_P (mode))
35764 /* ??? SSE vector cost should be used here. */
35765 *total = cost->fabs;
35766 return false;
35767
35768 case SQRT:
35769 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35770 /* ??? SSE cost should be used here. */
35771 *total = cost->fsqrt;
35772 else if (X87_FLOAT_MODE_P (mode))
35773 *total = cost->fsqrt;
35774 else if (FLOAT_MODE_P (mode))
35775 /* ??? SSE vector cost should be used here. */
35776 *total = cost->fsqrt;
35777 return false;
35778
35779 case UNSPEC:
35780 if (XINT (x, 1) == UNSPEC_TP)
35781 *total = 0;
35782 return false;
35783
35784 case VEC_SELECT:
35785 case VEC_CONCAT:
35786 case VEC_MERGE:
35787 case VEC_DUPLICATE:
35788 /* ??? Assume all of these vector manipulation patterns are
35789 recognizable. In which case they all pretty much have the
35790 same cost. */
35791 *total = cost->fabs;
35792 return true;
35793
35794 default:
35795 return false;
35796 }
35797 }
35798
35799 #if TARGET_MACHO
35800
35801 static int current_machopic_label_num;
35802
35803 /* Given a symbol name and its associated stub, write out the
35804 definition of the stub. */
35805
35806 void
35807 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35808 {
35809 unsigned int length;
35810 char *binder_name, *symbol_name, lazy_ptr_name[32];
35811 int label = ++current_machopic_label_num;
35812
35813 /* For 64-bit we shouldn't get here. */
35814 gcc_assert (!TARGET_64BIT);
35815
35816 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35817 symb = targetm.strip_name_encoding (symb);
35818
35819 length = strlen (stub);
35820 binder_name = XALLOCAVEC (char, length + 32);
35821 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35822
35823 length = strlen (symb);
35824 symbol_name = XALLOCAVEC (char, length + 32);
35825 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35826
35827 sprintf (lazy_ptr_name, "L%d$lz", label);
35828
35829 if (MACHOPIC_ATT_STUB)
35830 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35831 else if (MACHOPIC_PURE)
35832 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35833 else
35834 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35835
35836 fprintf (file, "%s:\n", stub);
35837 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35838
35839 if (MACHOPIC_ATT_STUB)
35840 {
35841 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35842 }
35843 else if (MACHOPIC_PURE)
35844 {
35845 /* PIC stub. */
35846 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35847 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35848 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35849 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35850 label, lazy_ptr_name, label);
35851 fprintf (file, "\tjmp\t*%%ecx\n");
35852 }
35853 else
35854 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35855
35856 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35857 it needs no stub-binding-helper. */
35858 if (MACHOPIC_ATT_STUB)
35859 return;
35860
35861 fprintf (file, "%s:\n", binder_name);
35862
35863 if (MACHOPIC_PURE)
35864 {
35865 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35866 fprintf (file, "\tpushl\t%%ecx\n");
35867 }
35868 else
35869 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35870
35871 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35872
35873 /* N.B. Keep the correspondence of these
35874 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35875 old-pic/new-pic/non-pic stubs; altering this will break
35876 compatibility with existing dylibs. */
35877 if (MACHOPIC_PURE)
35878 {
35879 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35880 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35881 }
35882 else
35883 /* 16-byte -mdynamic-no-pic stub. */
35884 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35885
35886 fprintf (file, "%s:\n", lazy_ptr_name);
35887 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35888 fprintf (file, ASM_LONG "%s\n", binder_name);
35889 }
35890 #endif /* TARGET_MACHO */
35891
35892 /* Order the registers for register allocator. */
35893
35894 void
35895 x86_order_regs_for_local_alloc (void)
35896 {
35897 int pos = 0;
35898 int i;
35899
35900 /* First allocate the local general purpose registers. */
35901 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35902 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35903 reg_alloc_order [pos++] = i;
35904
35905 /* Global general purpose registers. */
35906 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35907 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35908 reg_alloc_order [pos++] = i;
35909
35910 /* x87 registers come first in case we are doing FP math
35911 using them. */
35912 if (!TARGET_SSE_MATH)
35913 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35914 reg_alloc_order [pos++] = i;
35915
35916 /* SSE registers. */
35917 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35918 reg_alloc_order [pos++] = i;
35919 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35920 reg_alloc_order [pos++] = i;
35921
35922 /* Extended REX SSE registers. */
35923 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
35924 reg_alloc_order [pos++] = i;
35925
35926 /* Mask register. */
35927 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
35928 reg_alloc_order [pos++] = i;
35929
35930 /* MPX bound registers. */
35931 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
35932 reg_alloc_order [pos++] = i;
35933
35934 /* x87 registers. */
35935 if (TARGET_SSE_MATH)
35936 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35937 reg_alloc_order [pos++] = i;
35938
35939 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35940 reg_alloc_order [pos++] = i;
35941
35942 /* Initialize the rest of array as we do not allocate some registers
35943 at all. */
35944 while (pos < FIRST_PSEUDO_REGISTER)
35945 reg_alloc_order [pos++] = 0;
35946 }
35947
35948 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35949 in struct attribute_spec handler. */
35950 static tree
35951 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35952 tree args,
35953 int flags ATTRIBUTE_UNUSED,
35954 bool *no_add_attrs)
35955 {
35956 if (TREE_CODE (*node) != FUNCTION_TYPE
35957 && TREE_CODE (*node) != METHOD_TYPE
35958 && TREE_CODE (*node) != FIELD_DECL
35959 && TREE_CODE (*node) != TYPE_DECL)
35960 {
35961 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35962 name);
35963 *no_add_attrs = true;
35964 return NULL_TREE;
35965 }
35966 if (TARGET_64BIT)
35967 {
35968 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35969 name);
35970 *no_add_attrs = true;
35971 return NULL_TREE;
35972 }
35973 if (is_attribute_p ("callee_pop_aggregate_return", name))
35974 {
35975 tree cst;
35976
35977 cst = TREE_VALUE (args);
35978 if (TREE_CODE (cst) != INTEGER_CST)
35979 {
35980 warning (OPT_Wattributes,
35981 "%qE attribute requires an integer constant argument",
35982 name);
35983 *no_add_attrs = true;
35984 }
35985 else if (compare_tree_int (cst, 0) != 0
35986 && compare_tree_int (cst, 1) != 0)
35987 {
35988 warning (OPT_Wattributes,
35989 "argument to %qE attribute is neither zero, nor one",
35990 name);
35991 *no_add_attrs = true;
35992 }
35993
35994 return NULL_TREE;
35995 }
35996
35997 return NULL_TREE;
35998 }
35999
36000 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
36001 struct attribute_spec.handler. */
36002 static tree
36003 ix86_handle_abi_attribute (tree *node, tree name,
36004 tree args ATTRIBUTE_UNUSED,
36005 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36006 {
36007 if (TREE_CODE (*node) != FUNCTION_TYPE
36008 && TREE_CODE (*node) != METHOD_TYPE
36009 && TREE_CODE (*node) != FIELD_DECL
36010 && TREE_CODE (*node) != TYPE_DECL)
36011 {
36012 warning (OPT_Wattributes, "%qE attribute only applies to functions",
36013 name);
36014 *no_add_attrs = true;
36015 return NULL_TREE;
36016 }
36017
36018 /* Can combine regparm with all attributes but fastcall. */
36019 if (is_attribute_p ("ms_abi", name))
36020 {
36021 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
36022 {
36023 error ("ms_abi and sysv_abi attributes are not compatible");
36024 }
36025
36026 return NULL_TREE;
36027 }
36028 else if (is_attribute_p ("sysv_abi", name))
36029 {
36030 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
36031 {
36032 error ("ms_abi and sysv_abi attributes are not compatible");
36033 }
36034
36035 return NULL_TREE;
36036 }
36037
36038 return NULL_TREE;
36039 }
36040
36041 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
36042 struct attribute_spec.handler. */
36043 static tree
36044 ix86_handle_struct_attribute (tree *node, tree name,
36045 tree args ATTRIBUTE_UNUSED,
36046 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36047 {
36048 tree *type = NULL;
36049 if (DECL_P (*node))
36050 {
36051 if (TREE_CODE (*node) == TYPE_DECL)
36052 type = &TREE_TYPE (*node);
36053 }
36054 else
36055 type = node;
36056
36057 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
36058 {
36059 warning (OPT_Wattributes, "%qE attribute ignored",
36060 name);
36061 *no_add_attrs = true;
36062 }
36063
36064 else if ((is_attribute_p ("ms_struct", name)
36065 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
36066 || ((is_attribute_p ("gcc_struct", name)
36067 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
36068 {
36069 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
36070 name);
36071 *no_add_attrs = true;
36072 }
36073
36074 return NULL_TREE;
36075 }
36076
36077 static tree
36078 ix86_handle_fndecl_attribute (tree *node, tree name,
36079 tree args ATTRIBUTE_UNUSED,
36080 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36081 {
36082 if (TREE_CODE (*node) != FUNCTION_DECL)
36083 {
36084 warning (OPT_Wattributes, "%qE attribute only applies to functions",
36085 name);
36086 *no_add_attrs = true;
36087 }
36088 return NULL_TREE;
36089 }
36090
36091 static bool
36092 ix86_ms_bitfield_layout_p (const_tree record_type)
36093 {
36094 return ((TARGET_MS_BITFIELD_LAYOUT
36095 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
36096 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
36097 }
36098
36099 /* Returns an expression indicating where the this parameter is
36100 located on entry to the FUNCTION. */
36101
36102 static rtx
36103 x86_this_parameter (tree function)
36104 {
36105 tree type = TREE_TYPE (function);
36106 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
36107 int nregs;
36108
36109 if (TARGET_64BIT)
36110 {
36111 const int *parm_regs;
36112
36113 if (ix86_function_type_abi (type) == MS_ABI)
36114 parm_regs = x86_64_ms_abi_int_parameter_registers;
36115 else
36116 parm_regs = x86_64_int_parameter_registers;
36117 return gen_rtx_REG (Pmode, parm_regs[aggr]);
36118 }
36119
36120 nregs = ix86_function_regparm (type, function);
36121
36122 if (nregs > 0 && !stdarg_p (type))
36123 {
36124 int regno;
36125 unsigned int ccvt = ix86_get_callcvt (type);
36126
36127 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36128 regno = aggr ? DX_REG : CX_REG;
36129 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36130 {
36131 regno = CX_REG;
36132 if (aggr)
36133 return gen_rtx_MEM (SImode,
36134 plus_constant (Pmode, stack_pointer_rtx, 4));
36135 }
36136 else
36137 {
36138 regno = AX_REG;
36139 if (aggr)
36140 {
36141 regno = DX_REG;
36142 if (nregs == 1)
36143 return gen_rtx_MEM (SImode,
36144 plus_constant (Pmode,
36145 stack_pointer_rtx, 4));
36146 }
36147 }
36148 return gen_rtx_REG (SImode, regno);
36149 }
36150
36151 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
36152 aggr ? 8 : 4));
36153 }
36154
36155 /* Determine whether x86_output_mi_thunk can succeed. */
36156
36157 static bool
36158 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
36159 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
36160 HOST_WIDE_INT vcall_offset, const_tree function)
36161 {
36162 /* 64-bit can handle anything. */
36163 if (TARGET_64BIT)
36164 return true;
36165
36166 /* For 32-bit, everything's fine if we have one free register. */
36167 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
36168 return true;
36169
36170 /* Need a free register for vcall_offset. */
36171 if (vcall_offset)
36172 return false;
36173
36174 /* Need a free register for GOT references. */
36175 if (flag_pic && !targetm.binds_local_p (function))
36176 return false;
36177
36178 /* Otherwise ok. */
36179 return true;
36180 }
36181
36182 /* Output the assembler code for a thunk function. THUNK_DECL is the
36183 declaration for the thunk function itself, FUNCTION is the decl for
36184 the target function. DELTA is an immediate constant offset to be
36185 added to THIS. If VCALL_OFFSET is nonzero, the word at
36186 *(*this + vcall_offset) should be added to THIS. */
36187
36188 static void
36189 x86_output_mi_thunk (FILE *file,
36190 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
36191 HOST_WIDE_INT vcall_offset, tree function)
36192 {
36193 rtx this_param = x86_this_parameter (function);
36194 rtx this_reg, tmp, fnaddr;
36195 unsigned int tmp_regno;
36196
36197 if (TARGET_64BIT)
36198 tmp_regno = R10_REG;
36199 else
36200 {
36201 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
36202 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36203 tmp_regno = AX_REG;
36204 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36205 tmp_regno = DX_REG;
36206 else
36207 tmp_regno = CX_REG;
36208 }
36209
36210 emit_note (NOTE_INSN_PROLOGUE_END);
36211
36212 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
36213 pull it in now and let DELTA benefit. */
36214 if (REG_P (this_param))
36215 this_reg = this_param;
36216 else if (vcall_offset)
36217 {
36218 /* Put the this parameter into %eax. */
36219 this_reg = gen_rtx_REG (Pmode, AX_REG);
36220 emit_move_insn (this_reg, this_param);
36221 }
36222 else
36223 this_reg = NULL_RTX;
36224
36225 /* Adjust the this parameter by a fixed constant. */
36226 if (delta)
36227 {
36228 rtx delta_rtx = GEN_INT (delta);
36229 rtx delta_dst = this_reg ? this_reg : this_param;
36230
36231 if (TARGET_64BIT)
36232 {
36233 if (!x86_64_general_operand (delta_rtx, Pmode))
36234 {
36235 tmp = gen_rtx_REG (Pmode, tmp_regno);
36236 emit_move_insn (tmp, delta_rtx);
36237 delta_rtx = tmp;
36238 }
36239 }
36240
36241 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
36242 }
36243
36244 /* Adjust the this parameter by a value stored in the vtable. */
36245 if (vcall_offset)
36246 {
36247 rtx vcall_addr, vcall_mem, this_mem;
36248
36249 tmp = gen_rtx_REG (Pmode, tmp_regno);
36250
36251 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
36252 if (Pmode != ptr_mode)
36253 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
36254 emit_move_insn (tmp, this_mem);
36255
36256 /* Adjust the this parameter. */
36257 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
36258 if (TARGET_64BIT
36259 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
36260 {
36261 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
36262 emit_move_insn (tmp2, GEN_INT (vcall_offset));
36263 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
36264 }
36265
36266 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
36267 if (Pmode != ptr_mode)
36268 emit_insn (gen_addsi_1_zext (this_reg,
36269 gen_rtx_REG (ptr_mode,
36270 REGNO (this_reg)),
36271 vcall_mem));
36272 else
36273 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
36274 }
36275
36276 /* If necessary, drop THIS back to its stack slot. */
36277 if (this_reg && this_reg != this_param)
36278 emit_move_insn (this_param, this_reg);
36279
36280 fnaddr = XEXP (DECL_RTL (function), 0);
36281 if (TARGET_64BIT)
36282 {
36283 if (!flag_pic || targetm.binds_local_p (function)
36284 || TARGET_PECOFF)
36285 ;
36286 else
36287 {
36288 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
36289 tmp = gen_rtx_CONST (Pmode, tmp);
36290 fnaddr = gen_rtx_MEM (Pmode, tmp);
36291 }
36292 }
36293 else
36294 {
36295 if (!flag_pic || targetm.binds_local_p (function))
36296 ;
36297 #if TARGET_MACHO
36298 else if (TARGET_MACHO)
36299 {
36300 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
36301 fnaddr = XEXP (fnaddr, 0);
36302 }
36303 #endif /* TARGET_MACHO */
36304 else
36305 {
36306 tmp = gen_rtx_REG (Pmode, CX_REG);
36307 output_set_got (tmp, NULL_RTX);
36308
36309 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
36310 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
36311 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
36312 }
36313 }
36314
36315 /* Our sibling call patterns do not allow memories, because we have no
36316 predicate that can distinguish between frame and non-frame memory.
36317 For our purposes here, we can get away with (ab)using a jump pattern,
36318 because we're going to do no optimization. */
36319 if (MEM_P (fnaddr))
36320 emit_jump_insn (gen_indirect_jump (fnaddr));
36321 else
36322 {
36323 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
36324 fnaddr = legitimize_pic_address (fnaddr,
36325 gen_rtx_REG (Pmode, tmp_regno));
36326
36327 if (!sibcall_insn_operand (fnaddr, word_mode))
36328 {
36329 tmp = gen_rtx_REG (word_mode, tmp_regno);
36330 if (GET_MODE (fnaddr) != word_mode)
36331 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
36332 emit_move_insn (tmp, fnaddr);
36333 fnaddr = tmp;
36334 }
36335
36336 tmp = gen_rtx_MEM (QImode, fnaddr);
36337 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
36338 tmp = emit_call_insn (tmp);
36339 SIBLING_CALL_P (tmp) = 1;
36340 }
36341 emit_barrier ();
36342
36343 /* Emit just enough of rest_of_compilation to get the insns emitted.
36344 Note that use_thunk calls assemble_start_function et al. */
36345 tmp = get_insns ();
36346 shorten_branches (tmp);
36347 final_start_function (tmp, file, 1);
36348 final (tmp, file, 1);
36349 final_end_function ();
36350 }
36351
36352 static void
36353 x86_file_start (void)
36354 {
36355 default_file_start ();
36356 #if TARGET_MACHO
36357 darwin_file_start ();
36358 #endif
36359 if (X86_FILE_START_VERSION_DIRECTIVE)
36360 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
36361 if (X86_FILE_START_FLTUSED)
36362 fputs ("\t.global\t__fltused\n", asm_out_file);
36363 if (ix86_asm_dialect == ASM_INTEL)
36364 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
36365 }
36366
36367 int
36368 x86_field_alignment (tree field, int computed)
36369 {
36370 enum machine_mode mode;
36371 tree type = TREE_TYPE (field);
36372
36373 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
36374 return computed;
36375 mode = TYPE_MODE (strip_array_types (type));
36376 if (mode == DFmode || mode == DCmode
36377 || GET_MODE_CLASS (mode) == MODE_INT
36378 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
36379 return MIN (32, computed);
36380 return computed;
36381 }
36382
36383 /* Output assembler code to FILE to increment profiler label # LABELNO
36384 for profiling a function entry. */
36385 void
36386 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
36387 {
36388 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
36389 : MCOUNT_NAME);
36390
36391 if (TARGET_64BIT)
36392 {
36393 #ifndef NO_PROFILE_COUNTERS
36394 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
36395 #endif
36396
36397 if (!TARGET_PECOFF && flag_pic)
36398 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
36399 else
36400 fprintf (file, "\tcall\t%s\n", mcount_name);
36401 }
36402 else if (flag_pic)
36403 {
36404 #ifndef NO_PROFILE_COUNTERS
36405 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
36406 LPREFIX, labelno);
36407 #endif
36408 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
36409 }
36410 else
36411 {
36412 #ifndef NO_PROFILE_COUNTERS
36413 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
36414 LPREFIX, labelno);
36415 #endif
36416 fprintf (file, "\tcall\t%s\n", mcount_name);
36417 }
36418 }
36419
36420 /* We don't have exact information about the insn sizes, but we may assume
36421 quite safely that we are informed about all 1 byte insns and memory
36422 address sizes. This is enough to eliminate unnecessary padding in
36423 99% of cases. */
36424
36425 static int
36426 min_insn_size (rtx insn)
36427 {
36428 int l = 0, len;
36429
36430 if (!INSN_P (insn) || !active_insn_p (insn))
36431 return 0;
36432
36433 /* Discard alignments we've emit and jump instructions. */
36434 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
36435 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
36436 return 0;
36437
36438 /* Important case - calls are always 5 bytes.
36439 It is common to have many calls in the row. */
36440 if (CALL_P (insn)
36441 && symbolic_reference_mentioned_p (PATTERN (insn))
36442 && !SIBLING_CALL_P (insn))
36443 return 5;
36444 len = get_attr_length (insn);
36445 if (len <= 1)
36446 return 1;
36447
36448 /* For normal instructions we rely on get_attr_length being exact,
36449 with a few exceptions. */
36450 if (!JUMP_P (insn))
36451 {
36452 enum attr_type type = get_attr_type (insn);
36453
36454 switch (type)
36455 {
36456 case TYPE_MULTI:
36457 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
36458 || asm_noperands (PATTERN (insn)) >= 0)
36459 return 0;
36460 break;
36461 case TYPE_OTHER:
36462 case TYPE_FCMP:
36463 break;
36464 default:
36465 /* Otherwise trust get_attr_length. */
36466 return len;
36467 }
36468
36469 l = get_attr_length_address (insn);
36470 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
36471 l = 4;
36472 }
36473 if (l)
36474 return 1+l;
36475 else
36476 return 2;
36477 }
36478
36479 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36480
36481 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
36482 window. */
36483
36484 static void
36485 ix86_avoid_jump_mispredicts (void)
36486 {
36487 rtx insn, start = get_insns ();
36488 int nbytes = 0, njumps = 0;
36489 int isjump = 0;
36490
36491 /* Look for all minimal intervals of instructions containing 4 jumps.
36492 The intervals are bounded by START and INSN. NBYTES is the total
36493 size of instructions in the interval including INSN and not including
36494 START. When the NBYTES is smaller than 16 bytes, it is possible
36495 that the end of START and INSN ends up in the same 16byte page.
36496
36497 The smallest offset in the page INSN can start is the case where START
36498 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
36499 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
36500 */
36501 for (insn = start; insn; insn = NEXT_INSN (insn))
36502 {
36503 int min_size;
36504
36505 if (LABEL_P (insn))
36506 {
36507 int align = label_to_alignment (insn);
36508 int max_skip = label_to_max_skip (insn);
36509
36510 if (max_skip > 15)
36511 max_skip = 15;
36512 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
36513 already in the current 16 byte page, because otherwise
36514 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
36515 bytes to reach 16 byte boundary. */
36516 if (align <= 0
36517 || (align <= 3 && max_skip != (1 << align) - 1))
36518 max_skip = 0;
36519 if (dump_file)
36520 fprintf (dump_file, "Label %i with max_skip %i\n",
36521 INSN_UID (insn), max_skip);
36522 if (max_skip)
36523 {
36524 while (nbytes + max_skip >= 16)
36525 {
36526 start = NEXT_INSN (start);
36527 if (JUMP_P (start) || CALL_P (start))
36528 njumps--, isjump = 1;
36529 else
36530 isjump = 0;
36531 nbytes -= min_insn_size (start);
36532 }
36533 }
36534 continue;
36535 }
36536
36537 min_size = min_insn_size (insn);
36538 nbytes += min_size;
36539 if (dump_file)
36540 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
36541 INSN_UID (insn), min_size);
36542 if (JUMP_P (insn) || CALL_P (insn))
36543 njumps++;
36544 else
36545 continue;
36546
36547 while (njumps > 3)
36548 {
36549 start = NEXT_INSN (start);
36550 if (JUMP_P (start) || CALL_P (start))
36551 njumps--, isjump = 1;
36552 else
36553 isjump = 0;
36554 nbytes -= min_insn_size (start);
36555 }
36556 gcc_assert (njumps >= 0);
36557 if (dump_file)
36558 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
36559 INSN_UID (start), INSN_UID (insn), nbytes);
36560
36561 if (njumps == 3 && isjump && nbytes < 16)
36562 {
36563 int padsize = 15 - nbytes + min_insn_size (insn);
36564
36565 if (dump_file)
36566 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
36567 INSN_UID (insn), padsize);
36568 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
36569 }
36570 }
36571 }
36572 #endif
36573
36574 /* AMD Athlon works faster
36575 when RET is not destination of conditional jump or directly preceded
36576 by other jump instruction. We avoid the penalty by inserting NOP just
36577 before the RET instructions in such cases. */
36578 static void
36579 ix86_pad_returns (void)
36580 {
36581 edge e;
36582 edge_iterator ei;
36583
36584 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36585 {
36586 basic_block bb = e->src;
36587 rtx ret = BB_END (bb);
36588 rtx prev;
36589 bool replace = false;
36590
36591 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
36592 || optimize_bb_for_size_p (bb))
36593 continue;
36594 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
36595 if (active_insn_p (prev) || LABEL_P (prev))
36596 break;
36597 if (prev && LABEL_P (prev))
36598 {
36599 edge e;
36600 edge_iterator ei;
36601
36602 FOR_EACH_EDGE (e, ei, bb->preds)
36603 if (EDGE_FREQUENCY (e) && e->src->index >= 0
36604 && !(e->flags & EDGE_FALLTHRU))
36605 {
36606 replace = true;
36607 break;
36608 }
36609 }
36610 if (!replace)
36611 {
36612 prev = prev_active_insn (ret);
36613 if (prev
36614 && ((JUMP_P (prev) && any_condjump_p (prev))
36615 || CALL_P (prev)))
36616 replace = true;
36617 /* Empty functions get branch mispredict even when
36618 the jump destination is not visible to us. */
36619 if (!prev && !optimize_function_for_size_p (cfun))
36620 replace = true;
36621 }
36622 if (replace)
36623 {
36624 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
36625 delete_insn (ret);
36626 }
36627 }
36628 }
36629
36630 /* Count the minimum number of instructions in BB. Return 4 if the
36631 number of instructions >= 4. */
36632
36633 static int
36634 ix86_count_insn_bb (basic_block bb)
36635 {
36636 rtx insn;
36637 int insn_count = 0;
36638
36639 /* Count number of instructions in this block. Return 4 if the number
36640 of instructions >= 4. */
36641 FOR_BB_INSNS (bb, insn)
36642 {
36643 /* Only happen in exit blocks. */
36644 if (JUMP_P (insn)
36645 && ANY_RETURN_P (PATTERN (insn)))
36646 break;
36647
36648 if (NONDEBUG_INSN_P (insn)
36649 && GET_CODE (PATTERN (insn)) != USE
36650 && GET_CODE (PATTERN (insn)) != CLOBBER)
36651 {
36652 insn_count++;
36653 if (insn_count >= 4)
36654 return insn_count;
36655 }
36656 }
36657
36658 return insn_count;
36659 }
36660
36661
36662 /* Count the minimum number of instructions in code path in BB.
36663 Return 4 if the number of instructions >= 4. */
36664
36665 static int
36666 ix86_count_insn (basic_block bb)
36667 {
36668 edge e;
36669 edge_iterator ei;
36670 int min_prev_count;
36671
36672 /* Only bother counting instructions along paths with no
36673 more than 2 basic blocks between entry and exit. Given
36674 that BB has an edge to exit, determine if a predecessor
36675 of BB has an edge from entry. If so, compute the number
36676 of instructions in the predecessor block. If there
36677 happen to be multiple such blocks, compute the minimum. */
36678 min_prev_count = 4;
36679 FOR_EACH_EDGE (e, ei, bb->preds)
36680 {
36681 edge prev_e;
36682 edge_iterator prev_ei;
36683
36684 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
36685 {
36686 min_prev_count = 0;
36687 break;
36688 }
36689 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36690 {
36691 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
36692 {
36693 int count = ix86_count_insn_bb (e->src);
36694 if (count < min_prev_count)
36695 min_prev_count = count;
36696 break;
36697 }
36698 }
36699 }
36700
36701 if (min_prev_count < 4)
36702 min_prev_count += ix86_count_insn_bb (bb);
36703
36704 return min_prev_count;
36705 }
36706
36707 /* Pad short function to 4 instructions. */
36708
36709 static void
36710 ix86_pad_short_function (void)
36711 {
36712 edge e;
36713 edge_iterator ei;
36714
36715 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36716 {
36717 rtx ret = BB_END (e->src);
36718 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36719 {
36720 int insn_count = ix86_count_insn (e->src);
36721
36722 /* Pad short function. */
36723 if (insn_count < 4)
36724 {
36725 rtx insn = ret;
36726
36727 /* Find epilogue. */
36728 while (insn
36729 && (!NOTE_P (insn)
36730 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36731 insn = PREV_INSN (insn);
36732
36733 if (!insn)
36734 insn = ret;
36735
36736 /* Two NOPs count as one instruction. */
36737 insn_count = 2 * (4 - insn_count);
36738 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36739 }
36740 }
36741 }
36742 }
36743
36744 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36745 the epilogue, the Windows system unwinder will apply epilogue logic and
36746 produce incorrect offsets. This can be avoided by adding a nop between
36747 the last insn that can throw and the first insn of the epilogue. */
36748
36749 static void
36750 ix86_seh_fixup_eh_fallthru (void)
36751 {
36752 edge e;
36753 edge_iterator ei;
36754
36755 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36756 {
36757 rtx insn, next;
36758
36759 /* Find the beginning of the epilogue. */
36760 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36761 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36762 break;
36763 if (insn == NULL)
36764 continue;
36765
36766 /* We only care about preceding insns that can throw. */
36767 insn = prev_active_insn (insn);
36768 if (insn == NULL || !can_throw_internal (insn))
36769 continue;
36770
36771 /* Do not separate calls from their debug information. */
36772 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36773 if (NOTE_P (next)
36774 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36775 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36776 insn = next;
36777 else
36778 break;
36779
36780 emit_insn_after (gen_nops (const1_rtx), insn);
36781 }
36782 }
36783
36784 /* Implement machine specific optimizations. We implement padding of returns
36785 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36786 static void
36787 ix86_reorg (void)
36788 {
36789 /* We are freeing block_for_insn in the toplev to keep compatibility
36790 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36791 compute_bb_for_insn ();
36792
36793 if (TARGET_SEH && current_function_has_exception_handlers ())
36794 ix86_seh_fixup_eh_fallthru ();
36795
36796 if (optimize && optimize_function_for_speed_p (cfun))
36797 {
36798 if (TARGET_PAD_SHORT_FUNCTION)
36799 ix86_pad_short_function ();
36800 else if (TARGET_PAD_RETURNS)
36801 ix86_pad_returns ();
36802 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36803 if (TARGET_FOUR_JUMP_LIMIT)
36804 ix86_avoid_jump_mispredicts ();
36805 #endif
36806 }
36807 }
36808
36809 /* Return nonzero when QImode register that must be represented via REX prefix
36810 is used. */
36811 bool
36812 x86_extended_QIreg_mentioned_p (rtx insn)
36813 {
36814 int i;
36815 extract_insn_cached (insn);
36816 for (i = 0; i < recog_data.n_operands; i++)
36817 if (GENERAL_REG_P (recog_data.operand[i])
36818 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36819 return true;
36820 return false;
36821 }
36822
36823 /* Return nonzero when P points to register encoded via REX prefix.
36824 Called via for_each_rtx. */
36825 static int
36826 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36827 {
36828 unsigned int regno;
36829 if (!REG_P (*p))
36830 return 0;
36831 regno = REGNO (*p);
36832 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36833 }
36834
36835 /* Return true when INSN mentions register that must be encoded using REX
36836 prefix. */
36837 bool
36838 x86_extended_reg_mentioned_p (rtx insn)
36839 {
36840 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36841 extended_reg_mentioned_1, NULL);
36842 }
36843
36844 /* If profitable, negate (without causing overflow) integer constant
36845 of mode MODE at location LOC. Return true in this case. */
36846 bool
36847 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36848 {
36849 HOST_WIDE_INT val;
36850
36851 if (!CONST_INT_P (*loc))
36852 return false;
36853
36854 switch (mode)
36855 {
36856 case DImode:
36857 /* DImode x86_64 constants must fit in 32 bits. */
36858 gcc_assert (x86_64_immediate_operand (*loc, mode));
36859
36860 mode = SImode;
36861 break;
36862
36863 case SImode:
36864 case HImode:
36865 case QImode:
36866 break;
36867
36868 default:
36869 gcc_unreachable ();
36870 }
36871
36872 /* Avoid overflows. */
36873 if (mode_signbit_p (mode, *loc))
36874 return false;
36875
36876 val = INTVAL (*loc);
36877
36878 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36879 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36880 if ((val < 0 && val != -128)
36881 || val == 128)
36882 {
36883 *loc = GEN_INT (-val);
36884 return true;
36885 }
36886
36887 return false;
36888 }
36889
36890 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36891 optabs would emit if we didn't have TFmode patterns. */
36892
36893 void
36894 x86_emit_floatuns (rtx operands[2])
36895 {
36896 rtx neglab, donelab, i0, i1, f0, in, out;
36897 enum machine_mode mode, inmode;
36898
36899 inmode = GET_MODE (operands[1]);
36900 gcc_assert (inmode == SImode || inmode == DImode);
36901
36902 out = operands[0];
36903 in = force_reg (inmode, operands[1]);
36904 mode = GET_MODE (out);
36905 neglab = gen_label_rtx ();
36906 donelab = gen_label_rtx ();
36907 f0 = gen_reg_rtx (mode);
36908
36909 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36910
36911 expand_float (out, in, 0);
36912
36913 emit_jump_insn (gen_jump (donelab));
36914 emit_barrier ();
36915
36916 emit_label (neglab);
36917
36918 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36919 1, OPTAB_DIRECT);
36920 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36921 1, OPTAB_DIRECT);
36922 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36923
36924 expand_float (f0, i0, 0);
36925
36926 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36927
36928 emit_label (donelab);
36929 }
36930 \f
36931 /* AVX512F does support 64-byte integer vector operations,
36932 thus the longest vector we are faced with is V64QImode. */
36933 #define MAX_VECT_LEN 64
36934
36935 struct expand_vec_perm_d
36936 {
36937 rtx target, op0, op1;
36938 unsigned char perm[MAX_VECT_LEN];
36939 enum machine_mode vmode;
36940 unsigned char nelt;
36941 bool one_operand_p;
36942 bool testing_p;
36943 };
36944
36945 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36946 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36947 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36948
36949 /* Get a vector mode of the same size as the original but with elements
36950 twice as wide. This is only guaranteed to apply to integral vectors. */
36951
36952 static inline enum machine_mode
36953 get_mode_wider_vector (enum machine_mode o)
36954 {
36955 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36956 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36957 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36958 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36959 return n;
36960 }
36961
36962 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36963 with all elements equal to VAR. Return true if successful. */
36964
36965 static bool
36966 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36967 rtx target, rtx val)
36968 {
36969 bool ok;
36970
36971 switch (mode)
36972 {
36973 case V2SImode:
36974 case V2SFmode:
36975 if (!mmx_ok)
36976 return false;
36977 /* FALLTHRU */
36978
36979 case V4DFmode:
36980 case V4DImode:
36981 case V8SFmode:
36982 case V8SImode:
36983 case V2DFmode:
36984 case V2DImode:
36985 case V4SFmode:
36986 case V4SImode:
36987 {
36988 rtx insn, dup;
36989
36990 /* First attempt to recognize VAL as-is. */
36991 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36992 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36993 if (recog_memoized (insn) < 0)
36994 {
36995 rtx seq;
36996 /* If that fails, force VAL into a register. */
36997
36998 start_sequence ();
36999 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
37000 seq = get_insns ();
37001 end_sequence ();
37002 if (seq)
37003 emit_insn_before (seq, insn);
37004
37005 ok = recog_memoized (insn) >= 0;
37006 gcc_assert (ok);
37007 }
37008 }
37009 return true;
37010
37011 case V4HImode:
37012 if (!mmx_ok)
37013 return false;
37014 if (TARGET_SSE || TARGET_3DNOW_A)
37015 {
37016 rtx x;
37017
37018 val = gen_lowpart (SImode, val);
37019 x = gen_rtx_TRUNCATE (HImode, val);
37020 x = gen_rtx_VEC_DUPLICATE (mode, x);
37021 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37022 return true;
37023 }
37024 goto widen;
37025
37026 case V8QImode:
37027 if (!mmx_ok)
37028 return false;
37029 goto widen;
37030
37031 case V8HImode:
37032 if (TARGET_SSE2)
37033 {
37034 struct expand_vec_perm_d dperm;
37035 rtx tmp1, tmp2;
37036
37037 permute:
37038 memset (&dperm, 0, sizeof (dperm));
37039 dperm.target = target;
37040 dperm.vmode = mode;
37041 dperm.nelt = GET_MODE_NUNITS (mode);
37042 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
37043 dperm.one_operand_p = true;
37044
37045 /* Extend to SImode using a paradoxical SUBREG. */
37046 tmp1 = gen_reg_rtx (SImode);
37047 emit_move_insn (tmp1, gen_lowpart (SImode, val));
37048
37049 /* Insert the SImode value as low element of a V4SImode vector. */
37050 tmp2 = gen_reg_rtx (V4SImode);
37051 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
37052 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
37053
37054 ok = (expand_vec_perm_1 (&dperm)
37055 || expand_vec_perm_broadcast_1 (&dperm));
37056 gcc_assert (ok);
37057 return ok;
37058 }
37059 goto widen;
37060
37061 case V16QImode:
37062 if (TARGET_SSE2)
37063 goto permute;
37064 goto widen;
37065
37066 widen:
37067 /* Replicate the value once into the next wider mode and recurse. */
37068 {
37069 enum machine_mode smode, wsmode, wvmode;
37070 rtx x;
37071
37072 smode = GET_MODE_INNER (mode);
37073 wvmode = get_mode_wider_vector (mode);
37074 wsmode = GET_MODE_INNER (wvmode);
37075
37076 val = convert_modes (wsmode, smode, val, true);
37077 x = expand_simple_binop (wsmode, ASHIFT, val,
37078 GEN_INT (GET_MODE_BITSIZE (smode)),
37079 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37080 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
37081
37082 x = gen_reg_rtx (wvmode);
37083 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
37084 gcc_assert (ok);
37085 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
37086 return ok;
37087 }
37088
37089 case V16HImode:
37090 case V32QImode:
37091 {
37092 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
37093 rtx x = gen_reg_rtx (hvmode);
37094
37095 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
37096 gcc_assert (ok);
37097
37098 x = gen_rtx_VEC_CONCAT (mode, x, x);
37099 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37100 }
37101 return true;
37102
37103 default:
37104 return false;
37105 }
37106 }
37107
37108 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37109 whose ONE_VAR element is VAR, and other elements are zero. Return true
37110 if successful. */
37111
37112 static bool
37113 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
37114 rtx target, rtx var, int one_var)
37115 {
37116 enum machine_mode vsimode;
37117 rtx new_target;
37118 rtx x, tmp;
37119 bool use_vector_set = false;
37120
37121 switch (mode)
37122 {
37123 case V2DImode:
37124 /* For SSE4.1, we normally use vector set. But if the second
37125 element is zero and inter-unit moves are OK, we use movq
37126 instead. */
37127 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
37128 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
37129 && one_var == 0));
37130 break;
37131 case V16QImode:
37132 case V4SImode:
37133 case V4SFmode:
37134 use_vector_set = TARGET_SSE4_1;
37135 break;
37136 case V8HImode:
37137 use_vector_set = TARGET_SSE2;
37138 break;
37139 case V4HImode:
37140 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
37141 break;
37142 case V32QImode:
37143 case V16HImode:
37144 case V8SImode:
37145 case V8SFmode:
37146 case V4DFmode:
37147 use_vector_set = TARGET_AVX;
37148 break;
37149 case V4DImode:
37150 /* Use ix86_expand_vector_set in 64bit mode only. */
37151 use_vector_set = TARGET_AVX && TARGET_64BIT;
37152 break;
37153 default:
37154 break;
37155 }
37156
37157 if (use_vector_set)
37158 {
37159 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
37160 var = force_reg (GET_MODE_INNER (mode), var);
37161 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37162 return true;
37163 }
37164
37165 switch (mode)
37166 {
37167 case V2SFmode:
37168 case V2SImode:
37169 if (!mmx_ok)
37170 return false;
37171 /* FALLTHRU */
37172
37173 case V2DFmode:
37174 case V2DImode:
37175 if (one_var != 0)
37176 return false;
37177 var = force_reg (GET_MODE_INNER (mode), var);
37178 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
37179 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37180 return true;
37181
37182 case V4SFmode:
37183 case V4SImode:
37184 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
37185 new_target = gen_reg_rtx (mode);
37186 else
37187 new_target = target;
37188 var = force_reg (GET_MODE_INNER (mode), var);
37189 x = gen_rtx_VEC_DUPLICATE (mode, var);
37190 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
37191 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
37192 if (one_var != 0)
37193 {
37194 /* We need to shuffle the value to the correct position, so
37195 create a new pseudo to store the intermediate result. */
37196
37197 /* With SSE2, we can use the integer shuffle insns. */
37198 if (mode != V4SFmode && TARGET_SSE2)
37199 {
37200 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
37201 const1_rtx,
37202 GEN_INT (one_var == 1 ? 0 : 1),
37203 GEN_INT (one_var == 2 ? 0 : 1),
37204 GEN_INT (one_var == 3 ? 0 : 1)));
37205 if (target != new_target)
37206 emit_move_insn (target, new_target);
37207 return true;
37208 }
37209
37210 /* Otherwise convert the intermediate result to V4SFmode and
37211 use the SSE1 shuffle instructions. */
37212 if (mode != V4SFmode)
37213 {
37214 tmp = gen_reg_rtx (V4SFmode);
37215 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
37216 }
37217 else
37218 tmp = new_target;
37219
37220 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
37221 const1_rtx,
37222 GEN_INT (one_var == 1 ? 0 : 1),
37223 GEN_INT (one_var == 2 ? 0+4 : 1+4),
37224 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
37225
37226 if (mode != V4SFmode)
37227 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
37228 else if (tmp != target)
37229 emit_move_insn (target, tmp);
37230 }
37231 else if (target != new_target)
37232 emit_move_insn (target, new_target);
37233 return true;
37234
37235 case V8HImode:
37236 case V16QImode:
37237 vsimode = V4SImode;
37238 goto widen;
37239 case V4HImode:
37240 case V8QImode:
37241 if (!mmx_ok)
37242 return false;
37243 vsimode = V2SImode;
37244 goto widen;
37245 widen:
37246 if (one_var != 0)
37247 return false;
37248
37249 /* Zero extend the variable element to SImode and recurse. */
37250 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
37251
37252 x = gen_reg_rtx (vsimode);
37253 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
37254 var, one_var))
37255 gcc_unreachable ();
37256
37257 emit_move_insn (target, gen_lowpart (mode, x));
37258 return true;
37259
37260 default:
37261 return false;
37262 }
37263 }
37264
37265 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37266 consisting of the values in VALS. It is known that all elements
37267 except ONE_VAR are constants. Return true if successful. */
37268
37269 static bool
37270 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
37271 rtx target, rtx vals, int one_var)
37272 {
37273 rtx var = XVECEXP (vals, 0, one_var);
37274 enum machine_mode wmode;
37275 rtx const_vec, x;
37276
37277 const_vec = copy_rtx (vals);
37278 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
37279 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
37280
37281 switch (mode)
37282 {
37283 case V2DFmode:
37284 case V2DImode:
37285 case V2SFmode:
37286 case V2SImode:
37287 /* For the two element vectors, it's just as easy to use
37288 the general case. */
37289 return false;
37290
37291 case V4DImode:
37292 /* Use ix86_expand_vector_set in 64bit mode only. */
37293 if (!TARGET_64BIT)
37294 return false;
37295 case V4DFmode:
37296 case V8SFmode:
37297 case V8SImode:
37298 case V16HImode:
37299 case V32QImode:
37300 case V4SFmode:
37301 case V4SImode:
37302 case V8HImode:
37303 case V4HImode:
37304 break;
37305
37306 case V16QImode:
37307 if (TARGET_SSE4_1)
37308 break;
37309 wmode = V8HImode;
37310 goto widen;
37311 case V8QImode:
37312 wmode = V4HImode;
37313 goto widen;
37314 widen:
37315 /* There's no way to set one QImode entry easily. Combine
37316 the variable value with its adjacent constant value, and
37317 promote to an HImode set. */
37318 x = XVECEXP (vals, 0, one_var ^ 1);
37319 if (one_var & 1)
37320 {
37321 var = convert_modes (HImode, QImode, var, true);
37322 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
37323 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37324 x = GEN_INT (INTVAL (x) & 0xff);
37325 }
37326 else
37327 {
37328 var = convert_modes (HImode, QImode, var, true);
37329 x = gen_int_mode (INTVAL (x) << 8, HImode);
37330 }
37331 if (x != const0_rtx)
37332 var = expand_simple_binop (HImode, IOR, var, x, var,
37333 1, OPTAB_LIB_WIDEN);
37334
37335 x = gen_reg_rtx (wmode);
37336 emit_move_insn (x, gen_lowpart (wmode, const_vec));
37337 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
37338
37339 emit_move_insn (target, gen_lowpart (mode, x));
37340 return true;
37341
37342 default:
37343 return false;
37344 }
37345
37346 emit_move_insn (target, const_vec);
37347 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37348 return true;
37349 }
37350
37351 /* A subroutine of ix86_expand_vector_init_general. Use vector
37352 concatenate to handle the most general case: all values variable,
37353 and none identical. */
37354
37355 static void
37356 ix86_expand_vector_init_concat (enum machine_mode mode,
37357 rtx target, rtx *ops, int n)
37358 {
37359 enum machine_mode cmode, hmode = VOIDmode;
37360 rtx first[8], second[4];
37361 rtvec v;
37362 int i, j;
37363
37364 switch (n)
37365 {
37366 case 2:
37367 switch (mode)
37368 {
37369 case V8SImode:
37370 cmode = V4SImode;
37371 break;
37372 case V8SFmode:
37373 cmode = V4SFmode;
37374 break;
37375 case V4DImode:
37376 cmode = V2DImode;
37377 break;
37378 case V4DFmode:
37379 cmode = V2DFmode;
37380 break;
37381 case V4SImode:
37382 cmode = V2SImode;
37383 break;
37384 case V4SFmode:
37385 cmode = V2SFmode;
37386 break;
37387 case V2DImode:
37388 cmode = DImode;
37389 break;
37390 case V2SImode:
37391 cmode = SImode;
37392 break;
37393 case V2DFmode:
37394 cmode = DFmode;
37395 break;
37396 case V2SFmode:
37397 cmode = SFmode;
37398 break;
37399 default:
37400 gcc_unreachable ();
37401 }
37402
37403 if (!register_operand (ops[1], cmode))
37404 ops[1] = force_reg (cmode, ops[1]);
37405 if (!register_operand (ops[0], cmode))
37406 ops[0] = force_reg (cmode, ops[0]);
37407 emit_insn (gen_rtx_SET (VOIDmode, target,
37408 gen_rtx_VEC_CONCAT (mode, ops[0],
37409 ops[1])));
37410 break;
37411
37412 case 4:
37413 switch (mode)
37414 {
37415 case V4DImode:
37416 cmode = V2DImode;
37417 break;
37418 case V4DFmode:
37419 cmode = V2DFmode;
37420 break;
37421 case V4SImode:
37422 cmode = V2SImode;
37423 break;
37424 case V4SFmode:
37425 cmode = V2SFmode;
37426 break;
37427 default:
37428 gcc_unreachable ();
37429 }
37430 goto half;
37431
37432 case 8:
37433 switch (mode)
37434 {
37435 case V8SImode:
37436 cmode = V2SImode;
37437 hmode = V4SImode;
37438 break;
37439 case V8SFmode:
37440 cmode = V2SFmode;
37441 hmode = V4SFmode;
37442 break;
37443 default:
37444 gcc_unreachable ();
37445 }
37446 goto half;
37447
37448 half:
37449 /* FIXME: We process inputs backward to help RA. PR 36222. */
37450 i = n - 1;
37451 j = (n >> 1) - 1;
37452 for (; i > 0; i -= 2, j--)
37453 {
37454 first[j] = gen_reg_rtx (cmode);
37455 v = gen_rtvec (2, ops[i - 1], ops[i]);
37456 ix86_expand_vector_init (false, first[j],
37457 gen_rtx_PARALLEL (cmode, v));
37458 }
37459
37460 n >>= 1;
37461 if (n > 2)
37462 {
37463 gcc_assert (hmode != VOIDmode);
37464 for (i = j = 0; i < n; i += 2, j++)
37465 {
37466 second[j] = gen_reg_rtx (hmode);
37467 ix86_expand_vector_init_concat (hmode, second [j],
37468 &first [i], 2);
37469 }
37470 n >>= 1;
37471 ix86_expand_vector_init_concat (mode, target, second, n);
37472 }
37473 else
37474 ix86_expand_vector_init_concat (mode, target, first, n);
37475 break;
37476
37477 default:
37478 gcc_unreachable ();
37479 }
37480 }
37481
37482 /* A subroutine of ix86_expand_vector_init_general. Use vector
37483 interleave to handle the most general case: all values variable,
37484 and none identical. */
37485
37486 static void
37487 ix86_expand_vector_init_interleave (enum machine_mode mode,
37488 rtx target, rtx *ops, int n)
37489 {
37490 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
37491 int i, j;
37492 rtx op0, op1;
37493 rtx (*gen_load_even) (rtx, rtx, rtx);
37494 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
37495 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
37496
37497 switch (mode)
37498 {
37499 case V8HImode:
37500 gen_load_even = gen_vec_setv8hi;
37501 gen_interleave_first_low = gen_vec_interleave_lowv4si;
37502 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37503 inner_mode = HImode;
37504 first_imode = V4SImode;
37505 second_imode = V2DImode;
37506 third_imode = VOIDmode;
37507 break;
37508 case V16QImode:
37509 gen_load_even = gen_vec_setv16qi;
37510 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
37511 gen_interleave_second_low = gen_vec_interleave_lowv4si;
37512 inner_mode = QImode;
37513 first_imode = V8HImode;
37514 second_imode = V4SImode;
37515 third_imode = V2DImode;
37516 break;
37517 default:
37518 gcc_unreachable ();
37519 }
37520
37521 for (i = 0; i < n; i++)
37522 {
37523 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
37524 op0 = gen_reg_rtx (SImode);
37525 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
37526
37527 /* Insert the SImode value as low element of V4SImode vector. */
37528 op1 = gen_reg_rtx (V4SImode);
37529 op0 = gen_rtx_VEC_MERGE (V4SImode,
37530 gen_rtx_VEC_DUPLICATE (V4SImode,
37531 op0),
37532 CONST0_RTX (V4SImode),
37533 const1_rtx);
37534 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
37535
37536 /* Cast the V4SImode vector back to a vector in orignal mode. */
37537 op0 = gen_reg_rtx (mode);
37538 emit_move_insn (op0, gen_lowpart (mode, op1));
37539
37540 /* Load even elements into the second position. */
37541 emit_insn (gen_load_even (op0,
37542 force_reg (inner_mode,
37543 ops [i + i + 1]),
37544 const1_rtx));
37545
37546 /* Cast vector to FIRST_IMODE vector. */
37547 ops[i] = gen_reg_rtx (first_imode);
37548 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
37549 }
37550
37551 /* Interleave low FIRST_IMODE vectors. */
37552 for (i = j = 0; i < n; i += 2, j++)
37553 {
37554 op0 = gen_reg_rtx (first_imode);
37555 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
37556
37557 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
37558 ops[j] = gen_reg_rtx (second_imode);
37559 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
37560 }
37561
37562 /* Interleave low SECOND_IMODE vectors. */
37563 switch (second_imode)
37564 {
37565 case V4SImode:
37566 for (i = j = 0; i < n / 2; i += 2, j++)
37567 {
37568 op0 = gen_reg_rtx (second_imode);
37569 emit_insn (gen_interleave_second_low (op0, ops[i],
37570 ops[i + 1]));
37571
37572 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
37573 vector. */
37574 ops[j] = gen_reg_rtx (third_imode);
37575 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
37576 }
37577 second_imode = V2DImode;
37578 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37579 /* FALLTHRU */
37580
37581 case V2DImode:
37582 op0 = gen_reg_rtx (second_imode);
37583 emit_insn (gen_interleave_second_low (op0, ops[0],
37584 ops[1]));
37585
37586 /* Cast the SECOND_IMODE vector back to a vector on original
37587 mode. */
37588 emit_insn (gen_rtx_SET (VOIDmode, target,
37589 gen_lowpart (mode, op0)));
37590 break;
37591
37592 default:
37593 gcc_unreachable ();
37594 }
37595 }
37596
37597 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
37598 all values variable, and none identical. */
37599
37600 static void
37601 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
37602 rtx target, rtx vals)
37603 {
37604 rtx ops[32], op0, op1;
37605 enum machine_mode half_mode = VOIDmode;
37606 int n, i;
37607
37608 switch (mode)
37609 {
37610 case V2SFmode:
37611 case V2SImode:
37612 if (!mmx_ok && !TARGET_SSE)
37613 break;
37614 /* FALLTHRU */
37615
37616 case V8SFmode:
37617 case V8SImode:
37618 case V4DFmode:
37619 case V4DImode:
37620 case V4SFmode:
37621 case V4SImode:
37622 case V2DFmode:
37623 case V2DImode:
37624 n = GET_MODE_NUNITS (mode);
37625 for (i = 0; i < n; i++)
37626 ops[i] = XVECEXP (vals, 0, i);
37627 ix86_expand_vector_init_concat (mode, target, ops, n);
37628 return;
37629
37630 case V32QImode:
37631 half_mode = V16QImode;
37632 goto half;
37633
37634 case V16HImode:
37635 half_mode = V8HImode;
37636 goto half;
37637
37638 half:
37639 n = GET_MODE_NUNITS (mode);
37640 for (i = 0; i < n; i++)
37641 ops[i] = XVECEXP (vals, 0, i);
37642 op0 = gen_reg_rtx (half_mode);
37643 op1 = gen_reg_rtx (half_mode);
37644 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37645 n >> 2);
37646 ix86_expand_vector_init_interleave (half_mode, op1,
37647 &ops [n >> 1], n >> 2);
37648 emit_insn (gen_rtx_SET (VOIDmode, target,
37649 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37650 return;
37651
37652 case V16QImode:
37653 if (!TARGET_SSE4_1)
37654 break;
37655 /* FALLTHRU */
37656
37657 case V8HImode:
37658 if (!TARGET_SSE2)
37659 break;
37660
37661 /* Don't use ix86_expand_vector_init_interleave if we can't
37662 move from GPR to SSE register directly. */
37663 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37664 break;
37665
37666 n = GET_MODE_NUNITS (mode);
37667 for (i = 0; i < n; i++)
37668 ops[i] = XVECEXP (vals, 0, i);
37669 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37670 return;
37671
37672 case V4HImode:
37673 case V8QImode:
37674 break;
37675
37676 default:
37677 gcc_unreachable ();
37678 }
37679
37680 {
37681 int i, j, n_elts, n_words, n_elt_per_word;
37682 enum machine_mode inner_mode;
37683 rtx words[4], shift;
37684
37685 inner_mode = GET_MODE_INNER (mode);
37686 n_elts = GET_MODE_NUNITS (mode);
37687 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37688 n_elt_per_word = n_elts / n_words;
37689 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37690
37691 for (i = 0; i < n_words; ++i)
37692 {
37693 rtx word = NULL_RTX;
37694
37695 for (j = 0; j < n_elt_per_word; ++j)
37696 {
37697 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37698 elt = convert_modes (word_mode, inner_mode, elt, true);
37699
37700 if (j == 0)
37701 word = elt;
37702 else
37703 {
37704 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37705 word, 1, OPTAB_LIB_WIDEN);
37706 word = expand_simple_binop (word_mode, IOR, word, elt,
37707 word, 1, OPTAB_LIB_WIDEN);
37708 }
37709 }
37710
37711 words[i] = word;
37712 }
37713
37714 if (n_words == 1)
37715 emit_move_insn (target, gen_lowpart (mode, words[0]));
37716 else if (n_words == 2)
37717 {
37718 rtx tmp = gen_reg_rtx (mode);
37719 emit_clobber (tmp);
37720 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37721 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37722 emit_move_insn (target, tmp);
37723 }
37724 else if (n_words == 4)
37725 {
37726 rtx tmp = gen_reg_rtx (V4SImode);
37727 gcc_assert (word_mode == SImode);
37728 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37729 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37730 emit_move_insn (target, gen_lowpart (mode, tmp));
37731 }
37732 else
37733 gcc_unreachable ();
37734 }
37735 }
37736
37737 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37738 instructions unless MMX_OK is true. */
37739
37740 void
37741 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37742 {
37743 enum machine_mode mode = GET_MODE (target);
37744 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37745 int n_elts = GET_MODE_NUNITS (mode);
37746 int n_var = 0, one_var = -1;
37747 bool all_same = true, all_const_zero = true;
37748 int i;
37749 rtx x;
37750
37751 for (i = 0; i < n_elts; ++i)
37752 {
37753 x = XVECEXP (vals, 0, i);
37754 if (!(CONST_INT_P (x)
37755 || GET_CODE (x) == CONST_DOUBLE
37756 || GET_CODE (x) == CONST_FIXED))
37757 n_var++, one_var = i;
37758 else if (x != CONST0_RTX (inner_mode))
37759 all_const_zero = false;
37760 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37761 all_same = false;
37762 }
37763
37764 /* Constants are best loaded from the constant pool. */
37765 if (n_var == 0)
37766 {
37767 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37768 return;
37769 }
37770
37771 /* If all values are identical, broadcast the value. */
37772 if (all_same
37773 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37774 XVECEXP (vals, 0, 0)))
37775 return;
37776
37777 /* Values where only one field is non-constant are best loaded from
37778 the pool and overwritten via move later. */
37779 if (n_var == 1)
37780 {
37781 if (all_const_zero
37782 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37783 XVECEXP (vals, 0, one_var),
37784 one_var))
37785 return;
37786
37787 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37788 return;
37789 }
37790
37791 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37792 }
37793
37794 void
37795 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37796 {
37797 enum machine_mode mode = GET_MODE (target);
37798 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37799 enum machine_mode half_mode;
37800 bool use_vec_merge = false;
37801 rtx tmp;
37802 static rtx (*gen_extract[6][2]) (rtx, rtx)
37803 = {
37804 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37805 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37806 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37807 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37808 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37809 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37810 };
37811 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37812 = {
37813 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37814 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37815 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37816 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37817 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37818 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37819 };
37820 int i, j, n;
37821
37822 switch (mode)
37823 {
37824 case V2SFmode:
37825 case V2SImode:
37826 if (mmx_ok)
37827 {
37828 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37829 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37830 if (elt == 0)
37831 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37832 else
37833 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37834 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37835 return;
37836 }
37837 break;
37838
37839 case V2DImode:
37840 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37841 if (use_vec_merge)
37842 break;
37843
37844 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37845 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37846 if (elt == 0)
37847 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37848 else
37849 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37850 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37851 return;
37852
37853 case V2DFmode:
37854 {
37855 rtx op0, op1;
37856
37857 /* For the two element vectors, we implement a VEC_CONCAT with
37858 the extraction of the other element. */
37859
37860 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37861 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37862
37863 if (elt == 0)
37864 op0 = val, op1 = tmp;
37865 else
37866 op0 = tmp, op1 = val;
37867
37868 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37869 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37870 }
37871 return;
37872
37873 case V4SFmode:
37874 use_vec_merge = TARGET_SSE4_1;
37875 if (use_vec_merge)
37876 break;
37877
37878 switch (elt)
37879 {
37880 case 0:
37881 use_vec_merge = true;
37882 break;
37883
37884 case 1:
37885 /* tmp = target = A B C D */
37886 tmp = copy_to_reg (target);
37887 /* target = A A B B */
37888 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37889 /* target = X A B B */
37890 ix86_expand_vector_set (false, target, val, 0);
37891 /* target = A X C D */
37892 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37893 const1_rtx, const0_rtx,
37894 GEN_INT (2+4), GEN_INT (3+4)));
37895 return;
37896
37897 case 2:
37898 /* tmp = target = A B C D */
37899 tmp = copy_to_reg (target);
37900 /* tmp = X B C D */
37901 ix86_expand_vector_set (false, tmp, val, 0);
37902 /* target = A B X D */
37903 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37904 const0_rtx, const1_rtx,
37905 GEN_INT (0+4), GEN_INT (3+4)));
37906 return;
37907
37908 case 3:
37909 /* tmp = target = A B C D */
37910 tmp = copy_to_reg (target);
37911 /* tmp = X B C D */
37912 ix86_expand_vector_set (false, tmp, val, 0);
37913 /* target = A B X D */
37914 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37915 const0_rtx, const1_rtx,
37916 GEN_INT (2+4), GEN_INT (0+4)));
37917 return;
37918
37919 default:
37920 gcc_unreachable ();
37921 }
37922 break;
37923
37924 case V4SImode:
37925 use_vec_merge = TARGET_SSE4_1;
37926 if (use_vec_merge)
37927 break;
37928
37929 /* Element 0 handled by vec_merge below. */
37930 if (elt == 0)
37931 {
37932 use_vec_merge = true;
37933 break;
37934 }
37935
37936 if (TARGET_SSE2)
37937 {
37938 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37939 store into element 0, then shuffle them back. */
37940
37941 rtx order[4];
37942
37943 order[0] = GEN_INT (elt);
37944 order[1] = const1_rtx;
37945 order[2] = const2_rtx;
37946 order[3] = GEN_INT (3);
37947 order[elt] = const0_rtx;
37948
37949 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37950 order[1], order[2], order[3]));
37951
37952 ix86_expand_vector_set (false, target, val, 0);
37953
37954 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37955 order[1], order[2], order[3]));
37956 }
37957 else
37958 {
37959 /* For SSE1, we have to reuse the V4SF code. */
37960 rtx t = gen_reg_rtx (V4SFmode);
37961 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
37962 emit_move_insn (target, gen_lowpart (mode, t));
37963 }
37964 return;
37965
37966 case V8HImode:
37967 use_vec_merge = TARGET_SSE2;
37968 break;
37969 case V4HImode:
37970 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37971 break;
37972
37973 case V16QImode:
37974 use_vec_merge = TARGET_SSE4_1;
37975 break;
37976
37977 case V8QImode:
37978 break;
37979
37980 case V32QImode:
37981 half_mode = V16QImode;
37982 j = 0;
37983 n = 16;
37984 goto half;
37985
37986 case V16HImode:
37987 half_mode = V8HImode;
37988 j = 1;
37989 n = 8;
37990 goto half;
37991
37992 case V8SImode:
37993 half_mode = V4SImode;
37994 j = 2;
37995 n = 4;
37996 goto half;
37997
37998 case V4DImode:
37999 half_mode = V2DImode;
38000 j = 3;
38001 n = 2;
38002 goto half;
38003
38004 case V8SFmode:
38005 half_mode = V4SFmode;
38006 j = 4;
38007 n = 4;
38008 goto half;
38009
38010 case V4DFmode:
38011 half_mode = V2DFmode;
38012 j = 5;
38013 n = 2;
38014 goto half;
38015
38016 half:
38017 /* Compute offset. */
38018 i = elt / n;
38019 elt %= n;
38020
38021 gcc_assert (i <= 1);
38022
38023 /* Extract the half. */
38024 tmp = gen_reg_rtx (half_mode);
38025 emit_insn (gen_extract[j][i] (tmp, target));
38026
38027 /* Put val in tmp at elt. */
38028 ix86_expand_vector_set (false, tmp, val, elt);
38029
38030 /* Put it back. */
38031 emit_insn (gen_insert[j][i] (target, target, tmp));
38032 return;
38033
38034 default:
38035 break;
38036 }
38037
38038 if (use_vec_merge)
38039 {
38040 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
38041 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
38042 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38043 }
38044 else
38045 {
38046 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38047
38048 emit_move_insn (mem, target);
38049
38050 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38051 emit_move_insn (tmp, val);
38052
38053 emit_move_insn (target, mem);
38054 }
38055 }
38056
38057 void
38058 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
38059 {
38060 enum machine_mode mode = GET_MODE (vec);
38061 enum machine_mode inner_mode = GET_MODE_INNER (mode);
38062 bool use_vec_extr = false;
38063 rtx tmp;
38064
38065 switch (mode)
38066 {
38067 case V2SImode:
38068 case V2SFmode:
38069 if (!mmx_ok)
38070 break;
38071 /* FALLTHRU */
38072
38073 case V2DFmode:
38074 case V2DImode:
38075 use_vec_extr = true;
38076 break;
38077
38078 case V4SFmode:
38079 use_vec_extr = TARGET_SSE4_1;
38080 if (use_vec_extr)
38081 break;
38082
38083 switch (elt)
38084 {
38085 case 0:
38086 tmp = vec;
38087 break;
38088
38089 case 1:
38090 case 3:
38091 tmp = gen_reg_rtx (mode);
38092 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
38093 GEN_INT (elt), GEN_INT (elt),
38094 GEN_INT (elt+4), GEN_INT (elt+4)));
38095 break;
38096
38097 case 2:
38098 tmp = gen_reg_rtx (mode);
38099 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
38100 break;
38101
38102 default:
38103 gcc_unreachable ();
38104 }
38105 vec = tmp;
38106 use_vec_extr = true;
38107 elt = 0;
38108 break;
38109
38110 case V4SImode:
38111 use_vec_extr = TARGET_SSE4_1;
38112 if (use_vec_extr)
38113 break;
38114
38115 if (TARGET_SSE2)
38116 {
38117 switch (elt)
38118 {
38119 case 0:
38120 tmp = vec;
38121 break;
38122
38123 case 1:
38124 case 3:
38125 tmp = gen_reg_rtx (mode);
38126 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
38127 GEN_INT (elt), GEN_INT (elt),
38128 GEN_INT (elt), GEN_INT (elt)));
38129 break;
38130
38131 case 2:
38132 tmp = gen_reg_rtx (mode);
38133 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
38134 break;
38135
38136 default:
38137 gcc_unreachable ();
38138 }
38139 vec = tmp;
38140 use_vec_extr = true;
38141 elt = 0;
38142 }
38143 else
38144 {
38145 /* For SSE1, we have to reuse the V4SF code. */
38146 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
38147 gen_lowpart (V4SFmode, vec), elt);
38148 return;
38149 }
38150 break;
38151
38152 case V8HImode:
38153 use_vec_extr = TARGET_SSE2;
38154 break;
38155 case V4HImode:
38156 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
38157 break;
38158
38159 case V16QImode:
38160 use_vec_extr = TARGET_SSE4_1;
38161 break;
38162
38163 case V8SFmode:
38164 if (TARGET_AVX)
38165 {
38166 tmp = gen_reg_rtx (V4SFmode);
38167 if (elt < 4)
38168 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
38169 else
38170 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
38171 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38172 return;
38173 }
38174 break;
38175
38176 case V4DFmode:
38177 if (TARGET_AVX)
38178 {
38179 tmp = gen_reg_rtx (V2DFmode);
38180 if (elt < 2)
38181 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
38182 else
38183 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
38184 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38185 return;
38186 }
38187 break;
38188
38189 case V32QImode:
38190 if (TARGET_AVX)
38191 {
38192 tmp = gen_reg_rtx (V16QImode);
38193 if (elt < 16)
38194 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
38195 else
38196 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
38197 ix86_expand_vector_extract (false, target, tmp, elt & 15);
38198 return;
38199 }
38200 break;
38201
38202 case V16HImode:
38203 if (TARGET_AVX)
38204 {
38205 tmp = gen_reg_rtx (V8HImode);
38206 if (elt < 8)
38207 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
38208 else
38209 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
38210 ix86_expand_vector_extract (false, target, tmp, elt & 7);
38211 return;
38212 }
38213 break;
38214
38215 case V8SImode:
38216 if (TARGET_AVX)
38217 {
38218 tmp = gen_reg_rtx (V4SImode);
38219 if (elt < 4)
38220 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
38221 else
38222 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
38223 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38224 return;
38225 }
38226 break;
38227
38228 case V4DImode:
38229 if (TARGET_AVX)
38230 {
38231 tmp = gen_reg_rtx (V2DImode);
38232 if (elt < 2)
38233 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
38234 else
38235 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
38236 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38237 return;
38238 }
38239 break;
38240
38241 case V8QImode:
38242 /* ??? Could extract the appropriate HImode element and shift. */
38243 default:
38244 break;
38245 }
38246
38247 if (use_vec_extr)
38248 {
38249 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
38250 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
38251
38252 /* Let the rtl optimizers know about the zero extension performed. */
38253 if (inner_mode == QImode || inner_mode == HImode)
38254 {
38255 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
38256 target = gen_lowpart (SImode, target);
38257 }
38258
38259 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38260 }
38261 else
38262 {
38263 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38264
38265 emit_move_insn (mem, vec);
38266
38267 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38268 emit_move_insn (target, tmp);
38269 }
38270 }
38271
38272 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
38273 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
38274 The upper bits of DEST are undefined, though they shouldn't cause
38275 exceptions (some bits from src or all zeros are ok). */
38276
38277 static void
38278 emit_reduc_half (rtx dest, rtx src, int i)
38279 {
38280 rtx tem, d = dest;
38281 switch (GET_MODE (src))
38282 {
38283 case V4SFmode:
38284 if (i == 128)
38285 tem = gen_sse_movhlps (dest, src, src);
38286 else
38287 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
38288 GEN_INT (1 + 4), GEN_INT (1 + 4));
38289 break;
38290 case V2DFmode:
38291 tem = gen_vec_interleave_highv2df (dest, src, src);
38292 break;
38293 case V16QImode:
38294 case V8HImode:
38295 case V4SImode:
38296 case V2DImode:
38297 d = gen_reg_rtx (V1TImode);
38298 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
38299 GEN_INT (i / 2));
38300 break;
38301 case V8SFmode:
38302 if (i == 256)
38303 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
38304 else
38305 tem = gen_avx_shufps256 (dest, src, src,
38306 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
38307 break;
38308 case V4DFmode:
38309 if (i == 256)
38310 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
38311 else
38312 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
38313 break;
38314 case V32QImode:
38315 case V16HImode:
38316 case V8SImode:
38317 case V4DImode:
38318 if (i == 256)
38319 {
38320 if (GET_MODE (dest) != V4DImode)
38321 d = gen_reg_rtx (V4DImode);
38322 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
38323 gen_lowpart (V4DImode, src),
38324 const1_rtx);
38325 }
38326 else
38327 {
38328 d = gen_reg_rtx (V2TImode);
38329 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
38330 GEN_INT (i / 2));
38331 }
38332 break;
38333 default:
38334 gcc_unreachable ();
38335 }
38336 emit_insn (tem);
38337 if (d != dest)
38338 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
38339 }
38340
38341 /* Expand a vector reduction. FN is the binary pattern to reduce;
38342 DEST is the destination; IN is the input vector. */
38343
38344 void
38345 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
38346 {
38347 rtx half, dst, vec = in;
38348 enum machine_mode mode = GET_MODE (in);
38349 int i;
38350
38351 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
38352 if (TARGET_SSE4_1
38353 && mode == V8HImode
38354 && fn == gen_uminv8hi3)
38355 {
38356 emit_insn (gen_sse4_1_phminposuw (dest, in));
38357 return;
38358 }
38359
38360 for (i = GET_MODE_BITSIZE (mode);
38361 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
38362 i >>= 1)
38363 {
38364 half = gen_reg_rtx (mode);
38365 emit_reduc_half (half, vec, i);
38366 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
38367 dst = dest;
38368 else
38369 dst = gen_reg_rtx (mode);
38370 emit_insn (fn (dst, half, vec));
38371 vec = dst;
38372 }
38373 }
38374 \f
38375 /* Target hook for scalar_mode_supported_p. */
38376 static bool
38377 ix86_scalar_mode_supported_p (enum machine_mode mode)
38378 {
38379 if (DECIMAL_FLOAT_MODE_P (mode))
38380 return default_decimal_float_supported_p ();
38381 else if (mode == TFmode)
38382 return true;
38383 else
38384 return default_scalar_mode_supported_p (mode);
38385 }
38386
38387 /* Implements target hook vector_mode_supported_p. */
38388 static bool
38389 ix86_vector_mode_supported_p (enum machine_mode mode)
38390 {
38391 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
38392 return true;
38393 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
38394 return true;
38395 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
38396 return true;
38397 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
38398 return true;
38399 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
38400 return true;
38401 return false;
38402 }
38403
38404 /* Target hook for c_mode_for_suffix. */
38405 static enum machine_mode
38406 ix86_c_mode_for_suffix (char suffix)
38407 {
38408 if (suffix == 'q')
38409 return TFmode;
38410 if (suffix == 'w')
38411 return XFmode;
38412
38413 return VOIDmode;
38414 }
38415
38416 /* Worker function for TARGET_MD_ASM_CLOBBERS.
38417
38418 We do this in the new i386 backend to maintain source compatibility
38419 with the old cc0-based compiler. */
38420
38421 static tree
38422 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
38423 tree inputs ATTRIBUTE_UNUSED,
38424 tree clobbers)
38425 {
38426 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
38427 clobbers);
38428 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
38429 clobbers);
38430 return clobbers;
38431 }
38432
38433 /* Implements target vector targetm.asm.encode_section_info. */
38434
38435 static void ATTRIBUTE_UNUSED
38436 ix86_encode_section_info (tree decl, rtx rtl, int first)
38437 {
38438 default_encode_section_info (decl, rtl, first);
38439
38440 if (TREE_CODE (decl) == VAR_DECL
38441 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
38442 && ix86_in_large_data_p (decl))
38443 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
38444 }
38445
38446 /* Worker function for REVERSE_CONDITION. */
38447
38448 enum rtx_code
38449 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
38450 {
38451 return (mode != CCFPmode && mode != CCFPUmode
38452 ? reverse_condition (code)
38453 : reverse_condition_maybe_unordered (code));
38454 }
38455
38456 /* Output code to perform an x87 FP register move, from OPERANDS[1]
38457 to OPERANDS[0]. */
38458
38459 const char *
38460 output_387_reg_move (rtx insn, rtx *operands)
38461 {
38462 if (REG_P (operands[0]))
38463 {
38464 if (REG_P (operands[1])
38465 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38466 {
38467 if (REGNO (operands[0]) == FIRST_STACK_REG)
38468 return output_387_ffreep (operands, 0);
38469 return "fstp\t%y0";
38470 }
38471 if (STACK_TOP_P (operands[0]))
38472 return "fld%Z1\t%y1";
38473 return "fst\t%y0";
38474 }
38475 else if (MEM_P (operands[0]))
38476 {
38477 gcc_assert (REG_P (operands[1]));
38478 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38479 return "fstp%Z0\t%y0";
38480 else
38481 {
38482 /* There is no non-popping store to memory for XFmode.
38483 So if we need one, follow the store with a load. */
38484 if (GET_MODE (operands[0]) == XFmode)
38485 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
38486 else
38487 return "fst%Z0\t%y0";
38488 }
38489 }
38490 else
38491 gcc_unreachable();
38492 }
38493
38494 /* Output code to perform a conditional jump to LABEL, if C2 flag in
38495 FP status register is set. */
38496
38497 void
38498 ix86_emit_fp_unordered_jump (rtx label)
38499 {
38500 rtx reg = gen_reg_rtx (HImode);
38501 rtx temp;
38502
38503 emit_insn (gen_x86_fnstsw_1 (reg));
38504
38505 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
38506 {
38507 emit_insn (gen_x86_sahf_1 (reg));
38508
38509 temp = gen_rtx_REG (CCmode, FLAGS_REG);
38510 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
38511 }
38512 else
38513 {
38514 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
38515
38516 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
38517 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
38518 }
38519
38520 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
38521 gen_rtx_LABEL_REF (VOIDmode, label),
38522 pc_rtx);
38523 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
38524
38525 emit_jump_insn (temp);
38526 predict_jump (REG_BR_PROB_BASE * 10 / 100);
38527 }
38528
38529 /* Output code to perform a log1p XFmode calculation. */
38530
38531 void ix86_emit_i387_log1p (rtx op0, rtx op1)
38532 {
38533 rtx label1 = gen_label_rtx ();
38534 rtx label2 = gen_label_rtx ();
38535
38536 rtx tmp = gen_reg_rtx (XFmode);
38537 rtx tmp2 = gen_reg_rtx (XFmode);
38538 rtx test;
38539
38540 emit_insn (gen_absxf2 (tmp, op1));
38541 test = gen_rtx_GE (VOIDmode, tmp,
38542 CONST_DOUBLE_FROM_REAL_VALUE (
38543 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
38544 XFmode));
38545 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
38546
38547 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38548 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
38549 emit_jump (label2);
38550
38551 emit_label (label1);
38552 emit_move_insn (tmp, CONST1_RTX (XFmode));
38553 emit_insn (gen_addxf3 (tmp, op1, tmp));
38554 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38555 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
38556
38557 emit_label (label2);
38558 }
38559
38560 /* Emit code for round calculation. */
38561 void ix86_emit_i387_round (rtx op0, rtx op1)
38562 {
38563 enum machine_mode inmode = GET_MODE (op1);
38564 enum machine_mode outmode = GET_MODE (op0);
38565 rtx e1, e2, res, tmp, tmp1, half;
38566 rtx scratch = gen_reg_rtx (HImode);
38567 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
38568 rtx jump_label = gen_label_rtx ();
38569 rtx insn;
38570 rtx (*gen_abs) (rtx, rtx);
38571 rtx (*gen_neg) (rtx, rtx);
38572
38573 switch (inmode)
38574 {
38575 case SFmode:
38576 gen_abs = gen_abssf2;
38577 break;
38578 case DFmode:
38579 gen_abs = gen_absdf2;
38580 break;
38581 case XFmode:
38582 gen_abs = gen_absxf2;
38583 break;
38584 default:
38585 gcc_unreachable ();
38586 }
38587
38588 switch (outmode)
38589 {
38590 case SFmode:
38591 gen_neg = gen_negsf2;
38592 break;
38593 case DFmode:
38594 gen_neg = gen_negdf2;
38595 break;
38596 case XFmode:
38597 gen_neg = gen_negxf2;
38598 break;
38599 case HImode:
38600 gen_neg = gen_neghi2;
38601 break;
38602 case SImode:
38603 gen_neg = gen_negsi2;
38604 break;
38605 case DImode:
38606 gen_neg = gen_negdi2;
38607 break;
38608 default:
38609 gcc_unreachable ();
38610 }
38611
38612 e1 = gen_reg_rtx (inmode);
38613 e2 = gen_reg_rtx (inmode);
38614 res = gen_reg_rtx (outmode);
38615
38616 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
38617
38618 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
38619
38620 /* scratch = fxam(op1) */
38621 emit_insn (gen_rtx_SET (VOIDmode, scratch,
38622 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
38623 UNSPEC_FXAM)));
38624 /* e1 = fabs(op1) */
38625 emit_insn (gen_abs (e1, op1));
38626
38627 /* e2 = e1 + 0.5 */
38628 half = force_reg (inmode, half);
38629 emit_insn (gen_rtx_SET (VOIDmode, e2,
38630 gen_rtx_PLUS (inmode, e1, half)));
38631
38632 /* res = floor(e2) */
38633 if (inmode != XFmode)
38634 {
38635 tmp1 = gen_reg_rtx (XFmode);
38636
38637 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
38638 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
38639 }
38640 else
38641 tmp1 = e2;
38642
38643 switch (outmode)
38644 {
38645 case SFmode:
38646 case DFmode:
38647 {
38648 rtx tmp0 = gen_reg_rtx (XFmode);
38649
38650 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38651
38652 emit_insn (gen_rtx_SET (VOIDmode, res,
38653 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38654 UNSPEC_TRUNC_NOOP)));
38655 }
38656 break;
38657 case XFmode:
38658 emit_insn (gen_frndintxf2_floor (res, tmp1));
38659 break;
38660 case HImode:
38661 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38662 break;
38663 case SImode:
38664 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38665 break;
38666 case DImode:
38667 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38668 break;
38669 default:
38670 gcc_unreachable ();
38671 }
38672
38673 /* flags = signbit(a) */
38674 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38675
38676 /* if (flags) then res = -res */
38677 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38678 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38679 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38680 pc_rtx);
38681 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38682 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38683 JUMP_LABEL (insn) = jump_label;
38684
38685 emit_insn (gen_neg (res, res));
38686
38687 emit_label (jump_label);
38688 LABEL_NUSES (jump_label) = 1;
38689
38690 emit_move_insn (op0, res);
38691 }
38692
38693 /* Output code to perform a Newton-Rhapson approximation of a single precision
38694 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38695
38696 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38697 {
38698 rtx x0, x1, e0, e1;
38699
38700 x0 = gen_reg_rtx (mode);
38701 e0 = gen_reg_rtx (mode);
38702 e1 = gen_reg_rtx (mode);
38703 x1 = gen_reg_rtx (mode);
38704
38705 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38706
38707 b = force_reg (mode, b);
38708
38709 /* x0 = rcp(b) estimate */
38710 emit_insn (gen_rtx_SET (VOIDmode, x0,
38711 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38712 UNSPEC_RCP)));
38713 /* e0 = x0 * b */
38714 emit_insn (gen_rtx_SET (VOIDmode, e0,
38715 gen_rtx_MULT (mode, x0, b)));
38716
38717 /* e0 = x0 * e0 */
38718 emit_insn (gen_rtx_SET (VOIDmode, e0,
38719 gen_rtx_MULT (mode, x0, e0)));
38720
38721 /* e1 = x0 + x0 */
38722 emit_insn (gen_rtx_SET (VOIDmode, e1,
38723 gen_rtx_PLUS (mode, x0, x0)));
38724
38725 /* x1 = e1 - e0 */
38726 emit_insn (gen_rtx_SET (VOIDmode, x1,
38727 gen_rtx_MINUS (mode, e1, e0)));
38728
38729 /* res = a * x1 */
38730 emit_insn (gen_rtx_SET (VOIDmode, res,
38731 gen_rtx_MULT (mode, a, x1)));
38732 }
38733
38734 /* Output code to perform a Newton-Rhapson approximation of a
38735 single precision floating point [reciprocal] square root. */
38736
38737 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38738 bool recip)
38739 {
38740 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38741 REAL_VALUE_TYPE r;
38742
38743 x0 = gen_reg_rtx (mode);
38744 e0 = gen_reg_rtx (mode);
38745 e1 = gen_reg_rtx (mode);
38746 e2 = gen_reg_rtx (mode);
38747 e3 = gen_reg_rtx (mode);
38748
38749 real_from_integer (&r, VOIDmode, -3, -1, 0);
38750 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38751
38752 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38753 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38754
38755 if (VECTOR_MODE_P (mode))
38756 {
38757 mthree = ix86_build_const_vector (mode, true, mthree);
38758 mhalf = ix86_build_const_vector (mode, true, mhalf);
38759 }
38760
38761 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38762 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38763
38764 a = force_reg (mode, a);
38765
38766 /* x0 = rsqrt(a) estimate */
38767 emit_insn (gen_rtx_SET (VOIDmode, x0,
38768 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38769 UNSPEC_RSQRT)));
38770
38771 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38772 if (!recip)
38773 {
38774 rtx zero, mask;
38775
38776 zero = gen_reg_rtx (mode);
38777 mask = gen_reg_rtx (mode);
38778
38779 zero = force_reg (mode, CONST0_RTX(mode));
38780 emit_insn (gen_rtx_SET (VOIDmode, mask,
38781 gen_rtx_NE (mode, zero, a)));
38782
38783 emit_insn (gen_rtx_SET (VOIDmode, x0,
38784 gen_rtx_AND (mode, x0, mask)));
38785 }
38786
38787 /* e0 = x0 * a */
38788 emit_insn (gen_rtx_SET (VOIDmode, e0,
38789 gen_rtx_MULT (mode, x0, a)));
38790 /* e1 = e0 * x0 */
38791 emit_insn (gen_rtx_SET (VOIDmode, e1,
38792 gen_rtx_MULT (mode, e0, x0)));
38793
38794 /* e2 = e1 - 3. */
38795 mthree = force_reg (mode, mthree);
38796 emit_insn (gen_rtx_SET (VOIDmode, e2,
38797 gen_rtx_PLUS (mode, e1, mthree)));
38798
38799 mhalf = force_reg (mode, mhalf);
38800 if (recip)
38801 /* e3 = -.5 * x0 */
38802 emit_insn (gen_rtx_SET (VOIDmode, e3,
38803 gen_rtx_MULT (mode, x0, mhalf)));
38804 else
38805 /* e3 = -.5 * e0 */
38806 emit_insn (gen_rtx_SET (VOIDmode, e3,
38807 gen_rtx_MULT (mode, e0, mhalf)));
38808 /* ret = e2 * e3 */
38809 emit_insn (gen_rtx_SET (VOIDmode, res,
38810 gen_rtx_MULT (mode, e2, e3)));
38811 }
38812
38813 #ifdef TARGET_SOLARIS
38814 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38815
38816 static void
38817 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38818 tree decl)
38819 {
38820 /* With Binutils 2.15, the "@unwind" marker must be specified on
38821 every occurrence of the ".eh_frame" section, not just the first
38822 one. */
38823 if (TARGET_64BIT
38824 && strcmp (name, ".eh_frame") == 0)
38825 {
38826 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38827 flags & SECTION_WRITE ? "aw" : "a");
38828 return;
38829 }
38830
38831 #ifndef USE_GAS
38832 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38833 {
38834 solaris_elf_asm_comdat_section (name, flags, decl);
38835 return;
38836 }
38837 #endif
38838
38839 default_elf_asm_named_section (name, flags, decl);
38840 }
38841 #endif /* TARGET_SOLARIS */
38842
38843 /* Return the mangling of TYPE if it is an extended fundamental type. */
38844
38845 static const char *
38846 ix86_mangle_type (const_tree type)
38847 {
38848 type = TYPE_MAIN_VARIANT (type);
38849
38850 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38851 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38852 return NULL;
38853
38854 switch (TYPE_MODE (type))
38855 {
38856 case TFmode:
38857 /* __float128 is "g". */
38858 return "g";
38859 case XFmode:
38860 /* "long double" or __float80 is "e". */
38861 return "e";
38862 default:
38863 return NULL;
38864 }
38865 }
38866
38867 /* For 32-bit code we can save PIC register setup by using
38868 __stack_chk_fail_local hidden function instead of calling
38869 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38870 register, so it is better to call __stack_chk_fail directly. */
38871
38872 static tree ATTRIBUTE_UNUSED
38873 ix86_stack_protect_fail (void)
38874 {
38875 return TARGET_64BIT
38876 ? default_external_stack_protect_fail ()
38877 : default_hidden_stack_protect_fail ();
38878 }
38879
38880 /* Select a format to encode pointers in exception handling data. CODE
38881 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38882 true if the symbol may be affected by dynamic relocations.
38883
38884 ??? All x86 object file formats are capable of representing this.
38885 After all, the relocation needed is the same as for the call insn.
38886 Whether or not a particular assembler allows us to enter such, I
38887 guess we'll have to see. */
38888 int
38889 asm_preferred_eh_data_format (int code, int global)
38890 {
38891 if (flag_pic)
38892 {
38893 int type = DW_EH_PE_sdata8;
38894 if (!TARGET_64BIT
38895 || ix86_cmodel == CM_SMALL_PIC
38896 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38897 type = DW_EH_PE_sdata4;
38898 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38899 }
38900 if (ix86_cmodel == CM_SMALL
38901 || (ix86_cmodel == CM_MEDIUM && code))
38902 return DW_EH_PE_udata4;
38903 return DW_EH_PE_absptr;
38904 }
38905 \f
38906 /* Expand copysign from SIGN to the positive value ABS_VALUE
38907 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38908 the sign-bit. */
38909 static void
38910 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38911 {
38912 enum machine_mode mode = GET_MODE (sign);
38913 rtx sgn = gen_reg_rtx (mode);
38914 if (mask == NULL_RTX)
38915 {
38916 enum machine_mode vmode;
38917
38918 if (mode == SFmode)
38919 vmode = V4SFmode;
38920 else if (mode == DFmode)
38921 vmode = V2DFmode;
38922 else
38923 vmode = mode;
38924
38925 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38926 if (!VECTOR_MODE_P (mode))
38927 {
38928 /* We need to generate a scalar mode mask in this case. */
38929 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38930 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38931 mask = gen_reg_rtx (mode);
38932 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38933 }
38934 }
38935 else
38936 mask = gen_rtx_NOT (mode, mask);
38937 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38938 gen_rtx_AND (mode, mask, sign)));
38939 emit_insn (gen_rtx_SET (VOIDmode, result,
38940 gen_rtx_IOR (mode, abs_value, sgn)));
38941 }
38942
38943 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38944 mask for masking out the sign-bit is stored in *SMASK, if that is
38945 non-null. */
38946 static rtx
38947 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38948 {
38949 enum machine_mode vmode, mode = GET_MODE (op0);
38950 rtx xa, mask;
38951
38952 xa = gen_reg_rtx (mode);
38953 if (mode == SFmode)
38954 vmode = V4SFmode;
38955 else if (mode == DFmode)
38956 vmode = V2DFmode;
38957 else
38958 vmode = mode;
38959 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38960 if (!VECTOR_MODE_P (mode))
38961 {
38962 /* We need to generate a scalar mode mask in this case. */
38963 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38964 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38965 mask = gen_reg_rtx (mode);
38966 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38967 }
38968 emit_insn (gen_rtx_SET (VOIDmode, xa,
38969 gen_rtx_AND (mode, op0, mask)));
38970
38971 if (smask)
38972 *smask = mask;
38973
38974 return xa;
38975 }
38976
38977 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38978 swapping the operands if SWAP_OPERANDS is true. The expanded
38979 code is a forward jump to a newly created label in case the
38980 comparison is true. The generated label rtx is returned. */
38981 static rtx
38982 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38983 bool swap_operands)
38984 {
38985 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
38986 rtx label, tmp;
38987
38988 if (swap_operands)
38989 {
38990 tmp = op0;
38991 op0 = op1;
38992 op1 = tmp;
38993 }
38994
38995 label = gen_label_rtx ();
38996 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
38997 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38998 gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
38999 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
39000 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
39001 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
39002 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
39003 JUMP_LABEL (tmp) = label;
39004
39005 return label;
39006 }
39007
39008 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
39009 using comparison code CODE. Operands are swapped for the comparison if
39010 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
39011 static rtx
39012 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
39013 bool swap_operands)
39014 {
39015 rtx (*insn)(rtx, rtx, rtx, rtx);
39016 enum machine_mode mode = GET_MODE (op0);
39017 rtx mask = gen_reg_rtx (mode);
39018
39019 if (swap_operands)
39020 {
39021 rtx tmp = op0;
39022 op0 = op1;
39023 op1 = tmp;
39024 }
39025
39026 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
39027
39028 emit_insn (insn (mask, op0, op1,
39029 gen_rtx_fmt_ee (code, mode, op0, op1)));
39030 return mask;
39031 }
39032
39033 /* Generate and return a rtx of mode MODE for 2**n where n is the number
39034 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
39035 static rtx
39036 ix86_gen_TWO52 (enum machine_mode mode)
39037 {
39038 REAL_VALUE_TYPE TWO52r;
39039 rtx TWO52;
39040
39041 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
39042 TWO52 = const_double_from_real_value (TWO52r, mode);
39043 TWO52 = force_reg (mode, TWO52);
39044
39045 return TWO52;
39046 }
39047
39048 /* Expand SSE sequence for computing lround from OP1 storing
39049 into OP0. */
39050 void
39051 ix86_expand_lround (rtx op0, rtx op1)
39052 {
39053 /* C code for the stuff we're doing below:
39054 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
39055 return (long)tmp;
39056 */
39057 enum machine_mode mode = GET_MODE (op1);
39058 const struct real_format *fmt;
39059 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39060 rtx adj;
39061
39062 /* load nextafter (0.5, 0.0) */
39063 fmt = REAL_MODE_FORMAT (mode);
39064 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39065 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39066
39067 /* adj = copysign (0.5, op1) */
39068 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
39069 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
39070
39071 /* adj = op1 + adj */
39072 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
39073
39074 /* op0 = (imode)adj */
39075 expand_fix (op0, adj, 0);
39076 }
39077
39078 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
39079 into OPERAND0. */
39080 void
39081 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
39082 {
39083 /* C code for the stuff we're doing below (for do_floor):
39084 xi = (long)op1;
39085 xi -= (double)xi > op1 ? 1 : 0;
39086 return xi;
39087 */
39088 enum machine_mode fmode = GET_MODE (op1);
39089 enum machine_mode imode = GET_MODE (op0);
39090 rtx ireg, freg, label, tmp;
39091
39092 /* reg = (long)op1 */
39093 ireg = gen_reg_rtx (imode);
39094 expand_fix (ireg, op1, 0);
39095
39096 /* freg = (double)reg */
39097 freg = gen_reg_rtx (fmode);
39098 expand_float (freg, ireg, 0);
39099
39100 /* ireg = (freg > op1) ? ireg - 1 : ireg */
39101 label = ix86_expand_sse_compare_and_jump (UNLE,
39102 freg, op1, !do_floor);
39103 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
39104 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
39105 emit_move_insn (ireg, tmp);
39106
39107 emit_label (label);
39108 LABEL_NUSES (label) = 1;
39109
39110 emit_move_insn (op0, ireg);
39111 }
39112
39113 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
39114 result in OPERAND0. */
39115 void
39116 ix86_expand_rint (rtx operand0, rtx operand1)
39117 {
39118 /* C code for the stuff we're doing below:
39119 xa = fabs (operand1);
39120 if (!isless (xa, 2**52))
39121 return operand1;
39122 xa = xa + 2**52 - 2**52;
39123 return copysign (xa, operand1);
39124 */
39125 enum machine_mode mode = GET_MODE (operand0);
39126 rtx res, xa, label, TWO52, mask;
39127
39128 res = gen_reg_rtx (mode);
39129 emit_move_insn (res, operand1);
39130
39131 /* xa = abs (operand1) */
39132 xa = ix86_expand_sse_fabs (res, &mask);
39133
39134 /* if (!isless (xa, TWO52)) goto label; */
39135 TWO52 = ix86_gen_TWO52 (mode);
39136 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39137
39138 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39139 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39140
39141 ix86_sse_copysign_to_positive (res, xa, res, mask);
39142
39143 emit_label (label);
39144 LABEL_NUSES (label) = 1;
39145
39146 emit_move_insn (operand0, res);
39147 }
39148
39149 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39150 into OPERAND0. */
39151 void
39152 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
39153 {
39154 /* C code for the stuff we expand below.
39155 double xa = fabs (x), x2;
39156 if (!isless (xa, TWO52))
39157 return x;
39158 xa = xa + TWO52 - TWO52;
39159 x2 = copysign (xa, x);
39160 Compensate. Floor:
39161 if (x2 > x)
39162 x2 -= 1;
39163 Compensate. Ceil:
39164 if (x2 < x)
39165 x2 -= -1;
39166 return x2;
39167 */
39168 enum machine_mode mode = GET_MODE (operand0);
39169 rtx xa, TWO52, tmp, label, one, res, mask;
39170
39171 TWO52 = ix86_gen_TWO52 (mode);
39172
39173 /* Temporary for holding the result, initialized to the input
39174 operand to ease control flow. */
39175 res = gen_reg_rtx (mode);
39176 emit_move_insn (res, operand1);
39177
39178 /* xa = abs (operand1) */
39179 xa = ix86_expand_sse_fabs (res, &mask);
39180
39181 /* if (!isless (xa, TWO52)) goto label; */
39182 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39183
39184 /* xa = xa + TWO52 - TWO52; */
39185 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39186 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39187
39188 /* xa = copysign (xa, operand1) */
39189 ix86_sse_copysign_to_positive (xa, xa, res, mask);
39190
39191 /* generate 1.0 or -1.0 */
39192 one = force_reg (mode,
39193 const_double_from_real_value (do_floor
39194 ? dconst1 : dconstm1, mode));
39195
39196 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39197 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39198 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39199 gen_rtx_AND (mode, one, tmp)));
39200 /* We always need to subtract here to preserve signed zero. */
39201 tmp = expand_simple_binop (mode, MINUS,
39202 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39203 emit_move_insn (res, tmp);
39204
39205 emit_label (label);
39206 LABEL_NUSES (label) = 1;
39207
39208 emit_move_insn (operand0, res);
39209 }
39210
39211 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39212 into OPERAND0. */
39213 void
39214 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
39215 {
39216 /* C code for the stuff we expand below.
39217 double xa = fabs (x), x2;
39218 if (!isless (xa, TWO52))
39219 return x;
39220 x2 = (double)(long)x;
39221 Compensate. Floor:
39222 if (x2 > x)
39223 x2 -= 1;
39224 Compensate. Ceil:
39225 if (x2 < x)
39226 x2 += 1;
39227 if (HONOR_SIGNED_ZEROS (mode))
39228 return copysign (x2, x);
39229 return x2;
39230 */
39231 enum machine_mode mode = GET_MODE (operand0);
39232 rtx xa, xi, TWO52, tmp, label, one, res, mask;
39233
39234 TWO52 = ix86_gen_TWO52 (mode);
39235
39236 /* Temporary for holding the result, initialized to the input
39237 operand to ease control flow. */
39238 res = gen_reg_rtx (mode);
39239 emit_move_insn (res, operand1);
39240
39241 /* xa = abs (operand1) */
39242 xa = ix86_expand_sse_fabs (res, &mask);
39243
39244 /* if (!isless (xa, TWO52)) goto label; */
39245 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39246
39247 /* xa = (double)(long)x */
39248 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39249 expand_fix (xi, res, 0);
39250 expand_float (xa, xi, 0);
39251
39252 /* generate 1.0 */
39253 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39254
39255 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39256 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39257 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39258 gen_rtx_AND (mode, one, tmp)));
39259 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
39260 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39261 emit_move_insn (res, tmp);
39262
39263 if (HONOR_SIGNED_ZEROS (mode))
39264 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39265
39266 emit_label (label);
39267 LABEL_NUSES (label) = 1;
39268
39269 emit_move_insn (operand0, res);
39270 }
39271
39272 /* Expand SSE sequence for computing round from OPERAND1 storing
39273 into OPERAND0. Sequence that works without relying on DImode truncation
39274 via cvttsd2siq that is only available on 64bit targets. */
39275 void
39276 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
39277 {
39278 /* C code for the stuff we expand below.
39279 double xa = fabs (x), xa2, x2;
39280 if (!isless (xa, TWO52))
39281 return x;
39282 Using the absolute value and copying back sign makes
39283 -0.0 -> -0.0 correct.
39284 xa2 = xa + TWO52 - TWO52;
39285 Compensate.
39286 dxa = xa2 - xa;
39287 if (dxa <= -0.5)
39288 xa2 += 1;
39289 else if (dxa > 0.5)
39290 xa2 -= 1;
39291 x2 = copysign (xa2, x);
39292 return x2;
39293 */
39294 enum machine_mode mode = GET_MODE (operand0);
39295 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
39296
39297 TWO52 = ix86_gen_TWO52 (mode);
39298
39299 /* Temporary for holding the result, initialized to the input
39300 operand to ease control flow. */
39301 res = gen_reg_rtx (mode);
39302 emit_move_insn (res, operand1);
39303
39304 /* xa = abs (operand1) */
39305 xa = ix86_expand_sse_fabs (res, &mask);
39306
39307 /* if (!isless (xa, TWO52)) goto label; */
39308 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39309
39310 /* xa2 = xa + TWO52 - TWO52; */
39311 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39312 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
39313
39314 /* dxa = xa2 - xa; */
39315 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
39316
39317 /* generate 0.5, 1.0 and -0.5 */
39318 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
39319 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
39320 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
39321 0, OPTAB_DIRECT);
39322
39323 /* Compensate. */
39324 tmp = gen_reg_rtx (mode);
39325 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
39326 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
39327 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39328 gen_rtx_AND (mode, one, tmp)));
39329 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39330 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
39331 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
39332 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39333 gen_rtx_AND (mode, one, tmp)));
39334 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39335
39336 /* res = copysign (xa2, operand1) */
39337 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
39338
39339 emit_label (label);
39340 LABEL_NUSES (label) = 1;
39341
39342 emit_move_insn (operand0, res);
39343 }
39344
39345 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39346 into OPERAND0. */
39347 void
39348 ix86_expand_trunc (rtx operand0, rtx operand1)
39349 {
39350 /* C code for SSE variant we expand below.
39351 double xa = fabs (x), x2;
39352 if (!isless (xa, TWO52))
39353 return x;
39354 x2 = (double)(long)x;
39355 if (HONOR_SIGNED_ZEROS (mode))
39356 return copysign (x2, x);
39357 return x2;
39358 */
39359 enum machine_mode mode = GET_MODE (operand0);
39360 rtx xa, xi, TWO52, label, res, mask;
39361
39362 TWO52 = ix86_gen_TWO52 (mode);
39363
39364 /* Temporary for holding the result, initialized to the input
39365 operand to ease control flow. */
39366 res = gen_reg_rtx (mode);
39367 emit_move_insn (res, operand1);
39368
39369 /* xa = abs (operand1) */
39370 xa = ix86_expand_sse_fabs (res, &mask);
39371
39372 /* if (!isless (xa, TWO52)) goto label; */
39373 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39374
39375 /* x = (double)(long)x */
39376 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39377 expand_fix (xi, res, 0);
39378 expand_float (res, xi, 0);
39379
39380 if (HONOR_SIGNED_ZEROS (mode))
39381 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39382
39383 emit_label (label);
39384 LABEL_NUSES (label) = 1;
39385
39386 emit_move_insn (operand0, res);
39387 }
39388
39389 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39390 into OPERAND0. */
39391 void
39392 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
39393 {
39394 enum machine_mode mode = GET_MODE (operand0);
39395 rtx xa, mask, TWO52, label, one, res, smask, tmp;
39396
39397 /* C code for SSE variant we expand below.
39398 double xa = fabs (x), x2;
39399 if (!isless (xa, TWO52))
39400 return x;
39401 xa2 = xa + TWO52 - TWO52;
39402 Compensate:
39403 if (xa2 > xa)
39404 xa2 -= 1.0;
39405 x2 = copysign (xa2, x);
39406 return x2;
39407 */
39408
39409 TWO52 = ix86_gen_TWO52 (mode);
39410
39411 /* Temporary for holding the result, initialized to the input
39412 operand to ease control flow. */
39413 res = gen_reg_rtx (mode);
39414 emit_move_insn (res, operand1);
39415
39416 /* xa = abs (operand1) */
39417 xa = ix86_expand_sse_fabs (res, &smask);
39418
39419 /* if (!isless (xa, TWO52)) goto label; */
39420 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39421
39422 /* res = xa + TWO52 - TWO52; */
39423 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39424 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
39425 emit_move_insn (res, tmp);
39426
39427 /* generate 1.0 */
39428 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39429
39430 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
39431 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
39432 emit_insn (gen_rtx_SET (VOIDmode, mask,
39433 gen_rtx_AND (mode, mask, one)));
39434 tmp = expand_simple_binop (mode, MINUS,
39435 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
39436 emit_move_insn (res, tmp);
39437
39438 /* res = copysign (res, operand1) */
39439 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
39440
39441 emit_label (label);
39442 LABEL_NUSES (label) = 1;
39443
39444 emit_move_insn (operand0, res);
39445 }
39446
39447 /* Expand SSE sequence for computing round from OPERAND1 storing
39448 into OPERAND0. */
39449 void
39450 ix86_expand_round (rtx operand0, rtx operand1)
39451 {
39452 /* C code for the stuff we're doing below:
39453 double xa = fabs (x);
39454 if (!isless (xa, TWO52))
39455 return x;
39456 xa = (double)(long)(xa + nextafter (0.5, 0.0));
39457 return copysign (xa, x);
39458 */
39459 enum machine_mode mode = GET_MODE (operand0);
39460 rtx res, TWO52, xa, label, xi, half, mask;
39461 const struct real_format *fmt;
39462 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39463
39464 /* Temporary for holding the result, initialized to the input
39465 operand to ease control flow. */
39466 res = gen_reg_rtx (mode);
39467 emit_move_insn (res, operand1);
39468
39469 TWO52 = ix86_gen_TWO52 (mode);
39470 xa = ix86_expand_sse_fabs (res, &mask);
39471 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39472
39473 /* load nextafter (0.5, 0.0) */
39474 fmt = REAL_MODE_FORMAT (mode);
39475 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39476 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39477
39478 /* xa = xa + 0.5 */
39479 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
39480 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
39481
39482 /* xa = (double)(int64_t)xa */
39483 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39484 expand_fix (xi, xa, 0);
39485 expand_float (xa, xi, 0);
39486
39487 /* res = copysign (xa, operand1) */
39488 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
39489
39490 emit_label (label);
39491 LABEL_NUSES (label) = 1;
39492
39493 emit_move_insn (operand0, res);
39494 }
39495
39496 /* Expand SSE sequence for computing round
39497 from OP1 storing into OP0 using sse4 round insn. */
39498 void
39499 ix86_expand_round_sse4 (rtx op0, rtx op1)
39500 {
39501 enum machine_mode mode = GET_MODE (op0);
39502 rtx e1, e2, res, half;
39503 const struct real_format *fmt;
39504 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39505 rtx (*gen_copysign) (rtx, rtx, rtx);
39506 rtx (*gen_round) (rtx, rtx, rtx);
39507
39508 switch (mode)
39509 {
39510 case SFmode:
39511 gen_copysign = gen_copysignsf3;
39512 gen_round = gen_sse4_1_roundsf2;
39513 break;
39514 case DFmode:
39515 gen_copysign = gen_copysigndf3;
39516 gen_round = gen_sse4_1_rounddf2;
39517 break;
39518 default:
39519 gcc_unreachable ();
39520 }
39521
39522 /* round (a) = trunc (a + copysign (0.5, a)) */
39523
39524 /* load nextafter (0.5, 0.0) */
39525 fmt = REAL_MODE_FORMAT (mode);
39526 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39527 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39528 half = const_double_from_real_value (pred_half, mode);
39529
39530 /* e1 = copysign (0.5, op1) */
39531 e1 = gen_reg_rtx (mode);
39532 emit_insn (gen_copysign (e1, half, op1));
39533
39534 /* e2 = op1 + e1 */
39535 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
39536
39537 /* res = trunc (e2) */
39538 res = gen_reg_rtx (mode);
39539 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
39540
39541 emit_move_insn (op0, res);
39542 }
39543 \f
39544
39545 /* Table of valid machine attributes. */
39546 static const struct attribute_spec ix86_attribute_table[] =
39547 {
39548 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
39549 affects_type_identity } */
39550 /* Stdcall attribute says callee is responsible for popping arguments
39551 if they are not variable. */
39552 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39553 true },
39554 /* Fastcall attribute says callee is responsible for popping arguments
39555 if they are not variable. */
39556 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39557 true },
39558 /* Thiscall attribute says callee is responsible for popping arguments
39559 if they are not variable. */
39560 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39561 true },
39562 /* Cdecl attribute says the callee is a normal C declaration */
39563 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39564 true },
39565 /* Regparm attribute specifies how many integer arguments are to be
39566 passed in registers. */
39567 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
39568 true },
39569 /* Sseregparm attribute says we are using x86_64 calling conventions
39570 for FP arguments. */
39571 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39572 true },
39573 /* The transactional memory builtins are implicitly regparm or fastcall
39574 depending on the ABI. Override the generic do-nothing attribute that
39575 these builtins were declared with. */
39576 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
39577 true },
39578 /* force_align_arg_pointer says this function realigns the stack at entry. */
39579 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
39580 false, true, true, ix86_handle_cconv_attribute, false },
39581 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39582 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
39583 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
39584 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
39585 false },
39586 #endif
39587 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39588 false },
39589 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39590 false },
39591 #ifdef SUBTARGET_ATTRIBUTE_TABLE
39592 SUBTARGET_ATTRIBUTE_TABLE,
39593 #endif
39594 /* ms_abi and sysv_abi calling convention function attributes. */
39595 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39596 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39597 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
39598 false },
39599 { "callee_pop_aggregate_return", 1, 1, false, true, true,
39600 ix86_handle_callee_pop_aggregate_return, true },
39601 /* End element. */
39602 { NULL, 0, 0, false, false, false, NULL, false }
39603 };
39604
39605 /* Implement targetm.vectorize.builtin_vectorization_cost. */
39606 static int
39607 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
39608 tree vectype,
39609 int misalign ATTRIBUTE_UNUSED)
39610 {
39611 unsigned elements;
39612
39613 switch (type_of_cost)
39614 {
39615 case scalar_stmt:
39616 return ix86_cost->scalar_stmt_cost;
39617
39618 case scalar_load:
39619 return ix86_cost->scalar_load_cost;
39620
39621 case scalar_store:
39622 return ix86_cost->scalar_store_cost;
39623
39624 case vector_stmt:
39625 return ix86_cost->vec_stmt_cost;
39626
39627 case vector_load:
39628 return ix86_cost->vec_align_load_cost;
39629
39630 case vector_store:
39631 return ix86_cost->vec_store_cost;
39632
39633 case vec_to_scalar:
39634 return ix86_cost->vec_to_scalar_cost;
39635
39636 case scalar_to_vec:
39637 return ix86_cost->scalar_to_vec_cost;
39638
39639 case unaligned_load:
39640 case unaligned_store:
39641 return ix86_cost->vec_unalign_load_cost;
39642
39643 case cond_branch_taken:
39644 return ix86_cost->cond_taken_branch_cost;
39645
39646 case cond_branch_not_taken:
39647 return ix86_cost->cond_not_taken_branch_cost;
39648
39649 case vec_perm:
39650 case vec_promote_demote:
39651 return ix86_cost->vec_stmt_cost;
39652
39653 case vec_construct:
39654 elements = TYPE_VECTOR_SUBPARTS (vectype);
39655 return elements / 2 + 1;
39656
39657 default:
39658 gcc_unreachable ();
39659 }
39660 }
39661
39662 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39663 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39664 insn every time. */
39665
39666 static GTY(()) rtx vselect_insn;
39667
39668 /* Initialize vselect_insn. */
39669
39670 static void
39671 init_vselect_insn (void)
39672 {
39673 unsigned i;
39674 rtx x;
39675
39676 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39677 for (i = 0; i < MAX_VECT_LEN; ++i)
39678 XVECEXP (x, 0, i) = const0_rtx;
39679 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39680 const0_rtx), x);
39681 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39682 start_sequence ();
39683 vselect_insn = emit_insn (x);
39684 end_sequence ();
39685 }
39686
39687 /* Construct (set target (vec_select op0 (parallel perm))) and
39688 return true if that's a valid instruction in the active ISA. */
39689
39690 static bool
39691 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39692 unsigned nelt, bool testing_p)
39693 {
39694 unsigned int i;
39695 rtx x, save_vconcat;
39696 int icode;
39697
39698 if (vselect_insn == NULL_RTX)
39699 init_vselect_insn ();
39700
39701 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39702 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39703 for (i = 0; i < nelt; ++i)
39704 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39705 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39706 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39707 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39708 SET_DEST (PATTERN (vselect_insn)) = target;
39709 icode = recog_memoized (vselect_insn);
39710
39711 if (icode >= 0 && !testing_p)
39712 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39713
39714 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39715 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39716 INSN_CODE (vselect_insn) = -1;
39717
39718 return icode >= 0;
39719 }
39720
39721 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39722
39723 static bool
39724 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39725 const unsigned char *perm, unsigned nelt,
39726 bool testing_p)
39727 {
39728 enum machine_mode v2mode;
39729 rtx x;
39730 bool ok;
39731
39732 if (vselect_insn == NULL_RTX)
39733 init_vselect_insn ();
39734
39735 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39736 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39737 PUT_MODE (x, v2mode);
39738 XEXP (x, 0) = op0;
39739 XEXP (x, 1) = op1;
39740 ok = expand_vselect (target, x, perm, nelt, testing_p);
39741 XEXP (x, 0) = const0_rtx;
39742 XEXP (x, 1) = const0_rtx;
39743 return ok;
39744 }
39745
39746 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39747 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39748
39749 static bool
39750 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39751 {
39752 enum machine_mode vmode = d->vmode;
39753 unsigned i, mask, nelt = d->nelt;
39754 rtx target, op0, op1, x;
39755 rtx rperm[32], vperm;
39756
39757 if (d->one_operand_p)
39758 return false;
39759 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39760 ;
39761 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39762 ;
39763 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39764 ;
39765 else
39766 return false;
39767
39768 /* This is a blend, not a permute. Elements must stay in their
39769 respective lanes. */
39770 for (i = 0; i < nelt; ++i)
39771 {
39772 unsigned e = d->perm[i];
39773 if (!(e == i || e == i + nelt))
39774 return false;
39775 }
39776
39777 if (d->testing_p)
39778 return true;
39779
39780 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39781 decision should be extracted elsewhere, so that we only try that
39782 sequence once all budget==3 options have been tried. */
39783 target = d->target;
39784 op0 = d->op0;
39785 op1 = d->op1;
39786 mask = 0;
39787
39788 switch (vmode)
39789 {
39790 case V4DFmode:
39791 case V8SFmode:
39792 case V2DFmode:
39793 case V4SFmode:
39794 case V8HImode:
39795 case V8SImode:
39796 for (i = 0; i < nelt; ++i)
39797 mask |= (d->perm[i] >= nelt) << i;
39798 break;
39799
39800 case V2DImode:
39801 for (i = 0; i < 2; ++i)
39802 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39803 vmode = V8HImode;
39804 goto do_subreg;
39805
39806 case V4SImode:
39807 for (i = 0; i < 4; ++i)
39808 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39809 vmode = V8HImode;
39810 goto do_subreg;
39811
39812 case V16QImode:
39813 /* See if bytes move in pairs so we can use pblendw with
39814 an immediate argument, rather than pblendvb with a vector
39815 argument. */
39816 for (i = 0; i < 16; i += 2)
39817 if (d->perm[i] + 1 != d->perm[i + 1])
39818 {
39819 use_pblendvb:
39820 for (i = 0; i < nelt; ++i)
39821 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39822
39823 finish_pblendvb:
39824 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39825 vperm = force_reg (vmode, vperm);
39826
39827 if (GET_MODE_SIZE (vmode) == 16)
39828 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39829 else
39830 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39831 if (target != d->target)
39832 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39833 return true;
39834 }
39835
39836 for (i = 0; i < 8; ++i)
39837 mask |= (d->perm[i * 2] >= 16) << i;
39838 vmode = V8HImode;
39839 /* FALLTHRU */
39840
39841 do_subreg:
39842 target = gen_reg_rtx (vmode);
39843 op0 = gen_lowpart (vmode, op0);
39844 op1 = gen_lowpart (vmode, op1);
39845 break;
39846
39847 case V32QImode:
39848 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39849 for (i = 0; i < 32; i += 2)
39850 if (d->perm[i] + 1 != d->perm[i + 1])
39851 goto use_pblendvb;
39852 /* See if bytes move in quadruplets. If yes, vpblendd
39853 with immediate can be used. */
39854 for (i = 0; i < 32; i += 4)
39855 if (d->perm[i] + 2 != d->perm[i + 2])
39856 break;
39857 if (i < 32)
39858 {
39859 /* See if bytes move the same in both lanes. If yes,
39860 vpblendw with immediate can be used. */
39861 for (i = 0; i < 16; i += 2)
39862 if (d->perm[i] + 16 != d->perm[i + 16])
39863 goto use_pblendvb;
39864
39865 /* Use vpblendw. */
39866 for (i = 0; i < 16; ++i)
39867 mask |= (d->perm[i * 2] >= 32) << i;
39868 vmode = V16HImode;
39869 goto do_subreg;
39870 }
39871
39872 /* Use vpblendd. */
39873 for (i = 0; i < 8; ++i)
39874 mask |= (d->perm[i * 4] >= 32) << i;
39875 vmode = V8SImode;
39876 goto do_subreg;
39877
39878 case V16HImode:
39879 /* See if words move in pairs. If yes, vpblendd can be used. */
39880 for (i = 0; i < 16; i += 2)
39881 if (d->perm[i] + 1 != d->perm[i + 1])
39882 break;
39883 if (i < 16)
39884 {
39885 /* See if words move the same in both lanes. If not,
39886 vpblendvb must be used. */
39887 for (i = 0; i < 8; i++)
39888 if (d->perm[i] + 8 != d->perm[i + 8])
39889 {
39890 /* Use vpblendvb. */
39891 for (i = 0; i < 32; ++i)
39892 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39893
39894 vmode = V32QImode;
39895 nelt = 32;
39896 target = gen_reg_rtx (vmode);
39897 op0 = gen_lowpart (vmode, op0);
39898 op1 = gen_lowpart (vmode, op1);
39899 goto finish_pblendvb;
39900 }
39901
39902 /* Use vpblendw. */
39903 for (i = 0; i < 16; ++i)
39904 mask |= (d->perm[i] >= 16) << i;
39905 break;
39906 }
39907
39908 /* Use vpblendd. */
39909 for (i = 0; i < 8; ++i)
39910 mask |= (d->perm[i * 2] >= 16) << i;
39911 vmode = V8SImode;
39912 goto do_subreg;
39913
39914 case V4DImode:
39915 /* Use vpblendd. */
39916 for (i = 0; i < 4; ++i)
39917 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39918 vmode = V8SImode;
39919 goto do_subreg;
39920
39921 default:
39922 gcc_unreachable ();
39923 }
39924
39925 /* This matches five different patterns with the different modes. */
39926 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39927 x = gen_rtx_SET (VOIDmode, target, x);
39928 emit_insn (x);
39929 if (target != d->target)
39930 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39931
39932 return true;
39933 }
39934
39935 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39936 in terms of the variable form of vpermilps.
39937
39938 Note that we will have already failed the immediate input vpermilps,
39939 which requires that the high and low part shuffle be identical; the
39940 variable form doesn't require that. */
39941
39942 static bool
39943 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39944 {
39945 rtx rperm[8], vperm;
39946 unsigned i;
39947
39948 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39949 return false;
39950
39951 /* We can only permute within the 128-bit lane. */
39952 for (i = 0; i < 8; ++i)
39953 {
39954 unsigned e = d->perm[i];
39955 if (i < 4 ? e >= 4 : e < 4)
39956 return false;
39957 }
39958
39959 if (d->testing_p)
39960 return true;
39961
39962 for (i = 0; i < 8; ++i)
39963 {
39964 unsigned e = d->perm[i];
39965
39966 /* Within each 128-bit lane, the elements of op0 are numbered
39967 from 0 and the elements of op1 are numbered from 4. */
39968 if (e >= 8 + 4)
39969 e -= 8;
39970 else if (e >= 4)
39971 e -= 4;
39972
39973 rperm[i] = GEN_INT (e);
39974 }
39975
39976 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39977 vperm = force_reg (V8SImode, vperm);
39978 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39979
39980 return true;
39981 }
39982
39983 /* Return true if permutation D can be performed as VMODE permutation
39984 instead. */
39985
39986 static bool
39987 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39988 {
39989 unsigned int i, j, chunk;
39990
39991 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39992 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39993 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39994 return false;
39995
39996 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39997 return true;
39998
39999 chunk = d->nelt / GET_MODE_NUNITS (vmode);
40000 for (i = 0; i < d->nelt; i += chunk)
40001 if (d->perm[i] & (chunk - 1))
40002 return false;
40003 else
40004 for (j = 1; j < chunk; ++j)
40005 if (d->perm[i] + j != d->perm[i + j])
40006 return false;
40007
40008 return true;
40009 }
40010
40011 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40012 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
40013
40014 static bool
40015 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
40016 {
40017 unsigned i, nelt, eltsz, mask;
40018 unsigned char perm[32];
40019 enum machine_mode vmode = V16QImode;
40020 rtx rperm[32], vperm, target, op0, op1;
40021
40022 nelt = d->nelt;
40023
40024 if (!d->one_operand_p)
40025 {
40026 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
40027 {
40028 if (TARGET_AVX2
40029 && valid_perm_using_mode_p (V2TImode, d))
40030 {
40031 if (d->testing_p)
40032 return true;
40033
40034 /* Use vperm2i128 insn. The pattern uses
40035 V4DImode instead of V2TImode. */
40036 target = d->target;
40037 if (d->vmode != V4DImode)
40038 target = gen_reg_rtx (V4DImode);
40039 op0 = gen_lowpart (V4DImode, d->op0);
40040 op1 = gen_lowpart (V4DImode, d->op1);
40041 rperm[0]
40042 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
40043 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
40044 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
40045 if (target != d->target)
40046 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40047 return true;
40048 }
40049 return false;
40050 }
40051 }
40052 else
40053 {
40054 if (GET_MODE_SIZE (d->vmode) == 16)
40055 {
40056 if (!TARGET_SSSE3)
40057 return false;
40058 }
40059 else if (GET_MODE_SIZE (d->vmode) == 32)
40060 {
40061 if (!TARGET_AVX2)
40062 return false;
40063
40064 /* V4DImode should be already handled through
40065 expand_vselect by vpermq instruction. */
40066 gcc_assert (d->vmode != V4DImode);
40067
40068 vmode = V32QImode;
40069 if (d->vmode == V8SImode
40070 || d->vmode == V16HImode
40071 || d->vmode == V32QImode)
40072 {
40073 /* First see if vpermq can be used for
40074 V8SImode/V16HImode/V32QImode. */
40075 if (valid_perm_using_mode_p (V4DImode, d))
40076 {
40077 for (i = 0; i < 4; i++)
40078 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
40079 if (d->testing_p)
40080 return true;
40081 target = gen_reg_rtx (V4DImode);
40082 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
40083 perm, 4, false))
40084 {
40085 emit_move_insn (d->target,
40086 gen_lowpart (d->vmode, target));
40087 return true;
40088 }
40089 return false;
40090 }
40091
40092 /* Next see if vpermd can be used. */
40093 if (valid_perm_using_mode_p (V8SImode, d))
40094 vmode = V8SImode;
40095 }
40096 /* Or if vpermps can be used. */
40097 else if (d->vmode == V8SFmode)
40098 vmode = V8SImode;
40099
40100 if (vmode == V32QImode)
40101 {
40102 /* vpshufb only works intra lanes, it is not
40103 possible to shuffle bytes in between the lanes. */
40104 for (i = 0; i < nelt; ++i)
40105 if ((d->perm[i] ^ i) & (nelt / 2))
40106 return false;
40107 }
40108 }
40109 else
40110 return false;
40111 }
40112
40113 if (d->testing_p)
40114 return true;
40115
40116 if (vmode == V8SImode)
40117 for (i = 0; i < 8; ++i)
40118 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
40119 else
40120 {
40121 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40122 if (!d->one_operand_p)
40123 mask = 2 * nelt - 1;
40124 else if (vmode == V16QImode)
40125 mask = nelt - 1;
40126 else
40127 mask = nelt / 2 - 1;
40128
40129 for (i = 0; i < nelt; ++i)
40130 {
40131 unsigned j, e = d->perm[i] & mask;
40132 for (j = 0; j < eltsz; ++j)
40133 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
40134 }
40135 }
40136
40137 vperm = gen_rtx_CONST_VECTOR (vmode,
40138 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
40139 vperm = force_reg (vmode, vperm);
40140
40141 target = d->target;
40142 if (d->vmode != vmode)
40143 target = gen_reg_rtx (vmode);
40144 op0 = gen_lowpart (vmode, d->op0);
40145 if (d->one_operand_p)
40146 {
40147 if (vmode == V16QImode)
40148 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
40149 else if (vmode == V32QImode)
40150 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
40151 else if (vmode == V8SFmode)
40152 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
40153 else
40154 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
40155 }
40156 else
40157 {
40158 op1 = gen_lowpart (vmode, d->op1);
40159 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
40160 }
40161 if (target != d->target)
40162 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40163
40164 return true;
40165 }
40166
40167 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
40168 in a single instruction. */
40169
40170 static bool
40171 expand_vec_perm_1 (struct expand_vec_perm_d *d)
40172 {
40173 unsigned i, nelt = d->nelt;
40174 unsigned char perm2[MAX_VECT_LEN];
40175
40176 /* Check plain VEC_SELECT first, because AVX has instructions that could
40177 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
40178 input where SEL+CONCAT may not. */
40179 if (d->one_operand_p)
40180 {
40181 int mask = nelt - 1;
40182 bool identity_perm = true;
40183 bool broadcast_perm = true;
40184
40185 for (i = 0; i < nelt; i++)
40186 {
40187 perm2[i] = d->perm[i] & mask;
40188 if (perm2[i] != i)
40189 identity_perm = false;
40190 if (perm2[i])
40191 broadcast_perm = false;
40192 }
40193
40194 if (identity_perm)
40195 {
40196 if (!d->testing_p)
40197 emit_move_insn (d->target, d->op0);
40198 return true;
40199 }
40200 else if (broadcast_perm && TARGET_AVX2)
40201 {
40202 /* Use vpbroadcast{b,w,d}. */
40203 rtx (*gen) (rtx, rtx) = NULL;
40204 switch (d->vmode)
40205 {
40206 case V32QImode:
40207 gen = gen_avx2_pbroadcastv32qi_1;
40208 break;
40209 case V16HImode:
40210 gen = gen_avx2_pbroadcastv16hi_1;
40211 break;
40212 case V8SImode:
40213 gen = gen_avx2_pbroadcastv8si_1;
40214 break;
40215 case V16QImode:
40216 gen = gen_avx2_pbroadcastv16qi;
40217 break;
40218 case V8HImode:
40219 gen = gen_avx2_pbroadcastv8hi;
40220 break;
40221 case V8SFmode:
40222 gen = gen_avx2_vec_dupv8sf_1;
40223 break;
40224 /* For other modes prefer other shuffles this function creates. */
40225 default: break;
40226 }
40227 if (gen != NULL)
40228 {
40229 if (!d->testing_p)
40230 emit_insn (gen (d->target, d->op0));
40231 return true;
40232 }
40233 }
40234
40235 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
40236 return true;
40237
40238 /* There are plenty of patterns in sse.md that are written for
40239 SEL+CONCAT and are not replicated for a single op. Perhaps
40240 that should be changed, to avoid the nastiness here. */
40241
40242 /* Recognize interleave style patterns, which means incrementing
40243 every other permutation operand. */
40244 for (i = 0; i < nelt; i += 2)
40245 {
40246 perm2[i] = d->perm[i] & mask;
40247 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
40248 }
40249 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40250 d->testing_p))
40251 return true;
40252
40253 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
40254 if (nelt >= 4)
40255 {
40256 for (i = 0; i < nelt; i += 4)
40257 {
40258 perm2[i + 0] = d->perm[i + 0] & mask;
40259 perm2[i + 1] = d->perm[i + 1] & mask;
40260 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
40261 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
40262 }
40263
40264 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40265 d->testing_p))
40266 return true;
40267 }
40268 }
40269
40270 /* Finally, try the fully general two operand permute. */
40271 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
40272 d->testing_p))
40273 return true;
40274
40275 /* Recognize interleave style patterns with reversed operands. */
40276 if (!d->one_operand_p)
40277 {
40278 for (i = 0; i < nelt; ++i)
40279 {
40280 unsigned e = d->perm[i];
40281 if (e >= nelt)
40282 e -= nelt;
40283 else
40284 e += nelt;
40285 perm2[i] = e;
40286 }
40287
40288 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
40289 d->testing_p))
40290 return true;
40291 }
40292
40293 /* Try the SSE4.1 blend variable merge instructions. */
40294 if (expand_vec_perm_blend (d))
40295 return true;
40296
40297 /* Try one of the AVX vpermil variable permutations. */
40298 if (expand_vec_perm_vpermil (d))
40299 return true;
40300
40301 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
40302 vpshufb, vpermd, vpermps or vpermq variable permutation. */
40303 if (expand_vec_perm_pshufb (d))
40304 return true;
40305
40306 return false;
40307 }
40308
40309 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40310 in terms of a pair of pshuflw + pshufhw instructions. */
40311
40312 static bool
40313 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
40314 {
40315 unsigned char perm2[MAX_VECT_LEN];
40316 unsigned i;
40317 bool ok;
40318
40319 if (d->vmode != V8HImode || !d->one_operand_p)
40320 return false;
40321
40322 /* The two permutations only operate in 64-bit lanes. */
40323 for (i = 0; i < 4; ++i)
40324 if (d->perm[i] >= 4)
40325 return false;
40326 for (i = 4; i < 8; ++i)
40327 if (d->perm[i] < 4)
40328 return false;
40329
40330 if (d->testing_p)
40331 return true;
40332
40333 /* Emit the pshuflw. */
40334 memcpy (perm2, d->perm, 4);
40335 for (i = 4; i < 8; ++i)
40336 perm2[i] = i;
40337 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
40338 gcc_assert (ok);
40339
40340 /* Emit the pshufhw. */
40341 memcpy (perm2 + 4, d->perm + 4, 4);
40342 for (i = 0; i < 4; ++i)
40343 perm2[i] = i;
40344 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
40345 gcc_assert (ok);
40346
40347 return true;
40348 }
40349
40350 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40351 the permutation using the SSSE3 palignr instruction. This succeeds
40352 when all of the elements in PERM fit within one vector and we merely
40353 need to shift them down so that a single vector permutation has a
40354 chance to succeed. */
40355
40356 static bool
40357 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
40358 {
40359 unsigned i, nelt = d->nelt;
40360 unsigned min, max;
40361 bool in_order, ok;
40362 rtx shift, target;
40363 struct expand_vec_perm_d dcopy;
40364
40365 /* Even with AVX, palignr only operates on 128-bit vectors. */
40366 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40367 return false;
40368
40369 min = nelt, max = 0;
40370 for (i = 0; i < nelt; ++i)
40371 {
40372 unsigned e = d->perm[i];
40373 if (e < min)
40374 min = e;
40375 if (e > max)
40376 max = e;
40377 }
40378 if (min == 0 || max - min >= nelt)
40379 return false;
40380
40381 /* Given that we have SSSE3, we know we'll be able to implement the
40382 single operand permutation after the palignr with pshufb. */
40383 if (d->testing_p)
40384 return true;
40385
40386 dcopy = *d;
40387 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
40388 target = gen_reg_rtx (TImode);
40389 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
40390 gen_lowpart (TImode, d->op0), shift));
40391
40392 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
40393 dcopy.one_operand_p = true;
40394
40395 in_order = true;
40396 for (i = 0; i < nelt; ++i)
40397 {
40398 unsigned e = dcopy.perm[i] - min;
40399 if (e != i)
40400 in_order = false;
40401 dcopy.perm[i] = e;
40402 }
40403
40404 /* Test for the degenerate case where the alignment by itself
40405 produces the desired permutation. */
40406 if (in_order)
40407 {
40408 emit_move_insn (d->target, dcopy.op0);
40409 return true;
40410 }
40411
40412 ok = expand_vec_perm_1 (&dcopy);
40413 gcc_assert (ok);
40414
40415 return ok;
40416 }
40417
40418 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
40419
40420 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40421 a two vector permutation into a single vector permutation by using
40422 an interleave operation to merge the vectors. */
40423
40424 static bool
40425 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
40426 {
40427 struct expand_vec_perm_d dremap, dfinal;
40428 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
40429 unsigned HOST_WIDE_INT contents;
40430 unsigned char remap[2 * MAX_VECT_LEN];
40431 rtx seq;
40432 bool ok, same_halves = false;
40433
40434 if (GET_MODE_SIZE (d->vmode) == 16)
40435 {
40436 if (d->one_operand_p)
40437 return false;
40438 }
40439 else if (GET_MODE_SIZE (d->vmode) == 32)
40440 {
40441 if (!TARGET_AVX)
40442 return false;
40443 /* For 32-byte modes allow even d->one_operand_p.
40444 The lack of cross-lane shuffling in some instructions
40445 might prevent a single insn shuffle. */
40446 dfinal = *d;
40447 dfinal.testing_p = true;
40448 /* If expand_vec_perm_interleave3 can expand this into
40449 a 3 insn sequence, give up and let it be expanded as
40450 3 insn sequence. While that is one insn longer,
40451 it doesn't need a memory operand and in the common
40452 case that both interleave low and high permutations
40453 with the same operands are adjacent needs 4 insns
40454 for both after CSE. */
40455 if (expand_vec_perm_interleave3 (&dfinal))
40456 return false;
40457 }
40458 else
40459 return false;
40460
40461 /* Examine from whence the elements come. */
40462 contents = 0;
40463 for (i = 0; i < nelt; ++i)
40464 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
40465
40466 memset (remap, 0xff, sizeof (remap));
40467 dremap = *d;
40468
40469 if (GET_MODE_SIZE (d->vmode) == 16)
40470 {
40471 unsigned HOST_WIDE_INT h1, h2, h3, h4;
40472
40473 /* Split the two input vectors into 4 halves. */
40474 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
40475 h2 = h1 << nelt2;
40476 h3 = h2 << nelt2;
40477 h4 = h3 << nelt2;
40478
40479 /* If the elements from the low halves use interleave low, and similarly
40480 for interleave high. If the elements are from mis-matched halves, we
40481 can use shufps for V4SF/V4SI or do a DImode shuffle. */
40482 if ((contents & (h1 | h3)) == contents)
40483 {
40484 /* punpckl* */
40485 for (i = 0; i < nelt2; ++i)
40486 {
40487 remap[i] = i * 2;
40488 remap[i + nelt] = i * 2 + 1;
40489 dremap.perm[i * 2] = i;
40490 dremap.perm[i * 2 + 1] = i + nelt;
40491 }
40492 if (!TARGET_SSE2 && d->vmode == V4SImode)
40493 dremap.vmode = V4SFmode;
40494 }
40495 else if ((contents & (h2 | h4)) == contents)
40496 {
40497 /* punpckh* */
40498 for (i = 0; i < nelt2; ++i)
40499 {
40500 remap[i + nelt2] = i * 2;
40501 remap[i + nelt + nelt2] = i * 2 + 1;
40502 dremap.perm[i * 2] = i + nelt2;
40503 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
40504 }
40505 if (!TARGET_SSE2 && d->vmode == V4SImode)
40506 dremap.vmode = V4SFmode;
40507 }
40508 else if ((contents & (h1 | h4)) == contents)
40509 {
40510 /* shufps */
40511 for (i = 0; i < nelt2; ++i)
40512 {
40513 remap[i] = i;
40514 remap[i + nelt + nelt2] = i + nelt2;
40515 dremap.perm[i] = i;
40516 dremap.perm[i + nelt2] = i + nelt + nelt2;
40517 }
40518 if (nelt != 4)
40519 {
40520 /* shufpd */
40521 dremap.vmode = V2DImode;
40522 dremap.nelt = 2;
40523 dremap.perm[0] = 0;
40524 dremap.perm[1] = 3;
40525 }
40526 }
40527 else if ((contents & (h2 | h3)) == contents)
40528 {
40529 /* shufps */
40530 for (i = 0; i < nelt2; ++i)
40531 {
40532 remap[i + nelt2] = i;
40533 remap[i + nelt] = i + nelt2;
40534 dremap.perm[i] = i + nelt2;
40535 dremap.perm[i + nelt2] = i + nelt;
40536 }
40537 if (nelt != 4)
40538 {
40539 /* shufpd */
40540 dremap.vmode = V2DImode;
40541 dremap.nelt = 2;
40542 dremap.perm[0] = 1;
40543 dremap.perm[1] = 2;
40544 }
40545 }
40546 else
40547 return false;
40548 }
40549 else
40550 {
40551 unsigned int nelt4 = nelt / 4, nzcnt = 0;
40552 unsigned HOST_WIDE_INT q[8];
40553 unsigned int nonzero_halves[4];
40554
40555 /* Split the two input vectors into 8 quarters. */
40556 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
40557 for (i = 1; i < 8; ++i)
40558 q[i] = q[0] << (nelt4 * i);
40559 for (i = 0; i < 4; ++i)
40560 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
40561 {
40562 nonzero_halves[nzcnt] = i;
40563 ++nzcnt;
40564 }
40565
40566 if (nzcnt == 1)
40567 {
40568 gcc_assert (d->one_operand_p);
40569 nonzero_halves[1] = nonzero_halves[0];
40570 same_halves = true;
40571 }
40572 else if (d->one_operand_p)
40573 {
40574 gcc_assert (nonzero_halves[0] == 0);
40575 gcc_assert (nonzero_halves[1] == 1);
40576 }
40577
40578 if (nzcnt <= 2)
40579 {
40580 if (d->perm[0] / nelt2 == nonzero_halves[1])
40581 {
40582 /* Attempt to increase the likelihood that dfinal
40583 shuffle will be intra-lane. */
40584 char tmph = nonzero_halves[0];
40585 nonzero_halves[0] = nonzero_halves[1];
40586 nonzero_halves[1] = tmph;
40587 }
40588
40589 /* vperm2f128 or vperm2i128. */
40590 for (i = 0; i < nelt2; ++i)
40591 {
40592 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
40593 remap[i + nonzero_halves[0] * nelt2] = i;
40594 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
40595 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
40596 }
40597
40598 if (d->vmode != V8SFmode
40599 && d->vmode != V4DFmode
40600 && d->vmode != V8SImode)
40601 {
40602 dremap.vmode = V8SImode;
40603 dremap.nelt = 8;
40604 for (i = 0; i < 4; ++i)
40605 {
40606 dremap.perm[i] = i + nonzero_halves[0] * 4;
40607 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
40608 }
40609 }
40610 }
40611 else if (d->one_operand_p)
40612 return false;
40613 else if (TARGET_AVX2
40614 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
40615 {
40616 /* vpunpckl* */
40617 for (i = 0; i < nelt4; ++i)
40618 {
40619 remap[i] = i * 2;
40620 remap[i + nelt] = i * 2 + 1;
40621 remap[i + nelt2] = i * 2 + nelt2;
40622 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
40623 dremap.perm[i * 2] = i;
40624 dremap.perm[i * 2 + 1] = i + nelt;
40625 dremap.perm[i * 2 + nelt2] = i + nelt2;
40626 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
40627 }
40628 }
40629 else if (TARGET_AVX2
40630 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
40631 {
40632 /* vpunpckh* */
40633 for (i = 0; i < nelt4; ++i)
40634 {
40635 remap[i + nelt4] = i * 2;
40636 remap[i + nelt + nelt4] = i * 2 + 1;
40637 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
40638 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
40639 dremap.perm[i * 2] = i + nelt4;
40640 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
40641 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
40642 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
40643 }
40644 }
40645 else
40646 return false;
40647 }
40648
40649 /* Use the remapping array set up above to move the elements from their
40650 swizzled locations into their final destinations. */
40651 dfinal = *d;
40652 for (i = 0; i < nelt; ++i)
40653 {
40654 unsigned e = remap[d->perm[i]];
40655 gcc_assert (e < nelt);
40656 /* If same_halves is true, both halves of the remapped vector are the
40657 same. Avoid cross-lane accesses if possible. */
40658 if (same_halves && i >= nelt2)
40659 {
40660 gcc_assert (e < nelt2);
40661 dfinal.perm[i] = e + nelt2;
40662 }
40663 else
40664 dfinal.perm[i] = e;
40665 }
40666 dremap.target = gen_reg_rtx (dremap.vmode);
40667 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40668 dfinal.op1 = dfinal.op0;
40669 dfinal.one_operand_p = true;
40670
40671 /* Test if the final remap can be done with a single insn. For V4SFmode or
40672 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40673 start_sequence ();
40674 ok = expand_vec_perm_1 (&dfinal);
40675 seq = get_insns ();
40676 end_sequence ();
40677
40678 if (!ok)
40679 return false;
40680
40681 if (d->testing_p)
40682 return true;
40683
40684 if (dremap.vmode != dfinal.vmode)
40685 {
40686 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40687 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40688 }
40689
40690 ok = expand_vec_perm_1 (&dremap);
40691 gcc_assert (ok);
40692
40693 emit_insn (seq);
40694 return true;
40695 }
40696
40697 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40698 a single vector cross-lane permutation into vpermq followed
40699 by any of the single insn permutations. */
40700
40701 static bool
40702 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40703 {
40704 struct expand_vec_perm_d dremap, dfinal;
40705 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40706 unsigned contents[2];
40707 bool ok;
40708
40709 if (!(TARGET_AVX2
40710 && (d->vmode == V32QImode || d->vmode == V16HImode)
40711 && d->one_operand_p))
40712 return false;
40713
40714 contents[0] = 0;
40715 contents[1] = 0;
40716 for (i = 0; i < nelt2; ++i)
40717 {
40718 contents[0] |= 1u << (d->perm[i] / nelt4);
40719 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40720 }
40721
40722 for (i = 0; i < 2; ++i)
40723 {
40724 unsigned int cnt = 0;
40725 for (j = 0; j < 4; ++j)
40726 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40727 return false;
40728 }
40729
40730 if (d->testing_p)
40731 return true;
40732
40733 dremap = *d;
40734 dremap.vmode = V4DImode;
40735 dremap.nelt = 4;
40736 dremap.target = gen_reg_rtx (V4DImode);
40737 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40738 dremap.op1 = dremap.op0;
40739 dremap.one_operand_p = true;
40740 for (i = 0; i < 2; ++i)
40741 {
40742 unsigned int cnt = 0;
40743 for (j = 0; j < 4; ++j)
40744 if ((contents[i] & (1u << j)) != 0)
40745 dremap.perm[2 * i + cnt++] = j;
40746 for (; cnt < 2; ++cnt)
40747 dremap.perm[2 * i + cnt] = 0;
40748 }
40749
40750 dfinal = *d;
40751 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40752 dfinal.op1 = dfinal.op0;
40753 dfinal.one_operand_p = true;
40754 for (i = 0, j = 0; i < nelt; ++i)
40755 {
40756 if (i == nelt2)
40757 j = 2;
40758 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40759 if ((d->perm[i] / nelt4) == dremap.perm[j])
40760 ;
40761 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40762 dfinal.perm[i] |= nelt4;
40763 else
40764 gcc_unreachable ();
40765 }
40766
40767 ok = expand_vec_perm_1 (&dremap);
40768 gcc_assert (ok);
40769
40770 ok = expand_vec_perm_1 (&dfinal);
40771 gcc_assert (ok);
40772
40773 return true;
40774 }
40775
40776 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40777 a vector permutation using two instructions, vperm2f128 resp.
40778 vperm2i128 followed by any single in-lane permutation. */
40779
40780 static bool
40781 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40782 {
40783 struct expand_vec_perm_d dfirst, dsecond;
40784 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40785 bool ok;
40786
40787 if (!TARGET_AVX
40788 || GET_MODE_SIZE (d->vmode) != 32
40789 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40790 return false;
40791
40792 dsecond = *d;
40793 dsecond.one_operand_p = false;
40794 dsecond.testing_p = true;
40795
40796 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40797 immediate. For perm < 16 the second permutation uses
40798 d->op0 as first operand, for perm >= 16 it uses d->op1
40799 as first operand. The second operand is the result of
40800 vperm2[fi]128. */
40801 for (perm = 0; perm < 32; perm++)
40802 {
40803 /* Ignore permutations which do not move anything cross-lane. */
40804 if (perm < 16)
40805 {
40806 /* The second shuffle for e.g. V4DFmode has
40807 0123 and ABCD operands.
40808 Ignore AB23, as 23 is already in the second lane
40809 of the first operand. */
40810 if ((perm & 0xc) == (1 << 2)) continue;
40811 /* And 01CD, as 01 is in the first lane of the first
40812 operand. */
40813 if ((perm & 3) == 0) continue;
40814 /* And 4567, as then the vperm2[fi]128 doesn't change
40815 anything on the original 4567 second operand. */
40816 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40817 }
40818 else
40819 {
40820 /* The second shuffle for e.g. V4DFmode has
40821 4567 and ABCD operands.
40822 Ignore AB67, as 67 is already in the second lane
40823 of the first operand. */
40824 if ((perm & 0xc) == (3 << 2)) continue;
40825 /* And 45CD, as 45 is in the first lane of the first
40826 operand. */
40827 if ((perm & 3) == 2) continue;
40828 /* And 0123, as then the vperm2[fi]128 doesn't change
40829 anything on the original 0123 first operand. */
40830 if ((perm & 0xf) == (1 << 2)) continue;
40831 }
40832
40833 for (i = 0; i < nelt; i++)
40834 {
40835 j = d->perm[i] / nelt2;
40836 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40837 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40838 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40839 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40840 else
40841 break;
40842 }
40843
40844 if (i == nelt)
40845 {
40846 start_sequence ();
40847 ok = expand_vec_perm_1 (&dsecond);
40848 end_sequence ();
40849 }
40850 else
40851 ok = false;
40852
40853 if (ok)
40854 {
40855 if (d->testing_p)
40856 return true;
40857
40858 /* Found a usable second shuffle. dfirst will be
40859 vperm2f128 on d->op0 and d->op1. */
40860 dsecond.testing_p = false;
40861 dfirst = *d;
40862 dfirst.target = gen_reg_rtx (d->vmode);
40863 for (i = 0; i < nelt; i++)
40864 dfirst.perm[i] = (i & (nelt2 - 1))
40865 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40866
40867 ok = expand_vec_perm_1 (&dfirst);
40868 gcc_assert (ok);
40869
40870 /* And dsecond is some single insn shuffle, taking
40871 d->op0 and result of vperm2f128 (if perm < 16) or
40872 d->op1 and result of vperm2f128 (otherwise). */
40873 dsecond.op1 = dfirst.target;
40874 if (perm >= 16)
40875 dsecond.op0 = dfirst.op1;
40876
40877 ok = expand_vec_perm_1 (&dsecond);
40878 gcc_assert (ok);
40879
40880 return true;
40881 }
40882
40883 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40884 if (d->one_operand_p)
40885 return false;
40886 }
40887
40888 return false;
40889 }
40890
40891 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40892 a two vector permutation using 2 intra-lane interleave insns
40893 and cross-lane shuffle for 32-byte vectors. */
40894
40895 static bool
40896 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40897 {
40898 unsigned i, nelt;
40899 rtx (*gen) (rtx, rtx, rtx);
40900
40901 if (d->one_operand_p)
40902 return false;
40903 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40904 ;
40905 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40906 ;
40907 else
40908 return false;
40909
40910 nelt = d->nelt;
40911 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40912 return false;
40913 for (i = 0; i < nelt; i += 2)
40914 if (d->perm[i] != d->perm[0] + i / 2
40915 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40916 return false;
40917
40918 if (d->testing_p)
40919 return true;
40920
40921 switch (d->vmode)
40922 {
40923 case V32QImode:
40924 if (d->perm[0])
40925 gen = gen_vec_interleave_highv32qi;
40926 else
40927 gen = gen_vec_interleave_lowv32qi;
40928 break;
40929 case V16HImode:
40930 if (d->perm[0])
40931 gen = gen_vec_interleave_highv16hi;
40932 else
40933 gen = gen_vec_interleave_lowv16hi;
40934 break;
40935 case V8SImode:
40936 if (d->perm[0])
40937 gen = gen_vec_interleave_highv8si;
40938 else
40939 gen = gen_vec_interleave_lowv8si;
40940 break;
40941 case V4DImode:
40942 if (d->perm[0])
40943 gen = gen_vec_interleave_highv4di;
40944 else
40945 gen = gen_vec_interleave_lowv4di;
40946 break;
40947 case V8SFmode:
40948 if (d->perm[0])
40949 gen = gen_vec_interleave_highv8sf;
40950 else
40951 gen = gen_vec_interleave_lowv8sf;
40952 break;
40953 case V4DFmode:
40954 if (d->perm[0])
40955 gen = gen_vec_interleave_highv4df;
40956 else
40957 gen = gen_vec_interleave_lowv4df;
40958 break;
40959 default:
40960 gcc_unreachable ();
40961 }
40962
40963 emit_insn (gen (d->target, d->op0, d->op1));
40964 return true;
40965 }
40966
40967 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40968 a single vector permutation using a single intra-lane vector
40969 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40970 the non-swapped and swapped vectors together. */
40971
40972 static bool
40973 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40974 {
40975 struct expand_vec_perm_d dfirst, dsecond;
40976 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40977 rtx seq;
40978 bool ok;
40979 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40980
40981 if (!TARGET_AVX
40982 || TARGET_AVX2
40983 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40984 || !d->one_operand_p)
40985 return false;
40986
40987 dfirst = *d;
40988 for (i = 0; i < nelt; i++)
40989 dfirst.perm[i] = 0xff;
40990 for (i = 0, msk = 0; i < nelt; i++)
40991 {
40992 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40993 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40994 return false;
40995 dfirst.perm[j] = d->perm[i];
40996 if (j != i)
40997 msk |= (1 << i);
40998 }
40999 for (i = 0; i < nelt; i++)
41000 if (dfirst.perm[i] == 0xff)
41001 dfirst.perm[i] = i;
41002
41003 if (!d->testing_p)
41004 dfirst.target = gen_reg_rtx (dfirst.vmode);
41005
41006 start_sequence ();
41007 ok = expand_vec_perm_1 (&dfirst);
41008 seq = get_insns ();
41009 end_sequence ();
41010
41011 if (!ok)
41012 return false;
41013
41014 if (d->testing_p)
41015 return true;
41016
41017 emit_insn (seq);
41018
41019 dsecond = *d;
41020 dsecond.op0 = dfirst.target;
41021 dsecond.op1 = dfirst.target;
41022 dsecond.one_operand_p = true;
41023 dsecond.target = gen_reg_rtx (dsecond.vmode);
41024 for (i = 0; i < nelt; i++)
41025 dsecond.perm[i] = i ^ nelt2;
41026
41027 ok = expand_vec_perm_1 (&dsecond);
41028 gcc_assert (ok);
41029
41030 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
41031 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
41032 return true;
41033 }
41034
41035 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
41036 permutation using two vperm2f128, followed by a vshufpd insn blending
41037 the two vectors together. */
41038
41039 static bool
41040 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
41041 {
41042 struct expand_vec_perm_d dfirst, dsecond, dthird;
41043 bool ok;
41044
41045 if (!TARGET_AVX || (d->vmode != V4DFmode))
41046 return false;
41047
41048 if (d->testing_p)
41049 return true;
41050
41051 dfirst = *d;
41052 dsecond = *d;
41053 dthird = *d;
41054
41055 dfirst.perm[0] = (d->perm[0] & ~1);
41056 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
41057 dfirst.perm[2] = (d->perm[2] & ~1);
41058 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
41059 dsecond.perm[0] = (d->perm[1] & ~1);
41060 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
41061 dsecond.perm[2] = (d->perm[3] & ~1);
41062 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
41063 dthird.perm[0] = (d->perm[0] % 2);
41064 dthird.perm[1] = (d->perm[1] % 2) + 4;
41065 dthird.perm[2] = (d->perm[2] % 2) + 2;
41066 dthird.perm[3] = (d->perm[3] % 2) + 6;
41067
41068 dfirst.target = gen_reg_rtx (dfirst.vmode);
41069 dsecond.target = gen_reg_rtx (dsecond.vmode);
41070 dthird.op0 = dfirst.target;
41071 dthird.op1 = dsecond.target;
41072 dthird.one_operand_p = false;
41073
41074 canonicalize_perm (&dfirst);
41075 canonicalize_perm (&dsecond);
41076
41077 ok = expand_vec_perm_1 (&dfirst)
41078 && expand_vec_perm_1 (&dsecond)
41079 && expand_vec_perm_1 (&dthird);
41080
41081 gcc_assert (ok);
41082
41083 return true;
41084 }
41085
41086 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
41087 permutation with two pshufb insns and an ior. We should have already
41088 failed all two instruction sequences. */
41089
41090 static bool
41091 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
41092 {
41093 rtx rperm[2][16], vperm, l, h, op, m128;
41094 unsigned int i, nelt, eltsz;
41095
41096 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
41097 return false;
41098 gcc_assert (!d->one_operand_p);
41099
41100 nelt = d->nelt;
41101 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41102
41103 /* Generate two permutation masks. If the required element is within
41104 the given vector it is shuffled into the proper lane. If the required
41105 element is in the other vector, force a zero into the lane by setting
41106 bit 7 in the permutation mask. */
41107 m128 = GEN_INT (-128);
41108 for (i = 0; i < nelt; ++i)
41109 {
41110 unsigned j, e = d->perm[i];
41111 unsigned which = (e >= nelt);
41112 if (e >= nelt)
41113 e -= nelt;
41114
41115 for (j = 0; j < eltsz; ++j)
41116 {
41117 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
41118 rperm[1-which][i*eltsz + j] = m128;
41119 }
41120 }
41121
41122 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
41123 vperm = force_reg (V16QImode, vperm);
41124
41125 l = gen_reg_rtx (V16QImode);
41126 op = gen_lowpart (V16QImode, d->op0);
41127 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
41128
41129 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
41130 vperm = force_reg (V16QImode, vperm);
41131
41132 h = gen_reg_rtx (V16QImode);
41133 op = gen_lowpart (V16QImode, d->op1);
41134 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
41135
41136 op = d->target;
41137 if (d->vmode != V16QImode)
41138 op = gen_reg_rtx (V16QImode);
41139 emit_insn (gen_iorv16qi3 (op, l, h));
41140 if (op != d->target)
41141 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41142
41143 return true;
41144 }
41145
41146 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
41147 with two vpshufb insns, vpermq and vpor. We should have already failed
41148 all two or three instruction sequences. */
41149
41150 static bool
41151 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
41152 {
41153 rtx rperm[2][32], vperm, l, h, hp, op, m128;
41154 unsigned int i, nelt, eltsz;
41155
41156 if (!TARGET_AVX2
41157 || !d->one_operand_p
41158 || (d->vmode != V32QImode && d->vmode != V16HImode))
41159 return false;
41160
41161 if (d->testing_p)
41162 return true;
41163
41164 nelt = d->nelt;
41165 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41166
41167 /* Generate two permutation masks. If the required element is within
41168 the same lane, it is shuffled in. If the required element from the
41169 other lane, force a zero by setting bit 7 in the permutation mask.
41170 In the other mask the mask has non-negative elements if element
41171 is requested from the other lane, but also moved to the other lane,
41172 so that the result of vpshufb can have the two V2TImode halves
41173 swapped. */
41174 m128 = GEN_INT (-128);
41175 for (i = 0; i < nelt; ++i)
41176 {
41177 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41178 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41179
41180 for (j = 0; j < eltsz; ++j)
41181 {
41182 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
41183 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
41184 }
41185 }
41186
41187 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41188 vperm = force_reg (V32QImode, vperm);
41189
41190 h = gen_reg_rtx (V32QImode);
41191 op = gen_lowpart (V32QImode, d->op0);
41192 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41193
41194 /* Swap the 128-byte lanes of h into hp. */
41195 hp = gen_reg_rtx (V4DImode);
41196 op = gen_lowpart (V4DImode, h);
41197 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
41198 const1_rtx));
41199
41200 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41201 vperm = force_reg (V32QImode, vperm);
41202
41203 l = gen_reg_rtx (V32QImode);
41204 op = gen_lowpart (V32QImode, d->op0);
41205 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41206
41207 op = d->target;
41208 if (d->vmode != V32QImode)
41209 op = gen_reg_rtx (V32QImode);
41210 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
41211 if (op != d->target)
41212 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41213
41214 return true;
41215 }
41216
41217 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
41218 and extract-odd permutations of two V32QImode and V16QImode operand
41219 with two vpshufb insns, vpor and vpermq. We should have already
41220 failed all two or three instruction sequences. */
41221
41222 static bool
41223 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
41224 {
41225 rtx rperm[2][32], vperm, l, h, ior, op, m128;
41226 unsigned int i, nelt, eltsz;
41227
41228 if (!TARGET_AVX2
41229 || d->one_operand_p
41230 || (d->vmode != V32QImode && d->vmode != V16HImode))
41231 return false;
41232
41233 for (i = 0; i < d->nelt; ++i)
41234 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
41235 return false;
41236
41237 if (d->testing_p)
41238 return true;
41239
41240 nelt = d->nelt;
41241 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41242
41243 /* Generate two permutation masks. In the first permutation mask
41244 the first quarter will contain indexes for the first half
41245 of the op0, the second quarter will contain bit 7 set, third quarter
41246 will contain indexes for the second half of the op0 and the
41247 last quarter bit 7 set. In the second permutation mask
41248 the first quarter will contain bit 7 set, the second quarter
41249 indexes for the first half of the op1, the third quarter bit 7 set
41250 and last quarter indexes for the second half of the op1.
41251 I.e. the first mask e.g. for V32QImode extract even will be:
41252 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
41253 (all values masked with 0xf except for -128) and second mask
41254 for extract even will be
41255 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
41256 m128 = GEN_INT (-128);
41257 for (i = 0; i < nelt; ++i)
41258 {
41259 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41260 unsigned which = d->perm[i] >= nelt;
41261 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
41262
41263 for (j = 0; j < eltsz; ++j)
41264 {
41265 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
41266 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
41267 }
41268 }
41269
41270 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41271 vperm = force_reg (V32QImode, vperm);
41272
41273 l = gen_reg_rtx (V32QImode);
41274 op = gen_lowpart (V32QImode, d->op0);
41275 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41276
41277 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41278 vperm = force_reg (V32QImode, vperm);
41279
41280 h = gen_reg_rtx (V32QImode);
41281 op = gen_lowpart (V32QImode, d->op1);
41282 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41283
41284 ior = gen_reg_rtx (V32QImode);
41285 emit_insn (gen_iorv32qi3 (ior, l, h));
41286
41287 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
41288 op = gen_reg_rtx (V4DImode);
41289 ior = gen_lowpart (V4DImode, ior);
41290 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
41291 const1_rtx, GEN_INT (3)));
41292 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41293
41294 return true;
41295 }
41296
41297 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
41298 and extract-odd permutations. */
41299
41300 static bool
41301 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
41302 {
41303 rtx t1, t2, t3, t4, t5;
41304
41305 switch (d->vmode)
41306 {
41307 case V4DFmode:
41308 t1 = gen_reg_rtx (V4DFmode);
41309 t2 = gen_reg_rtx (V4DFmode);
41310
41311 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41312 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
41313 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
41314
41315 /* Now an unpck[lh]pd will produce the result required. */
41316 if (odd)
41317 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
41318 else
41319 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
41320 emit_insn (t3);
41321 break;
41322
41323 case V8SFmode:
41324 {
41325 int mask = odd ? 0xdd : 0x88;
41326
41327 t1 = gen_reg_rtx (V8SFmode);
41328 t2 = gen_reg_rtx (V8SFmode);
41329 t3 = gen_reg_rtx (V8SFmode);
41330
41331 /* Shuffle within the 128-bit lanes to produce:
41332 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
41333 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
41334 GEN_INT (mask)));
41335
41336 /* Shuffle the lanes around to produce:
41337 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
41338 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
41339 GEN_INT (0x3)));
41340
41341 /* Shuffle within the 128-bit lanes to produce:
41342 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
41343 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
41344
41345 /* Shuffle within the 128-bit lanes to produce:
41346 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
41347 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
41348
41349 /* Shuffle the lanes around to produce:
41350 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
41351 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
41352 GEN_INT (0x20)));
41353 }
41354 break;
41355
41356 case V2DFmode:
41357 case V4SFmode:
41358 case V2DImode:
41359 case V4SImode:
41360 /* These are always directly implementable by expand_vec_perm_1. */
41361 gcc_unreachable ();
41362
41363 case V8HImode:
41364 if (TARGET_SSSE3)
41365 return expand_vec_perm_pshufb2 (d);
41366 else
41367 {
41368 /* We need 2*log2(N)-1 operations to achieve odd/even
41369 with interleave. */
41370 t1 = gen_reg_rtx (V8HImode);
41371 t2 = gen_reg_rtx (V8HImode);
41372 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
41373 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
41374 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
41375 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
41376 if (odd)
41377 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
41378 else
41379 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
41380 emit_insn (t3);
41381 }
41382 break;
41383
41384 case V16QImode:
41385 if (TARGET_SSSE3)
41386 return expand_vec_perm_pshufb2 (d);
41387 else
41388 {
41389 t1 = gen_reg_rtx (V16QImode);
41390 t2 = gen_reg_rtx (V16QImode);
41391 t3 = gen_reg_rtx (V16QImode);
41392 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
41393 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
41394 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
41395 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
41396 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
41397 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
41398 if (odd)
41399 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
41400 else
41401 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
41402 emit_insn (t3);
41403 }
41404 break;
41405
41406 case V16HImode:
41407 case V32QImode:
41408 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
41409
41410 case V4DImode:
41411 if (!TARGET_AVX2)
41412 {
41413 struct expand_vec_perm_d d_copy = *d;
41414 d_copy.vmode = V4DFmode;
41415 d_copy.target = gen_reg_rtx (V4DFmode);
41416 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
41417 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
41418 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41419 {
41420 if (!d->testing_p)
41421 emit_move_insn (d->target,
41422 gen_lowpart (V4DImode, d_copy.target));
41423 return true;
41424 }
41425 return false;
41426 }
41427
41428 t1 = gen_reg_rtx (V4DImode);
41429 t2 = gen_reg_rtx (V4DImode);
41430
41431 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41432 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
41433 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
41434
41435 /* Now an vpunpck[lh]qdq will produce the result required. */
41436 if (odd)
41437 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
41438 else
41439 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
41440 emit_insn (t3);
41441 break;
41442
41443 case V8SImode:
41444 if (!TARGET_AVX2)
41445 {
41446 struct expand_vec_perm_d d_copy = *d;
41447 d_copy.vmode = V8SFmode;
41448 d_copy.target = gen_reg_rtx (V8SFmode);
41449 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
41450 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
41451 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41452 {
41453 if (!d->testing_p)
41454 emit_move_insn (d->target,
41455 gen_lowpart (V8SImode, d_copy.target));
41456 return true;
41457 }
41458 return false;
41459 }
41460
41461 t1 = gen_reg_rtx (V8SImode);
41462 t2 = gen_reg_rtx (V8SImode);
41463 t3 = gen_reg_rtx (V4DImode);
41464 t4 = gen_reg_rtx (V4DImode);
41465 t5 = gen_reg_rtx (V4DImode);
41466
41467 /* Shuffle the lanes around into
41468 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
41469 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
41470 gen_lowpart (V4DImode, d->op1),
41471 GEN_INT (0x20)));
41472 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
41473 gen_lowpart (V4DImode, d->op1),
41474 GEN_INT (0x31)));
41475
41476 /* Swap the 2nd and 3rd position in each lane into
41477 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
41478 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
41479 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41480 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
41481 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41482
41483 /* Now an vpunpck[lh]qdq will produce
41484 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
41485 if (odd)
41486 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
41487 gen_lowpart (V4DImode, t2));
41488 else
41489 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
41490 gen_lowpart (V4DImode, t2));
41491 emit_insn (t3);
41492 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
41493 break;
41494
41495 default:
41496 gcc_unreachable ();
41497 }
41498
41499 return true;
41500 }
41501
41502 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41503 extract-even and extract-odd permutations. */
41504
41505 static bool
41506 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
41507 {
41508 unsigned i, odd, nelt = d->nelt;
41509
41510 odd = d->perm[0];
41511 if (odd != 0 && odd != 1)
41512 return false;
41513
41514 for (i = 1; i < nelt; ++i)
41515 if (d->perm[i] != 2 * i + odd)
41516 return false;
41517
41518 return expand_vec_perm_even_odd_1 (d, odd);
41519 }
41520
41521 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
41522 permutations. We assume that expand_vec_perm_1 has already failed. */
41523
41524 static bool
41525 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
41526 {
41527 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
41528 enum machine_mode vmode = d->vmode;
41529 unsigned char perm2[4];
41530 rtx op0 = d->op0, dest;
41531 bool ok;
41532
41533 switch (vmode)
41534 {
41535 case V4DFmode:
41536 case V8SFmode:
41537 /* These are special-cased in sse.md so that we can optionally
41538 use the vbroadcast instruction. They expand to two insns
41539 if the input happens to be in a register. */
41540 gcc_unreachable ();
41541
41542 case V2DFmode:
41543 case V2DImode:
41544 case V4SFmode:
41545 case V4SImode:
41546 /* These are always implementable using standard shuffle patterns. */
41547 gcc_unreachable ();
41548
41549 case V8HImode:
41550 case V16QImode:
41551 /* These can be implemented via interleave. We save one insn by
41552 stopping once we have promoted to V4SImode and then use pshufd. */
41553 do
41554 {
41555 rtx dest;
41556 rtx (*gen) (rtx, rtx, rtx)
41557 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
41558 : gen_vec_interleave_lowv8hi;
41559
41560 if (elt >= nelt2)
41561 {
41562 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
41563 : gen_vec_interleave_highv8hi;
41564 elt -= nelt2;
41565 }
41566 nelt2 /= 2;
41567
41568 dest = gen_reg_rtx (vmode);
41569 emit_insn (gen (dest, op0, op0));
41570 vmode = get_mode_wider_vector (vmode);
41571 op0 = gen_lowpart (vmode, dest);
41572 }
41573 while (vmode != V4SImode);
41574
41575 memset (perm2, elt, 4);
41576 dest = gen_reg_rtx (V4SImode);
41577 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
41578 gcc_assert (ok);
41579 if (!d->testing_p)
41580 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
41581 return true;
41582
41583 case V32QImode:
41584 case V16HImode:
41585 case V8SImode:
41586 case V4DImode:
41587 /* For AVX2 broadcasts of the first element vpbroadcast* or
41588 vpermq should be used by expand_vec_perm_1. */
41589 gcc_assert (!TARGET_AVX2 || d->perm[0]);
41590 return false;
41591
41592 default:
41593 gcc_unreachable ();
41594 }
41595 }
41596
41597 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41598 broadcast permutations. */
41599
41600 static bool
41601 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
41602 {
41603 unsigned i, elt, nelt = d->nelt;
41604
41605 if (!d->one_operand_p)
41606 return false;
41607
41608 elt = d->perm[0];
41609 for (i = 1; i < nelt; ++i)
41610 if (d->perm[i] != elt)
41611 return false;
41612
41613 return expand_vec_perm_broadcast_1 (d);
41614 }
41615
41616 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
41617 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
41618 all the shorter instruction sequences. */
41619
41620 static bool
41621 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
41622 {
41623 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
41624 unsigned int i, nelt, eltsz;
41625 bool used[4];
41626
41627 if (!TARGET_AVX2
41628 || d->one_operand_p
41629 || (d->vmode != V32QImode && d->vmode != V16HImode))
41630 return false;
41631
41632 if (d->testing_p)
41633 return true;
41634
41635 nelt = d->nelt;
41636 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41637
41638 /* Generate 4 permutation masks. If the required element is within
41639 the same lane, it is shuffled in. If the required element from the
41640 other lane, force a zero by setting bit 7 in the permutation mask.
41641 In the other mask the mask has non-negative elements if element
41642 is requested from the other lane, but also moved to the other lane,
41643 so that the result of vpshufb can have the two V2TImode halves
41644 swapped. */
41645 m128 = GEN_INT (-128);
41646 for (i = 0; i < 32; ++i)
41647 {
41648 rperm[0][i] = m128;
41649 rperm[1][i] = m128;
41650 rperm[2][i] = m128;
41651 rperm[3][i] = m128;
41652 }
41653 used[0] = false;
41654 used[1] = false;
41655 used[2] = false;
41656 used[3] = false;
41657 for (i = 0; i < nelt; ++i)
41658 {
41659 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41660 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41661 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
41662
41663 for (j = 0; j < eltsz; ++j)
41664 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
41665 used[which] = true;
41666 }
41667
41668 for (i = 0; i < 2; ++i)
41669 {
41670 if (!used[2 * i + 1])
41671 {
41672 h[i] = NULL_RTX;
41673 continue;
41674 }
41675 vperm = gen_rtx_CONST_VECTOR (V32QImode,
41676 gen_rtvec_v (32, rperm[2 * i + 1]));
41677 vperm = force_reg (V32QImode, vperm);
41678 h[i] = gen_reg_rtx (V32QImode);
41679 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41680 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
41681 }
41682
41683 /* Swap the 128-byte lanes of h[X]. */
41684 for (i = 0; i < 2; ++i)
41685 {
41686 if (h[i] == NULL_RTX)
41687 continue;
41688 op = gen_reg_rtx (V4DImode);
41689 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
41690 const2_rtx, GEN_INT (3), const0_rtx,
41691 const1_rtx));
41692 h[i] = gen_lowpart (V32QImode, op);
41693 }
41694
41695 for (i = 0; i < 2; ++i)
41696 {
41697 if (!used[2 * i])
41698 {
41699 l[i] = NULL_RTX;
41700 continue;
41701 }
41702 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41703 vperm = force_reg (V32QImode, vperm);
41704 l[i] = gen_reg_rtx (V32QImode);
41705 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41706 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41707 }
41708
41709 for (i = 0; i < 2; ++i)
41710 {
41711 if (h[i] && l[i])
41712 {
41713 op = gen_reg_rtx (V32QImode);
41714 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41715 l[i] = op;
41716 }
41717 else if (h[i])
41718 l[i] = h[i];
41719 }
41720
41721 gcc_assert (l[0] && l[1]);
41722 op = d->target;
41723 if (d->vmode != V32QImode)
41724 op = gen_reg_rtx (V32QImode);
41725 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41726 if (op != d->target)
41727 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41728 return true;
41729 }
41730
41731 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41732 With all of the interface bits taken care of, perform the expansion
41733 in D and return true on success. */
41734
41735 static bool
41736 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41737 {
41738 /* Try a single instruction expansion. */
41739 if (expand_vec_perm_1 (d))
41740 return true;
41741
41742 /* Try sequences of two instructions. */
41743
41744 if (expand_vec_perm_pshuflw_pshufhw (d))
41745 return true;
41746
41747 if (expand_vec_perm_palignr (d))
41748 return true;
41749
41750 if (expand_vec_perm_interleave2 (d))
41751 return true;
41752
41753 if (expand_vec_perm_broadcast (d))
41754 return true;
41755
41756 if (expand_vec_perm_vpermq_perm_1 (d))
41757 return true;
41758
41759 if (expand_vec_perm_vperm2f128 (d))
41760 return true;
41761
41762 /* Try sequences of three instructions. */
41763
41764 if (expand_vec_perm_2vperm2f128_vshuf (d))
41765 return true;
41766
41767 if (expand_vec_perm_pshufb2 (d))
41768 return true;
41769
41770 if (expand_vec_perm_interleave3 (d))
41771 return true;
41772
41773 if (expand_vec_perm_vperm2f128_vblend (d))
41774 return true;
41775
41776 /* Try sequences of four instructions. */
41777
41778 if (expand_vec_perm_vpshufb2_vpermq (d))
41779 return true;
41780
41781 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41782 return true;
41783
41784 /* ??? Look for narrow permutations whose element orderings would
41785 allow the promotion to a wider mode. */
41786
41787 /* ??? Look for sequences of interleave or a wider permute that place
41788 the data into the correct lanes for a half-vector shuffle like
41789 pshuf[lh]w or vpermilps. */
41790
41791 /* ??? Look for sequences of interleave that produce the desired results.
41792 The combinatorics of punpck[lh] get pretty ugly... */
41793
41794 if (expand_vec_perm_even_odd (d))
41795 return true;
41796
41797 /* Even longer sequences. */
41798 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41799 return true;
41800
41801 return false;
41802 }
41803
41804 /* If a permutation only uses one operand, make it clear. Returns true
41805 if the permutation references both operands. */
41806
41807 static bool
41808 canonicalize_perm (struct expand_vec_perm_d *d)
41809 {
41810 int i, which, nelt = d->nelt;
41811
41812 for (i = which = 0; i < nelt; ++i)
41813 which |= (d->perm[i] < nelt ? 1 : 2);
41814
41815 d->one_operand_p = true;
41816 switch (which)
41817 {
41818 default:
41819 gcc_unreachable();
41820
41821 case 3:
41822 if (!rtx_equal_p (d->op0, d->op1))
41823 {
41824 d->one_operand_p = false;
41825 break;
41826 }
41827 /* The elements of PERM do not suggest that only the first operand
41828 is used, but both operands are identical. Allow easier matching
41829 of the permutation by folding the permutation into the single
41830 input vector. */
41831 /* FALLTHRU */
41832
41833 case 2:
41834 for (i = 0; i < nelt; ++i)
41835 d->perm[i] &= nelt - 1;
41836 d->op0 = d->op1;
41837 break;
41838
41839 case 1:
41840 d->op1 = d->op0;
41841 break;
41842 }
41843
41844 return (which == 3);
41845 }
41846
41847 bool
41848 ix86_expand_vec_perm_const (rtx operands[4])
41849 {
41850 struct expand_vec_perm_d d;
41851 unsigned char perm[MAX_VECT_LEN];
41852 int i, nelt;
41853 bool two_args;
41854 rtx sel;
41855
41856 d.target = operands[0];
41857 d.op0 = operands[1];
41858 d.op1 = operands[2];
41859 sel = operands[3];
41860
41861 d.vmode = GET_MODE (d.target);
41862 gcc_assert (VECTOR_MODE_P (d.vmode));
41863 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41864 d.testing_p = false;
41865
41866 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41867 gcc_assert (XVECLEN (sel, 0) == nelt);
41868 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41869
41870 for (i = 0; i < nelt; ++i)
41871 {
41872 rtx e = XVECEXP (sel, 0, i);
41873 int ei = INTVAL (e) & (2 * nelt - 1);
41874 d.perm[i] = ei;
41875 perm[i] = ei;
41876 }
41877
41878 two_args = canonicalize_perm (&d);
41879
41880 if (ix86_expand_vec_perm_const_1 (&d))
41881 return true;
41882
41883 /* If the selector says both arguments are needed, but the operands are the
41884 same, the above tried to expand with one_operand_p and flattened selector.
41885 If that didn't work, retry without one_operand_p; we succeeded with that
41886 during testing. */
41887 if (two_args && d.one_operand_p)
41888 {
41889 d.one_operand_p = false;
41890 memcpy (d.perm, perm, sizeof (perm));
41891 return ix86_expand_vec_perm_const_1 (&d);
41892 }
41893
41894 return false;
41895 }
41896
41897 /* Implement targetm.vectorize.vec_perm_const_ok. */
41898
41899 static bool
41900 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41901 const unsigned char *sel)
41902 {
41903 struct expand_vec_perm_d d;
41904 unsigned int i, nelt, which;
41905 bool ret;
41906
41907 d.vmode = vmode;
41908 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41909 d.testing_p = true;
41910
41911 /* Given sufficient ISA support we can just return true here
41912 for selected vector modes. */
41913 if (GET_MODE_SIZE (d.vmode) == 16)
41914 {
41915 /* All implementable with a single vpperm insn. */
41916 if (TARGET_XOP)
41917 return true;
41918 /* All implementable with 2 pshufb + 1 ior. */
41919 if (TARGET_SSSE3)
41920 return true;
41921 /* All implementable with shufpd or unpck[lh]pd. */
41922 if (d.nelt == 2)
41923 return true;
41924 }
41925
41926 /* Extract the values from the vector CST into the permutation
41927 array in D. */
41928 memcpy (d.perm, sel, nelt);
41929 for (i = which = 0; i < nelt; ++i)
41930 {
41931 unsigned char e = d.perm[i];
41932 gcc_assert (e < 2 * nelt);
41933 which |= (e < nelt ? 1 : 2);
41934 }
41935
41936 /* For all elements from second vector, fold the elements to first. */
41937 if (which == 2)
41938 for (i = 0; i < nelt; ++i)
41939 d.perm[i] -= nelt;
41940
41941 /* Check whether the mask can be applied to the vector type. */
41942 d.one_operand_p = (which != 3);
41943
41944 /* Implementable with shufps or pshufd. */
41945 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41946 return true;
41947
41948 /* Otherwise we have to go through the motions and see if we can
41949 figure out how to generate the requested permutation. */
41950 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41951 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41952 if (!d.one_operand_p)
41953 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41954
41955 start_sequence ();
41956 ret = ix86_expand_vec_perm_const_1 (&d);
41957 end_sequence ();
41958
41959 return ret;
41960 }
41961
41962 void
41963 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41964 {
41965 struct expand_vec_perm_d d;
41966 unsigned i, nelt;
41967
41968 d.target = targ;
41969 d.op0 = op0;
41970 d.op1 = op1;
41971 d.vmode = GET_MODE (targ);
41972 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41973 d.one_operand_p = false;
41974 d.testing_p = false;
41975
41976 for (i = 0; i < nelt; ++i)
41977 d.perm[i] = i * 2 + odd;
41978
41979 /* We'll either be able to implement the permutation directly... */
41980 if (expand_vec_perm_1 (&d))
41981 return;
41982
41983 /* ... or we use the special-case patterns. */
41984 expand_vec_perm_even_odd_1 (&d, odd);
41985 }
41986
41987 static void
41988 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41989 {
41990 struct expand_vec_perm_d d;
41991 unsigned i, nelt, base;
41992 bool ok;
41993
41994 d.target = targ;
41995 d.op0 = op0;
41996 d.op1 = op1;
41997 d.vmode = GET_MODE (targ);
41998 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41999 d.one_operand_p = false;
42000 d.testing_p = false;
42001
42002 base = high_p ? nelt / 2 : 0;
42003 for (i = 0; i < nelt / 2; ++i)
42004 {
42005 d.perm[i * 2] = i + base;
42006 d.perm[i * 2 + 1] = i + base + nelt;
42007 }
42008
42009 /* Note that for AVX this isn't one instruction. */
42010 ok = ix86_expand_vec_perm_const_1 (&d);
42011 gcc_assert (ok);
42012 }
42013
42014
42015 /* Expand a vector operation CODE for a V*QImode in terms of the
42016 same operation on V*HImode. */
42017
42018 void
42019 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
42020 {
42021 enum machine_mode qimode = GET_MODE (dest);
42022 enum machine_mode himode;
42023 rtx (*gen_il) (rtx, rtx, rtx);
42024 rtx (*gen_ih) (rtx, rtx, rtx);
42025 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
42026 struct expand_vec_perm_d d;
42027 bool ok, full_interleave;
42028 bool uns_p = false;
42029 int i;
42030
42031 switch (qimode)
42032 {
42033 case V16QImode:
42034 himode = V8HImode;
42035 gen_il = gen_vec_interleave_lowv16qi;
42036 gen_ih = gen_vec_interleave_highv16qi;
42037 break;
42038 case V32QImode:
42039 himode = V16HImode;
42040 gen_il = gen_avx2_interleave_lowv32qi;
42041 gen_ih = gen_avx2_interleave_highv32qi;
42042 break;
42043 default:
42044 gcc_unreachable ();
42045 }
42046
42047 op2_l = op2_h = op2;
42048 switch (code)
42049 {
42050 case MULT:
42051 /* Unpack data such that we've got a source byte in each low byte of
42052 each word. We don't care what goes into the high byte of each word.
42053 Rather than trying to get zero in there, most convenient is to let
42054 it be a copy of the low byte. */
42055 op2_l = gen_reg_rtx (qimode);
42056 op2_h = gen_reg_rtx (qimode);
42057 emit_insn (gen_il (op2_l, op2, op2));
42058 emit_insn (gen_ih (op2_h, op2, op2));
42059 /* FALLTHRU */
42060
42061 op1_l = gen_reg_rtx (qimode);
42062 op1_h = gen_reg_rtx (qimode);
42063 emit_insn (gen_il (op1_l, op1, op1));
42064 emit_insn (gen_ih (op1_h, op1, op1));
42065 full_interleave = qimode == V16QImode;
42066 break;
42067
42068 case ASHIFT:
42069 case LSHIFTRT:
42070 uns_p = true;
42071 /* FALLTHRU */
42072 case ASHIFTRT:
42073 op1_l = gen_reg_rtx (himode);
42074 op1_h = gen_reg_rtx (himode);
42075 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
42076 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
42077 full_interleave = true;
42078 break;
42079 default:
42080 gcc_unreachable ();
42081 }
42082
42083 /* Perform the operation. */
42084 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
42085 1, OPTAB_DIRECT);
42086 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
42087 1, OPTAB_DIRECT);
42088 gcc_assert (res_l && res_h);
42089
42090 /* Merge the data back into the right place. */
42091 d.target = dest;
42092 d.op0 = gen_lowpart (qimode, res_l);
42093 d.op1 = gen_lowpart (qimode, res_h);
42094 d.vmode = qimode;
42095 d.nelt = GET_MODE_NUNITS (qimode);
42096 d.one_operand_p = false;
42097 d.testing_p = false;
42098
42099 if (full_interleave)
42100 {
42101 /* For SSE2, we used an full interleave, so the desired
42102 results are in the even elements. */
42103 for (i = 0; i < 32; ++i)
42104 d.perm[i] = i * 2;
42105 }
42106 else
42107 {
42108 /* For AVX, the interleave used above was not cross-lane. So the
42109 extraction is evens but with the second and third quarter swapped.
42110 Happily, that is even one insn shorter than even extraction. */
42111 for (i = 0; i < 32; ++i)
42112 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
42113 }
42114
42115 ok = ix86_expand_vec_perm_const_1 (&d);
42116 gcc_assert (ok);
42117
42118 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42119 gen_rtx_fmt_ee (code, qimode, op1, op2));
42120 }
42121
42122 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
42123 if op is CONST_VECTOR with all odd elements equal to their
42124 preceding element. */
42125
42126 static bool
42127 const_vector_equal_evenodd_p (rtx op)
42128 {
42129 enum machine_mode mode = GET_MODE (op);
42130 int i, nunits = GET_MODE_NUNITS (mode);
42131 if (GET_CODE (op) != CONST_VECTOR
42132 || nunits != CONST_VECTOR_NUNITS (op))
42133 return false;
42134 for (i = 0; i < nunits; i += 2)
42135 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
42136 return false;
42137 return true;
42138 }
42139
42140 void
42141 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
42142 bool uns_p, bool odd_p)
42143 {
42144 enum machine_mode mode = GET_MODE (op1);
42145 enum machine_mode wmode = GET_MODE (dest);
42146 rtx x;
42147 rtx orig_op1 = op1, orig_op2 = op2;
42148
42149 if (!nonimmediate_operand (op1, mode))
42150 op1 = force_reg (mode, op1);
42151 if (!nonimmediate_operand (op2, mode))
42152 op2 = force_reg (mode, op2);
42153
42154 /* We only play even/odd games with vectors of SImode. */
42155 gcc_assert (mode == V4SImode || mode == V8SImode);
42156
42157 /* If we're looking for the odd results, shift those members down to
42158 the even slots. For some cpus this is faster than a PSHUFD. */
42159 if (odd_p)
42160 {
42161 /* For XOP use vpmacsdqh, but only for smult, as it is only
42162 signed. */
42163 if (TARGET_XOP && mode == V4SImode && !uns_p)
42164 {
42165 x = force_reg (wmode, CONST0_RTX (wmode));
42166 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
42167 return;
42168 }
42169
42170 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
42171 if (!const_vector_equal_evenodd_p (orig_op1))
42172 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
42173 x, NULL, 1, OPTAB_DIRECT);
42174 if (!const_vector_equal_evenodd_p (orig_op2))
42175 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
42176 x, NULL, 1, OPTAB_DIRECT);
42177 op1 = gen_lowpart (mode, op1);
42178 op2 = gen_lowpart (mode, op2);
42179 }
42180
42181 if (mode == V8SImode)
42182 {
42183 if (uns_p)
42184 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
42185 else
42186 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
42187 }
42188 else if (uns_p)
42189 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
42190 else if (TARGET_SSE4_1)
42191 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
42192 else
42193 {
42194 rtx s1, s2, t0, t1, t2;
42195
42196 /* The easiest way to implement this without PMULDQ is to go through
42197 the motions as if we are performing a full 64-bit multiply. With
42198 the exception that we need to do less shuffling of the elements. */
42199
42200 /* Compute the sign-extension, aka highparts, of the two operands. */
42201 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42202 op1, pc_rtx, pc_rtx);
42203 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42204 op2, pc_rtx, pc_rtx);
42205
42206 /* Multiply LO(A) * HI(B), and vice-versa. */
42207 t1 = gen_reg_rtx (wmode);
42208 t2 = gen_reg_rtx (wmode);
42209 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
42210 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
42211
42212 /* Multiply LO(A) * LO(B). */
42213 t0 = gen_reg_rtx (wmode);
42214 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
42215
42216 /* Combine and shift the highparts into place. */
42217 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
42218 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
42219 1, OPTAB_DIRECT);
42220
42221 /* Combine high and low parts. */
42222 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
42223 return;
42224 }
42225 emit_insn (x);
42226 }
42227
42228 void
42229 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
42230 bool uns_p, bool high_p)
42231 {
42232 enum machine_mode wmode = GET_MODE (dest);
42233 enum machine_mode mode = GET_MODE (op1);
42234 rtx t1, t2, t3, t4, mask;
42235
42236 switch (mode)
42237 {
42238 case V4SImode:
42239 t1 = gen_reg_rtx (mode);
42240 t2 = gen_reg_rtx (mode);
42241 if (TARGET_XOP && !uns_p)
42242 {
42243 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
42244 shuffle the elements once so that all elements are in the right
42245 place for immediate use: { A C B D }. */
42246 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
42247 const1_rtx, GEN_INT (3)));
42248 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
42249 const1_rtx, GEN_INT (3)));
42250 }
42251 else
42252 {
42253 /* Put the elements into place for the multiply. */
42254 ix86_expand_vec_interleave (t1, op1, op1, high_p);
42255 ix86_expand_vec_interleave (t2, op2, op2, high_p);
42256 high_p = false;
42257 }
42258 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
42259 break;
42260
42261 case V8SImode:
42262 /* Shuffle the elements between the lanes. After this we
42263 have { A B E F | C D G H } for each operand. */
42264 t1 = gen_reg_rtx (V4DImode);
42265 t2 = gen_reg_rtx (V4DImode);
42266 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
42267 const0_rtx, const2_rtx,
42268 const1_rtx, GEN_INT (3)));
42269 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
42270 const0_rtx, const2_rtx,
42271 const1_rtx, GEN_INT (3)));
42272
42273 /* Shuffle the elements within the lanes. After this we
42274 have { A A B B | C C D D } or { E E F F | G G H H }. */
42275 t3 = gen_reg_rtx (V8SImode);
42276 t4 = gen_reg_rtx (V8SImode);
42277 mask = GEN_INT (high_p
42278 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
42279 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
42280 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
42281 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
42282
42283 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
42284 break;
42285
42286 case V8HImode:
42287 case V16HImode:
42288 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
42289 uns_p, OPTAB_DIRECT);
42290 t2 = expand_binop (mode,
42291 uns_p ? umul_highpart_optab : smul_highpart_optab,
42292 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
42293 gcc_assert (t1 && t2);
42294
42295 t3 = gen_reg_rtx (mode);
42296 ix86_expand_vec_interleave (t3, t1, t2, high_p);
42297 emit_move_insn (dest, gen_lowpart (wmode, t3));
42298 break;
42299
42300 case V16QImode:
42301 case V32QImode:
42302 t1 = gen_reg_rtx (wmode);
42303 t2 = gen_reg_rtx (wmode);
42304 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
42305 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
42306
42307 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
42308 break;
42309
42310 default:
42311 gcc_unreachable ();
42312 }
42313 }
42314
42315 void
42316 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
42317 {
42318 rtx res_1, res_2, res_3, res_4;
42319
42320 res_1 = gen_reg_rtx (V4SImode);
42321 res_2 = gen_reg_rtx (V4SImode);
42322 res_3 = gen_reg_rtx (V2DImode);
42323 res_4 = gen_reg_rtx (V2DImode);
42324 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
42325 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
42326
42327 /* Move the results in element 2 down to element 1; we don't care
42328 what goes in elements 2 and 3. Then we can merge the parts
42329 back together with an interleave.
42330
42331 Note that two other sequences were tried:
42332 (1) Use interleaves at the start instead of psrldq, which allows
42333 us to use a single shufps to merge things back at the end.
42334 (2) Use shufps here to combine the two vectors, then pshufd to
42335 put the elements in the correct order.
42336 In both cases the cost of the reformatting stall was too high
42337 and the overall sequence slower. */
42338
42339 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
42340 const0_rtx, const2_rtx,
42341 const0_rtx, const0_rtx));
42342 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
42343 const0_rtx, const2_rtx,
42344 const0_rtx, const0_rtx));
42345 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
42346
42347 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
42348 }
42349
42350 void
42351 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
42352 {
42353 enum machine_mode mode = GET_MODE (op0);
42354 rtx t1, t2, t3, t4, t5, t6;
42355
42356 if (TARGET_XOP && mode == V2DImode)
42357 {
42358 /* op1: A,B,C,D, op2: E,F,G,H */
42359 op1 = gen_lowpart (V4SImode, op1);
42360 op2 = gen_lowpart (V4SImode, op2);
42361
42362 t1 = gen_reg_rtx (V4SImode);
42363 t2 = gen_reg_rtx (V4SImode);
42364 t3 = gen_reg_rtx (V2DImode);
42365 t4 = gen_reg_rtx (V2DImode);
42366
42367 /* t1: B,A,D,C */
42368 emit_insn (gen_sse2_pshufd_1 (t1, op1,
42369 GEN_INT (1),
42370 GEN_INT (0),
42371 GEN_INT (3),
42372 GEN_INT (2)));
42373
42374 /* t2: (B*E),(A*F),(D*G),(C*H) */
42375 emit_insn (gen_mulv4si3 (t2, t1, op2));
42376
42377 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
42378 emit_insn (gen_xop_phadddq (t3, t2));
42379
42380 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
42381 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
42382
42383 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
42384 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
42385 }
42386 else
42387 {
42388 enum machine_mode nmode;
42389 rtx (*umul) (rtx, rtx, rtx);
42390
42391 if (mode == V2DImode)
42392 {
42393 umul = gen_vec_widen_umult_even_v4si;
42394 nmode = V4SImode;
42395 }
42396 else if (mode == V4DImode)
42397 {
42398 umul = gen_vec_widen_umult_even_v8si;
42399 nmode = V8SImode;
42400 }
42401 else
42402 gcc_unreachable ();
42403
42404
42405 /* Multiply low parts. */
42406 t1 = gen_reg_rtx (mode);
42407 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
42408
42409 /* Shift input vectors right 32 bits so we can multiply high parts. */
42410 t6 = GEN_INT (32);
42411 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
42412 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
42413
42414 /* Multiply high parts by low parts. */
42415 t4 = gen_reg_rtx (mode);
42416 t5 = gen_reg_rtx (mode);
42417 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
42418 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
42419
42420 /* Combine and shift the highparts back. */
42421 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
42422 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
42423
42424 /* Combine high and low parts. */
42425 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
42426 }
42427
42428 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42429 gen_rtx_MULT (mode, op1, op2));
42430 }
42431
42432 /* Return 1 if control tansfer instruction INSN
42433 should be encoded with bnd prefix.
42434 If insn is NULL then return 1 when control
42435 transfer instructions should be prefixed with
42436 bnd by default for current function. */
42437
42438 bool
42439 ix86_bnd_prefixed_insn_p (rtx insn ATTRIBUTE_UNUSED)
42440 {
42441 return false;
42442 }
42443
42444 /* Calculate integer abs() using only SSE2 instructions. */
42445
42446 void
42447 ix86_expand_sse2_abs (rtx target, rtx input)
42448 {
42449 enum machine_mode mode = GET_MODE (target);
42450 rtx tmp0, tmp1, x;
42451
42452 switch (mode)
42453 {
42454 /* For 32-bit signed integer X, the best way to calculate the absolute
42455 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
42456 case V4SImode:
42457 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
42458 GEN_INT (GET_MODE_BITSIZE
42459 (GET_MODE_INNER (mode)) - 1),
42460 NULL, 0, OPTAB_DIRECT);
42461 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
42462 NULL, 0, OPTAB_DIRECT);
42463 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
42464 target, 0, OPTAB_DIRECT);
42465 break;
42466
42467 /* For 16-bit signed integer X, the best way to calculate the absolute
42468 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
42469 case V8HImode:
42470 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42471
42472 x = expand_simple_binop (mode, SMAX, tmp0, input,
42473 target, 0, OPTAB_DIRECT);
42474 break;
42475
42476 /* For 8-bit signed integer X, the best way to calculate the absolute
42477 value of X is min ((unsigned char) X, (unsigned char) (-X)),
42478 as SSE2 provides the PMINUB insn. */
42479 case V16QImode:
42480 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42481
42482 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
42483 target, 0, OPTAB_DIRECT);
42484 break;
42485
42486 default:
42487 gcc_unreachable ();
42488 }
42489
42490 if (x != target)
42491 emit_move_insn (target, x);
42492 }
42493
42494 /* Expand an insert into a vector register through pinsr insn.
42495 Return true if successful. */
42496
42497 bool
42498 ix86_expand_pinsr (rtx *operands)
42499 {
42500 rtx dst = operands[0];
42501 rtx src = operands[3];
42502
42503 unsigned int size = INTVAL (operands[1]);
42504 unsigned int pos = INTVAL (operands[2]);
42505
42506 if (GET_CODE (dst) == SUBREG)
42507 {
42508 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
42509 dst = SUBREG_REG (dst);
42510 }
42511
42512 if (GET_CODE (src) == SUBREG)
42513 src = SUBREG_REG (src);
42514
42515 switch (GET_MODE (dst))
42516 {
42517 case V16QImode:
42518 case V8HImode:
42519 case V4SImode:
42520 case V2DImode:
42521 {
42522 enum machine_mode srcmode, dstmode;
42523 rtx (*pinsr)(rtx, rtx, rtx, rtx);
42524
42525 srcmode = mode_for_size (size, MODE_INT, 0);
42526
42527 switch (srcmode)
42528 {
42529 case QImode:
42530 if (!TARGET_SSE4_1)
42531 return false;
42532 dstmode = V16QImode;
42533 pinsr = gen_sse4_1_pinsrb;
42534 break;
42535
42536 case HImode:
42537 if (!TARGET_SSE2)
42538 return false;
42539 dstmode = V8HImode;
42540 pinsr = gen_sse2_pinsrw;
42541 break;
42542
42543 case SImode:
42544 if (!TARGET_SSE4_1)
42545 return false;
42546 dstmode = V4SImode;
42547 pinsr = gen_sse4_1_pinsrd;
42548 break;
42549
42550 case DImode:
42551 gcc_assert (TARGET_64BIT);
42552 if (!TARGET_SSE4_1)
42553 return false;
42554 dstmode = V2DImode;
42555 pinsr = gen_sse4_1_pinsrq;
42556 break;
42557
42558 default:
42559 return false;
42560 }
42561
42562 rtx d = dst;
42563 if (GET_MODE (dst) != dstmode)
42564 d = gen_reg_rtx (dstmode);
42565 src = gen_lowpart (srcmode, src);
42566
42567 pos /= size;
42568
42569 emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
42570 GEN_INT (1 << pos)));
42571 if (d != dst)
42572 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
42573 return true;
42574 }
42575
42576 default:
42577 return false;
42578 }
42579 }
42580 \f
42581 /* This function returns the calling abi specific va_list type node.
42582 It returns the FNDECL specific va_list type. */
42583
42584 static tree
42585 ix86_fn_abi_va_list (tree fndecl)
42586 {
42587 if (!TARGET_64BIT)
42588 return va_list_type_node;
42589 gcc_assert (fndecl != NULL_TREE);
42590
42591 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
42592 return ms_va_list_type_node;
42593 else
42594 return sysv_va_list_type_node;
42595 }
42596
42597 /* Returns the canonical va_list type specified by TYPE. If there
42598 is no valid TYPE provided, it return NULL_TREE. */
42599
42600 static tree
42601 ix86_canonical_va_list_type (tree type)
42602 {
42603 tree wtype, htype;
42604
42605 /* Resolve references and pointers to va_list type. */
42606 if (TREE_CODE (type) == MEM_REF)
42607 type = TREE_TYPE (type);
42608 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
42609 type = TREE_TYPE (type);
42610 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
42611 type = TREE_TYPE (type);
42612
42613 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
42614 {
42615 wtype = va_list_type_node;
42616 gcc_assert (wtype != NULL_TREE);
42617 htype = type;
42618 if (TREE_CODE (wtype) == ARRAY_TYPE)
42619 {
42620 /* If va_list is an array type, the argument may have decayed
42621 to a pointer type, e.g. by being passed to another function.
42622 In that case, unwrap both types so that we can compare the
42623 underlying records. */
42624 if (TREE_CODE (htype) == ARRAY_TYPE
42625 || POINTER_TYPE_P (htype))
42626 {
42627 wtype = TREE_TYPE (wtype);
42628 htype = TREE_TYPE (htype);
42629 }
42630 }
42631 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42632 return va_list_type_node;
42633 wtype = sysv_va_list_type_node;
42634 gcc_assert (wtype != NULL_TREE);
42635 htype = type;
42636 if (TREE_CODE (wtype) == ARRAY_TYPE)
42637 {
42638 /* If va_list is an array type, the argument may have decayed
42639 to a pointer type, e.g. by being passed to another function.
42640 In that case, unwrap both types so that we can compare the
42641 underlying records. */
42642 if (TREE_CODE (htype) == ARRAY_TYPE
42643 || POINTER_TYPE_P (htype))
42644 {
42645 wtype = TREE_TYPE (wtype);
42646 htype = TREE_TYPE (htype);
42647 }
42648 }
42649 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42650 return sysv_va_list_type_node;
42651 wtype = ms_va_list_type_node;
42652 gcc_assert (wtype != NULL_TREE);
42653 htype = type;
42654 if (TREE_CODE (wtype) == ARRAY_TYPE)
42655 {
42656 /* If va_list is an array type, the argument may have decayed
42657 to a pointer type, e.g. by being passed to another function.
42658 In that case, unwrap both types so that we can compare the
42659 underlying records. */
42660 if (TREE_CODE (htype) == ARRAY_TYPE
42661 || POINTER_TYPE_P (htype))
42662 {
42663 wtype = TREE_TYPE (wtype);
42664 htype = TREE_TYPE (htype);
42665 }
42666 }
42667 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42668 return ms_va_list_type_node;
42669 return NULL_TREE;
42670 }
42671 return std_canonical_va_list_type (type);
42672 }
42673
42674 /* Iterate through the target-specific builtin types for va_list.
42675 IDX denotes the iterator, *PTREE is set to the result type of
42676 the va_list builtin, and *PNAME to its internal type.
42677 Returns zero if there is no element for this index, otherwise
42678 IDX should be increased upon the next call.
42679 Note, do not iterate a base builtin's name like __builtin_va_list.
42680 Used from c_common_nodes_and_builtins. */
42681
42682 static int
42683 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
42684 {
42685 if (TARGET_64BIT)
42686 {
42687 switch (idx)
42688 {
42689 default:
42690 break;
42691
42692 case 0:
42693 *ptree = ms_va_list_type_node;
42694 *pname = "__builtin_ms_va_list";
42695 return 1;
42696
42697 case 1:
42698 *ptree = sysv_va_list_type_node;
42699 *pname = "__builtin_sysv_va_list";
42700 return 1;
42701 }
42702 }
42703
42704 return 0;
42705 }
42706
42707 #undef TARGET_SCHED_DISPATCH
42708 #define TARGET_SCHED_DISPATCH has_dispatch
42709 #undef TARGET_SCHED_DISPATCH_DO
42710 #define TARGET_SCHED_DISPATCH_DO do_dispatch
42711 #undef TARGET_SCHED_REASSOCIATION_WIDTH
42712 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
42713 #undef TARGET_SCHED_REORDER
42714 #define TARGET_SCHED_REORDER ix86_sched_reorder
42715 #undef TARGET_SCHED_ADJUST_PRIORITY
42716 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
42717 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
42718 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
42719 ix86_dependencies_evaluation_hook
42720
42721 /* The size of the dispatch window is the total number of bytes of
42722 object code allowed in a window. */
42723 #define DISPATCH_WINDOW_SIZE 16
42724
42725 /* Number of dispatch windows considered for scheduling. */
42726 #define MAX_DISPATCH_WINDOWS 3
42727
42728 /* Maximum number of instructions in a window. */
42729 #define MAX_INSN 4
42730
42731 /* Maximum number of immediate operands in a window. */
42732 #define MAX_IMM 4
42733
42734 /* Maximum number of immediate bits allowed in a window. */
42735 #define MAX_IMM_SIZE 128
42736
42737 /* Maximum number of 32 bit immediates allowed in a window. */
42738 #define MAX_IMM_32 4
42739
42740 /* Maximum number of 64 bit immediates allowed in a window. */
42741 #define MAX_IMM_64 2
42742
42743 /* Maximum total of loads or prefetches allowed in a window. */
42744 #define MAX_LOAD 2
42745
42746 /* Maximum total of stores allowed in a window. */
42747 #define MAX_STORE 1
42748
42749 #undef BIG
42750 #define BIG 100
42751
42752
42753 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
42754 enum dispatch_group {
42755 disp_no_group = 0,
42756 disp_load,
42757 disp_store,
42758 disp_load_store,
42759 disp_prefetch,
42760 disp_imm,
42761 disp_imm_32,
42762 disp_imm_64,
42763 disp_branch,
42764 disp_cmp,
42765 disp_jcc,
42766 disp_last
42767 };
42768
42769 /* Number of allowable groups in a dispatch window. It is an array
42770 indexed by dispatch_group enum. 100 is used as a big number,
42771 because the number of these kind of operations does not have any
42772 effect in dispatch window, but we need them for other reasons in
42773 the table. */
42774 static unsigned int num_allowable_groups[disp_last] = {
42775 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42776 };
42777
42778 char group_name[disp_last + 1][16] = {
42779 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42780 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42781 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42782 };
42783
42784 /* Instruction path. */
42785 enum insn_path {
42786 no_path = 0,
42787 path_single, /* Single micro op. */
42788 path_double, /* Double micro op. */
42789 path_multi, /* Instructions with more than 2 micro op.. */
42790 last_path
42791 };
42792
42793 /* sched_insn_info defines a window to the instructions scheduled in
42794 the basic block. It contains a pointer to the insn_info table and
42795 the instruction scheduled.
42796
42797 Windows are allocated for each basic block and are linked
42798 together. */
42799 typedef struct sched_insn_info_s {
42800 rtx insn;
42801 enum dispatch_group group;
42802 enum insn_path path;
42803 int byte_len;
42804 int imm_bytes;
42805 } sched_insn_info;
42806
42807 /* Linked list of dispatch windows. This is a two way list of
42808 dispatch windows of a basic block. It contains information about
42809 the number of uops in the window and the total number of
42810 instructions and of bytes in the object code for this dispatch
42811 window. */
42812 typedef struct dispatch_windows_s {
42813 int num_insn; /* Number of insn in the window. */
42814 int num_uops; /* Number of uops in the window. */
42815 int window_size; /* Number of bytes in the window. */
42816 int window_num; /* Window number between 0 or 1. */
42817 int num_imm; /* Number of immediates in an insn. */
42818 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42819 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42820 int imm_size; /* Total immediates in the window. */
42821 int num_loads; /* Total memory loads in the window. */
42822 int num_stores; /* Total memory stores in the window. */
42823 int violation; /* Violation exists in window. */
42824 sched_insn_info *window; /* Pointer to the window. */
42825 struct dispatch_windows_s *next;
42826 struct dispatch_windows_s *prev;
42827 } dispatch_windows;
42828
42829 /* Immediate valuse used in an insn. */
42830 typedef struct imm_info_s
42831 {
42832 int imm;
42833 int imm32;
42834 int imm64;
42835 } imm_info;
42836
42837 static dispatch_windows *dispatch_window_list;
42838 static dispatch_windows *dispatch_window_list1;
42839
42840 /* Get dispatch group of insn. */
42841
42842 static enum dispatch_group
42843 get_mem_group (rtx insn)
42844 {
42845 enum attr_memory memory;
42846
42847 if (INSN_CODE (insn) < 0)
42848 return disp_no_group;
42849 memory = get_attr_memory (insn);
42850 if (memory == MEMORY_STORE)
42851 return disp_store;
42852
42853 if (memory == MEMORY_LOAD)
42854 return disp_load;
42855
42856 if (memory == MEMORY_BOTH)
42857 return disp_load_store;
42858
42859 return disp_no_group;
42860 }
42861
42862 /* Return true if insn is a compare instruction. */
42863
42864 static bool
42865 is_cmp (rtx insn)
42866 {
42867 enum attr_type type;
42868
42869 type = get_attr_type (insn);
42870 return (type == TYPE_TEST
42871 || type == TYPE_ICMP
42872 || type == TYPE_FCMP
42873 || GET_CODE (PATTERN (insn)) == COMPARE);
42874 }
42875
42876 /* Return true if a dispatch violation encountered. */
42877
42878 static bool
42879 dispatch_violation (void)
42880 {
42881 if (dispatch_window_list->next)
42882 return dispatch_window_list->next->violation;
42883 return dispatch_window_list->violation;
42884 }
42885
42886 /* Return true if insn is a branch instruction. */
42887
42888 static bool
42889 is_branch (rtx insn)
42890 {
42891 return (CALL_P (insn) || JUMP_P (insn));
42892 }
42893
42894 /* Return true if insn is a prefetch instruction. */
42895
42896 static bool
42897 is_prefetch (rtx insn)
42898 {
42899 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42900 }
42901
42902 /* This function initializes a dispatch window and the list container holding a
42903 pointer to the window. */
42904
42905 static void
42906 init_window (int window_num)
42907 {
42908 int i;
42909 dispatch_windows *new_list;
42910
42911 if (window_num == 0)
42912 new_list = dispatch_window_list;
42913 else
42914 new_list = dispatch_window_list1;
42915
42916 new_list->num_insn = 0;
42917 new_list->num_uops = 0;
42918 new_list->window_size = 0;
42919 new_list->next = NULL;
42920 new_list->prev = NULL;
42921 new_list->window_num = window_num;
42922 new_list->num_imm = 0;
42923 new_list->num_imm_32 = 0;
42924 new_list->num_imm_64 = 0;
42925 new_list->imm_size = 0;
42926 new_list->num_loads = 0;
42927 new_list->num_stores = 0;
42928 new_list->violation = false;
42929
42930 for (i = 0; i < MAX_INSN; i++)
42931 {
42932 new_list->window[i].insn = NULL;
42933 new_list->window[i].group = disp_no_group;
42934 new_list->window[i].path = no_path;
42935 new_list->window[i].byte_len = 0;
42936 new_list->window[i].imm_bytes = 0;
42937 }
42938 return;
42939 }
42940
42941 /* This function allocates and initializes a dispatch window and the
42942 list container holding a pointer to the window. */
42943
42944 static dispatch_windows *
42945 allocate_window (void)
42946 {
42947 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42948 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42949
42950 return new_list;
42951 }
42952
42953 /* This routine initializes the dispatch scheduling information. It
42954 initiates building dispatch scheduler tables and constructs the
42955 first dispatch window. */
42956
42957 static void
42958 init_dispatch_sched (void)
42959 {
42960 /* Allocate a dispatch list and a window. */
42961 dispatch_window_list = allocate_window ();
42962 dispatch_window_list1 = allocate_window ();
42963 init_window (0);
42964 init_window (1);
42965 }
42966
42967 /* This function returns true if a branch is detected. End of a basic block
42968 does not have to be a branch, but here we assume only branches end a
42969 window. */
42970
42971 static bool
42972 is_end_basic_block (enum dispatch_group group)
42973 {
42974 return group == disp_branch;
42975 }
42976
42977 /* This function is called when the end of a window processing is reached. */
42978
42979 static void
42980 process_end_window (void)
42981 {
42982 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42983 if (dispatch_window_list->next)
42984 {
42985 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42986 gcc_assert (dispatch_window_list->window_size
42987 + dispatch_window_list1->window_size <= 48);
42988 init_window (1);
42989 }
42990 init_window (0);
42991 }
42992
42993 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42994 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42995 for 48 bytes of instructions. Note that these windows are not dispatch
42996 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42997
42998 static dispatch_windows *
42999 allocate_next_window (int window_num)
43000 {
43001 if (window_num == 0)
43002 {
43003 if (dispatch_window_list->next)
43004 init_window (1);
43005 init_window (0);
43006 return dispatch_window_list;
43007 }
43008
43009 dispatch_window_list->next = dispatch_window_list1;
43010 dispatch_window_list1->prev = dispatch_window_list;
43011
43012 return dispatch_window_list1;
43013 }
43014
43015 /* Increment the number of immediate operands of an instruction. */
43016
43017 static int
43018 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
43019 {
43020 if (*in_rtx == 0)
43021 return 0;
43022
43023 switch ( GET_CODE (*in_rtx))
43024 {
43025 case CONST:
43026 case SYMBOL_REF:
43027 case CONST_INT:
43028 (imm_values->imm)++;
43029 if (x86_64_immediate_operand (*in_rtx, SImode))
43030 (imm_values->imm32)++;
43031 else
43032 (imm_values->imm64)++;
43033 break;
43034
43035 case CONST_DOUBLE:
43036 (imm_values->imm)++;
43037 (imm_values->imm64)++;
43038 break;
43039
43040 case CODE_LABEL:
43041 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
43042 {
43043 (imm_values->imm)++;
43044 (imm_values->imm32)++;
43045 }
43046 break;
43047
43048 default:
43049 break;
43050 }
43051
43052 return 0;
43053 }
43054
43055 /* Compute number of immediate operands of an instruction. */
43056
43057 static void
43058 find_constant (rtx in_rtx, imm_info *imm_values)
43059 {
43060 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
43061 (rtx_function) find_constant_1, (void *) imm_values);
43062 }
43063
43064 /* Return total size of immediate operands of an instruction along with number
43065 of corresponding immediate-operands. It initializes its parameters to zero
43066 befor calling FIND_CONSTANT.
43067 INSN is the input instruction. IMM is the total of immediates.
43068 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
43069 bit immediates. */
43070
43071 static int
43072 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
43073 {
43074 imm_info imm_values = {0, 0, 0};
43075
43076 find_constant (insn, &imm_values);
43077 *imm = imm_values.imm;
43078 *imm32 = imm_values.imm32;
43079 *imm64 = imm_values.imm64;
43080 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
43081 }
43082
43083 /* This function indicates if an operand of an instruction is an
43084 immediate. */
43085
43086 static bool
43087 has_immediate (rtx insn)
43088 {
43089 int num_imm_operand;
43090 int num_imm32_operand;
43091 int num_imm64_operand;
43092
43093 if (insn)
43094 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43095 &num_imm64_operand);
43096 return false;
43097 }
43098
43099 /* Return single or double path for instructions. */
43100
43101 static enum insn_path
43102 get_insn_path (rtx insn)
43103 {
43104 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
43105
43106 if ((int)path == 0)
43107 return path_single;
43108
43109 if ((int)path == 1)
43110 return path_double;
43111
43112 return path_multi;
43113 }
43114
43115 /* Return insn dispatch group. */
43116
43117 static enum dispatch_group
43118 get_insn_group (rtx insn)
43119 {
43120 enum dispatch_group group = get_mem_group (insn);
43121 if (group)
43122 return group;
43123
43124 if (is_branch (insn))
43125 return disp_branch;
43126
43127 if (is_cmp (insn))
43128 return disp_cmp;
43129
43130 if (has_immediate (insn))
43131 return disp_imm;
43132
43133 if (is_prefetch (insn))
43134 return disp_prefetch;
43135
43136 return disp_no_group;
43137 }
43138
43139 /* Count number of GROUP restricted instructions in a dispatch
43140 window WINDOW_LIST. */
43141
43142 static int
43143 count_num_restricted (rtx insn, dispatch_windows *window_list)
43144 {
43145 enum dispatch_group group = get_insn_group (insn);
43146 int imm_size;
43147 int num_imm_operand;
43148 int num_imm32_operand;
43149 int num_imm64_operand;
43150
43151 if (group == disp_no_group)
43152 return 0;
43153
43154 if (group == disp_imm)
43155 {
43156 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43157 &num_imm64_operand);
43158 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
43159 || num_imm_operand + window_list->num_imm > MAX_IMM
43160 || (num_imm32_operand > 0
43161 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
43162 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
43163 || (num_imm64_operand > 0
43164 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
43165 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
43166 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
43167 && num_imm64_operand > 0
43168 && ((window_list->num_imm_64 > 0
43169 && window_list->num_insn >= 2)
43170 || window_list->num_insn >= 3)))
43171 return BIG;
43172
43173 return 1;
43174 }
43175
43176 if ((group == disp_load_store
43177 && (window_list->num_loads >= MAX_LOAD
43178 || window_list->num_stores >= MAX_STORE))
43179 || ((group == disp_load
43180 || group == disp_prefetch)
43181 && window_list->num_loads >= MAX_LOAD)
43182 || (group == disp_store
43183 && window_list->num_stores >= MAX_STORE))
43184 return BIG;
43185
43186 return 1;
43187 }
43188
43189 /* This function returns true if insn satisfies dispatch rules on the
43190 last window scheduled. */
43191
43192 static bool
43193 fits_dispatch_window (rtx insn)
43194 {
43195 dispatch_windows *window_list = dispatch_window_list;
43196 dispatch_windows *window_list_next = dispatch_window_list->next;
43197 unsigned int num_restrict;
43198 enum dispatch_group group = get_insn_group (insn);
43199 enum insn_path path = get_insn_path (insn);
43200 int sum;
43201
43202 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
43203 instructions should be given the lowest priority in the
43204 scheduling process in Haifa scheduler to make sure they will be
43205 scheduled in the same dispatch window as the reference to them. */
43206 if (group == disp_jcc || group == disp_cmp)
43207 return false;
43208
43209 /* Check nonrestricted. */
43210 if (group == disp_no_group || group == disp_branch)
43211 return true;
43212
43213 /* Get last dispatch window. */
43214 if (window_list_next)
43215 window_list = window_list_next;
43216
43217 if (window_list->window_num == 1)
43218 {
43219 sum = window_list->prev->window_size + window_list->window_size;
43220
43221 if (sum == 32
43222 || (min_insn_size (insn) + sum) >= 48)
43223 /* Window 1 is full. Go for next window. */
43224 return true;
43225 }
43226
43227 num_restrict = count_num_restricted (insn, window_list);
43228
43229 if (num_restrict > num_allowable_groups[group])
43230 return false;
43231
43232 /* See if it fits in the first window. */
43233 if (window_list->window_num == 0)
43234 {
43235 /* The first widow should have only single and double path
43236 uops. */
43237 if (path == path_double
43238 && (window_list->num_uops + 2) > MAX_INSN)
43239 return false;
43240 else if (path != path_single)
43241 return false;
43242 }
43243 return true;
43244 }
43245
43246 /* Add an instruction INSN with NUM_UOPS micro-operations to the
43247 dispatch window WINDOW_LIST. */
43248
43249 static void
43250 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
43251 {
43252 int byte_len = min_insn_size (insn);
43253 int num_insn = window_list->num_insn;
43254 int imm_size;
43255 sched_insn_info *window = window_list->window;
43256 enum dispatch_group group = get_insn_group (insn);
43257 enum insn_path path = get_insn_path (insn);
43258 int num_imm_operand;
43259 int num_imm32_operand;
43260 int num_imm64_operand;
43261
43262 if (!window_list->violation && group != disp_cmp
43263 && !fits_dispatch_window (insn))
43264 window_list->violation = true;
43265
43266 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43267 &num_imm64_operand);
43268
43269 /* Initialize window with new instruction. */
43270 window[num_insn].insn = insn;
43271 window[num_insn].byte_len = byte_len;
43272 window[num_insn].group = group;
43273 window[num_insn].path = path;
43274 window[num_insn].imm_bytes = imm_size;
43275
43276 window_list->window_size += byte_len;
43277 window_list->num_insn = num_insn + 1;
43278 window_list->num_uops = window_list->num_uops + num_uops;
43279 window_list->imm_size += imm_size;
43280 window_list->num_imm += num_imm_operand;
43281 window_list->num_imm_32 += num_imm32_operand;
43282 window_list->num_imm_64 += num_imm64_operand;
43283
43284 if (group == disp_store)
43285 window_list->num_stores += 1;
43286 else if (group == disp_load
43287 || group == disp_prefetch)
43288 window_list->num_loads += 1;
43289 else if (group == disp_load_store)
43290 {
43291 window_list->num_stores += 1;
43292 window_list->num_loads += 1;
43293 }
43294 }
43295
43296 /* Adds a scheduled instruction, INSN, to the current dispatch window.
43297 If the total bytes of instructions or the number of instructions in
43298 the window exceed allowable, it allocates a new window. */
43299
43300 static void
43301 add_to_dispatch_window (rtx insn)
43302 {
43303 int byte_len;
43304 dispatch_windows *window_list;
43305 dispatch_windows *next_list;
43306 dispatch_windows *window0_list;
43307 enum insn_path path;
43308 enum dispatch_group insn_group;
43309 bool insn_fits;
43310 int num_insn;
43311 int num_uops;
43312 int window_num;
43313 int insn_num_uops;
43314 int sum;
43315
43316 if (INSN_CODE (insn) < 0)
43317 return;
43318
43319 byte_len = min_insn_size (insn);
43320 window_list = dispatch_window_list;
43321 next_list = window_list->next;
43322 path = get_insn_path (insn);
43323 insn_group = get_insn_group (insn);
43324
43325 /* Get the last dispatch window. */
43326 if (next_list)
43327 window_list = dispatch_window_list->next;
43328
43329 if (path == path_single)
43330 insn_num_uops = 1;
43331 else if (path == path_double)
43332 insn_num_uops = 2;
43333 else
43334 insn_num_uops = (int) path;
43335
43336 /* If current window is full, get a new window.
43337 Window number zero is full, if MAX_INSN uops are scheduled in it.
43338 Window number one is full, if window zero's bytes plus window
43339 one's bytes is 32, or if the bytes of the new instruction added
43340 to the total makes it greater than 48, or it has already MAX_INSN
43341 instructions in it. */
43342 num_insn = window_list->num_insn;
43343 num_uops = window_list->num_uops;
43344 window_num = window_list->window_num;
43345 insn_fits = fits_dispatch_window (insn);
43346
43347 if (num_insn >= MAX_INSN
43348 || num_uops + insn_num_uops > MAX_INSN
43349 || !(insn_fits))
43350 {
43351 window_num = ~window_num & 1;
43352 window_list = allocate_next_window (window_num);
43353 }
43354
43355 if (window_num == 0)
43356 {
43357 add_insn_window (insn, window_list, insn_num_uops);
43358 if (window_list->num_insn >= MAX_INSN
43359 && insn_group == disp_branch)
43360 {
43361 process_end_window ();
43362 return;
43363 }
43364 }
43365 else if (window_num == 1)
43366 {
43367 window0_list = window_list->prev;
43368 sum = window0_list->window_size + window_list->window_size;
43369 if (sum == 32
43370 || (byte_len + sum) >= 48)
43371 {
43372 process_end_window ();
43373 window_list = dispatch_window_list;
43374 }
43375
43376 add_insn_window (insn, window_list, insn_num_uops);
43377 }
43378 else
43379 gcc_unreachable ();
43380
43381 if (is_end_basic_block (insn_group))
43382 {
43383 /* End of basic block is reached do end-basic-block process. */
43384 process_end_window ();
43385 return;
43386 }
43387 }
43388
43389 /* Print the dispatch window, WINDOW_NUM, to FILE. */
43390
43391 DEBUG_FUNCTION static void
43392 debug_dispatch_window_file (FILE *file, int window_num)
43393 {
43394 dispatch_windows *list;
43395 int i;
43396
43397 if (window_num == 0)
43398 list = dispatch_window_list;
43399 else
43400 list = dispatch_window_list1;
43401
43402 fprintf (file, "Window #%d:\n", list->window_num);
43403 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
43404 list->num_insn, list->num_uops, list->window_size);
43405 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43406 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
43407
43408 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
43409 list->num_stores);
43410 fprintf (file, " insn info:\n");
43411
43412 for (i = 0; i < MAX_INSN; i++)
43413 {
43414 if (!list->window[i].insn)
43415 break;
43416 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
43417 i, group_name[list->window[i].group],
43418 i, (void *)list->window[i].insn,
43419 i, list->window[i].path,
43420 i, list->window[i].byte_len,
43421 i, list->window[i].imm_bytes);
43422 }
43423 }
43424
43425 /* Print to stdout a dispatch window. */
43426
43427 DEBUG_FUNCTION void
43428 debug_dispatch_window (int window_num)
43429 {
43430 debug_dispatch_window_file (stdout, window_num);
43431 }
43432
43433 /* Print INSN dispatch information to FILE. */
43434
43435 DEBUG_FUNCTION static void
43436 debug_insn_dispatch_info_file (FILE *file, rtx insn)
43437 {
43438 int byte_len;
43439 enum insn_path path;
43440 enum dispatch_group group;
43441 int imm_size;
43442 int num_imm_operand;
43443 int num_imm32_operand;
43444 int num_imm64_operand;
43445
43446 if (INSN_CODE (insn) < 0)
43447 return;
43448
43449 byte_len = min_insn_size (insn);
43450 path = get_insn_path (insn);
43451 group = get_insn_group (insn);
43452 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43453 &num_imm64_operand);
43454
43455 fprintf (file, " insn info:\n");
43456 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
43457 group_name[group], path, byte_len);
43458 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43459 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
43460 }
43461
43462 /* Print to STDERR the status of the ready list with respect to
43463 dispatch windows. */
43464
43465 DEBUG_FUNCTION void
43466 debug_ready_dispatch (void)
43467 {
43468 int i;
43469 int no_ready = number_in_ready ();
43470
43471 fprintf (stdout, "Number of ready: %d\n", no_ready);
43472
43473 for (i = 0; i < no_ready; i++)
43474 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
43475 }
43476
43477 /* This routine is the driver of the dispatch scheduler. */
43478
43479 static void
43480 do_dispatch (rtx insn, int mode)
43481 {
43482 if (mode == DISPATCH_INIT)
43483 init_dispatch_sched ();
43484 else if (mode == ADD_TO_DISPATCH_WINDOW)
43485 add_to_dispatch_window (insn);
43486 }
43487
43488 /* Return TRUE if Dispatch Scheduling is supported. */
43489
43490 static bool
43491 has_dispatch (rtx insn, int action)
43492 {
43493 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 || TARGET_BDVER4)
43494 && flag_dispatch_scheduler)
43495 switch (action)
43496 {
43497 default:
43498 return false;
43499
43500 case IS_DISPATCH_ON:
43501 return true;
43502 break;
43503
43504 case IS_CMP:
43505 return is_cmp (insn);
43506
43507 case DISPATCH_VIOLATION:
43508 return dispatch_violation ();
43509
43510 case FITS_DISPATCH_WINDOW:
43511 return fits_dispatch_window (insn);
43512 }
43513
43514 return false;
43515 }
43516
43517 /* Implementation of reassociation_width target hook used by
43518 reassoc phase to identify parallelism level in reassociated
43519 tree. Statements tree_code is passed in OPC. Arguments type
43520 is passed in MODE.
43521
43522 Currently parallel reassociation is enabled for Atom
43523 processors only and we set reassociation width to be 2
43524 because Atom may issue up to 2 instructions per cycle.
43525
43526 Return value should be fixed if parallel reassociation is
43527 enabled for other processors. */
43528
43529 static int
43530 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
43531 enum machine_mode mode)
43532 {
43533 int res = 1;
43534
43535 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
43536 res = 2;
43537 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
43538 res = 2;
43539
43540 return res;
43541 }
43542
43543 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
43544 place emms and femms instructions. */
43545
43546 static enum machine_mode
43547 ix86_preferred_simd_mode (enum machine_mode mode)
43548 {
43549 if (!TARGET_SSE)
43550 return word_mode;
43551
43552 switch (mode)
43553 {
43554 case QImode:
43555 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
43556 case HImode:
43557 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
43558 case SImode:
43559 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
43560 case DImode:
43561 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
43562
43563 case SFmode:
43564 if (TARGET_AVX && !TARGET_PREFER_AVX128)
43565 return V8SFmode;
43566 else
43567 return V4SFmode;
43568
43569 case DFmode:
43570 if (!TARGET_VECTORIZE_DOUBLE)
43571 return word_mode;
43572 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
43573 return V4DFmode;
43574 else if (TARGET_SSE2)
43575 return V2DFmode;
43576 /* FALLTHRU */
43577
43578 default:
43579 return word_mode;
43580 }
43581 }
43582
43583 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
43584 vectors. */
43585
43586 static unsigned int
43587 ix86_autovectorize_vector_sizes (void)
43588 {
43589 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
43590 }
43591
43592 \f
43593
43594 /* Return class of registers which could be used for pseudo of MODE
43595 and of class RCLASS for spilling instead of memory. Return NO_REGS
43596 if it is not possible or non-profitable. */
43597 static reg_class_t
43598 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
43599 {
43600 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
43601 && (mode == SImode || (TARGET_64BIT && mode == DImode))
43602 && INTEGER_CLASS_P (rclass))
43603 return ALL_SSE_REGS;
43604 return NO_REGS;
43605 }
43606
43607 /* Implement targetm.vectorize.init_cost. */
43608
43609 static void *
43610 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
43611 {
43612 unsigned *cost = XNEWVEC (unsigned, 3);
43613 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
43614 return cost;
43615 }
43616
43617 /* Implement targetm.vectorize.add_stmt_cost. */
43618
43619 static unsigned
43620 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
43621 struct _stmt_vec_info *stmt_info, int misalign,
43622 enum vect_cost_model_location where)
43623 {
43624 unsigned *cost = (unsigned *) data;
43625 unsigned retval = 0;
43626
43627 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
43628 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
43629
43630 /* Statements in an inner loop relative to the loop being
43631 vectorized are weighted more heavily. The value here is
43632 arbitrary and could potentially be improved with analysis. */
43633 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
43634 count *= 50; /* FIXME. */
43635
43636 retval = (unsigned) (count * stmt_cost);
43637 cost[where] += retval;
43638
43639 return retval;
43640 }
43641
43642 /* Implement targetm.vectorize.finish_cost. */
43643
43644 static void
43645 ix86_finish_cost (void *data, unsigned *prologue_cost,
43646 unsigned *body_cost, unsigned *epilogue_cost)
43647 {
43648 unsigned *cost = (unsigned *) data;
43649 *prologue_cost = cost[vect_prologue];
43650 *body_cost = cost[vect_body];
43651 *epilogue_cost = cost[vect_epilogue];
43652 }
43653
43654 /* Implement targetm.vectorize.destroy_cost_data. */
43655
43656 static void
43657 ix86_destroy_cost_data (void *data)
43658 {
43659 free (data);
43660 }
43661
43662 /* Validate target specific memory model bits in VAL. */
43663
43664 static unsigned HOST_WIDE_INT
43665 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
43666 {
43667 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
43668 bool strong;
43669
43670 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
43671 |MEMMODEL_MASK)
43672 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
43673 {
43674 warning (OPT_Winvalid_memory_model,
43675 "Unknown architecture specific memory model");
43676 return MEMMODEL_SEQ_CST;
43677 }
43678 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
43679 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
43680 {
43681 warning (OPT_Winvalid_memory_model,
43682 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
43683 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
43684 }
43685 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
43686 {
43687 warning (OPT_Winvalid_memory_model,
43688 "HLE_RELEASE not used with RELEASE or stronger memory model");
43689 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
43690 }
43691 return val;
43692 }
43693
43694 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
43695
43696 static bool
43697 ix86_float_exceptions_rounding_supported_p (void)
43698 {
43699 /* For x87 floating point with standard excess precision handling,
43700 there is no adddf3 pattern (since x87 floating point only has
43701 XFmode operations) so the default hook implementation gets this
43702 wrong. */
43703 return TARGET_80387 || TARGET_SSE_MATH;
43704 }
43705
43706 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
43707
43708 static void
43709 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
43710 {
43711 if (!TARGET_80387 && !TARGET_SSE_MATH)
43712 return;
43713 tree exceptions_var = create_tmp_var (integer_type_node, NULL);
43714 if (TARGET_80387)
43715 {
43716 tree fenv_index_type = build_index_type (size_int (6));
43717 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
43718 tree fenv_var = create_tmp_var (fenv_type, NULL);
43719 mark_addressable (fenv_var);
43720 tree fenv_ptr = build_pointer_type (fenv_type);
43721 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
43722 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
43723 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
43724 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
43725 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
43726 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
43727 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
43728 tree hold_fnclex = build_call_expr (fnclex, 0);
43729 *hold = build2 (COMPOUND_EXPR, void_type_node, hold_fnstenv,
43730 hold_fnclex);
43731 *clear = build_call_expr (fnclex, 0);
43732 tree sw_var = create_tmp_var (short_unsigned_type_node, NULL);
43733 mark_addressable (sw_var);
43734 tree su_ptr = build_pointer_type (short_unsigned_type_node);
43735 tree sw_addr = build1 (ADDR_EXPR, su_ptr, sw_var);
43736 tree fnstsw_call = build_call_expr (fnstsw, 1, sw_addr);
43737 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
43738 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
43739 exceptions_var, exceptions_x87);
43740 *update = build2 (COMPOUND_EXPR, integer_type_node,
43741 fnstsw_call, update_mod);
43742 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
43743 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
43744 }
43745 if (TARGET_SSE_MATH)
43746 {
43747 tree mxcsr_orig_var = create_tmp_var (unsigned_type_node, NULL);
43748 tree mxcsr_mod_var = create_tmp_var (unsigned_type_node, NULL);
43749 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
43750 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
43751 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
43752 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
43753 mxcsr_orig_var, stmxcsr_hold_call);
43754 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
43755 mxcsr_orig_var,
43756 build_int_cst (unsigned_type_node, 0x1f80));
43757 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
43758 build_int_cst (unsigned_type_node, 0xffffffc0));
43759 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
43760 mxcsr_mod_var, hold_mod_val);
43761 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
43762 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
43763 hold_assign_orig, hold_assign_mod);
43764 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
43765 ldmxcsr_hold_call);
43766 if (*hold)
43767 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
43768 else
43769 *hold = hold_all;
43770 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
43771 if (*clear)
43772 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
43773 ldmxcsr_clear_call);
43774 else
43775 *clear = ldmxcsr_clear_call;
43776 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
43777 tree exceptions_sse = fold_convert (integer_type_node,
43778 stxmcsr_update_call);
43779 if (*update)
43780 {
43781 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
43782 exceptions_var, exceptions_sse);
43783 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
43784 exceptions_var, exceptions_mod);
43785 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
43786 exceptions_assign);
43787 }
43788 else
43789 *update = build2 (MODIFY_EXPR, integer_type_node,
43790 exceptions_var, exceptions_sse);
43791 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
43792 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
43793 ldmxcsr_update_call);
43794 }
43795 tree atomic_feraiseexcept
43796 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
43797 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
43798 1, exceptions_var);
43799 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
43800 atomic_feraiseexcept_call);
43801 }
43802
43803 /* Initialize the GCC target structure. */
43804 #undef TARGET_RETURN_IN_MEMORY
43805 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
43806
43807 #undef TARGET_LEGITIMIZE_ADDRESS
43808 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
43809
43810 #undef TARGET_ATTRIBUTE_TABLE
43811 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
43812 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
43813 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
43814 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43815 # undef TARGET_MERGE_DECL_ATTRIBUTES
43816 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
43817 #endif
43818
43819 #undef TARGET_COMP_TYPE_ATTRIBUTES
43820 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
43821
43822 #undef TARGET_INIT_BUILTINS
43823 #define TARGET_INIT_BUILTINS ix86_init_builtins
43824 #undef TARGET_BUILTIN_DECL
43825 #define TARGET_BUILTIN_DECL ix86_builtin_decl
43826 #undef TARGET_EXPAND_BUILTIN
43827 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
43828
43829 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
43830 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
43831 ix86_builtin_vectorized_function
43832
43833 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
43834 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
43835
43836 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
43837 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
43838
43839 #undef TARGET_VECTORIZE_BUILTIN_GATHER
43840 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
43841
43842 #undef TARGET_BUILTIN_RECIPROCAL
43843 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
43844
43845 #undef TARGET_ASM_FUNCTION_EPILOGUE
43846 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
43847
43848 #undef TARGET_ENCODE_SECTION_INFO
43849 #ifndef SUBTARGET_ENCODE_SECTION_INFO
43850 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
43851 #else
43852 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
43853 #endif
43854
43855 #undef TARGET_ASM_OPEN_PAREN
43856 #define TARGET_ASM_OPEN_PAREN ""
43857 #undef TARGET_ASM_CLOSE_PAREN
43858 #define TARGET_ASM_CLOSE_PAREN ""
43859
43860 #undef TARGET_ASM_BYTE_OP
43861 #define TARGET_ASM_BYTE_OP ASM_BYTE
43862
43863 #undef TARGET_ASM_ALIGNED_HI_OP
43864 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
43865 #undef TARGET_ASM_ALIGNED_SI_OP
43866 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
43867 #ifdef ASM_QUAD
43868 #undef TARGET_ASM_ALIGNED_DI_OP
43869 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
43870 #endif
43871
43872 #undef TARGET_PROFILE_BEFORE_PROLOGUE
43873 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
43874
43875 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
43876 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
43877
43878 #undef TARGET_ASM_UNALIGNED_HI_OP
43879 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
43880 #undef TARGET_ASM_UNALIGNED_SI_OP
43881 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
43882 #undef TARGET_ASM_UNALIGNED_DI_OP
43883 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
43884
43885 #undef TARGET_PRINT_OPERAND
43886 #define TARGET_PRINT_OPERAND ix86_print_operand
43887 #undef TARGET_PRINT_OPERAND_ADDRESS
43888 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
43889 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
43890 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
43891 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
43892 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
43893
43894 #undef TARGET_SCHED_INIT_GLOBAL
43895 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
43896 #undef TARGET_SCHED_ADJUST_COST
43897 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
43898 #undef TARGET_SCHED_ISSUE_RATE
43899 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
43900 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
43901 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
43902 ia32_multipass_dfa_lookahead
43903 #undef TARGET_SCHED_MACRO_FUSION_P
43904 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
43905 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
43906 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
43907
43908 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
43909 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
43910
43911 #undef TARGET_MEMMODEL_CHECK
43912 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
43913
43914 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
43915 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
43916
43917 #ifdef HAVE_AS_TLS
43918 #undef TARGET_HAVE_TLS
43919 #define TARGET_HAVE_TLS true
43920 #endif
43921 #undef TARGET_CANNOT_FORCE_CONST_MEM
43922 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
43923 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
43924 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
43925
43926 #undef TARGET_DELEGITIMIZE_ADDRESS
43927 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
43928
43929 #undef TARGET_MS_BITFIELD_LAYOUT_P
43930 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
43931
43932 #if TARGET_MACHO
43933 #undef TARGET_BINDS_LOCAL_P
43934 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
43935 #endif
43936 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43937 #undef TARGET_BINDS_LOCAL_P
43938 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
43939 #endif
43940
43941 #undef TARGET_ASM_OUTPUT_MI_THUNK
43942 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
43943 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
43944 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
43945
43946 #undef TARGET_ASM_FILE_START
43947 #define TARGET_ASM_FILE_START x86_file_start
43948
43949 #undef TARGET_OPTION_OVERRIDE
43950 #define TARGET_OPTION_OVERRIDE ix86_option_override
43951
43952 #undef TARGET_REGISTER_MOVE_COST
43953 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
43954 #undef TARGET_MEMORY_MOVE_COST
43955 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
43956 #undef TARGET_RTX_COSTS
43957 #define TARGET_RTX_COSTS ix86_rtx_costs
43958 #undef TARGET_ADDRESS_COST
43959 #define TARGET_ADDRESS_COST ix86_address_cost
43960
43961 #undef TARGET_FIXED_CONDITION_CODE_REGS
43962 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
43963 #undef TARGET_CC_MODES_COMPATIBLE
43964 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
43965
43966 #undef TARGET_MACHINE_DEPENDENT_REORG
43967 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
43968
43969 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
43970 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
43971
43972 #undef TARGET_BUILD_BUILTIN_VA_LIST
43973 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
43974
43975 #undef TARGET_FOLD_BUILTIN
43976 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
43977
43978 #undef TARGET_COMPARE_VERSION_PRIORITY
43979 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
43980
43981 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
43982 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
43983 ix86_generate_version_dispatcher_body
43984
43985 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
43986 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
43987 ix86_get_function_versions_dispatcher
43988
43989 #undef TARGET_ENUM_VA_LIST_P
43990 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
43991
43992 #undef TARGET_FN_ABI_VA_LIST
43993 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
43994
43995 #undef TARGET_CANONICAL_VA_LIST_TYPE
43996 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
43997
43998 #undef TARGET_EXPAND_BUILTIN_VA_START
43999 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
44000
44001 #undef TARGET_MD_ASM_CLOBBERS
44002 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
44003
44004 #undef TARGET_PROMOTE_PROTOTYPES
44005 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
44006 #undef TARGET_STRUCT_VALUE_RTX
44007 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
44008 #undef TARGET_SETUP_INCOMING_VARARGS
44009 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
44010 #undef TARGET_MUST_PASS_IN_STACK
44011 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
44012 #undef TARGET_FUNCTION_ARG_ADVANCE
44013 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
44014 #undef TARGET_FUNCTION_ARG
44015 #define TARGET_FUNCTION_ARG ix86_function_arg
44016 #undef TARGET_FUNCTION_ARG_BOUNDARY
44017 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
44018 #undef TARGET_PASS_BY_REFERENCE
44019 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
44020 #undef TARGET_INTERNAL_ARG_POINTER
44021 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
44022 #undef TARGET_UPDATE_STACK_BOUNDARY
44023 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
44024 #undef TARGET_GET_DRAP_RTX
44025 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
44026 #undef TARGET_STRICT_ARGUMENT_NAMING
44027 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
44028 #undef TARGET_STATIC_CHAIN
44029 #define TARGET_STATIC_CHAIN ix86_static_chain
44030 #undef TARGET_TRAMPOLINE_INIT
44031 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
44032 #undef TARGET_RETURN_POPS_ARGS
44033 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
44034
44035 #undef TARGET_LEGITIMATE_COMBINED_INSN
44036 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
44037
44038 #undef TARGET_ASAN_SHADOW_OFFSET
44039 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
44040
44041 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
44042 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
44043
44044 #undef TARGET_SCALAR_MODE_SUPPORTED_P
44045 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
44046
44047 #undef TARGET_VECTOR_MODE_SUPPORTED_P
44048 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
44049
44050 #undef TARGET_C_MODE_FOR_SUFFIX
44051 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
44052
44053 #ifdef HAVE_AS_TLS
44054 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
44055 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
44056 #endif
44057
44058 #ifdef SUBTARGET_INSERT_ATTRIBUTES
44059 #undef TARGET_INSERT_ATTRIBUTES
44060 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
44061 #endif
44062
44063 #undef TARGET_MANGLE_TYPE
44064 #define TARGET_MANGLE_TYPE ix86_mangle_type
44065
44066 #if !TARGET_MACHO
44067 #undef TARGET_STACK_PROTECT_FAIL
44068 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
44069 #endif
44070
44071 #undef TARGET_FUNCTION_VALUE
44072 #define TARGET_FUNCTION_VALUE ix86_function_value
44073
44074 #undef TARGET_FUNCTION_VALUE_REGNO_P
44075 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
44076
44077 #undef TARGET_PROMOTE_FUNCTION_MODE
44078 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
44079
44080 #undef TARGET_MEMBER_TYPE_FORCES_BLK
44081 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
44082
44083 #undef TARGET_INSTANTIATE_DECLS
44084 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
44085
44086 #undef TARGET_SECONDARY_RELOAD
44087 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
44088
44089 #undef TARGET_CLASS_MAX_NREGS
44090 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
44091
44092 #undef TARGET_PREFERRED_RELOAD_CLASS
44093 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
44094 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
44095 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
44096 #undef TARGET_CLASS_LIKELY_SPILLED_P
44097 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
44098
44099 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
44100 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
44101 ix86_builtin_vectorization_cost
44102 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
44103 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
44104 ix86_vectorize_vec_perm_const_ok
44105 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
44106 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
44107 ix86_preferred_simd_mode
44108 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
44109 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
44110 ix86_autovectorize_vector_sizes
44111 #undef TARGET_VECTORIZE_INIT_COST
44112 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
44113 #undef TARGET_VECTORIZE_ADD_STMT_COST
44114 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
44115 #undef TARGET_VECTORIZE_FINISH_COST
44116 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
44117 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
44118 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
44119
44120 #undef TARGET_SET_CURRENT_FUNCTION
44121 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
44122
44123 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
44124 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
44125
44126 #undef TARGET_OPTION_SAVE
44127 #define TARGET_OPTION_SAVE ix86_function_specific_save
44128
44129 #undef TARGET_OPTION_RESTORE
44130 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
44131
44132 #undef TARGET_OPTION_PRINT
44133 #define TARGET_OPTION_PRINT ix86_function_specific_print
44134
44135 #undef TARGET_OPTION_FUNCTION_VERSIONS
44136 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
44137
44138 #undef TARGET_CAN_INLINE_P
44139 #define TARGET_CAN_INLINE_P ix86_can_inline_p
44140
44141 #undef TARGET_EXPAND_TO_RTL_HOOK
44142 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
44143
44144 #undef TARGET_LEGITIMATE_ADDRESS_P
44145 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
44146
44147 #undef TARGET_LRA_P
44148 #define TARGET_LRA_P hook_bool_void_true
44149
44150 #undef TARGET_REGISTER_PRIORITY
44151 #define TARGET_REGISTER_PRIORITY ix86_register_priority
44152
44153 #undef TARGET_REGISTER_USAGE_LEVELING_P
44154 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
44155
44156 #undef TARGET_LEGITIMATE_CONSTANT_P
44157 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
44158
44159 #undef TARGET_FRAME_POINTER_REQUIRED
44160 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
44161
44162 #undef TARGET_CAN_ELIMINATE
44163 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
44164
44165 #undef TARGET_EXTRA_LIVE_ON_ENTRY
44166 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
44167
44168 #undef TARGET_ASM_CODE_END
44169 #define TARGET_ASM_CODE_END ix86_code_end
44170
44171 #undef TARGET_CONDITIONAL_REGISTER_USAGE
44172 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
44173
44174 #if TARGET_MACHO
44175 #undef TARGET_INIT_LIBFUNCS
44176 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
44177 #endif
44178
44179 #undef TARGET_SPILL_CLASS
44180 #define TARGET_SPILL_CLASS ix86_spill_class
44181
44182 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
44183 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
44184 ix86_float_exceptions_rounding_supported_p
44185
44186 struct gcc_target targetm = TARGET_INITIALIZER;
44187 \f
44188 #include "gt-i386.h"