]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
8e6bbd14dff6ba5c498cad257af4027c77b0241e
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "stringpool.h"
27 #include "attribs.h"
28 #include "calls.h"
29 #include "stor-layout.h"
30 #include "varasm.h"
31 #include "tm_p.h"
32 #include "regs.h"
33 #include "hard-reg-set.h"
34 #include "insn-config.h"
35 #include "conditions.h"
36 #include "output.h"
37 #include "insn-codes.h"
38 #include "insn-attr.h"
39 #include "flags.h"
40 #include "except.h"
41 #include "function.h"
42 #include "recog.h"
43 #include "expr.h"
44 #include "optabs.h"
45 #include "diagnostic-core.h"
46 #include "toplev.h"
47 #include "basic-block.h"
48 #include "ggc.h"
49 #include "target.h"
50 #include "target-def.h"
51 #include "common/common-target.h"
52 #include "langhooks.h"
53 #include "reload.h"
54 #include "cgraph.h"
55 #include "gimple.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "df.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "debug.h"
63 #include "sched-int.h"
64 #include "sbitmap.h"
65 #include "fibheap.h"
66 #include "opts.h"
67 #include "diagnostic.h"
68 #include "dumpfile.h"
69 #include "tree-pass.h"
70 #include "wide-int.h"
71 #include "context.h"
72 #include "pass_manager.h"
73
74 static rtx legitimize_dllimport_symbol (rtx, bool);
75 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
76 static rtx legitimize_pe_coff_symbol (rtx, bool);
77
78 #ifndef CHECK_STACK_LIMIT
79 #define CHECK_STACK_LIMIT (-1)
80 #endif
81
82 /* Return index of given mode in mult and division cost tables. */
83 #define MODE_INDEX(mode) \
84 ((mode) == QImode ? 0 \
85 : (mode) == HImode ? 1 \
86 : (mode) == SImode ? 2 \
87 : (mode) == DImode ? 3 \
88 : 4)
89
90 /* Processor costs (relative to an add) */
91 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
92 #define COSTS_N_BYTES(N) ((N) * 2)
93
94 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
95
96 static stringop_algs ix86_size_memcpy[2] = {
97 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
98 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
99 static stringop_algs ix86_size_memset[2] = {
100 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
101 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
102
103 const
104 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
105 COSTS_N_BYTES (2), /* cost of an add instruction */
106 COSTS_N_BYTES (3), /* cost of a lea instruction */
107 COSTS_N_BYTES (2), /* variable shift costs */
108 COSTS_N_BYTES (3), /* constant shift costs */
109 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
110 COSTS_N_BYTES (3), /* HI */
111 COSTS_N_BYTES (3), /* SI */
112 COSTS_N_BYTES (3), /* DI */
113 COSTS_N_BYTES (5)}, /* other */
114 0, /* cost of multiply per each bit set */
115 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
116 COSTS_N_BYTES (3), /* HI */
117 COSTS_N_BYTES (3), /* SI */
118 COSTS_N_BYTES (3), /* DI */
119 COSTS_N_BYTES (5)}, /* other */
120 COSTS_N_BYTES (3), /* cost of movsx */
121 COSTS_N_BYTES (3), /* cost of movzx */
122 0, /* "large" insn */
123 2, /* MOVE_RATIO */
124 2, /* cost for loading QImode using movzbl */
125 {2, 2, 2}, /* cost of loading integer registers
126 in QImode, HImode and SImode.
127 Relative to reg-reg move (2). */
128 {2, 2, 2}, /* cost of storing integer registers */
129 2, /* cost of reg,reg fld/fst */
130 {2, 2, 2}, /* cost of loading fp registers
131 in SFmode, DFmode and XFmode */
132 {2, 2, 2}, /* cost of storing fp registers
133 in SFmode, DFmode and XFmode */
134 3, /* cost of moving MMX register */
135 {3, 3}, /* cost of loading MMX registers
136 in SImode and DImode */
137 {3, 3}, /* cost of storing MMX registers
138 in SImode and DImode */
139 3, /* cost of moving SSE register */
140 {3, 3, 3}, /* cost of loading SSE registers
141 in SImode, DImode and TImode */
142 {3, 3, 3}, /* cost of storing SSE registers
143 in SImode, DImode and TImode */
144 3, /* MMX or SSE register to integer */
145 0, /* size of l1 cache */
146 0, /* size of l2 cache */
147 0, /* size of prefetch block */
148 0, /* number of parallel prefetches */
149 2, /* Branch cost */
150 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
151 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
152 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
153 COSTS_N_BYTES (2), /* cost of FABS instruction. */
154 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
155 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
156 ix86_size_memcpy,
157 ix86_size_memset,
158 1, /* scalar_stmt_cost. */
159 1, /* scalar load_cost. */
160 1, /* scalar_store_cost. */
161 1, /* vec_stmt_cost. */
162 1, /* vec_to_scalar_cost. */
163 1, /* scalar_to_vec_cost. */
164 1, /* vec_align_load_cost. */
165 1, /* vec_unalign_load_cost. */
166 1, /* vec_store_cost. */
167 1, /* cond_taken_branch_cost. */
168 1, /* cond_not_taken_branch_cost. */
169 };
170
171 /* Processor costs (relative to an add) */
172 static stringop_algs i386_memcpy[2] = {
173 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
174 DUMMY_STRINGOP_ALGS};
175 static stringop_algs i386_memset[2] = {
176 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
177 DUMMY_STRINGOP_ALGS};
178
179 static const
180 struct processor_costs i386_cost = { /* 386 specific costs */
181 COSTS_N_INSNS (1), /* cost of an add instruction */
182 COSTS_N_INSNS (1), /* cost of a lea instruction */
183 COSTS_N_INSNS (3), /* variable shift costs */
184 COSTS_N_INSNS (2), /* constant shift costs */
185 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
186 COSTS_N_INSNS (6), /* HI */
187 COSTS_N_INSNS (6), /* SI */
188 COSTS_N_INSNS (6), /* DI */
189 COSTS_N_INSNS (6)}, /* other */
190 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
191 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
192 COSTS_N_INSNS (23), /* HI */
193 COSTS_N_INSNS (23), /* SI */
194 COSTS_N_INSNS (23), /* DI */
195 COSTS_N_INSNS (23)}, /* other */
196 COSTS_N_INSNS (3), /* cost of movsx */
197 COSTS_N_INSNS (2), /* cost of movzx */
198 15, /* "large" insn */
199 3, /* MOVE_RATIO */
200 4, /* cost for loading QImode using movzbl */
201 {2, 4, 2}, /* cost of loading integer registers
202 in QImode, HImode and SImode.
203 Relative to reg-reg move (2). */
204 {2, 4, 2}, /* cost of storing integer registers */
205 2, /* cost of reg,reg fld/fst */
206 {8, 8, 8}, /* cost of loading fp registers
207 in SFmode, DFmode and XFmode */
208 {8, 8, 8}, /* cost of storing fp registers
209 in SFmode, DFmode and XFmode */
210 2, /* cost of moving MMX register */
211 {4, 8}, /* cost of loading MMX registers
212 in SImode and DImode */
213 {4, 8}, /* cost of storing MMX registers
214 in SImode and DImode */
215 2, /* cost of moving SSE register */
216 {4, 8, 16}, /* cost of loading SSE registers
217 in SImode, DImode and TImode */
218 {4, 8, 16}, /* cost of storing SSE registers
219 in SImode, DImode and TImode */
220 3, /* MMX or SSE register to integer */
221 0, /* size of l1 cache */
222 0, /* size of l2 cache */
223 0, /* size of prefetch block */
224 0, /* number of parallel prefetches */
225 1, /* Branch cost */
226 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
227 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
228 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
229 COSTS_N_INSNS (22), /* cost of FABS instruction. */
230 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
231 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
232 i386_memcpy,
233 i386_memset,
234 1, /* scalar_stmt_cost. */
235 1, /* scalar load_cost. */
236 1, /* scalar_store_cost. */
237 1, /* vec_stmt_cost. */
238 1, /* vec_to_scalar_cost. */
239 1, /* scalar_to_vec_cost. */
240 1, /* vec_align_load_cost. */
241 2, /* vec_unalign_load_cost. */
242 1, /* vec_store_cost. */
243 3, /* cond_taken_branch_cost. */
244 1, /* cond_not_taken_branch_cost. */
245 };
246
247 static stringop_algs i486_memcpy[2] = {
248 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
249 DUMMY_STRINGOP_ALGS};
250 static stringop_algs i486_memset[2] = {
251 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
252 DUMMY_STRINGOP_ALGS};
253
254 static const
255 struct processor_costs i486_cost = { /* 486 specific costs */
256 COSTS_N_INSNS (1), /* cost of an add instruction */
257 COSTS_N_INSNS (1), /* cost of a lea instruction */
258 COSTS_N_INSNS (3), /* variable shift costs */
259 COSTS_N_INSNS (2), /* constant shift costs */
260 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
261 COSTS_N_INSNS (12), /* HI */
262 COSTS_N_INSNS (12), /* SI */
263 COSTS_N_INSNS (12), /* DI */
264 COSTS_N_INSNS (12)}, /* other */
265 1, /* cost of multiply per each bit set */
266 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
267 COSTS_N_INSNS (40), /* HI */
268 COSTS_N_INSNS (40), /* SI */
269 COSTS_N_INSNS (40), /* DI */
270 COSTS_N_INSNS (40)}, /* other */
271 COSTS_N_INSNS (3), /* cost of movsx */
272 COSTS_N_INSNS (2), /* cost of movzx */
273 15, /* "large" insn */
274 3, /* MOVE_RATIO */
275 4, /* cost for loading QImode using movzbl */
276 {2, 4, 2}, /* cost of loading integer registers
277 in QImode, HImode and SImode.
278 Relative to reg-reg move (2). */
279 {2, 4, 2}, /* cost of storing integer registers */
280 2, /* cost of reg,reg fld/fst */
281 {8, 8, 8}, /* cost of loading fp registers
282 in SFmode, DFmode and XFmode */
283 {8, 8, 8}, /* cost of storing fp registers
284 in SFmode, DFmode and XFmode */
285 2, /* cost of moving MMX register */
286 {4, 8}, /* cost of loading MMX registers
287 in SImode and DImode */
288 {4, 8}, /* cost of storing MMX registers
289 in SImode and DImode */
290 2, /* cost of moving SSE register */
291 {4, 8, 16}, /* cost of loading SSE registers
292 in SImode, DImode and TImode */
293 {4, 8, 16}, /* cost of storing SSE registers
294 in SImode, DImode and TImode */
295 3, /* MMX or SSE register to integer */
296 4, /* size of l1 cache. 486 has 8kB cache
297 shared for code and data, so 4kB is
298 not really precise. */
299 4, /* size of l2 cache */
300 0, /* size of prefetch block */
301 0, /* number of parallel prefetches */
302 1, /* Branch cost */
303 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
304 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
305 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
306 COSTS_N_INSNS (3), /* cost of FABS instruction. */
307 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
308 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
309 i486_memcpy,
310 i486_memset,
311 1, /* scalar_stmt_cost. */
312 1, /* scalar load_cost. */
313 1, /* scalar_store_cost. */
314 1, /* vec_stmt_cost. */
315 1, /* vec_to_scalar_cost. */
316 1, /* scalar_to_vec_cost. */
317 1, /* vec_align_load_cost. */
318 2, /* vec_unalign_load_cost. */
319 1, /* vec_store_cost. */
320 3, /* cond_taken_branch_cost. */
321 1, /* cond_not_taken_branch_cost. */
322 };
323
324 static stringop_algs pentium_memcpy[2] = {
325 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
326 DUMMY_STRINGOP_ALGS};
327 static stringop_algs pentium_memset[2] = {
328 {libcall, {{-1, rep_prefix_4_byte, false}}},
329 DUMMY_STRINGOP_ALGS};
330
331 static const
332 struct processor_costs pentium_cost = {
333 COSTS_N_INSNS (1), /* cost of an add instruction */
334 COSTS_N_INSNS (1), /* cost of a lea instruction */
335 COSTS_N_INSNS (4), /* variable shift costs */
336 COSTS_N_INSNS (1), /* constant shift costs */
337 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
338 COSTS_N_INSNS (11), /* HI */
339 COSTS_N_INSNS (11), /* SI */
340 COSTS_N_INSNS (11), /* DI */
341 COSTS_N_INSNS (11)}, /* other */
342 0, /* cost of multiply per each bit set */
343 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
344 COSTS_N_INSNS (25), /* HI */
345 COSTS_N_INSNS (25), /* SI */
346 COSTS_N_INSNS (25), /* DI */
347 COSTS_N_INSNS (25)}, /* other */
348 COSTS_N_INSNS (3), /* cost of movsx */
349 COSTS_N_INSNS (2), /* cost of movzx */
350 8, /* "large" insn */
351 6, /* MOVE_RATIO */
352 6, /* cost for loading QImode using movzbl */
353 {2, 4, 2}, /* cost of loading integer registers
354 in QImode, HImode and SImode.
355 Relative to reg-reg move (2). */
356 {2, 4, 2}, /* cost of storing integer registers */
357 2, /* cost of reg,reg fld/fst */
358 {2, 2, 6}, /* cost of loading fp registers
359 in SFmode, DFmode and XFmode */
360 {4, 4, 6}, /* cost of storing fp registers
361 in SFmode, DFmode and XFmode */
362 8, /* cost of moving MMX register */
363 {8, 8}, /* cost of loading MMX registers
364 in SImode and DImode */
365 {8, 8}, /* cost of storing MMX registers
366 in SImode and DImode */
367 2, /* cost of moving SSE register */
368 {4, 8, 16}, /* cost of loading SSE registers
369 in SImode, DImode and TImode */
370 {4, 8, 16}, /* cost of storing SSE registers
371 in SImode, DImode and TImode */
372 3, /* MMX or SSE register to integer */
373 8, /* size of l1 cache. */
374 8, /* size of l2 cache */
375 0, /* size of prefetch block */
376 0, /* number of parallel prefetches */
377 2, /* Branch cost */
378 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
379 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
380 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
381 COSTS_N_INSNS (1), /* cost of FABS instruction. */
382 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
383 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
384 pentium_memcpy,
385 pentium_memset,
386 1, /* scalar_stmt_cost. */
387 1, /* scalar load_cost. */
388 1, /* scalar_store_cost. */
389 1, /* vec_stmt_cost. */
390 1, /* vec_to_scalar_cost. */
391 1, /* scalar_to_vec_cost. */
392 1, /* vec_align_load_cost. */
393 2, /* vec_unalign_load_cost. */
394 1, /* vec_store_cost. */
395 3, /* cond_taken_branch_cost. */
396 1, /* cond_not_taken_branch_cost. */
397 };
398
399 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
400 (we ensure the alignment). For small blocks inline loop is still a
401 noticeable win, for bigger blocks either rep movsl or rep movsb is
402 way to go. Rep movsb has apparently more expensive startup time in CPU,
403 but after 4K the difference is down in the noise. */
404 static stringop_algs pentiumpro_memcpy[2] = {
405 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
406 {8192, rep_prefix_4_byte, false},
407 {-1, rep_prefix_1_byte, false}}},
408 DUMMY_STRINGOP_ALGS};
409 static stringop_algs pentiumpro_memset[2] = {
410 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
411 {8192, rep_prefix_4_byte, false},
412 {-1, libcall, false}}},
413 DUMMY_STRINGOP_ALGS};
414 static const
415 struct processor_costs pentiumpro_cost = {
416 COSTS_N_INSNS (1), /* cost of an add instruction */
417 COSTS_N_INSNS (1), /* cost of a lea instruction */
418 COSTS_N_INSNS (1), /* variable shift costs */
419 COSTS_N_INSNS (1), /* constant shift costs */
420 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
421 COSTS_N_INSNS (4), /* HI */
422 COSTS_N_INSNS (4), /* SI */
423 COSTS_N_INSNS (4), /* DI */
424 COSTS_N_INSNS (4)}, /* other */
425 0, /* cost of multiply per each bit set */
426 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
427 COSTS_N_INSNS (17), /* HI */
428 COSTS_N_INSNS (17), /* SI */
429 COSTS_N_INSNS (17), /* DI */
430 COSTS_N_INSNS (17)}, /* other */
431 COSTS_N_INSNS (1), /* cost of movsx */
432 COSTS_N_INSNS (1), /* cost of movzx */
433 8, /* "large" insn */
434 6, /* MOVE_RATIO */
435 2, /* cost for loading QImode using movzbl */
436 {4, 4, 4}, /* cost of loading integer registers
437 in QImode, HImode and SImode.
438 Relative to reg-reg move (2). */
439 {2, 2, 2}, /* cost of storing integer registers */
440 2, /* cost of reg,reg fld/fst */
441 {2, 2, 6}, /* cost of loading fp registers
442 in SFmode, DFmode and XFmode */
443 {4, 4, 6}, /* cost of storing fp registers
444 in SFmode, DFmode and XFmode */
445 2, /* cost of moving MMX register */
446 {2, 2}, /* cost of loading MMX registers
447 in SImode and DImode */
448 {2, 2}, /* cost of storing MMX registers
449 in SImode and DImode */
450 2, /* cost of moving SSE register */
451 {2, 2, 8}, /* cost of loading SSE registers
452 in SImode, DImode and TImode */
453 {2, 2, 8}, /* cost of storing SSE registers
454 in SImode, DImode and TImode */
455 3, /* MMX or SSE register to integer */
456 8, /* size of l1 cache. */
457 256, /* size of l2 cache */
458 32, /* size of prefetch block */
459 6, /* number of parallel prefetches */
460 2, /* Branch cost */
461 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
462 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
463 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
464 COSTS_N_INSNS (2), /* cost of FABS instruction. */
465 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
466 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
467 pentiumpro_memcpy,
468 pentiumpro_memset,
469 1, /* scalar_stmt_cost. */
470 1, /* scalar load_cost. */
471 1, /* scalar_store_cost. */
472 1, /* vec_stmt_cost. */
473 1, /* vec_to_scalar_cost. */
474 1, /* scalar_to_vec_cost. */
475 1, /* vec_align_load_cost. */
476 2, /* vec_unalign_load_cost. */
477 1, /* vec_store_cost. */
478 3, /* cond_taken_branch_cost. */
479 1, /* cond_not_taken_branch_cost. */
480 };
481
482 static stringop_algs geode_memcpy[2] = {
483 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
484 DUMMY_STRINGOP_ALGS};
485 static stringop_algs geode_memset[2] = {
486 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
487 DUMMY_STRINGOP_ALGS};
488 static const
489 struct processor_costs geode_cost = {
490 COSTS_N_INSNS (1), /* cost of an add instruction */
491 COSTS_N_INSNS (1), /* cost of a lea instruction */
492 COSTS_N_INSNS (2), /* variable shift costs */
493 COSTS_N_INSNS (1), /* constant shift costs */
494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
495 COSTS_N_INSNS (4), /* HI */
496 COSTS_N_INSNS (7), /* SI */
497 COSTS_N_INSNS (7), /* DI */
498 COSTS_N_INSNS (7)}, /* other */
499 0, /* cost of multiply per each bit set */
500 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
501 COSTS_N_INSNS (23), /* HI */
502 COSTS_N_INSNS (39), /* SI */
503 COSTS_N_INSNS (39), /* DI */
504 COSTS_N_INSNS (39)}, /* other */
505 COSTS_N_INSNS (1), /* cost of movsx */
506 COSTS_N_INSNS (1), /* cost of movzx */
507 8, /* "large" insn */
508 4, /* MOVE_RATIO */
509 1, /* cost for loading QImode using movzbl */
510 {1, 1, 1}, /* cost of loading integer registers
511 in QImode, HImode and SImode.
512 Relative to reg-reg move (2). */
513 {1, 1, 1}, /* cost of storing integer registers */
514 1, /* cost of reg,reg fld/fst */
515 {1, 1, 1}, /* cost of loading fp registers
516 in SFmode, DFmode and XFmode */
517 {4, 6, 6}, /* cost of storing fp registers
518 in SFmode, DFmode and XFmode */
519
520 1, /* cost of moving MMX register */
521 {1, 1}, /* cost of loading MMX registers
522 in SImode and DImode */
523 {1, 1}, /* cost of storing MMX registers
524 in SImode and DImode */
525 1, /* cost of moving SSE register */
526 {1, 1, 1}, /* cost of loading SSE registers
527 in SImode, DImode and TImode */
528 {1, 1, 1}, /* cost of storing SSE registers
529 in SImode, DImode and TImode */
530 1, /* MMX or SSE register to integer */
531 64, /* size of l1 cache. */
532 128, /* size of l2 cache. */
533 32, /* size of prefetch block */
534 1, /* number of parallel prefetches */
535 1, /* Branch cost */
536 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
537 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
538 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
539 COSTS_N_INSNS (1), /* cost of FABS instruction. */
540 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
541 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
542 geode_memcpy,
543 geode_memset,
544 1, /* scalar_stmt_cost. */
545 1, /* scalar load_cost. */
546 1, /* scalar_store_cost. */
547 1, /* vec_stmt_cost. */
548 1, /* vec_to_scalar_cost. */
549 1, /* scalar_to_vec_cost. */
550 1, /* vec_align_load_cost. */
551 2, /* vec_unalign_load_cost. */
552 1, /* vec_store_cost. */
553 3, /* cond_taken_branch_cost. */
554 1, /* cond_not_taken_branch_cost. */
555 };
556
557 static stringop_algs k6_memcpy[2] = {
558 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
559 DUMMY_STRINGOP_ALGS};
560 static stringop_algs k6_memset[2] = {
561 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
562 DUMMY_STRINGOP_ALGS};
563 static const
564 struct processor_costs k6_cost = {
565 COSTS_N_INSNS (1), /* cost of an add instruction */
566 COSTS_N_INSNS (2), /* cost of a lea instruction */
567 COSTS_N_INSNS (1), /* variable shift costs */
568 COSTS_N_INSNS (1), /* constant shift costs */
569 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
570 COSTS_N_INSNS (3), /* HI */
571 COSTS_N_INSNS (3), /* SI */
572 COSTS_N_INSNS (3), /* DI */
573 COSTS_N_INSNS (3)}, /* other */
574 0, /* cost of multiply per each bit set */
575 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
576 COSTS_N_INSNS (18), /* HI */
577 COSTS_N_INSNS (18), /* SI */
578 COSTS_N_INSNS (18), /* DI */
579 COSTS_N_INSNS (18)}, /* other */
580 COSTS_N_INSNS (2), /* cost of movsx */
581 COSTS_N_INSNS (2), /* cost of movzx */
582 8, /* "large" insn */
583 4, /* MOVE_RATIO */
584 3, /* cost for loading QImode using movzbl */
585 {4, 5, 4}, /* cost of loading integer registers
586 in QImode, HImode and SImode.
587 Relative to reg-reg move (2). */
588 {2, 3, 2}, /* cost of storing integer registers */
589 4, /* cost of reg,reg fld/fst */
590 {6, 6, 6}, /* cost of loading fp registers
591 in SFmode, DFmode and XFmode */
592 {4, 4, 4}, /* cost of storing fp registers
593 in SFmode, DFmode and XFmode */
594 2, /* cost of moving MMX register */
595 {2, 2}, /* cost of loading MMX registers
596 in SImode and DImode */
597 {2, 2}, /* cost of storing MMX registers
598 in SImode and DImode */
599 2, /* cost of moving SSE register */
600 {2, 2, 8}, /* cost of loading SSE registers
601 in SImode, DImode and TImode */
602 {2, 2, 8}, /* cost of storing SSE registers
603 in SImode, DImode and TImode */
604 6, /* MMX or SSE register to integer */
605 32, /* size of l1 cache. */
606 32, /* size of l2 cache. Some models
607 have integrated l2 cache, but
608 optimizing for k6 is not important
609 enough to worry about that. */
610 32, /* size of prefetch block */
611 1, /* number of parallel prefetches */
612 1, /* Branch cost */
613 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
614 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
615 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
616 COSTS_N_INSNS (2), /* cost of FABS instruction. */
617 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
618 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
619 k6_memcpy,
620 k6_memset,
621 1, /* scalar_stmt_cost. */
622 1, /* scalar load_cost. */
623 1, /* scalar_store_cost. */
624 1, /* vec_stmt_cost. */
625 1, /* vec_to_scalar_cost. */
626 1, /* scalar_to_vec_cost. */
627 1, /* vec_align_load_cost. */
628 2, /* vec_unalign_load_cost. */
629 1, /* vec_store_cost. */
630 3, /* cond_taken_branch_cost. */
631 1, /* cond_not_taken_branch_cost. */
632 };
633
634 /* For some reason, Athlon deals better with REP prefix (relative to loops)
635 compared to K8. Alignment becomes important after 8 bytes for memcpy and
636 128 bytes for memset. */
637 static stringop_algs athlon_memcpy[2] = {
638 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
639 DUMMY_STRINGOP_ALGS};
640 static stringop_algs athlon_memset[2] = {
641 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
642 DUMMY_STRINGOP_ALGS};
643 static const
644 struct processor_costs athlon_cost = {
645 COSTS_N_INSNS (1), /* cost of an add instruction */
646 COSTS_N_INSNS (2), /* cost of a lea instruction */
647 COSTS_N_INSNS (1), /* variable shift costs */
648 COSTS_N_INSNS (1), /* constant shift costs */
649 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
650 COSTS_N_INSNS (5), /* HI */
651 COSTS_N_INSNS (5), /* SI */
652 COSTS_N_INSNS (5), /* DI */
653 COSTS_N_INSNS (5)}, /* other */
654 0, /* cost of multiply per each bit set */
655 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
656 COSTS_N_INSNS (26), /* HI */
657 COSTS_N_INSNS (42), /* SI */
658 COSTS_N_INSNS (74), /* DI */
659 COSTS_N_INSNS (74)}, /* other */
660 COSTS_N_INSNS (1), /* cost of movsx */
661 COSTS_N_INSNS (1), /* cost of movzx */
662 8, /* "large" insn */
663 9, /* MOVE_RATIO */
664 4, /* cost for loading QImode using movzbl */
665 {3, 4, 3}, /* cost of loading integer registers
666 in QImode, HImode and SImode.
667 Relative to reg-reg move (2). */
668 {3, 4, 3}, /* cost of storing integer registers */
669 4, /* cost of reg,reg fld/fst */
670 {4, 4, 12}, /* cost of loading fp registers
671 in SFmode, DFmode and XFmode */
672 {6, 6, 8}, /* cost of storing fp registers
673 in SFmode, DFmode and XFmode */
674 2, /* cost of moving MMX register */
675 {4, 4}, /* cost of loading MMX registers
676 in SImode and DImode */
677 {4, 4}, /* cost of storing MMX registers
678 in SImode and DImode */
679 2, /* cost of moving SSE register */
680 {4, 4, 6}, /* cost of loading SSE registers
681 in SImode, DImode and TImode */
682 {4, 4, 5}, /* cost of storing SSE registers
683 in SImode, DImode and TImode */
684 5, /* MMX or SSE register to integer */
685 64, /* size of l1 cache. */
686 256, /* size of l2 cache. */
687 64, /* size of prefetch block */
688 6, /* number of parallel prefetches */
689 5, /* Branch cost */
690 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
691 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
692 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
693 COSTS_N_INSNS (2), /* cost of FABS instruction. */
694 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
695 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
696 athlon_memcpy,
697 athlon_memset,
698 1, /* scalar_stmt_cost. */
699 1, /* scalar load_cost. */
700 1, /* scalar_store_cost. */
701 1, /* vec_stmt_cost. */
702 1, /* vec_to_scalar_cost. */
703 1, /* scalar_to_vec_cost. */
704 1, /* vec_align_load_cost. */
705 2, /* vec_unalign_load_cost. */
706 1, /* vec_store_cost. */
707 3, /* cond_taken_branch_cost. */
708 1, /* cond_not_taken_branch_cost. */
709 };
710
711 /* K8 has optimized REP instruction for medium sized blocks, but for very
712 small blocks it is better to use loop. For large blocks, libcall can
713 do nontemporary accesses and beat inline considerably. */
714 static stringop_algs k8_memcpy[2] = {
715 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
716 {-1, rep_prefix_4_byte, false}}},
717 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
718 {-1, libcall, false}}}};
719 static stringop_algs k8_memset[2] = {
720 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
721 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
722 {libcall, {{48, unrolled_loop, false},
723 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
724 static const
725 struct processor_costs k8_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (2), /* cost of a lea instruction */
728 COSTS_N_INSNS (1), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (4), /* HI */
732 COSTS_N_INSNS (3), /* SI */
733 COSTS_N_INSNS (4), /* DI */
734 COSTS_N_INSNS (5)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (26), /* HI */
738 COSTS_N_INSNS (42), /* SI */
739 COSTS_N_INSNS (74), /* DI */
740 COSTS_N_INSNS (74)}, /* other */
741 COSTS_N_INSNS (1), /* cost of movsx */
742 COSTS_N_INSNS (1), /* cost of movzx */
743 8, /* "large" insn */
744 9, /* MOVE_RATIO */
745 4, /* cost for loading QImode using movzbl */
746 {3, 4, 3}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {3, 4, 3}, /* cost of storing integer registers */
750 4, /* cost of reg,reg fld/fst */
751 {4, 4, 12}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {6, 6, 8}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 2, /* cost of moving MMX register */
756 {3, 3}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {4, 4}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 3, 6}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 4, 5}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 5, /* MMX or SSE register to integer */
766 64, /* size of l1 cache. */
767 512, /* size of l2 cache. */
768 64, /* size of prefetch block */
769 /* New AMD processors never drop prefetches; if they cannot be performed
770 immediately, they are queued. We set number of simultaneous prefetches
771 to a large constant to reflect this (it probably is not a good idea not
772 to limit number of prefetches at all, as their execution also takes some
773 time). */
774 100, /* number of parallel prefetches */
775 3, /* Branch cost */
776 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
777 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
778 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
779 COSTS_N_INSNS (2), /* cost of FABS instruction. */
780 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
781 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
782
783 k8_memcpy,
784 k8_memset,
785 4, /* scalar_stmt_cost. */
786 2, /* scalar load_cost. */
787 2, /* scalar_store_cost. */
788 5, /* vec_stmt_cost. */
789 0, /* vec_to_scalar_cost. */
790 2, /* scalar_to_vec_cost. */
791 2, /* vec_align_load_cost. */
792 3, /* vec_unalign_load_cost. */
793 3, /* vec_store_cost. */
794 3, /* cond_taken_branch_cost. */
795 2, /* cond_not_taken_branch_cost. */
796 };
797
798 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
799 very small blocks it is better to use loop. For large blocks, libcall can
800 do nontemporary accesses and beat inline considerably. */
801 static stringop_algs amdfam10_memcpy[2] = {
802 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
803 {-1, rep_prefix_4_byte, false}}},
804 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
805 {-1, libcall, false}}}};
806 static stringop_algs amdfam10_memset[2] = {
807 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
808 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
809 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
810 {-1, libcall, false}}}};
811 struct processor_costs amdfam10_cost = {
812 COSTS_N_INSNS (1), /* cost of an add instruction */
813 COSTS_N_INSNS (2), /* cost of a lea instruction */
814 COSTS_N_INSNS (1), /* variable shift costs */
815 COSTS_N_INSNS (1), /* constant shift costs */
816 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
817 COSTS_N_INSNS (4), /* HI */
818 COSTS_N_INSNS (3), /* SI */
819 COSTS_N_INSNS (4), /* DI */
820 COSTS_N_INSNS (5)}, /* other */
821 0, /* cost of multiply per each bit set */
822 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
823 COSTS_N_INSNS (35), /* HI */
824 COSTS_N_INSNS (51), /* SI */
825 COSTS_N_INSNS (83), /* DI */
826 COSTS_N_INSNS (83)}, /* other */
827 COSTS_N_INSNS (1), /* cost of movsx */
828 COSTS_N_INSNS (1), /* cost of movzx */
829 8, /* "large" insn */
830 9, /* MOVE_RATIO */
831 4, /* cost for loading QImode using movzbl */
832 {3, 4, 3}, /* cost of loading integer registers
833 in QImode, HImode and SImode.
834 Relative to reg-reg move (2). */
835 {3, 4, 3}, /* cost of storing integer registers */
836 4, /* cost of reg,reg fld/fst */
837 {4, 4, 12}, /* cost of loading fp registers
838 in SFmode, DFmode and XFmode */
839 {6, 6, 8}, /* cost of storing fp registers
840 in SFmode, DFmode and XFmode */
841 2, /* cost of moving MMX register */
842 {3, 3}, /* cost of loading MMX registers
843 in SImode and DImode */
844 {4, 4}, /* cost of storing MMX registers
845 in SImode and DImode */
846 2, /* cost of moving SSE register */
847 {4, 4, 3}, /* cost of loading SSE registers
848 in SImode, DImode and TImode */
849 {4, 4, 5}, /* cost of storing SSE registers
850 in SImode, DImode and TImode */
851 3, /* MMX or SSE register to integer */
852 /* On K8:
853 MOVD reg64, xmmreg Double FSTORE 4
854 MOVD reg32, xmmreg Double FSTORE 4
855 On AMDFAM10:
856 MOVD reg64, xmmreg Double FADD 3
857 1/1 1/1
858 MOVD reg32, xmmreg Double FADD 3
859 1/1 1/1 */
860 64, /* size of l1 cache. */
861 512, /* size of l2 cache. */
862 64, /* size of prefetch block */
863 /* New AMD processors never drop prefetches; if they cannot be performed
864 immediately, they are queued. We set number of simultaneous prefetches
865 to a large constant to reflect this (it probably is not a good idea not
866 to limit number of prefetches at all, as their execution also takes some
867 time). */
868 100, /* number of parallel prefetches */
869 2, /* Branch cost */
870 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
871 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
872 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
873 COSTS_N_INSNS (2), /* cost of FABS instruction. */
874 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
875 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
876
877 amdfam10_memcpy,
878 amdfam10_memset,
879 4, /* scalar_stmt_cost. */
880 2, /* scalar load_cost. */
881 2, /* scalar_store_cost. */
882 6, /* vec_stmt_cost. */
883 0, /* vec_to_scalar_cost. */
884 2, /* scalar_to_vec_cost. */
885 2, /* vec_align_load_cost. */
886 2, /* vec_unalign_load_cost. */
887 2, /* vec_store_cost. */
888 2, /* cond_taken_branch_cost. */
889 1, /* cond_not_taken_branch_cost. */
890 };
891
892 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
893 very small blocks it is better to use loop. For large blocks, libcall
894 can do nontemporary accesses and beat inline considerably. */
895 static stringop_algs bdver1_memcpy[2] = {
896 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
897 {-1, rep_prefix_4_byte, false}}},
898 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
899 {-1, libcall, false}}}};
900 static stringop_algs bdver1_memset[2] = {
901 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
902 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
903 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
904 {-1, libcall, false}}}};
905
906 const struct processor_costs bdver1_cost = {
907 COSTS_N_INSNS (1), /* cost of an add instruction */
908 COSTS_N_INSNS (1), /* cost of a lea instruction */
909 COSTS_N_INSNS (1), /* variable shift costs */
910 COSTS_N_INSNS (1), /* constant shift costs */
911 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
912 COSTS_N_INSNS (4), /* HI */
913 COSTS_N_INSNS (4), /* SI */
914 COSTS_N_INSNS (6), /* DI */
915 COSTS_N_INSNS (6)}, /* other */
916 0, /* cost of multiply per each bit set */
917 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
918 COSTS_N_INSNS (35), /* HI */
919 COSTS_N_INSNS (51), /* SI */
920 COSTS_N_INSNS (83), /* DI */
921 COSTS_N_INSNS (83)}, /* other */
922 COSTS_N_INSNS (1), /* cost of movsx */
923 COSTS_N_INSNS (1), /* cost of movzx */
924 8, /* "large" insn */
925 9, /* MOVE_RATIO */
926 4, /* cost for loading QImode using movzbl */
927 {5, 5, 4}, /* cost of loading integer registers
928 in QImode, HImode and SImode.
929 Relative to reg-reg move (2). */
930 {4, 4, 4}, /* cost of storing integer registers */
931 2, /* cost of reg,reg fld/fst */
932 {5, 5, 12}, /* cost of loading fp registers
933 in SFmode, DFmode and XFmode */
934 {4, 4, 8}, /* cost of storing fp registers
935 in SFmode, DFmode and XFmode */
936 2, /* cost of moving MMX register */
937 {4, 4}, /* cost of loading MMX registers
938 in SImode and DImode */
939 {4, 4}, /* cost of storing MMX registers
940 in SImode and DImode */
941 2, /* cost of moving SSE register */
942 {4, 4, 4}, /* cost of loading SSE registers
943 in SImode, DImode and TImode */
944 {4, 4, 4}, /* cost of storing SSE registers
945 in SImode, DImode and TImode */
946 2, /* MMX or SSE register to integer */
947 /* On K8:
948 MOVD reg64, xmmreg Double FSTORE 4
949 MOVD reg32, xmmreg Double FSTORE 4
950 On AMDFAM10:
951 MOVD reg64, xmmreg Double FADD 3
952 1/1 1/1
953 MOVD reg32, xmmreg Double FADD 3
954 1/1 1/1 */
955 16, /* size of l1 cache. */
956 2048, /* size of l2 cache. */
957 64, /* size of prefetch block */
958 /* New AMD processors never drop prefetches; if they cannot be performed
959 immediately, they are queued. We set number of simultaneous prefetches
960 to a large constant to reflect this (it probably is not a good idea not
961 to limit number of prefetches at all, as their execution also takes some
962 time). */
963 100, /* number of parallel prefetches */
964 2, /* Branch cost */
965 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
966 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
967 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
968 COSTS_N_INSNS (2), /* cost of FABS instruction. */
969 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
970 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
971
972 bdver1_memcpy,
973 bdver1_memset,
974 6, /* scalar_stmt_cost. */
975 4, /* scalar load_cost. */
976 4, /* scalar_store_cost. */
977 6, /* vec_stmt_cost. */
978 0, /* vec_to_scalar_cost. */
979 2, /* scalar_to_vec_cost. */
980 4, /* vec_align_load_cost. */
981 4, /* vec_unalign_load_cost. */
982 4, /* vec_store_cost. */
983 2, /* cond_taken_branch_cost. */
984 1, /* cond_not_taken_branch_cost. */
985 };
986
987 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
988 very small blocks it is better to use loop. For large blocks, libcall
989 can do nontemporary accesses and beat inline considerably. */
990
991 static stringop_algs bdver2_memcpy[2] = {
992 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
993 {-1, rep_prefix_4_byte, false}}},
994 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
995 {-1, libcall, false}}}};
996 static stringop_algs bdver2_memset[2] = {
997 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
998 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
999 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1000 {-1, libcall, false}}}};
1001
1002 const struct processor_costs bdver2_cost = {
1003 COSTS_N_INSNS (1), /* cost of an add instruction */
1004 COSTS_N_INSNS (1), /* cost of a lea instruction */
1005 COSTS_N_INSNS (1), /* variable shift costs */
1006 COSTS_N_INSNS (1), /* constant shift costs */
1007 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1008 COSTS_N_INSNS (4), /* HI */
1009 COSTS_N_INSNS (4), /* SI */
1010 COSTS_N_INSNS (6), /* DI */
1011 COSTS_N_INSNS (6)}, /* other */
1012 0, /* cost of multiply per each bit set */
1013 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1014 COSTS_N_INSNS (35), /* HI */
1015 COSTS_N_INSNS (51), /* SI */
1016 COSTS_N_INSNS (83), /* DI */
1017 COSTS_N_INSNS (83)}, /* other */
1018 COSTS_N_INSNS (1), /* cost of movsx */
1019 COSTS_N_INSNS (1), /* cost of movzx */
1020 8, /* "large" insn */
1021 9, /* MOVE_RATIO */
1022 4, /* cost for loading QImode using movzbl */
1023 {5, 5, 4}, /* cost of loading integer registers
1024 in QImode, HImode and SImode.
1025 Relative to reg-reg move (2). */
1026 {4, 4, 4}, /* cost of storing integer registers */
1027 2, /* cost of reg,reg fld/fst */
1028 {5, 5, 12}, /* cost of loading fp registers
1029 in SFmode, DFmode and XFmode */
1030 {4, 4, 8}, /* cost of storing fp registers
1031 in SFmode, DFmode and XFmode */
1032 2, /* cost of moving MMX register */
1033 {4, 4}, /* cost of loading MMX registers
1034 in SImode and DImode */
1035 {4, 4}, /* cost of storing MMX registers
1036 in SImode and DImode */
1037 2, /* cost of moving SSE register */
1038 {4, 4, 4}, /* cost of loading SSE registers
1039 in SImode, DImode and TImode */
1040 {4, 4, 4}, /* cost of storing SSE registers
1041 in SImode, DImode and TImode */
1042 2, /* MMX or SSE register to integer */
1043 /* On K8:
1044 MOVD reg64, xmmreg Double FSTORE 4
1045 MOVD reg32, xmmreg Double FSTORE 4
1046 On AMDFAM10:
1047 MOVD reg64, xmmreg Double FADD 3
1048 1/1 1/1
1049 MOVD reg32, xmmreg Double FADD 3
1050 1/1 1/1 */
1051 16, /* size of l1 cache. */
1052 2048, /* size of l2 cache. */
1053 64, /* size of prefetch block */
1054 /* New AMD processors never drop prefetches; if they cannot be performed
1055 immediately, they are queued. We set number of simultaneous prefetches
1056 to a large constant to reflect this (it probably is not a good idea not
1057 to limit number of prefetches at all, as their execution also takes some
1058 time). */
1059 100, /* number of parallel prefetches */
1060 2, /* Branch cost */
1061 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1062 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1063 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1064 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1065 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1066 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1067
1068 bdver2_memcpy,
1069 bdver2_memset,
1070 6, /* scalar_stmt_cost. */
1071 4, /* scalar load_cost. */
1072 4, /* scalar_store_cost. */
1073 6, /* vec_stmt_cost. */
1074 0, /* vec_to_scalar_cost. */
1075 2, /* scalar_to_vec_cost. */
1076 4, /* vec_align_load_cost. */
1077 4, /* vec_unalign_load_cost. */
1078 4, /* vec_store_cost. */
1079 2, /* cond_taken_branch_cost. */
1080 1, /* cond_not_taken_branch_cost. */
1081 };
1082
1083
1084 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1085 very small blocks it is better to use loop. For large blocks, libcall
1086 can do nontemporary accesses and beat inline considerably. */
1087 static stringop_algs bdver3_memcpy[2] = {
1088 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1089 {-1, rep_prefix_4_byte, false}}},
1090 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1091 {-1, libcall, false}}}};
1092 static stringop_algs bdver3_memset[2] = {
1093 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1094 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1095 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1096 {-1, libcall, false}}}};
1097 struct processor_costs bdver3_cost = {
1098 COSTS_N_INSNS (1), /* cost of an add instruction */
1099 COSTS_N_INSNS (1), /* cost of a lea instruction */
1100 COSTS_N_INSNS (1), /* variable shift costs */
1101 COSTS_N_INSNS (1), /* constant shift costs */
1102 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1103 COSTS_N_INSNS (4), /* HI */
1104 COSTS_N_INSNS (4), /* SI */
1105 COSTS_N_INSNS (6), /* DI */
1106 COSTS_N_INSNS (6)}, /* other */
1107 0, /* cost of multiply per each bit set */
1108 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1109 COSTS_N_INSNS (35), /* HI */
1110 COSTS_N_INSNS (51), /* SI */
1111 COSTS_N_INSNS (83), /* DI */
1112 COSTS_N_INSNS (83)}, /* other */
1113 COSTS_N_INSNS (1), /* cost of movsx */
1114 COSTS_N_INSNS (1), /* cost of movzx */
1115 8, /* "large" insn */
1116 9, /* MOVE_RATIO */
1117 4, /* cost for loading QImode using movzbl */
1118 {5, 5, 4}, /* cost of loading integer registers
1119 in QImode, HImode and SImode.
1120 Relative to reg-reg move (2). */
1121 {4, 4, 4}, /* cost of storing integer registers */
1122 2, /* cost of reg,reg fld/fst */
1123 {5, 5, 12}, /* cost of loading fp registers
1124 in SFmode, DFmode and XFmode */
1125 {4, 4, 8}, /* cost of storing fp registers
1126 in SFmode, DFmode and XFmode */
1127 2, /* cost of moving MMX register */
1128 {4, 4}, /* cost of loading MMX registers
1129 in SImode and DImode */
1130 {4, 4}, /* cost of storing MMX registers
1131 in SImode and DImode */
1132 2, /* cost of moving SSE register */
1133 {4, 4, 4}, /* cost of loading SSE registers
1134 in SImode, DImode and TImode */
1135 {4, 4, 4}, /* cost of storing SSE registers
1136 in SImode, DImode and TImode */
1137 2, /* MMX or SSE register to integer */
1138 16, /* size of l1 cache. */
1139 2048, /* size of l2 cache. */
1140 64, /* size of prefetch block */
1141 /* New AMD processors never drop prefetches; if they cannot be performed
1142 immediately, they are queued. We set number of simultaneous prefetches
1143 to a large constant to reflect this (it probably is not a good idea not
1144 to limit number of prefetches at all, as their execution also takes some
1145 time). */
1146 100, /* number of parallel prefetches */
1147 2, /* Branch cost */
1148 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1149 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1150 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1151 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1152 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1153 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1154
1155 bdver3_memcpy,
1156 bdver3_memset,
1157 6, /* scalar_stmt_cost. */
1158 4, /* scalar load_cost. */
1159 4, /* scalar_store_cost. */
1160 6, /* vec_stmt_cost. */
1161 0, /* vec_to_scalar_cost. */
1162 2, /* scalar_to_vec_cost. */
1163 4, /* vec_align_load_cost. */
1164 4, /* vec_unalign_load_cost. */
1165 4, /* vec_store_cost. */
1166 2, /* cond_taken_branch_cost. */
1167 1, /* cond_not_taken_branch_cost. */
1168 };
1169
1170 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1171 very small blocks it is better to use loop. For large blocks, libcall
1172 can do nontemporary accesses and beat inline considerably. */
1173 static stringop_algs bdver4_memcpy[2] = {
1174 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1175 {-1, rep_prefix_4_byte, false}}},
1176 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1177 {-1, libcall, false}}}};
1178 static stringop_algs bdver4_memset[2] = {
1179 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1180 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1181 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1182 {-1, libcall, false}}}};
1183 struct processor_costs bdver4_cost = {
1184 COSTS_N_INSNS (1), /* cost of an add instruction */
1185 COSTS_N_INSNS (1), /* cost of a lea instruction */
1186 COSTS_N_INSNS (1), /* variable shift costs */
1187 COSTS_N_INSNS (1), /* constant shift costs */
1188 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1189 COSTS_N_INSNS (4), /* HI */
1190 COSTS_N_INSNS (4), /* SI */
1191 COSTS_N_INSNS (6), /* DI */
1192 COSTS_N_INSNS (6)}, /* other */
1193 0, /* cost of multiply per each bit set */
1194 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1195 COSTS_N_INSNS (35), /* HI */
1196 COSTS_N_INSNS (51), /* SI */
1197 COSTS_N_INSNS (83), /* DI */
1198 COSTS_N_INSNS (83)}, /* other */
1199 COSTS_N_INSNS (1), /* cost of movsx */
1200 COSTS_N_INSNS (1), /* cost of movzx */
1201 8, /* "large" insn */
1202 9, /* MOVE_RATIO */
1203 4, /* cost for loading QImode using movzbl */
1204 {5, 5, 4}, /* cost of loading integer registers
1205 in QImode, HImode and SImode.
1206 Relative to reg-reg move (2). */
1207 {4, 4, 4}, /* cost of storing integer registers */
1208 2, /* cost of reg,reg fld/fst */
1209 {5, 5, 12}, /* cost of loading fp registers
1210 in SFmode, DFmode and XFmode */
1211 {4, 4, 8}, /* cost of storing fp registers
1212 in SFmode, DFmode and XFmode */
1213 2, /* cost of moving MMX register */
1214 {4, 4}, /* cost of loading MMX registers
1215 in SImode and DImode */
1216 {4, 4}, /* cost of storing MMX registers
1217 in SImode and DImode */
1218 2, /* cost of moving SSE register */
1219 {4, 4, 4}, /* cost of loading SSE registers
1220 in SImode, DImode and TImode */
1221 {4, 4, 4}, /* cost of storing SSE registers
1222 in SImode, DImode and TImode */
1223 2, /* MMX or SSE register to integer */
1224 16, /* size of l1 cache. */
1225 2048, /* size of l2 cache. */
1226 64, /* size of prefetch block */
1227 /* New AMD processors never drop prefetches; if they cannot be performed
1228 immediately, they are queued. We set number of simultaneous prefetches
1229 to a large constant to reflect this (it probably is not a good idea not
1230 to limit number of prefetches at all, as their execution also takes some
1231 time). */
1232 100, /* number of parallel prefetches */
1233 2, /* Branch cost */
1234 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1235 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1236 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1237 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1238 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1239 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1240
1241 bdver4_memcpy,
1242 bdver4_memset,
1243 6, /* scalar_stmt_cost. */
1244 4, /* scalar load_cost. */
1245 4, /* scalar_store_cost. */
1246 6, /* vec_stmt_cost. */
1247 0, /* vec_to_scalar_cost. */
1248 2, /* scalar_to_vec_cost. */
1249 4, /* vec_align_load_cost. */
1250 4, /* vec_unalign_load_cost. */
1251 4, /* vec_store_cost. */
1252 2, /* cond_taken_branch_cost. */
1253 1, /* cond_not_taken_branch_cost. */
1254 };
1255
1256 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1257 very small blocks it is better to use loop. For large blocks, libcall can
1258 do nontemporary accesses and beat inline considerably. */
1259 static stringop_algs btver1_memcpy[2] = {
1260 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1261 {-1, rep_prefix_4_byte, false}}},
1262 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1263 {-1, libcall, false}}}};
1264 static stringop_algs btver1_memset[2] = {
1265 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1266 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1267 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1268 {-1, libcall, false}}}};
1269 const struct processor_costs btver1_cost = {
1270 COSTS_N_INSNS (1), /* cost of an add instruction */
1271 COSTS_N_INSNS (2), /* cost of a lea instruction */
1272 COSTS_N_INSNS (1), /* variable shift costs */
1273 COSTS_N_INSNS (1), /* constant shift costs */
1274 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1275 COSTS_N_INSNS (4), /* HI */
1276 COSTS_N_INSNS (3), /* SI */
1277 COSTS_N_INSNS (4), /* DI */
1278 COSTS_N_INSNS (5)}, /* other */
1279 0, /* cost of multiply per each bit set */
1280 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1281 COSTS_N_INSNS (35), /* HI */
1282 COSTS_N_INSNS (51), /* SI */
1283 COSTS_N_INSNS (83), /* DI */
1284 COSTS_N_INSNS (83)}, /* other */
1285 COSTS_N_INSNS (1), /* cost of movsx */
1286 COSTS_N_INSNS (1), /* cost of movzx */
1287 8, /* "large" insn */
1288 9, /* MOVE_RATIO */
1289 4, /* cost for loading QImode using movzbl */
1290 {3, 4, 3}, /* cost of loading integer registers
1291 in QImode, HImode and SImode.
1292 Relative to reg-reg move (2). */
1293 {3, 4, 3}, /* cost of storing integer registers */
1294 4, /* cost of reg,reg fld/fst */
1295 {4, 4, 12}, /* cost of loading fp registers
1296 in SFmode, DFmode and XFmode */
1297 {6, 6, 8}, /* cost of storing fp registers
1298 in SFmode, DFmode and XFmode */
1299 2, /* cost of moving MMX register */
1300 {3, 3}, /* cost of loading MMX registers
1301 in SImode and DImode */
1302 {4, 4}, /* cost of storing MMX registers
1303 in SImode and DImode */
1304 2, /* cost of moving SSE register */
1305 {4, 4, 3}, /* cost of loading SSE registers
1306 in SImode, DImode and TImode */
1307 {4, 4, 5}, /* cost of storing SSE registers
1308 in SImode, DImode and TImode */
1309 3, /* MMX or SSE register to integer */
1310 /* On K8:
1311 MOVD reg64, xmmreg Double FSTORE 4
1312 MOVD reg32, xmmreg Double FSTORE 4
1313 On AMDFAM10:
1314 MOVD reg64, xmmreg Double FADD 3
1315 1/1 1/1
1316 MOVD reg32, xmmreg Double FADD 3
1317 1/1 1/1 */
1318 32, /* size of l1 cache. */
1319 512, /* size of l2 cache. */
1320 64, /* size of prefetch block */
1321 100, /* number of parallel prefetches */
1322 2, /* Branch cost */
1323 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1324 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1325 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1326 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1327 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1328 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1329
1330 btver1_memcpy,
1331 btver1_memset,
1332 4, /* scalar_stmt_cost. */
1333 2, /* scalar load_cost. */
1334 2, /* scalar_store_cost. */
1335 6, /* vec_stmt_cost. */
1336 0, /* vec_to_scalar_cost. */
1337 2, /* scalar_to_vec_cost. */
1338 2, /* vec_align_load_cost. */
1339 2, /* vec_unalign_load_cost. */
1340 2, /* vec_store_cost. */
1341 2, /* cond_taken_branch_cost. */
1342 1, /* cond_not_taken_branch_cost. */
1343 };
1344
1345 static stringop_algs btver2_memcpy[2] = {
1346 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1347 {-1, rep_prefix_4_byte, false}}},
1348 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1349 {-1, libcall, false}}}};
1350 static stringop_algs btver2_memset[2] = {
1351 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1352 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1353 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1354 {-1, libcall, false}}}};
1355 const struct processor_costs btver2_cost = {
1356 COSTS_N_INSNS (1), /* cost of an add instruction */
1357 COSTS_N_INSNS (2), /* cost of a lea instruction */
1358 COSTS_N_INSNS (1), /* variable shift costs */
1359 COSTS_N_INSNS (1), /* constant shift costs */
1360 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1361 COSTS_N_INSNS (4), /* HI */
1362 COSTS_N_INSNS (3), /* SI */
1363 COSTS_N_INSNS (4), /* DI */
1364 COSTS_N_INSNS (5)}, /* other */
1365 0, /* cost of multiply per each bit set */
1366 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1367 COSTS_N_INSNS (35), /* HI */
1368 COSTS_N_INSNS (51), /* SI */
1369 COSTS_N_INSNS (83), /* DI */
1370 COSTS_N_INSNS (83)}, /* other */
1371 COSTS_N_INSNS (1), /* cost of movsx */
1372 COSTS_N_INSNS (1), /* cost of movzx */
1373 8, /* "large" insn */
1374 9, /* MOVE_RATIO */
1375 4, /* cost for loading QImode using movzbl */
1376 {3, 4, 3}, /* cost of loading integer registers
1377 in QImode, HImode and SImode.
1378 Relative to reg-reg move (2). */
1379 {3, 4, 3}, /* cost of storing integer registers */
1380 4, /* cost of reg,reg fld/fst */
1381 {4, 4, 12}, /* cost of loading fp registers
1382 in SFmode, DFmode and XFmode */
1383 {6, 6, 8}, /* cost of storing fp registers
1384 in SFmode, DFmode and XFmode */
1385 2, /* cost of moving MMX register */
1386 {3, 3}, /* cost of loading MMX registers
1387 in SImode and DImode */
1388 {4, 4}, /* cost of storing MMX registers
1389 in SImode and DImode */
1390 2, /* cost of moving SSE register */
1391 {4, 4, 3}, /* cost of loading SSE registers
1392 in SImode, DImode and TImode */
1393 {4, 4, 5}, /* cost of storing SSE registers
1394 in SImode, DImode and TImode */
1395 3, /* MMX or SSE register to integer */
1396 /* On K8:
1397 MOVD reg64, xmmreg Double FSTORE 4
1398 MOVD reg32, xmmreg Double FSTORE 4
1399 On AMDFAM10:
1400 MOVD reg64, xmmreg Double FADD 3
1401 1/1 1/1
1402 MOVD reg32, xmmreg Double FADD 3
1403 1/1 1/1 */
1404 32, /* size of l1 cache. */
1405 2048, /* size of l2 cache. */
1406 64, /* size of prefetch block */
1407 100, /* number of parallel prefetches */
1408 2, /* Branch cost */
1409 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1410 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1411 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1412 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1413 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1414 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1415 btver2_memcpy,
1416 btver2_memset,
1417 4, /* scalar_stmt_cost. */
1418 2, /* scalar load_cost. */
1419 2, /* scalar_store_cost. */
1420 6, /* vec_stmt_cost. */
1421 0, /* vec_to_scalar_cost. */
1422 2, /* scalar_to_vec_cost. */
1423 2, /* vec_align_load_cost. */
1424 2, /* vec_unalign_load_cost. */
1425 2, /* vec_store_cost. */
1426 2, /* cond_taken_branch_cost. */
1427 1, /* cond_not_taken_branch_cost. */
1428 };
1429
1430 static stringop_algs pentium4_memcpy[2] = {
1431 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1432 DUMMY_STRINGOP_ALGS};
1433 static stringop_algs pentium4_memset[2] = {
1434 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1435 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1436 DUMMY_STRINGOP_ALGS};
1437
1438 static const
1439 struct processor_costs pentium4_cost = {
1440 COSTS_N_INSNS (1), /* cost of an add instruction */
1441 COSTS_N_INSNS (3), /* cost of a lea instruction */
1442 COSTS_N_INSNS (4), /* variable shift costs */
1443 COSTS_N_INSNS (4), /* constant shift costs */
1444 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1445 COSTS_N_INSNS (15), /* HI */
1446 COSTS_N_INSNS (15), /* SI */
1447 COSTS_N_INSNS (15), /* DI */
1448 COSTS_N_INSNS (15)}, /* other */
1449 0, /* cost of multiply per each bit set */
1450 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1451 COSTS_N_INSNS (56), /* HI */
1452 COSTS_N_INSNS (56), /* SI */
1453 COSTS_N_INSNS (56), /* DI */
1454 COSTS_N_INSNS (56)}, /* other */
1455 COSTS_N_INSNS (1), /* cost of movsx */
1456 COSTS_N_INSNS (1), /* cost of movzx */
1457 16, /* "large" insn */
1458 6, /* MOVE_RATIO */
1459 2, /* cost for loading QImode using movzbl */
1460 {4, 5, 4}, /* cost of loading integer registers
1461 in QImode, HImode and SImode.
1462 Relative to reg-reg move (2). */
1463 {2, 3, 2}, /* cost of storing integer registers */
1464 2, /* cost of reg,reg fld/fst */
1465 {2, 2, 6}, /* cost of loading fp registers
1466 in SFmode, DFmode and XFmode */
1467 {4, 4, 6}, /* cost of storing fp registers
1468 in SFmode, DFmode and XFmode */
1469 2, /* cost of moving MMX register */
1470 {2, 2}, /* cost of loading MMX registers
1471 in SImode and DImode */
1472 {2, 2}, /* cost of storing MMX registers
1473 in SImode and DImode */
1474 12, /* cost of moving SSE register */
1475 {12, 12, 12}, /* cost of loading SSE registers
1476 in SImode, DImode and TImode */
1477 {2, 2, 8}, /* cost of storing SSE registers
1478 in SImode, DImode and TImode */
1479 10, /* MMX or SSE register to integer */
1480 8, /* size of l1 cache. */
1481 256, /* size of l2 cache. */
1482 64, /* size of prefetch block */
1483 6, /* number of parallel prefetches */
1484 2, /* Branch cost */
1485 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1486 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1487 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1488 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1489 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1490 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1491 pentium4_memcpy,
1492 pentium4_memset,
1493 1, /* scalar_stmt_cost. */
1494 1, /* scalar load_cost. */
1495 1, /* scalar_store_cost. */
1496 1, /* vec_stmt_cost. */
1497 1, /* vec_to_scalar_cost. */
1498 1, /* scalar_to_vec_cost. */
1499 1, /* vec_align_load_cost. */
1500 2, /* vec_unalign_load_cost. */
1501 1, /* vec_store_cost. */
1502 3, /* cond_taken_branch_cost. */
1503 1, /* cond_not_taken_branch_cost. */
1504 };
1505
1506 static stringop_algs nocona_memcpy[2] = {
1507 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1508 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1509 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1510
1511 static stringop_algs nocona_memset[2] = {
1512 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1513 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1514 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1515 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1516
1517 static const
1518 struct processor_costs nocona_cost = {
1519 COSTS_N_INSNS (1), /* cost of an add instruction */
1520 COSTS_N_INSNS (1), /* cost of a lea instruction */
1521 COSTS_N_INSNS (1), /* variable shift costs */
1522 COSTS_N_INSNS (1), /* constant shift costs */
1523 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1524 COSTS_N_INSNS (10), /* HI */
1525 COSTS_N_INSNS (10), /* SI */
1526 COSTS_N_INSNS (10), /* DI */
1527 COSTS_N_INSNS (10)}, /* other */
1528 0, /* cost of multiply per each bit set */
1529 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1530 COSTS_N_INSNS (66), /* HI */
1531 COSTS_N_INSNS (66), /* SI */
1532 COSTS_N_INSNS (66), /* DI */
1533 COSTS_N_INSNS (66)}, /* other */
1534 COSTS_N_INSNS (1), /* cost of movsx */
1535 COSTS_N_INSNS (1), /* cost of movzx */
1536 16, /* "large" insn */
1537 17, /* MOVE_RATIO */
1538 4, /* cost for loading QImode using movzbl */
1539 {4, 4, 4}, /* cost of loading integer registers
1540 in QImode, HImode and SImode.
1541 Relative to reg-reg move (2). */
1542 {4, 4, 4}, /* cost of storing integer registers */
1543 3, /* cost of reg,reg fld/fst */
1544 {12, 12, 12}, /* cost of loading fp registers
1545 in SFmode, DFmode and XFmode */
1546 {4, 4, 4}, /* cost of storing fp registers
1547 in SFmode, DFmode and XFmode */
1548 6, /* cost of moving MMX register */
1549 {12, 12}, /* cost of loading MMX registers
1550 in SImode and DImode */
1551 {12, 12}, /* cost of storing MMX registers
1552 in SImode and DImode */
1553 6, /* cost of moving SSE register */
1554 {12, 12, 12}, /* cost of loading SSE registers
1555 in SImode, DImode and TImode */
1556 {12, 12, 12}, /* cost of storing SSE registers
1557 in SImode, DImode and TImode */
1558 8, /* MMX or SSE register to integer */
1559 8, /* size of l1 cache. */
1560 1024, /* size of l2 cache. */
1561 128, /* size of prefetch block */
1562 8, /* number of parallel prefetches */
1563 1, /* Branch cost */
1564 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1565 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1566 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1567 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1568 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1569 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1570 nocona_memcpy,
1571 nocona_memset,
1572 1, /* scalar_stmt_cost. */
1573 1, /* scalar load_cost. */
1574 1, /* scalar_store_cost. */
1575 1, /* vec_stmt_cost. */
1576 1, /* vec_to_scalar_cost. */
1577 1, /* scalar_to_vec_cost. */
1578 1, /* vec_align_load_cost. */
1579 2, /* vec_unalign_load_cost. */
1580 1, /* vec_store_cost. */
1581 3, /* cond_taken_branch_cost. */
1582 1, /* cond_not_taken_branch_cost. */
1583 };
1584
1585 static stringop_algs atom_memcpy[2] = {
1586 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1587 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1588 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1589 static stringop_algs atom_memset[2] = {
1590 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1591 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1592 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1593 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1594 static const
1595 struct processor_costs atom_cost = {
1596 COSTS_N_INSNS (1), /* cost of an add instruction */
1597 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1598 COSTS_N_INSNS (1), /* variable shift costs */
1599 COSTS_N_INSNS (1), /* constant shift costs */
1600 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1601 COSTS_N_INSNS (4), /* HI */
1602 COSTS_N_INSNS (3), /* SI */
1603 COSTS_N_INSNS (4), /* DI */
1604 COSTS_N_INSNS (2)}, /* other */
1605 0, /* cost of multiply per each bit set */
1606 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1607 COSTS_N_INSNS (26), /* HI */
1608 COSTS_N_INSNS (42), /* SI */
1609 COSTS_N_INSNS (74), /* DI */
1610 COSTS_N_INSNS (74)}, /* other */
1611 COSTS_N_INSNS (1), /* cost of movsx */
1612 COSTS_N_INSNS (1), /* cost of movzx */
1613 8, /* "large" insn */
1614 17, /* MOVE_RATIO */
1615 4, /* cost for loading QImode using movzbl */
1616 {4, 4, 4}, /* cost of loading integer registers
1617 in QImode, HImode and SImode.
1618 Relative to reg-reg move (2). */
1619 {4, 4, 4}, /* cost of storing integer registers */
1620 4, /* cost of reg,reg fld/fst */
1621 {12, 12, 12}, /* cost of loading fp registers
1622 in SFmode, DFmode and XFmode */
1623 {6, 6, 8}, /* cost of storing fp registers
1624 in SFmode, DFmode and XFmode */
1625 2, /* cost of moving MMX register */
1626 {8, 8}, /* cost of loading MMX registers
1627 in SImode and DImode */
1628 {8, 8}, /* cost of storing MMX registers
1629 in SImode and DImode */
1630 2, /* cost of moving SSE register */
1631 {8, 8, 8}, /* cost of loading SSE registers
1632 in SImode, DImode and TImode */
1633 {8, 8, 8}, /* cost of storing SSE registers
1634 in SImode, DImode and TImode */
1635 5, /* MMX or SSE register to integer */
1636 32, /* size of l1 cache. */
1637 256, /* size of l2 cache. */
1638 64, /* size of prefetch block */
1639 6, /* number of parallel prefetches */
1640 3, /* Branch cost */
1641 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1642 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1643 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1644 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1645 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1646 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1647 atom_memcpy,
1648 atom_memset,
1649 1, /* scalar_stmt_cost. */
1650 1, /* scalar load_cost. */
1651 1, /* scalar_store_cost. */
1652 1, /* vec_stmt_cost. */
1653 1, /* vec_to_scalar_cost. */
1654 1, /* scalar_to_vec_cost. */
1655 1, /* vec_align_load_cost. */
1656 2, /* vec_unalign_load_cost. */
1657 1, /* vec_store_cost. */
1658 3, /* cond_taken_branch_cost. */
1659 1, /* cond_not_taken_branch_cost. */
1660 };
1661
1662 static stringop_algs slm_memcpy[2] = {
1663 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1664 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1665 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1666 static stringop_algs slm_memset[2] = {
1667 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1668 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1669 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1670 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1671 static const
1672 struct processor_costs slm_cost = {
1673 COSTS_N_INSNS (1), /* cost of an add instruction */
1674 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1675 COSTS_N_INSNS (1), /* variable shift costs */
1676 COSTS_N_INSNS (1), /* constant shift costs */
1677 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1678 COSTS_N_INSNS (4), /* HI */
1679 COSTS_N_INSNS (3), /* SI */
1680 COSTS_N_INSNS (4), /* DI */
1681 COSTS_N_INSNS (2)}, /* other */
1682 0, /* cost of multiply per each bit set */
1683 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1684 COSTS_N_INSNS (26), /* HI */
1685 COSTS_N_INSNS (42), /* SI */
1686 COSTS_N_INSNS (74), /* DI */
1687 COSTS_N_INSNS (74)}, /* other */
1688 COSTS_N_INSNS (1), /* cost of movsx */
1689 COSTS_N_INSNS (1), /* cost of movzx */
1690 8, /* "large" insn */
1691 17, /* MOVE_RATIO */
1692 4, /* cost for loading QImode using movzbl */
1693 {4, 4, 4}, /* cost of loading integer registers
1694 in QImode, HImode and SImode.
1695 Relative to reg-reg move (2). */
1696 {4, 4, 4}, /* cost of storing integer registers */
1697 4, /* cost of reg,reg fld/fst */
1698 {12, 12, 12}, /* cost of loading fp registers
1699 in SFmode, DFmode and XFmode */
1700 {6, 6, 8}, /* cost of storing fp registers
1701 in SFmode, DFmode and XFmode */
1702 2, /* cost of moving MMX register */
1703 {8, 8}, /* cost of loading MMX registers
1704 in SImode and DImode */
1705 {8, 8}, /* cost of storing MMX registers
1706 in SImode and DImode */
1707 2, /* cost of moving SSE register */
1708 {8, 8, 8}, /* cost of loading SSE registers
1709 in SImode, DImode and TImode */
1710 {8, 8, 8}, /* cost of storing SSE registers
1711 in SImode, DImode and TImode */
1712 5, /* MMX or SSE register to integer */
1713 32, /* size of l1 cache. */
1714 256, /* size of l2 cache. */
1715 64, /* size of prefetch block */
1716 6, /* number of parallel prefetches */
1717 3, /* Branch cost */
1718 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1719 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1720 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1721 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1722 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1723 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1724 slm_memcpy,
1725 slm_memset,
1726 1, /* scalar_stmt_cost. */
1727 1, /* scalar load_cost. */
1728 1, /* scalar_store_cost. */
1729 1, /* vec_stmt_cost. */
1730 1, /* vec_to_scalar_cost. */
1731 1, /* scalar_to_vec_cost. */
1732 1, /* vec_align_load_cost. */
1733 2, /* vec_unalign_load_cost. */
1734 1, /* vec_store_cost. */
1735 3, /* cond_taken_branch_cost. */
1736 1, /* cond_not_taken_branch_cost. */
1737 };
1738
1739 /* Generic should produce code tuned for Core-i7 (and newer chips)
1740 and btver1 (and newer chips). */
1741
1742 static stringop_algs generic_memcpy[2] = {
1743 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1744 {-1, libcall, false}}},
1745 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1746 {-1, libcall, false}}}};
1747 static stringop_algs generic_memset[2] = {
1748 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1749 {-1, libcall, false}}},
1750 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1751 {-1, libcall, false}}}};
1752 static const
1753 struct processor_costs generic_cost = {
1754 COSTS_N_INSNS (1), /* cost of an add instruction */
1755 /* On all chips taken into consideration lea is 2 cycles and more. With
1756 this cost however our current implementation of synth_mult results in
1757 use of unnecessary temporary registers causing regression on several
1758 SPECfp benchmarks. */
1759 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1760 COSTS_N_INSNS (1), /* variable shift costs */
1761 COSTS_N_INSNS (1), /* constant shift costs */
1762 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1763 COSTS_N_INSNS (4), /* HI */
1764 COSTS_N_INSNS (3), /* SI */
1765 COSTS_N_INSNS (4), /* DI */
1766 COSTS_N_INSNS (2)}, /* other */
1767 0, /* cost of multiply per each bit set */
1768 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1769 COSTS_N_INSNS (26), /* HI */
1770 COSTS_N_INSNS (42), /* SI */
1771 COSTS_N_INSNS (74), /* DI */
1772 COSTS_N_INSNS (74)}, /* other */
1773 COSTS_N_INSNS (1), /* cost of movsx */
1774 COSTS_N_INSNS (1), /* cost of movzx */
1775 8, /* "large" insn */
1776 17, /* MOVE_RATIO */
1777 4, /* cost for loading QImode using movzbl */
1778 {4, 4, 4}, /* cost of loading integer registers
1779 in QImode, HImode and SImode.
1780 Relative to reg-reg move (2). */
1781 {4, 4, 4}, /* cost of storing integer registers */
1782 4, /* cost of reg,reg fld/fst */
1783 {12, 12, 12}, /* cost of loading fp registers
1784 in SFmode, DFmode and XFmode */
1785 {6, 6, 8}, /* cost of storing fp registers
1786 in SFmode, DFmode and XFmode */
1787 2, /* cost of moving MMX register */
1788 {8, 8}, /* cost of loading MMX registers
1789 in SImode and DImode */
1790 {8, 8}, /* cost of storing MMX registers
1791 in SImode and DImode */
1792 2, /* cost of moving SSE register */
1793 {8, 8, 8}, /* cost of loading SSE registers
1794 in SImode, DImode and TImode */
1795 {8, 8, 8}, /* cost of storing SSE registers
1796 in SImode, DImode and TImode */
1797 5, /* MMX or SSE register to integer */
1798 32, /* size of l1 cache. */
1799 512, /* size of l2 cache. */
1800 64, /* size of prefetch block */
1801 6, /* number of parallel prefetches */
1802 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1803 value is increased to perhaps more appropriate value of 5. */
1804 3, /* Branch cost */
1805 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1806 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1807 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1808 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1809 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1810 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1811 generic_memcpy,
1812 generic_memset,
1813 1, /* scalar_stmt_cost. */
1814 1, /* scalar load_cost. */
1815 1, /* scalar_store_cost. */
1816 1, /* vec_stmt_cost. */
1817 1, /* vec_to_scalar_cost. */
1818 1, /* scalar_to_vec_cost. */
1819 1, /* vec_align_load_cost. */
1820 2, /* vec_unalign_load_cost. */
1821 1, /* vec_store_cost. */
1822 3, /* cond_taken_branch_cost. */
1823 1, /* cond_not_taken_branch_cost. */
1824 };
1825
1826 /* core_cost should produce code tuned for Core familly of CPUs. */
1827 static stringop_algs core_memcpy[2] = {
1828 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1829 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1830 {-1, libcall, false}}}};
1831 static stringop_algs core_memset[2] = {
1832 {libcall, {{6, loop_1_byte, true},
1833 {24, loop, true},
1834 {8192, rep_prefix_4_byte, true},
1835 {-1, libcall, false}}},
1836 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1837 {-1, libcall, false}}}};
1838
1839 static const
1840 struct processor_costs core_cost = {
1841 COSTS_N_INSNS (1), /* cost of an add instruction */
1842 /* On all chips taken into consideration lea is 2 cycles and more. With
1843 this cost however our current implementation of synth_mult results in
1844 use of unnecessary temporary registers causing regression on several
1845 SPECfp benchmarks. */
1846 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1847 COSTS_N_INSNS (1), /* variable shift costs */
1848 COSTS_N_INSNS (1), /* constant shift costs */
1849 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1850 COSTS_N_INSNS (4), /* HI */
1851 COSTS_N_INSNS (3), /* SI */
1852 COSTS_N_INSNS (4), /* DI */
1853 COSTS_N_INSNS (2)}, /* other */
1854 0, /* cost of multiply per each bit set */
1855 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1856 COSTS_N_INSNS (26), /* HI */
1857 COSTS_N_INSNS (42), /* SI */
1858 COSTS_N_INSNS (74), /* DI */
1859 COSTS_N_INSNS (74)}, /* other */
1860 COSTS_N_INSNS (1), /* cost of movsx */
1861 COSTS_N_INSNS (1), /* cost of movzx */
1862 8, /* "large" insn */
1863 17, /* MOVE_RATIO */
1864 4, /* cost for loading QImode using movzbl */
1865 {4, 4, 4}, /* cost of loading integer registers
1866 in QImode, HImode and SImode.
1867 Relative to reg-reg move (2). */
1868 {4, 4, 4}, /* cost of storing integer registers */
1869 4, /* cost of reg,reg fld/fst */
1870 {12, 12, 12}, /* cost of loading fp registers
1871 in SFmode, DFmode and XFmode */
1872 {6, 6, 8}, /* cost of storing fp registers
1873 in SFmode, DFmode and XFmode */
1874 2, /* cost of moving MMX register */
1875 {8, 8}, /* cost of loading MMX registers
1876 in SImode and DImode */
1877 {8, 8}, /* cost of storing MMX registers
1878 in SImode and DImode */
1879 2, /* cost of moving SSE register */
1880 {8, 8, 8}, /* cost of loading SSE registers
1881 in SImode, DImode and TImode */
1882 {8, 8, 8}, /* cost of storing SSE registers
1883 in SImode, DImode and TImode */
1884 5, /* MMX or SSE register to integer */
1885 64, /* size of l1 cache. */
1886 512, /* size of l2 cache. */
1887 64, /* size of prefetch block */
1888 6, /* number of parallel prefetches */
1889 /* FIXME perhaps more appropriate value is 5. */
1890 3, /* Branch cost */
1891 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1892 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1893 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1894 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1895 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1896 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1897 core_memcpy,
1898 core_memset,
1899 1, /* scalar_stmt_cost. */
1900 1, /* scalar load_cost. */
1901 1, /* scalar_store_cost. */
1902 1, /* vec_stmt_cost. */
1903 1, /* vec_to_scalar_cost. */
1904 1, /* scalar_to_vec_cost. */
1905 1, /* vec_align_load_cost. */
1906 2, /* vec_unalign_load_cost. */
1907 1, /* vec_store_cost. */
1908 3, /* cond_taken_branch_cost. */
1909 1, /* cond_not_taken_branch_cost. */
1910 };
1911
1912
1913 /* Set by -mtune. */
1914 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1915
1916 /* Set by -mtune or -Os. */
1917 const struct processor_costs *ix86_cost = &pentium_cost;
1918
1919 /* Processor feature/optimization bitmasks. */
1920 #define m_386 (1<<PROCESSOR_I386)
1921 #define m_486 (1<<PROCESSOR_I486)
1922 #define m_PENT (1<<PROCESSOR_PENTIUM)
1923 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1924 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1925 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1926 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1927 #define m_CORE2 (1<<PROCESSOR_CORE2)
1928 #define m_COREI7 (1<<PROCESSOR_COREI7)
1929 #define m_COREI7_AVX (1<<PROCESSOR_COREI7_AVX)
1930 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1931 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_COREI7_AVX | m_HASWELL)
1932 #define m_ATOM (1<<PROCESSOR_ATOM)
1933 #define m_SLM (1<<PROCESSOR_SLM)
1934
1935 #define m_GEODE (1<<PROCESSOR_GEODE)
1936 #define m_K6 (1<<PROCESSOR_K6)
1937 #define m_K6_GEODE (m_K6 | m_GEODE)
1938 #define m_K8 (1<<PROCESSOR_K8)
1939 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1940 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1941 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1942 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1943 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1944 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1945 #define m_BDVER4 (1<<PROCESSOR_BDVER4)
1946 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1947 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1948 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
1949 #define m_BTVER (m_BTVER1 | m_BTVER2)
1950 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1951
1952 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1953
1954 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1955 #undef DEF_TUNE
1956 #define DEF_TUNE(tune, name, selector) name,
1957 #include "x86-tune.def"
1958 #undef DEF_TUNE
1959 };
1960
1961 /* Feature tests against the various tunings. */
1962 unsigned char ix86_tune_features[X86_TUNE_LAST];
1963
1964 /* Feature tests against the various tunings used to create ix86_tune_features
1965 based on the processor mask. */
1966 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1967 #undef DEF_TUNE
1968 #define DEF_TUNE(tune, name, selector) selector,
1969 #include "x86-tune.def"
1970 #undef DEF_TUNE
1971 };
1972
1973 /* Feature tests against the various architecture variations. */
1974 unsigned char ix86_arch_features[X86_ARCH_LAST];
1975
1976 /* Feature tests against the various architecture variations, used to create
1977 ix86_arch_features based on the processor mask. */
1978 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1979 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1980 ~(m_386 | m_486 | m_PENT | m_K6),
1981
1982 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1983 ~m_386,
1984
1985 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1986 ~(m_386 | m_486),
1987
1988 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1989 ~m_386,
1990
1991 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1992 ~m_386,
1993 };
1994
1995 /* In case the average insn count for single function invocation is
1996 lower than this constant, emit fast (but longer) prologue and
1997 epilogue code. */
1998 #define FAST_PROLOGUE_INSN_COUNT 20
1999
2000 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2001 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2002 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2003 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2004
2005 /* Array of the smallest class containing reg number REGNO, indexed by
2006 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2007
2008 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2009 {
2010 /* ax, dx, cx, bx */
2011 AREG, DREG, CREG, BREG,
2012 /* si, di, bp, sp */
2013 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2014 /* FP registers */
2015 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2016 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2017 /* arg pointer */
2018 NON_Q_REGS,
2019 /* flags, fpsr, fpcr, frame */
2020 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2021 /* SSE registers */
2022 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2023 SSE_REGS, SSE_REGS,
2024 /* MMX registers */
2025 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2026 MMX_REGS, MMX_REGS,
2027 /* REX registers */
2028 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2029 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2030 /* SSE REX registers */
2031 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2032 SSE_REGS, SSE_REGS,
2033 /* AVX-512 SSE registers */
2034 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2035 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2036 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2037 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2038 /* Mask registers. */
2039 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2040 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2041 /* MPX bound registers */
2042 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2043 };
2044
2045 /* The "default" register map used in 32bit mode. */
2046
2047 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2048 {
2049 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2050 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2051 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2052 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2053 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2054 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2055 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2056 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2057 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2058 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2059 101, 102, 103, 104, /* bound registers */
2060 };
2061
2062 /* The "default" register map used in 64bit mode. */
2063
2064 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2065 {
2066 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2067 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2068 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2069 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2070 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2071 8,9,10,11,12,13,14,15, /* extended integer registers */
2072 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2073 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2074 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2075 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2076 126, 127, 128, 129, /* bound registers */
2077 };
2078
2079 /* Define the register numbers to be used in Dwarf debugging information.
2080 The SVR4 reference port C compiler uses the following register numbers
2081 in its Dwarf output code:
2082 0 for %eax (gcc regno = 0)
2083 1 for %ecx (gcc regno = 2)
2084 2 for %edx (gcc regno = 1)
2085 3 for %ebx (gcc regno = 3)
2086 4 for %esp (gcc regno = 7)
2087 5 for %ebp (gcc regno = 6)
2088 6 for %esi (gcc regno = 4)
2089 7 for %edi (gcc regno = 5)
2090 The following three DWARF register numbers are never generated by
2091 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2092 believes these numbers have these meanings.
2093 8 for %eip (no gcc equivalent)
2094 9 for %eflags (gcc regno = 17)
2095 10 for %trapno (no gcc equivalent)
2096 It is not at all clear how we should number the FP stack registers
2097 for the x86 architecture. If the version of SDB on x86/svr4 were
2098 a bit less brain dead with respect to floating-point then we would
2099 have a precedent to follow with respect to DWARF register numbers
2100 for x86 FP registers, but the SDB on x86/svr4 is so completely
2101 broken with respect to FP registers that it is hardly worth thinking
2102 of it as something to strive for compatibility with.
2103 The version of x86/svr4 SDB I have at the moment does (partially)
2104 seem to believe that DWARF register number 11 is associated with
2105 the x86 register %st(0), but that's about all. Higher DWARF
2106 register numbers don't seem to be associated with anything in
2107 particular, and even for DWARF regno 11, SDB only seems to under-
2108 stand that it should say that a variable lives in %st(0) (when
2109 asked via an `=' command) if we said it was in DWARF regno 11,
2110 but SDB still prints garbage when asked for the value of the
2111 variable in question (via a `/' command).
2112 (Also note that the labels SDB prints for various FP stack regs
2113 when doing an `x' command are all wrong.)
2114 Note that these problems generally don't affect the native SVR4
2115 C compiler because it doesn't allow the use of -O with -g and
2116 because when it is *not* optimizing, it allocates a memory
2117 location for each floating-point variable, and the memory
2118 location is what gets described in the DWARF AT_location
2119 attribute for the variable in question.
2120 Regardless of the severe mental illness of the x86/svr4 SDB, we
2121 do something sensible here and we use the following DWARF
2122 register numbers. Note that these are all stack-top-relative
2123 numbers.
2124 11 for %st(0) (gcc regno = 8)
2125 12 for %st(1) (gcc regno = 9)
2126 13 for %st(2) (gcc regno = 10)
2127 14 for %st(3) (gcc regno = 11)
2128 15 for %st(4) (gcc regno = 12)
2129 16 for %st(5) (gcc regno = 13)
2130 17 for %st(6) (gcc regno = 14)
2131 18 for %st(7) (gcc regno = 15)
2132 */
2133 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2134 {
2135 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2136 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2137 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2138 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2139 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2140 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2141 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2142 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2143 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2144 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2145 -1, -1, -1, -1, /* bound registers */
2146 };
2147
2148 /* Define parameter passing and return registers. */
2149
2150 static int const x86_64_int_parameter_registers[6] =
2151 {
2152 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2153 };
2154
2155 static int const x86_64_ms_abi_int_parameter_registers[4] =
2156 {
2157 CX_REG, DX_REG, R8_REG, R9_REG
2158 };
2159
2160 static int const x86_64_int_return_registers[4] =
2161 {
2162 AX_REG, DX_REG, DI_REG, SI_REG
2163 };
2164
2165 /* Additional registers that are clobbered by SYSV calls. */
2166
2167 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2168 {
2169 SI_REG, DI_REG,
2170 XMM6_REG, XMM7_REG,
2171 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2172 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2173 };
2174
2175 /* Define the structure for the machine field in struct function. */
2176
2177 struct GTY(()) stack_local_entry {
2178 unsigned short mode;
2179 unsigned short n;
2180 rtx rtl;
2181 struct stack_local_entry *next;
2182 };
2183
2184 /* Structure describing stack frame layout.
2185 Stack grows downward:
2186
2187 [arguments]
2188 <- ARG_POINTER
2189 saved pc
2190
2191 saved static chain if ix86_static_chain_on_stack
2192
2193 saved frame pointer if frame_pointer_needed
2194 <- HARD_FRAME_POINTER
2195 [saved regs]
2196 <- regs_save_offset
2197 [padding0]
2198
2199 [saved SSE regs]
2200 <- sse_regs_save_offset
2201 [padding1] |
2202 | <- FRAME_POINTER
2203 [va_arg registers] |
2204 |
2205 [frame] |
2206 |
2207 [padding2] | = to_allocate
2208 <- STACK_POINTER
2209 */
2210 struct ix86_frame
2211 {
2212 int nsseregs;
2213 int nregs;
2214 int va_arg_size;
2215 int red_zone_size;
2216 int outgoing_arguments_size;
2217
2218 /* The offsets relative to ARG_POINTER. */
2219 HOST_WIDE_INT frame_pointer_offset;
2220 HOST_WIDE_INT hard_frame_pointer_offset;
2221 HOST_WIDE_INT stack_pointer_offset;
2222 HOST_WIDE_INT hfp_save_offset;
2223 HOST_WIDE_INT reg_save_offset;
2224 HOST_WIDE_INT sse_reg_save_offset;
2225
2226 /* When save_regs_using_mov is set, emit prologue using
2227 move instead of push instructions. */
2228 bool save_regs_using_mov;
2229 };
2230
2231 /* Which cpu are we scheduling for. */
2232 enum attr_cpu ix86_schedule;
2233
2234 /* Which cpu are we optimizing for. */
2235 enum processor_type ix86_tune;
2236
2237 /* Which instruction set architecture to use. */
2238 enum processor_type ix86_arch;
2239
2240 /* True if processor has SSE prefetch instruction. */
2241 unsigned char x86_prefetch_sse;
2242
2243 /* -mstackrealign option */
2244 static const char ix86_force_align_arg_pointer_string[]
2245 = "force_align_arg_pointer";
2246
2247 static rtx (*ix86_gen_leave) (void);
2248 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2249 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2250 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2251 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2252 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2253 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2254 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2255 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2256 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2257 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2258 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2259
2260 /* Preferred alignment for stack boundary in bits. */
2261 unsigned int ix86_preferred_stack_boundary;
2262
2263 /* Alignment for incoming stack boundary in bits specified at
2264 command line. */
2265 static unsigned int ix86_user_incoming_stack_boundary;
2266
2267 /* Default alignment for incoming stack boundary in bits. */
2268 static unsigned int ix86_default_incoming_stack_boundary;
2269
2270 /* Alignment for incoming stack boundary in bits. */
2271 unsigned int ix86_incoming_stack_boundary;
2272
2273 /* Calling abi specific va_list type nodes. */
2274 static GTY(()) tree sysv_va_list_type_node;
2275 static GTY(()) tree ms_va_list_type_node;
2276
2277 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2278 char internal_label_prefix[16];
2279 int internal_label_prefix_len;
2280
2281 /* Fence to use after loop using movnt. */
2282 tree x86_mfence;
2283
2284 /* Register class used for passing given 64bit part of the argument.
2285 These represent classes as documented by the PS ABI, with the exception
2286 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2287 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2288
2289 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2290 whenever possible (upper half does contain padding). */
2291 enum x86_64_reg_class
2292 {
2293 X86_64_NO_CLASS,
2294 X86_64_INTEGER_CLASS,
2295 X86_64_INTEGERSI_CLASS,
2296 X86_64_SSE_CLASS,
2297 X86_64_SSESF_CLASS,
2298 X86_64_SSEDF_CLASS,
2299 X86_64_SSEUP_CLASS,
2300 X86_64_X87_CLASS,
2301 X86_64_X87UP_CLASS,
2302 X86_64_COMPLEX_X87_CLASS,
2303 X86_64_MEMORY_CLASS
2304 };
2305
2306 #define MAX_CLASSES 4
2307
2308 /* Table of constants used by fldpi, fldln2, etc.... */
2309 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2310 static bool ext_80387_constants_init = 0;
2311
2312 \f
2313 static struct machine_function * ix86_init_machine_status (void);
2314 static rtx ix86_function_value (const_tree, const_tree, bool);
2315 static bool ix86_function_value_regno_p (const unsigned int);
2316 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2317 const_tree);
2318 static rtx ix86_static_chain (const_tree, bool);
2319 static int ix86_function_regparm (const_tree, const_tree);
2320 static void ix86_compute_frame_layout (struct ix86_frame *);
2321 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2322 rtx, rtx, int);
2323 static void ix86_add_new_builtins (HOST_WIDE_INT);
2324 static tree ix86_canonical_va_list_type (tree);
2325 static void predict_jump (int);
2326 static unsigned int split_stack_prologue_scratch_regno (void);
2327 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2328
2329 enum ix86_function_specific_strings
2330 {
2331 IX86_FUNCTION_SPECIFIC_ARCH,
2332 IX86_FUNCTION_SPECIFIC_TUNE,
2333 IX86_FUNCTION_SPECIFIC_MAX
2334 };
2335
2336 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2337 const char *, enum fpmath_unit, bool);
2338 static void ix86_function_specific_save (struct cl_target_option *,
2339 struct gcc_options *opts);
2340 static void ix86_function_specific_restore (struct gcc_options *opts,
2341 struct cl_target_option *);
2342 static void ix86_function_specific_print (FILE *, int,
2343 struct cl_target_option *);
2344 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2345 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2346 struct gcc_options *,
2347 struct gcc_options *,
2348 struct gcc_options *);
2349 static bool ix86_can_inline_p (tree, tree);
2350 static void ix86_set_current_function (tree);
2351 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2352
2353 static enum calling_abi ix86_function_abi (const_tree);
2354
2355 \f
2356 #ifndef SUBTARGET32_DEFAULT_CPU
2357 #define SUBTARGET32_DEFAULT_CPU "i386"
2358 #endif
2359
2360 /* Whether -mtune= or -march= were specified */
2361 static int ix86_tune_defaulted;
2362 static int ix86_arch_specified;
2363
2364 /* Vectorization library interface and handlers. */
2365 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2366
2367 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2368 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2369
2370 /* Processor target table, indexed by processor number */
2371 struct ptt
2372 {
2373 const struct processor_costs *cost; /* Processor costs */
2374 const int align_loop; /* Default alignments. */
2375 const int align_loop_max_skip;
2376 const int align_jump;
2377 const int align_jump_max_skip;
2378 const int align_func;
2379 };
2380
2381 static const struct ptt processor_target_table[PROCESSOR_max] =
2382 {
2383 {&i386_cost, 4, 3, 4, 3, 4},
2384 {&i486_cost, 16, 15, 16, 15, 16},
2385 {&pentium_cost, 16, 7, 16, 7, 16},
2386 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2387 {&geode_cost, 0, 0, 0, 0, 0},
2388 {&k6_cost, 32, 7, 32, 7, 32},
2389 {&athlon_cost, 16, 7, 16, 7, 16},
2390 {&pentium4_cost, 0, 0, 0, 0, 0},
2391 {&k8_cost, 16, 7, 16, 7, 16},
2392 {&nocona_cost, 0, 0, 0, 0, 0},
2393 /* Core 2 */
2394 {&core_cost, 16, 10, 16, 10, 16},
2395 /* Core i7 */
2396 {&core_cost, 16, 10, 16, 10, 16},
2397 /* Core i7 avx */
2398 {&core_cost, 16, 10, 16, 10, 16},
2399 /* Core avx2 */
2400 {&core_cost, 16, 10, 16, 10, 16},
2401 {&generic_cost, 16, 10, 16, 10, 16},
2402 {&amdfam10_cost, 32, 24, 32, 7, 32},
2403 {&bdver1_cost, 16, 10, 16, 7, 11},
2404 {&bdver2_cost, 16, 10, 16, 7, 11},
2405 {&bdver3_cost, 16, 10, 16, 7, 11},
2406 {&bdver4_cost, 16, 10, 16, 7, 11},
2407 {&btver1_cost, 16, 10, 16, 7, 11},
2408 {&btver2_cost, 16, 10, 16, 7, 11},
2409 {&atom_cost, 16, 15, 16, 7, 16},
2410 {&slm_cost, 16, 15, 16, 7, 16}
2411 };
2412
2413 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2414 {
2415 "generic",
2416 "i386",
2417 "i486",
2418 "pentium",
2419 "pentium-mmx",
2420 "pentiumpro",
2421 "pentium2",
2422 "pentium3",
2423 "pentium4",
2424 "pentium-m",
2425 "prescott",
2426 "nocona",
2427 "core2",
2428 "corei7",
2429 "corei7-avx",
2430 "core-avx2",
2431 "atom",
2432 "slm",
2433 "geode",
2434 "k6",
2435 "k6-2",
2436 "k6-3",
2437 "athlon",
2438 "athlon-4",
2439 "k8",
2440 "amdfam10",
2441 "bdver1",
2442 "bdver2",
2443 "bdver3",
2444 "bdver4",
2445 "btver1",
2446 "btver2"
2447 };
2448 \f
2449 static bool
2450 gate_insert_vzeroupper (void)
2451 {
2452 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2453 }
2454
2455 static unsigned int
2456 rest_of_handle_insert_vzeroupper (void)
2457 {
2458 int i;
2459
2460 /* vzeroupper instructions are inserted immediately after reload to
2461 account for possible spills from 256bit registers. The pass
2462 reuses mode switching infrastructure by re-running mode insertion
2463 pass, so disable entities that have already been processed. */
2464 for (i = 0; i < MAX_386_ENTITIES; i++)
2465 ix86_optimize_mode_switching[i] = 0;
2466
2467 ix86_optimize_mode_switching[AVX_U128] = 1;
2468
2469 /* Call optimize_mode_switching. */
2470 g->get_passes ()->execute_pass_mode_switching ();
2471 return 0;
2472 }
2473
2474 namespace {
2475
2476 const pass_data pass_data_insert_vzeroupper =
2477 {
2478 RTL_PASS, /* type */
2479 "vzeroupper", /* name */
2480 OPTGROUP_NONE, /* optinfo_flags */
2481 true, /* has_gate */
2482 true, /* has_execute */
2483 TV_NONE, /* tv_id */
2484 0, /* properties_required */
2485 0, /* properties_provided */
2486 0, /* properties_destroyed */
2487 0, /* todo_flags_start */
2488 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2489 };
2490
2491 class pass_insert_vzeroupper : public rtl_opt_pass
2492 {
2493 public:
2494 pass_insert_vzeroupper(gcc::context *ctxt)
2495 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2496 {}
2497
2498 /* opt_pass methods: */
2499 bool gate () { return gate_insert_vzeroupper (); }
2500 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2501
2502 }; // class pass_insert_vzeroupper
2503
2504 } // anon namespace
2505
2506 rtl_opt_pass *
2507 make_pass_insert_vzeroupper (gcc::context *ctxt)
2508 {
2509 return new pass_insert_vzeroupper (ctxt);
2510 }
2511
2512 /* Return true if a red-zone is in use. */
2513
2514 static inline bool
2515 ix86_using_red_zone (void)
2516 {
2517 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2518 }
2519 \f
2520 /* Return a string that documents the current -m options. The caller is
2521 responsible for freeing the string. */
2522
2523 static char *
2524 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2525 const char *tune, enum fpmath_unit fpmath,
2526 bool add_nl_p)
2527 {
2528 struct ix86_target_opts
2529 {
2530 const char *option; /* option string */
2531 HOST_WIDE_INT mask; /* isa mask options */
2532 };
2533
2534 /* This table is ordered so that options like -msse4.2 that imply
2535 preceding options while match those first. */
2536 static struct ix86_target_opts isa_opts[] =
2537 {
2538 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2539 { "-mfma", OPTION_MASK_ISA_FMA },
2540 { "-mxop", OPTION_MASK_ISA_XOP },
2541 { "-mlwp", OPTION_MASK_ISA_LWP },
2542 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2543 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2544 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2545 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2546 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2547 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2548 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2549 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2550 { "-msse3", OPTION_MASK_ISA_SSE3 },
2551 { "-msse2", OPTION_MASK_ISA_SSE2 },
2552 { "-msse", OPTION_MASK_ISA_SSE },
2553 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2554 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2555 { "-mmmx", OPTION_MASK_ISA_MMX },
2556 { "-mabm", OPTION_MASK_ISA_ABM },
2557 { "-mbmi", OPTION_MASK_ISA_BMI },
2558 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2559 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2560 { "-mhle", OPTION_MASK_ISA_HLE },
2561 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2562 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2563 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2564 { "-madx", OPTION_MASK_ISA_ADX },
2565 { "-mtbm", OPTION_MASK_ISA_TBM },
2566 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2567 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2568 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2569 { "-maes", OPTION_MASK_ISA_AES },
2570 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2571 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2572 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2573 { "-mf16c", OPTION_MASK_ISA_F16C },
2574 { "-mrtm", OPTION_MASK_ISA_RTM },
2575 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2576 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2577 { "-mmpx", OPTION_MASK_ISA_MPX },
2578 };
2579
2580 /* Flag options. */
2581 static struct ix86_target_opts flag_opts[] =
2582 {
2583 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2584 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2585 { "-m80387", MASK_80387 },
2586 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2587 { "-malign-double", MASK_ALIGN_DOUBLE },
2588 { "-mcld", MASK_CLD },
2589 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2590 { "-mieee-fp", MASK_IEEE_FP },
2591 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2592 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2593 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2594 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2595 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2596 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2597 { "-mno-red-zone", MASK_NO_RED_ZONE },
2598 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2599 { "-mrecip", MASK_RECIP },
2600 { "-mrtd", MASK_RTD },
2601 { "-msseregparm", MASK_SSEREGPARM },
2602 { "-mstack-arg-probe", MASK_STACK_PROBE },
2603 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2604 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2605 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2606 { "-mvzeroupper", MASK_VZEROUPPER },
2607 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2608 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2609 { "-mprefer-avx128", MASK_PREFER_AVX128},
2610 };
2611
2612 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2613
2614 char isa_other[40];
2615 char target_other[40];
2616 unsigned num = 0;
2617 unsigned i, j;
2618 char *ret;
2619 char *ptr;
2620 size_t len;
2621 size_t line_len;
2622 size_t sep_len;
2623 const char *abi;
2624
2625 memset (opts, '\0', sizeof (opts));
2626
2627 /* Add -march= option. */
2628 if (arch)
2629 {
2630 opts[num][0] = "-march=";
2631 opts[num++][1] = arch;
2632 }
2633
2634 /* Add -mtune= option. */
2635 if (tune)
2636 {
2637 opts[num][0] = "-mtune=";
2638 opts[num++][1] = tune;
2639 }
2640
2641 /* Add -m32/-m64/-mx32. */
2642 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2643 {
2644 if ((isa & OPTION_MASK_ABI_64) != 0)
2645 abi = "-m64";
2646 else
2647 abi = "-mx32";
2648 isa &= ~ (OPTION_MASK_ISA_64BIT
2649 | OPTION_MASK_ABI_64
2650 | OPTION_MASK_ABI_X32);
2651 }
2652 else
2653 abi = "-m32";
2654 opts[num++][0] = abi;
2655
2656 /* Pick out the options in isa options. */
2657 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2658 {
2659 if ((isa & isa_opts[i].mask) != 0)
2660 {
2661 opts[num++][0] = isa_opts[i].option;
2662 isa &= ~ isa_opts[i].mask;
2663 }
2664 }
2665
2666 if (isa && add_nl_p)
2667 {
2668 opts[num++][0] = isa_other;
2669 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2670 isa);
2671 }
2672
2673 /* Add flag options. */
2674 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2675 {
2676 if ((flags & flag_opts[i].mask) != 0)
2677 {
2678 opts[num++][0] = flag_opts[i].option;
2679 flags &= ~ flag_opts[i].mask;
2680 }
2681 }
2682
2683 if (flags && add_nl_p)
2684 {
2685 opts[num++][0] = target_other;
2686 sprintf (target_other, "(other flags: %#x)", flags);
2687 }
2688
2689 /* Add -fpmath= option. */
2690 if (fpmath)
2691 {
2692 opts[num][0] = "-mfpmath=";
2693 switch ((int) fpmath)
2694 {
2695 case FPMATH_387:
2696 opts[num++][1] = "387";
2697 break;
2698
2699 case FPMATH_SSE:
2700 opts[num++][1] = "sse";
2701 break;
2702
2703 case FPMATH_387 | FPMATH_SSE:
2704 opts[num++][1] = "sse+387";
2705 break;
2706
2707 default:
2708 gcc_unreachable ();
2709 }
2710 }
2711
2712 /* Any options? */
2713 if (num == 0)
2714 return NULL;
2715
2716 gcc_assert (num < ARRAY_SIZE (opts));
2717
2718 /* Size the string. */
2719 len = 0;
2720 sep_len = (add_nl_p) ? 3 : 1;
2721 for (i = 0; i < num; i++)
2722 {
2723 len += sep_len;
2724 for (j = 0; j < 2; j++)
2725 if (opts[i][j])
2726 len += strlen (opts[i][j]);
2727 }
2728
2729 /* Build the string. */
2730 ret = ptr = (char *) xmalloc (len);
2731 line_len = 0;
2732
2733 for (i = 0; i < num; i++)
2734 {
2735 size_t len2[2];
2736
2737 for (j = 0; j < 2; j++)
2738 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2739
2740 if (i != 0)
2741 {
2742 *ptr++ = ' ';
2743 line_len++;
2744
2745 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2746 {
2747 *ptr++ = '\\';
2748 *ptr++ = '\n';
2749 line_len = 0;
2750 }
2751 }
2752
2753 for (j = 0; j < 2; j++)
2754 if (opts[i][j])
2755 {
2756 memcpy (ptr, opts[i][j], len2[j]);
2757 ptr += len2[j];
2758 line_len += len2[j];
2759 }
2760 }
2761
2762 *ptr = '\0';
2763 gcc_assert (ret + len >= ptr);
2764
2765 return ret;
2766 }
2767
2768 /* Return true, if profiling code should be emitted before
2769 prologue. Otherwise it returns false.
2770 Note: For x86 with "hotfix" it is sorried. */
2771 static bool
2772 ix86_profile_before_prologue (void)
2773 {
2774 return flag_fentry != 0;
2775 }
2776
2777 /* Function that is callable from the debugger to print the current
2778 options. */
2779 void ATTRIBUTE_UNUSED
2780 ix86_debug_options (void)
2781 {
2782 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2783 ix86_arch_string, ix86_tune_string,
2784 ix86_fpmath, true);
2785
2786 if (opts)
2787 {
2788 fprintf (stderr, "%s\n\n", opts);
2789 free (opts);
2790 }
2791 else
2792 fputs ("<no options>\n\n", stderr);
2793
2794 return;
2795 }
2796
2797 static const char *stringop_alg_names[] = {
2798 #define DEF_ENUM
2799 #define DEF_ALG(alg, name) #name,
2800 #include "stringop.def"
2801 #undef DEF_ENUM
2802 #undef DEF_ALG
2803 };
2804
2805 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2806 The string is of the following form (or comma separated list of it):
2807
2808 strategy_alg:max_size:[align|noalign]
2809
2810 where the full size range for the strategy is either [0, max_size] or
2811 [min_size, max_size], in which min_size is the max_size + 1 of the
2812 preceding range. The last size range must have max_size == -1.
2813
2814 Examples:
2815
2816 1.
2817 -mmemcpy-strategy=libcall:-1:noalign
2818
2819 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2820
2821
2822 2.
2823 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2824
2825 This is to tell the compiler to use the following strategy for memset
2826 1) when the expected size is between [1, 16], use rep_8byte strategy;
2827 2) when the size is between [17, 2048], use vector_loop;
2828 3) when the size is > 2048, use libcall. */
2829
2830 struct stringop_size_range
2831 {
2832 int max;
2833 stringop_alg alg;
2834 bool noalign;
2835 };
2836
2837 static void
2838 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2839 {
2840 const struct stringop_algs *default_algs;
2841 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2842 char *curr_range_str, *next_range_str;
2843 int i = 0, n = 0;
2844
2845 if (is_memset)
2846 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2847 else
2848 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2849
2850 curr_range_str = strategy_str;
2851
2852 do
2853 {
2854 int maxs;
2855 stringop_alg alg;
2856 char alg_name[128];
2857 char align[16];
2858 next_range_str = strchr (curr_range_str, ',');
2859 if (next_range_str)
2860 *next_range_str++ = '\0';
2861
2862 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2863 alg_name, &maxs, align))
2864 {
2865 error ("wrong arg %s to option %s", curr_range_str,
2866 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2867 return;
2868 }
2869
2870 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2871 {
2872 error ("size ranges of option %s should be increasing",
2873 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2874 return;
2875 }
2876
2877 for (i = 0; i < last_alg; i++)
2878 {
2879 if (!strcmp (alg_name, stringop_alg_names[i]))
2880 {
2881 alg = (stringop_alg) i;
2882 break;
2883 }
2884 }
2885
2886 if (i == last_alg)
2887 {
2888 error ("wrong stringop strategy name %s specified for option %s",
2889 alg_name,
2890 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2891 return;
2892 }
2893
2894 input_ranges[n].max = maxs;
2895 input_ranges[n].alg = alg;
2896 if (!strcmp (align, "align"))
2897 input_ranges[n].noalign = false;
2898 else if (!strcmp (align, "noalign"))
2899 input_ranges[n].noalign = true;
2900 else
2901 {
2902 error ("unknown alignment %s specified for option %s",
2903 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2904 return;
2905 }
2906 n++;
2907 curr_range_str = next_range_str;
2908 }
2909 while (curr_range_str);
2910
2911 if (input_ranges[n - 1].max != -1)
2912 {
2913 error ("the max value for the last size range should be -1"
2914 " for option %s",
2915 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2916 return;
2917 }
2918
2919 if (n > MAX_STRINGOP_ALGS)
2920 {
2921 error ("too many size ranges specified in option %s",
2922 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2923 return;
2924 }
2925
2926 /* Now override the default algs array. */
2927 for (i = 0; i < n; i++)
2928 {
2929 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2930 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2931 = input_ranges[i].alg;
2932 *const_cast<int *>(&default_algs->size[i].noalign)
2933 = input_ranges[i].noalign;
2934 }
2935 }
2936
2937 \f
2938 /* parse -mtune-ctrl= option. When DUMP is true,
2939 print the features that are explicitly set. */
2940
2941 static void
2942 parse_mtune_ctrl_str (bool dump)
2943 {
2944 if (!ix86_tune_ctrl_string)
2945 return;
2946
2947 char *next_feature_string = NULL;
2948 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2949 char *orig = curr_feature_string;
2950 int i;
2951 do
2952 {
2953 bool clear = false;
2954
2955 next_feature_string = strchr (curr_feature_string, ',');
2956 if (next_feature_string)
2957 *next_feature_string++ = '\0';
2958 if (*curr_feature_string == '^')
2959 {
2960 curr_feature_string++;
2961 clear = true;
2962 }
2963 for (i = 0; i < X86_TUNE_LAST; i++)
2964 {
2965 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2966 {
2967 ix86_tune_features[i] = !clear;
2968 if (dump)
2969 fprintf (stderr, "Explicitly %s feature %s\n",
2970 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2971 break;
2972 }
2973 }
2974 if (i == X86_TUNE_LAST)
2975 error ("Unknown parameter to option -mtune-ctrl: %s",
2976 clear ? curr_feature_string - 1 : curr_feature_string);
2977 curr_feature_string = next_feature_string;
2978 }
2979 while (curr_feature_string);
2980 free (orig);
2981 }
2982
2983 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2984 processor type. */
2985
2986 static void
2987 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2988 {
2989 unsigned int ix86_tune_mask = 1u << ix86_tune;
2990 int i;
2991
2992 for (i = 0; i < X86_TUNE_LAST; ++i)
2993 {
2994 if (ix86_tune_no_default)
2995 ix86_tune_features[i] = 0;
2996 else
2997 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
2998 }
2999
3000 if (dump)
3001 {
3002 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3003 for (i = 0; i < X86_TUNE_LAST; i++)
3004 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3005 ix86_tune_features[i] ? "on" : "off");
3006 }
3007
3008 parse_mtune_ctrl_str (dump);
3009 }
3010
3011
3012 /* Override various settings based on options. If MAIN_ARGS_P, the
3013 options are from the command line, otherwise they are from
3014 attributes. */
3015
3016 static void
3017 ix86_option_override_internal (bool main_args_p,
3018 struct gcc_options *opts,
3019 struct gcc_options *opts_set)
3020 {
3021 int i;
3022 unsigned int ix86_arch_mask;
3023 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3024 const char *prefix;
3025 const char *suffix;
3026 const char *sw;
3027
3028 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3029 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3030 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3031 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3032 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3033 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3034 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3035 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3036 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3037 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3038 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3039 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3040 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3041 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3042 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3043 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3044 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3045 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3046 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3047 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3048 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3049 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3050 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3051 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3052 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3053 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3054 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3055 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3056 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3057 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3058 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3059 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3060 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3061 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3062 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3063 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3064 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3065 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3066 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3067 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3068 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3069 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3070 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3071 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3072 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3073
3074 /* if this reaches 64, need to widen struct pta flags below */
3075
3076 static struct pta
3077 {
3078 const char *const name; /* processor name or nickname. */
3079 const enum processor_type processor;
3080 const enum attr_cpu schedule;
3081 const unsigned HOST_WIDE_INT flags;
3082 }
3083 const processor_alias_table[] =
3084 {
3085 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3086 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3087 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3088 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3089 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3090 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3091 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3092 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3093 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3094 PTA_MMX | PTA_SSE | PTA_FXSR},
3095 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3096 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3097 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3098 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3099 PTA_MMX | PTA_SSE | PTA_FXSR},
3100 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3101 PTA_MMX | PTA_SSE | PTA_FXSR},
3102 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3103 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3104 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3105 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3106 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3107 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3108 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3109 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3110 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3111 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3112 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3113 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3114 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3115 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3116 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3117 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3118 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3119 {"corei7-avx", PROCESSOR_COREI7_AVX, CPU_COREI7,
3120 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3121 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3122 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3123 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3124 {"core-avx-i", PROCESSOR_COREI7_AVX, CPU_COREI7,
3125 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3126 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3127 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3128 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3129 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3130 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3131 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3132 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3133 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3134 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3135 | PTA_XSAVEOPT},
3136 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3137 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3138 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3139 {"slm", PROCESSOR_SLM, CPU_SLM,
3140 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3141 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3142 | PTA_FXSR},
3143 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3144 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3145 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3146 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3147 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3148 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3149 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3150 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3151 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3152 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3153 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3154 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3155 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3156 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3157 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3158 {"x86-64", PROCESSOR_K8, CPU_K8,
3159 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3160 {"k8", PROCESSOR_K8, CPU_K8,
3161 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3162 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3163 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3164 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3165 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3166 {"opteron", PROCESSOR_K8, CPU_K8,
3167 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3168 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3169 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3170 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3171 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3172 {"athlon64", PROCESSOR_K8, CPU_K8,
3173 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3174 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3175 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3176 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3177 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3178 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3179 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3180 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3181 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3182 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3183 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3184 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3185 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3186 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3187 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3188 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3189 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3190 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3191 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3192 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3193 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3194 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3195 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3196 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3197 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3198 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3199 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3200 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3201 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3202 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3203 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3204 | PTA_XSAVEOPT | PTA_FSGSBASE},
3205 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3206 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3207 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3208 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3209 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3210 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3211 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE},
3212 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3213 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3214 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3215 | PTA_FXSR | PTA_XSAVE},
3216 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3217 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3218 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3219 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3220 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3221 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3222
3223 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3224 PTA_64BIT
3225 | PTA_HLE /* flags are only used for -march switch. */ },
3226 };
3227
3228 /* -mrecip options. */
3229 static struct
3230 {
3231 const char *string; /* option name */
3232 unsigned int mask; /* mask bits to set */
3233 }
3234 const recip_options[] =
3235 {
3236 { "all", RECIP_MASK_ALL },
3237 { "none", RECIP_MASK_NONE },
3238 { "div", RECIP_MASK_DIV },
3239 { "sqrt", RECIP_MASK_SQRT },
3240 { "vec-div", RECIP_MASK_VEC_DIV },
3241 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3242 };
3243
3244 int const pta_size = ARRAY_SIZE (processor_alias_table);
3245
3246 /* Set up prefix/suffix so the error messages refer to either the command
3247 line argument, or the attribute(target). */
3248 if (main_args_p)
3249 {
3250 prefix = "-m";
3251 suffix = "";
3252 sw = "switch";
3253 }
3254 else
3255 {
3256 prefix = "option(\"";
3257 suffix = "\")";
3258 sw = "attribute";
3259 }
3260
3261 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3262 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3263 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3264 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3265 #ifdef TARGET_BI_ARCH
3266 else
3267 {
3268 #if TARGET_BI_ARCH == 1
3269 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3270 is on and OPTION_MASK_ABI_X32 is off. We turn off
3271 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3272 -mx32. */
3273 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3274 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3275 #else
3276 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3277 on and OPTION_MASK_ABI_64 is off. We turn off
3278 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3279 -m64. */
3280 if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3281 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3282 #endif
3283 }
3284 #endif
3285
3286 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3287 {
3288 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3289 OPTION_MASK_ABI_64 for TARGET_X32. */
3290 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3291 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3292 }
3293 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3294 {
3295 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3296 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3297 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3298 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3299 }
3300
3301 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3302 SUBTARGET_OVERRIDE_OPTIONS;
3303 #endif
3304
3305 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3306 SUBSUBTARGET_OVERRIDE_OPTIONS;
3307 #endif
3308
3309 /* -fPIC is the default for x86_64. */
3310 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3311 opts->x_flag_pic = 2;
3312
3313 /* Need to check -mtune=generic first. */
3314 if (opts->x_ix86_tune_string)
3315 {
3316 if (!strcmp (opts->x_ix86_tune_string, "generic")
3317 || !strcmp (opts->x_ix86_tune_string, "i686")
3318 /* As special support for cross compilers we read -mtune=native
3319 as -mtune=generic. With native compilers we won't see the
3320 -mtune=native, as it was changed by the driver. */
3321 || !strcmp (opts->x_ix86_tune_string, "native"))
3322 {
3323 opts->x_ix86_tune_string = "generic";
3324 }
3325 /* If this call is for setting the option attribute, allow the
3326 generic that was previously set. */
3327 else if (!main_args_p
3328 && !strcmp (opts->x_ix86_tune_string, "generic"))
3329 ;
3330 else if (!strncmp (opts->x_ix86_tune_string, "generic", 7))
3331 error ("bad value (%s) for %stune=%s %s",
3332 opts->x_ix86_tune_string, prefix, suffix, sw);
3333 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3334 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3335 "%stune=k8%s or %stune=generic%s instead as appropriate",
3336 prefix, suffix, prefix, suffix, prefix, suffix);
3337 }
3338 else
3339 {
3340 if (opts->x_ix86_arch_string)
3341 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3342 if (!opts->x_ix86_tune_string)
3343 {
3344 opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3345 ix86_tune_defaulted = 1;
3346 }
3347
3348 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3349 or defaulted. We need to use a sensible tune option. */
3350 if (!strcmp (opts->x_ix86_tune_string, "generic")
3351 || !strcmp (opts->x_ix86_tune_string, "x86-64")
3352 || !strcmp (opts->x_ix86_tune_string, "i686"))
3353 {
3354 opts->x_ix86_tune_string = "generic";
3355 }
3356 }
3357
3358 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3359 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3360 {
3361 /* rep; movq isn't available in 32-bit code. */
3362 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3363 opts->x_ix86_stringop_alg = no_stringop;
3364 }
3365
3366 if (!opts->x_ix86_arch_string)
3367 opts->x_ix86_arch_string
3368 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3369 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3370 else
3371 ix86_arch_specified = 1;
3372
3373 if (opts_set->x_ix86_pmode)
3374 {
3375 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3376 && opts->x_ix86_pmode == PMODE_SI)
3377 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3378 && opts->x_ix86_pmode == PMODE_DI))
3379 error ("address mode %qs not supported in the %s bit mode",
3380 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3381 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3382 }
3383 else
3384 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3385 ? PMODE_DI : PMODE_SI;
3386
3387 if (!opts_set->x_ix86_abi)
3388 opts->x_ix86_abi = DEFAULT_ABI;
3389
3390 /* For targets using ms ABI enable ms-extensions, if not
3391 explicit turned off. For non-ms ABI we turn off this
3392 option. */
3393 if (!opts_set->x_flag_ms_extensions)
3394 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3395
3396 if (opts_set->x_ix86_cmodel)
3397 {
3398 switch (opts->x_ix86_cmodel)
3399 {
3400 case CM_SMALL:
3401 case CM_SMALL_PIC:
3402 if (opts->x_flag_pic)
3403 opts->x_ix86_cmodel = CM_SMALL_PIC;
3404 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3405 error ("code model %qs not supported in the %s bit mode",
3406 "small", "32");
3407 break;
3408
3409 case CM_MEDIUM:
3410 case CM_MEDIUM_PIC:
3411 if (opts->x_flag_pic)
3412 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3413 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3414 error ("code model %qs not supported in the %s bit mode",
3415 "medium", "32");
3416 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3417 error ("code model %qs not supported in x32 mode",
3418 "medium");
3419 break;
3420
3421 case CM_LARGE:
3422 case CM_LARGE_PIC:
3423 if (opts->x_flag_pic)
3424 opts->x_ix86_cmodel = CM_LARGE_PIC;
3425 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3426 error ("code model %qs not supported in the %s bit mode",
3427 "large", "32");
3428 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3429 error ("code model %qs not supported in x32 mode",
3430 "large");
3431 break;
3432
3433 case CM_32:
3434 if (opts->x_flag_pic)
3435 error ("code model %s does not support PIC mode", "32");
3436 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3437 error ("code model %qs not supported in the %s bit mode",
3438 "32", "64");
3439 break;
3440
3441 case CM_KERNEL:
3442 if (opts->x_flag_pic)
3443 {
3444 error ("code model %s does not support PIC mode", "kernel");
3445 opts->x_ix86_cmodel = CM_32;
3446 }
3447 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3448 error ("code model %qs not supported in the %s bit mode",
3449 "kernel", "32");
3450 break;
3451
3452 default:
3453 gcc_unreachable ();
3454 }
3455 }
3456 else
3457 {
3458 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3459 use of rip-relative addressing. This eliminates fixups that
3460 would otherwise be needed if this object is to be placed in a
3461 DLL, and is essentially just as efficient as direct addressing. */
3462 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3463 && (TARGET_RDOS || TARGET_PECOFF))
3464 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3465 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3466 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3467 else
3468 opts->x_ix86_cmodel = CM_32;
3469 }
3470 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3471 {
3472 error ("-masm=intel not supported in this configuration");
3473 opts->x_ix86_asm_dialect = ASM_ATT;
3474 }
3475 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3476 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3477 sorry ("%i-bit mode not compiled in",
3478 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3479
3480 for (i = 0; i < pta_size; i++)
3481 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3482 {
3483 ix86_schedule = processor_alias_table[i].schedule;
3484 ix86_arch = processor_alias_table[i].processor;
3485 /* Default cpu tuning to the architecture. */
3486 ix86_tune = ix86_arch;
3487
3488 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3489 && !(processor_alias_table[i].flags & PTA_64BIT))
3490 error ("CPU you selected does not support x86-64 "
3491 "instruction set");
3492
3493 if (processor_alias_table[i].flags & PTA_MMX
3494 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3495 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3496 if (processor_alias_table[i].flags & PTA_3DNOW
3497 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3498 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3499 if (processor_alias_table[i].flags & PTA_3DNOW_A
3500 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3501 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3502 if (processor_alias_table[i].flags & PTA_SSE
3503 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3504 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3505 if (processor_alias_table[i].flags & PTA_SSE2
3506 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3507 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3508 if (processor_alias_table[i].flags & PTA_SSE3
3509 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3510 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3511 if (processor_alias_table[i].flags & PTA_SSSE3
3512 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3513 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3514 if (processor_alias_table[i].flags & PTA_SSE4_1
3515 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3516 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3517 if (processor_alias_table[i].flags & PTA_SSE4_2
3518 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3519 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3520 if (processor_alias_table[i].flags & PTA_AVX
3521 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3522 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3523 if (processor_alias_table[i].flags & PTA_AVX2
3524 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3525 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3526 if (processor_alias_table[i].flags & PTA_FMA
3527 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3528 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3529 if (processor_alias_table[i].flags & PTA_SSE4A
3530 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3531 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3532 if (processor_alias_table[i].flags & PTA_FMA4
3533 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3534 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3535 if (processor_alias_table[i].flags & PTA_XOP
3536 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3537 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3538 if (processor_alias_table[i].flags & PTA_LWP
3539 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3540 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3541 if (processor_alias_table[i].flags & PTA_ABM
3542 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3543 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3544 if (processor_alias_table[i].flags & PTA_BMI
3545 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3546 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3547 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3548 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3549 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3550 if (processor_alias_table[i].flags & PTA_TBM
3551 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3552 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3553 if (processor_alias_table[i].flags & PTA_BMI2
3554 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3555 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3556 if (processor_alias_table[i].flags & PTA_CX16
3557 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3558 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3559 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3560 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3561 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3562 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3563 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3564 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3565 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3566 if (processor_alias_table[i].flags & PTA_MOVBE
3567 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3568 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3569 if (processor_alias_table[i].flags & PTA_AES
3570 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3571 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES;
3572 if (processor_alias_table[i].flags & PTA_PCLMUL
3573 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3574 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3575 if (processor_alias_table[i].flags & PTA_FSGSBASE
3576 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3577 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3578 if (processor_alias_table[i].flags & PTA_RDRND
3579 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3580 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3581 if (processor_alias_table[i].flags & PTA_F16C
3582 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3583 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3584 if (processor_alias_table[i].flags & PTA_RTM
3585 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3586 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3587 if (processor_alias_table[i].flags & PTA_HLE
3588 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3589 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3590 if (processor_alias_table[i].flags & PTA_PRFCHW
3591 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3592 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3593 if (processor_alias_table[i].flags & PTA_RDSEED
3594 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3595 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3596 if (processor_alias_table[i].flags & PTA_ADX
3597 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3598 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3599 if (processor_alias_table[i].flags & PTA_FXSR
3600 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3601 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3602 if (processor_alias_table[i].flags & PTA_XSAVE
3603 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3604 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3605 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3606 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3607 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3608 if (processor_alias_table[i].flags & PTA_AVX512F
3609 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3610 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3611 if (processor_alias_table[i].flags & PTA_AVX512ER
3612 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3613 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3614 if (processor_alias_table[i].flags & PTA_AVX512PF
3615 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3616 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3617 if (processor_alias_table[i].flags & PTA_AVX512CD
3618 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3619 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3620 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3621 x86_prefetch_sse = true;
3622
3623 break;
3624 }
3625
3626 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3627 error ("generic CPU can be used only for %stune=%s %s",
3628 prefix, suffix, sw);
3629 else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size)
3630 error ("bad value (%s) for %sarch=%s %s",
3631 opts->x_ix86_arch_string, prefix, suffix, sw);
3632
3633 ix86_arch_mask = 1u << ix86_arch;
3634 for (i = 0; i < X86_ARCH_LAST; ++i)
3635 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3636
3637 for (i = 0; i < pta_size; i++)
3638 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
3639 {
3640 ix86_schedule = processor_alias_table[i].schedule;
3641 ix86_tune = processor_alias_table[i].processor;
3642 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3643 {
3644 if (!(processor_alias_table[i].flags & PTA_64BIT))
3645 {
3646 if (ix86_tune_defaulted)
3647 {
3648 opts->x_ix86_tune_string = "x86-64";
3649 for (i = 0; i < pta_size; i++)
3650 if (! strcmp (opts->x_ix86_tune_string,
3651 processor_alias_table[i].name))
3652 break;
3653 ix86_schedule = processor_alias_table[i].schedule;
3654 ix86_tune = processor_alias_table[i].processor;
3655 }
3656 else
3657 error ("CPU you selected does not support x86-64 "
3658 "instruction set");
3659 }
3660 }
3661 /* Intel CPUs have always interpreted SSE prefetch instructions as
3662 NOPs; so, we can enable SSE prefetch instructions even when
3663 -mtune (rather than -march) points us to a processor that has them.
3664 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3665 higher processors. */
3666 if (TARGET_CMOV
3667 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3668 x86_prefetch_sse = true;
3669 break;
3670 }
3671
3672 if (ix86_tune_specified && i == pta_size)
3673 error ("bad value (%s) for %stune=%s %s",
3674 opts->x_ix86_tune_string, prefix, suffix, sw);
3675
3676 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
3677
3678 #ifndef USE_IX86_FRAME_POINTER
3679 #define USE_IX86_FRAME_POINTER 0
3680 #endif
3681
3682 #ifndef USE_X86_64_FRAME_POINTER
3683 #define USE_X86_64_FRAME_POINTER 0
3684 #endif
3685
3686 /* Set the default values for switches whose default depends on TARGET_64BIT
3687 in case they weren't overwritten by command line options. */
3688 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3689 {
3690 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3691 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3692 if (opts->x_flag_asynchronous_unwind_tables == 2)
3693 opts->x_flag_unwind_tables
3694 = opts->x_flag_asynchronous_unwind_tables = 1;
3695 if (opts->x_flag_pcc_struct_return == 2)
3696 opts->x_flag_pcc_struct_return = 0;
3697 }
3698 else
3699 {
3700 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3701 opts->x_flag_omit_frame_pointer
3702 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
3703 if (opts->x_flag_asynchronous_unwind_tables == 2)
3704 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3705 if (opts->x_flag_pcc_struct_return == 2)
3706 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3707 }
3708
3709 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3710 if (opts->x_optimize_size)
3711 ix86_cost = &ix86_size_cost;
3712 else
3713 ix86_cost = ix86_tune_cost;
3714
3715 /* Arrange to set up i386_stack_locals for all functions. */
3716 init_machine_status = ix86_init_machine_status;
3717
3718 /* Validate -mregparm= value. */
3719 if (opts_set->x_ix86_regparm)
3720 {
3721 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3722 warning (0, "-mregparm is ignored in 64-bit mode");
3723 if (opts->x_ix86_regparm > REGPARM_MAX)
3724 {
3725 error ("-mregparm=%d is not between 0 and %d",
3726 opts->x_ix86_regparm, REGPARM_MAX);
3727 opts->x_ix86_regparm = 0;
3728 }
3729 }
3730 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3731 opts->x_ix86_regparm = REGPARM_MAX;
3732
3733 /* Default align_* from the processor table. */
3734 if (opts->x_align_loops == 0)
3735 {
3736 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3737 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3738 }
3739 if (opts->x_align_jumps == 0)
3740 {
3741 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3742 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3743 }
3744 if (opts->x_align_functions == 0)
3745 {
3746 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3747 }
3748
3749 /* Provide default for -mbranch-cost= value. */
3750 if (!opts_set->x_ix86_branch_cost)
3751 opts->x_ix86_branch_cost = ix86_cost->branch_cost;
3752
3753 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3754 {
3755 opts->x_target_flags
3756 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
3757
3758 /* Enable by default the SSE and MMX builtins. Do allow the user to
3759 explicitly disable any of these. In particular, disabling SSE and
3760 MMX for kernel code is extremely useful. */
3761 if (!ix86_arch_specified)
3762 opts->x_ix86_isa_flags
3763 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3764 | TARGET_SUBTARGET64_ISA_DEFAULT)
3765 & ~opts->x_ix86_isa_flags_explicit);
3766
3767 if (TARGET_RTD_P (opts->x_target_flags))
3768 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3769 }
3770 else
3771 {
3772 opts->x_target_flags
3773 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
3774
3775 if (!ix86_arch_specified)
3776 opts->x_ix86_isa_flags
3777 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
3778
3779 /* i386 ABI does not specify red zone. It still makes sense to use it
3780 when programmer takes care to stack from being destroyed. */
3781 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
3782 opts->x_target_flags |= MASK_NO_RED_ZONE;
3783 }
3784
3785 /* Keep nonleaf frame pointers. */
3786 if (opts->x_flag_omit_frame_pointer)
3787 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3788 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
3789 opts->x_flag_omit_frame_pointer = 1;
3790
3791 /* If we're doing fast math, we don't care about comparison order
3792 wrt NaNs. This lets us use a shorter comparison sequence. */
3793 if (opts->x_flag_finite_math_only)
3794 opts->x_target_flags &= ~MASK_IEEE_FP;
3795
3796 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3797 since the insns won't need emulation. */
3798 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
3799 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
3800
3801 /* Likewise, if the target doesn't have a 387, or we've specified
3802 software floating point, don't use 387 inline intrinsics. */
3803 if (!TARGET_80387_P (opts->x_target_flags))
3804 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
3805
3806 /* Turn on MMX builtins for -msse. */
3807 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
3808 opts->x_ix86_isa_flags
3809 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
3810
3811 /* Enable SSE prefetch. */
3812 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
3813 || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
3814 x86_prefetch_sse = true;
3815
3816 /* Enable prefetch{,w} instructions for -m3dnow. */
3817 if (TARGET_3DNOW_P (opts->x_ix86_isa_flags))
3818 opts->x_ix86_isa_flags
3819 |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
3820
3821 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3822 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
3823 || TARGET_ABM_P (opts->x_ix86_isa_flags))
3824 opts->x_ix86_isa_flags
3825 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
3826
3827 /* Enable lzcnt instruction for -mabm. */
3828 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
3829 opts->x_ix86_isa_flags
3830 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
3831
3832 /* Validate -mpreferred-stack-boundary= value or default it to
3833 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3834 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3835 if (opts_set->x_ix86_preferred_stack_boundary_arg)
3836 {
3837 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3838 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
3839 int max = (TARGET_SEH ? 4 : 12);
3840
3841 if (opts->x_ix86_preferred_stack_boundary_arg < min
3842 || opts->x_ix86_preferred_stack_boundary_arg > max)
3843 {
3844 if (min == max)
3845 error ("-mpreferred-stack-boundary is not supported "
3846 "for this target");
3847 else
3848 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3849 opts->x_ix86_preferred_stack_boundary_arg, min, max);
3850 }
3851 else
3852 ix86_preferred_stack_boundary
3853 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3854 }
3855
3856 /* Set the default value for -mstackrealign. */
3857 if (opts->x_ix86_force_align_arg_pointer == -1)
3858 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3859
3860 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3861
3862 /* Validate -mincoming-stack-boundary= value or default it to
3863 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3864 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3865 if (opts_set->x_ix86_incoming_stack_boundary_arg)
3866 {
3867 if (ix86_incoming_stack_boundary_arg
3868 < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
3869 || ix86_incoming_stack_boundary_arg > 12)
3870 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3871 ix86_incoming_stack_boundary_arg,
3872 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
3873 else
3874 {
3875 ix86_user_incoming_stack_boundary
3876 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3877 ix86_incoming_stack_boundary
3878 = ix86_user_incoming_stack_boundary;
3879 }
3880 }
3881
3882 /* Accept -msseregparm only if at least SSE support is enabled. */
3883 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
3884 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
3885 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3886
3887 if (opts_set->x_ix86_fpmath)
3888 {
3889 if (opts->x_ix86_fpmath & FPMATH_SSE)
3890 {
3891 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
3892 {
3893 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3894 opts->x_ix86_fpmath = FPMATH_387;
3895 }
3896 else if ((opts->x_ix86_fpmath & FPMATH_387)
3897 && !TARGET_80387_P (opts->x_target_flags))
3898 {
3899 warning (0, "387 instruction set disabled, using SSE arithmetics");
3900 opts->x_ix86_fpmath = FPMATH_SSE;
3901 }
3902 }
3903 }
3904 /* For all chips supporting SSE2, -mfpmath=sse performs better than
3905 fpmath=387. The second is however default at many targets since the
3906 extra 80bit precision of temporaries is considered to be part of ABI.
3907 Overwrite the default at least for -ffast-math.
3908 TODO: -mfpmath=both seems to produce same performing code with bit
3909 smaller binaries. It is however not clear if register allocation is
3910 ready for this setting.
3911 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
3912 codegen. We may switch to 387 with -ffast-math for size optimized
3913 functions. */
3914 else if (fast_math_flags_set_p (&global_options)
3915 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
3916 ix86_fpmath = FPMATH_SSE;
3917 else
3918 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
3919
3920 /* If the i387 is disabled, then do not return values in it. */
3921 if (!TARGET_80387_P (opts->x_target_flags))
3922 opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
3923
3924 /* Use external vectorized library in vectorizing intrinsics. */
3925 if (opts_set->x_ix86_veclibabi_type)
3926 switch (opts->x_ix86_veclibabi_type)
3927 {
3928 case ix86_veclibabi_type_svml:
3929 ix86_veclib_handler = ix86_veclibabi_svml;
3930 break;
3931
3932 case ix86_veclibabi_type_acml:
3933 ix86_veclib_handler = ix86_veclibabi_acml;
3934 break;
3935
3936 default:
3937 gcc_unreachable ();
3938 }
3939
3940 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
3941 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3942 && !opts->x_optimize_size)
3943 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3944
3945 /* If stack probes are required, the space used for large function
3946 arguments on the stack must also be probed, so enable
3947 -maccumulate-outgoing-args so this happens in the prologue. */
3948 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
3949 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3950 {
3951 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3952 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3953 "for correctness", prefix, suffix);
3954 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3955 }
3956
3957 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3958 {
3959 char *p;
3960 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3961 p = strchr (internal_label_prefix, 'X');
3962 internal_label_prefix_len = p - internal_label_prefix;
3963 *p = '\0';
3964 }
3965
3966 /* When scheduling description is not available, disable scheduler pass
3967 so it won't slow down the compilation and make x87 code slower. */
3968 if (!TARGET_SCHEDULE)
3969 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
3970
3971 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3972 ix86_tune_cost->simultaneous_prefetches,
3973 opts->x_param_values,
3974 opts_set->x_param_values);
3975 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3976 ix86_tune_cost->prefetch_block,
3977 opts->x_param_values,
3978 opts_set->x_param_values);
3979 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3980 ix86_tune_cost->l1_cache_size,
3981 opts->x_param_values,
3982 opts_set->x_param_values);
3983 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3984 ix86_tune_cost->l2_cache_size,
3985 opts->x_param_values,
3986 opts_set->x_param_values);
3987
3988 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3989 if (opts->x_flag_prefetch_loop_arrays < 0
3990 && HAVE_prefetch
3991 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
3992 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3993 opts->x_flag_prefetch_loop_arrays = 1;
3994
3995 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3996 can be opts->x_optimized to ap = __builtin_next_arg (0). */
3997 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
3998 targetm.expand_builtin_va_start = NULL;
3999
4000 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4001 {
4002 ix86_gen_leave = gen_leave_rex64;
4003 if (Pmode == DImode)
4004 {
4005 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4006 ix86_gen_tls_local_dynamic_base_64
4007 = gen_tls_local_dynamic_base_64_di;
4008 }
4009 else
4010 {
4011 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4012 ix86_gen_tls_local_dynamic_base_64
4013 = gen_tls_local_dynamic_base_64_si;
4014 }
4015 }
4016 else
4017 ix86_gen_leave = gen_leave;
4018
4019 if (Pmode == DImode)
4020 {
4021 ix86_gen_add3 = gen_adddi3;
4022 ix86_gen_sub3 = gen_subdi3;
4023 ix86_gen_sub3_carry = gen_subdi3_carry;
4024 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4025 ix86_gen_andsp = gen_anddi3;
4026 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4027 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4028 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4029 ix86_gen_monitor = gen_sse3_monitor_di;
4030 }
4031 else
4032 {
4033 ix86_gen_add3 = gen_addsi3;
4034 ix86_gen_sub3 = gen_subsi3;
4035 ix86_gen_sub3_carry = gen_subsi3_carry;
4036 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4037 ix86_gen_andsp = gen_andsi3;
4038 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4039 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4040 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4041 ix86_gen_monitor = gen_sse3_monitor_si;
4042 }
4043
4044 #ifdef USE_IX86_CLD
4045 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4046 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4047 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4048 #endif
4049
4050 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
4051 {
4052 if (opts->x_flag_fentry > 0)
4053 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4054 "with -fpic");
4055 opts->x_flag_fentry = 0;
4056 }
4057 else if (TARGET_SEH)
4058 {
4059 if (opts->x_flag_fentry == 0)
4060 sorry ("-mno-fentry isn%'t compatible with SEH");
4061 opts->x_flag_fentry = 1;
4062 }
4063 else if (opts->x_flag_fentry < 0)
4064 {
4065 #if defined(PROFILE_BEFORE_PROLOGUE)
4066 opts->x_flag_fentry = 1;
4067 #else
4068 opts->x_flag_fentry = 0;
4069 #endif
4070 }
4071
4072 /* When not opts->x_optimize for size, enable vzeroupper optimization for
4073 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4074 AVX unaligned load/store. */
4075 if (!opts->x_optimize_size)
4076 {
4077 if (flag_expensive_optimizations
4078 && !(opts_set->x_target_flags & MASK_VZEROUPPER))
4079 opts->x_target_flags |= MASK_VZEROUPPER;
4080 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4081 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4082 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4083 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4084 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4085 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4086 /* Enable 128-bit AVX instruction generation
4087 for the auto-vectorizer. */
4088 if (TARGET_AVX128_OPTIMAL
4089 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
4090 opts->x_target_flags |= MASK_PREFER_AVX128;
4091 }
4092
4093 if (opts->x_ix86_recip_name)
4094 {
4095 char *p = ASTRDUP (opts->x_ix86_recip_name);
4096 char *q;
4097 unsigned int mask, i;
4098 bool invert;
4099
4100 while ((q = strtok (p, ",")) != NULL)
4101 {
4102 p = NULL;
4103 if (*q == '!')
4104 {
4105 invert = true;
4106 q++;
4107 }
4108 else
4109 invert = false;
4110
4111 if (!strcmp (q, "default"))
4112 mask = RECIP_MASK_ALL;
4113 else
4114 {
4115 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4116 if (!strcmp (q, recip_options[i].string))
4117 {
4118 mask = recip_options[i].mask;
4119 break;
4120 }
4121
4122 if (i == ARRAY_SIZE (recip_options))
4123 {
4124 error ("unknown option for -mrecip=%s", q);
4125 invert = false;
4126 mask = RECIP_MASK_NONE;
4127 }
4128 }
4129
4130 opts->x_recip_mask_explicit |= mask;
4131 if (invert)
4132 opts->x_recip_mask &= ~mask;
4133 else
4134 opts->x_recip_mask |= mask;
4135 }
4136 }
4137
4138 if (TARGET_RECIP_P (opts->x_target_flags))
4139 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4140 else if (opts_set->x_target_flags & MASK_RECIP)
4141 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4142
4143 /* Default long double to 64-bit for Bionic. */
4144 if (TARGET_HAS_BIONIC
4145 && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64))
4146 opts->x_target_flags |= MASK_LONG_DOUBLE_64;
4147
4148 /* Save the initial options in case the user does function specific
4149 options. */
4150 if (main_args_p)
4151 target_option_default_node = target_option_current_node
4152 = build_target_option_node (opts);
4153
4154 /* Handle stack protector */
4155 if (!opts_set->x_ix86_stack_protector_guard)
4156 opts->x_ix86_stack_protector_guard
4157 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4158
4159 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4160 if (opts->x_ix86_tune_memcpy_strategy)
4161 {
4162 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4163 ix86_parse_stringop_strategy_string (str, false);
4164 free (str);
4165 }
4166
4167 if (opts->x_ix86_tune_memset_strategy)
4168 {
4169 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4170 ix86_parse_stringop_strategy_string (str, true);
4171 free (str);
4172 }
4173 }
4174
4175 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4176
4177 static void
4178 ix86_option_override (void)
4179 {
4180 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4181 static struct register_pass_info insert_vzeroupper_info
4182 = { pass_insert_vzeroupper, "reload",
4183 1, PASS_POS_INSERT_AFTER
4184 };
4185
4186 ix86_option_override_internal (true, &global_options, &global_options_set);
4187
4188
4189 /* This needs to be done at start up. It's convenient to do it here. */
4190 register_pass (&insert_vzeroupper_info);
4191 }
4192
4193 /* Update register usage after having seen the compiler flags. */
4194
4195 static void
4196 ix86_conditional_register_usage (void)
4197 {
4198 int i, c_mask;
4199 unsigned int j;
4200
4201 /* The PIC register, if it exists, is fixed. */
4202 j = PIC_OFFSET_TABLE_REGNUM;
4203 if (j != INVALID_REGNUM)
4204 fixed_regs[j] = call_used_regs[j] = 1;
4205
4206 /* For 32-bit targets, squash the REX registers. */
4207 if (! TARGET_64BIT)
4208 {
4209 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4210 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4211 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4212 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4213 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4214 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4215 }
4216
4217 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4218 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4219 : TARGET_64BIT ? (1 << 2)
4220 : (1 << 1));
4221
4222 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4223
4224 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4225 {
4226 /* Set/reset conditionally defined registers from
4227 CALL_USED_REGISTERS initializer. */
4228 if (call_used_regs[i] > 1)
4229 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4230
4231 /* Calculate registers of CLOBBERED_REGS register set
4232 as call used registers from GENERAL_REGS register set. */
4233 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4234 && call_used_regs[i])
4235 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4236 }
4237
4238 /* If MMX is disabled, squash the registers. */
4239 if (! TARGET_MMX)
4240 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4241 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4242 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4243
4244 /* If SSE is disabled, squash the registers. */
4245 if (! TARGET_SSE)
4246 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4247 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4248 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4249
4250 /* If the FPU is disabled, squash the registers. */
4251 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4252 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4253 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4254 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4255
4256 /* If AVX512F is disabled, squash the registers. */
4257 if (! TARGET_AVX512F)
4258 {
4259 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4260 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4261
4262 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4263 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4264 }
4265
4266 /* If MPX is disabled, squash the registers. */
4267 if (! TARGET_MPX)
4268 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4269 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4270 }
4271
4272 \f
4273 /* Save the current options */
4274
4275 static void
4276 ix86_function_specific_save (struct cl_target_option *ptr,
4277 struct gcc_options *opts)
4278 {
4279 ptr->arch = ix86_arch;
4280 ptr->schedule = ix86_schedule;
4281 ptr->tune = ix86_tune;
4282 ptr->branch_cost = ix86_branch_cost;
4283 ptr->tune_defaulted = ix86_tune_defaulted;
4284 ptr->arch_specified = ix86_arch_specified;
4285 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4286 ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
4287 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
4288
4289 /* The fields are char but the variables are not; make sure the
4290 values fit in the fields. */
4291 gcc_assert (ptr->arch == ix86_arch);
4292 gcc_assert (ptr->schedule == ix86_schedule);
4293 gcc_assert (ptr->tune == ix86_tune);
4294 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4295 }
4296
4297 /* Restore the current options */
4298
4299 static void
4300 ix86_function_specific_restore (struct gcc_options *opts,
4301 struct cl_target_option *ptr)
4302 {
4303 enum processor_type old_tune = ix86_tune;
4304 enum processor_type old_arch = ix86_arch;
4305 unsigned int ix86_arch_mask;
4306 int i;
4307
4308 ix86_arch = (enum processor_type) ptr->arch;
4309 ix86_schedule = (enum attr_cpu) ptr->schedule;
4310 ix86_tune = (enum processor_type) ptr->tune;
4311 opts->x_ix86_branch_cost = ptr->branch_cost;
4312 ix86_tune_defaulted = ptr->tune_defaulted;
4313 ix86_arch_specified = ptr->arch_specified;
4314 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4315 opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
4316 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
4317
4318 /* Recreate the arch feature tests if the arch changed */
4319 if (old_arch != ix86_arch)
4320 {
4321 ix86_arch_mask = 1u << ix86_arch;
4322 for (i = 0; i < X86_ARCH_LAST; ++i)
4323 ix86_arch_features[i]
4324 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4325 }
4326
4327 /* Recreate the tune optimization tests */
4328 if (old_tune != ix86_tune)
4329 set_ix86_tune_features (ix86_tune, false);
4330 }
4331
4332 /* Print the current options */
4333
4334 static void
4335 ix86_function_specific_print (FILE *file, int indent,
4336 struct cl_target_option *ptr)
4337 {
4338 char *target_string
4339 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4340 NULL, NULL, ptr->x_ix86_fpmath, false);
4341
4342 fprintf (file, "%*sarch = %d (%s)\n",
4343 indent, "",
4344 ptr->arch,
4345 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4346 ? cpu_names[ptr->arch]
4347 : "<unknown>"));
4348
4349 fprintf (file, "%*stune = %d (%s)\n",
4350 indent, "",
4351 ptr->tune,
4352 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4353 ? cpu_names[ptr->tune]
4354 : "<unknown>"));
4355
4356 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4357
4358 if (target_string)
4359 {
4360 fprintf (file, "%*s%s\n", indent, "", target_string);
4361 free (target_string);
4362 }
4363 }
4364
4365 \f
4366 /* Inner function to process the attribute((target(...))), take an argument and
4367 set the current options from the argument. If we have a list, recursively go
4368 over the list. */
4369
4370 static bool
4371 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4372 struct gcc_options *opts,
4373 struct gcc_options *opts_set,
4374 struct gcc_options *enum_opts_set)
4375 {
4376 char *next_optstr;
4377 bool ret = true;
4378
4379 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4380 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4381 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4382 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4383 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4384
4385 enum ix86_opt_type
4386 {
4387 ix86_opt_unknown,
4388 ix86_opt_yes,
4389 ix86_opt_no,
4390 ix86_opt_str,
4391 ix86_opt_enum,
4392 ix86_opt_isa
4393 };
4394
4395 static const struct
4396 {
4397 const char *string;
4398 size_t len;
4399 enum ix86_opt_type type;
4400 int opt;
4401 int mask;
4402 } attrs[] = {
4403 /* isa options */
4404 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4405 IX86_ATTR_ISA ("abm", OPT_mabm),
4406 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4407 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4408 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4409 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4410 IX86_ATTR_ISA ("aes", OPT_maes),
4411 IX86_ATTR_ISA ("avx", OPT_mavx),
4412 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4413 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4414 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4415 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4416 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4417 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4418 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4419 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4420 IX86_ATTR_ISA ("sse", OPT_msse),
4421 IX86_ATTR_ISA ("sse2", OPT_msse2),
4422 IX86_ATTR_ISA ("sse3", OPT_msse3),
4423 IX86_ATTR_ISA ("sse4", OPT_msse4),
4424 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4425 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4426 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4427 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4428 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4429 IX86_ATTR_ISA ("fma", OPT_mfma),
4430 IX86_ATTR_ISA ("xop", OPT_mxop),
4431 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4432 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4433 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4434 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4435 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4436 IX86_ATTR_ISA ("hle", OPT_mhle),
4437 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4438 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4439 IX86_ATTR_ISA ("adx", OPT_madx),
4440 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4441 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4442 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4443
4444 /* enum options */
4445 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4446
4447 /* string options */
4448 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4449 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4450
4451 /* flag options */
4452 IX86_ATTR_YES ("cld",
4453 OPT_mcld,
4454 MASK_CLD),
4455
4456 IX86_ATTR_NO ("fancy-math-387",
4457 OPT_mfancy_math_387,
4458 MASK_NO_FANCY_MATH_387),
4459
4460 IX86_ATTR_YES ("ieee-fp",
4461 OPT_mieee_fp,
4462 MASK_IEEE_FP),
4463
4464 IX86_ATTR_YES ("inline-all-stringops",
4465 OPT_minline_all_stringops,
4466 MASK_INLINE_ALL_STRINGOPS),
4467
4468 IX86_ATTR_YES ("inline-stringops-dynamically",
4469 OPT_minline_stringops_dynamically,
4470 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4471
4472 IX86_ATTR_NO ("align-stringops",
4473 OPT_mno_align_stringops,
4474 MASK_NO_ALIGN_STRINGOPS),
4475
4476 IX86_ATTR_YES ("recip",
4477 OPT_mrecip,
4478 MASK_RECIP),
4479
4480 };
4481
4482 /* If this is a list, recurse to get the options. */
4483 if (TREE_CODE (args) == TREE_LIST)
4484 {
4485 bool ret = true;
4486
4487 for (; args; args = TREE_CHAIN (args))
4488 if (TREE_VALUE (args)
4489 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4490 p_strings, opts, opts_set,
4491 enum_opts_set))
4492 ret = false;
4493
4494 return ret;
4495 }
4496
4497 else if (TREE_CODE (args) != STRING_CST)
4498 {
4499 error ("attribute %<target%> argument not a string");
4500 return false;
4501 }
4502
4503 /* Handle multiple arguments separated by commas. */
4504 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4505
4506 while (next_optstr && *next_optstr != '\0')
4507 {
4508 char *p = next_optstr;
4509 char *orig_p = p;
4510 char *comma = strchr (next_optstr, ',');
4511 const char *opt_string;
4512 size_t len, opt_len;
4513 int opt;
4514 bool opt_set_p;
4515 char ch;
4516 unsigned i;
4517 enum ix86_opt_type type = ix86_opt_unknown;
4518 int mask = 0;
4519
4520 if (comma)
4521 {
4522 *comma = '\0';
4523 len = comma - next_optstr;
4524 next_optstr = comma + 1;
4525 }
4526 else
4527 {
4528 len = strlen (p);
4529 next_optstr = NULL;
4530 }
4531
4532 /* Recognize no-xxx. */
4533 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4534 {
4535 opt_set_p = false;
4536 p += 3;
4537 len -= 3;
4538 }
4539 else
4540 opt_set_p = true;
4541
4542 /* Find the option. */
4543 ch = *p;
4544 opt = N_OPTS;
4545 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4546 {
4547 type = attrs[i].type;
4548 opt_len = attrs[i].len;
4549 if (ch == attrs[i].string[0]
4550 && ((type != ix86_opt_str && type != ix86_opt_enum)
4551 ? len == opt_len
4552 : len > opt_len)
4553 && memcmp (p, attrs[i].string, opt_len) == 0)
4554 {
4555 opt = attrs[i].opt;
4556 mask = attrs[i].mask;
4557 opt_string = attrs[i].string;
4558 break;
4559 }
4560 }
4561
4562 /* Process the option. */
4563 if (opt == N_OPTS)
4564 {
4565 error ("attribute(target(\"%s\")) is unknown", orig_p);
4566 ret = false;
4567 }
4568
4569 else if (type == ix86_opt_isa)
4570 {
4571 struct cl_decoded_option decoded;
4572
4573 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4574 ix86_handle_option (opts, opts_set,
4575 &decoded, input_location);
4576 }
4577
4578 else if (type == ix86_opt_yes || type == ix86_opt_no)
4579 {
4580 if (type == ix86_opt_no)
4581 opt_set_p = !opt_set_p;
4582
4583 if (opt_set_p)
4584 opts->x_target_flags |= mask;
4585 else
4586 opts->x_target_flags &= ~mask;
4587 }
4588
4589 else if (type == ix86_opt_str)
4590 {
4591 if (p_strings[opt])
4592 {
4593 error ("option(\"%s\") was already specified", opt_string);
4594 ret = false;
4595 }
4596 else
4597 p_strings[opt] = xstrdup (p + opt_len);
4598 }
4599
4600 else if (type == ix86_opt_enum)
4601 {
4602 bool arg_ok;
4603 int value;
4604
4605 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4606 if (arg_ok)
4607 set_option (opts, enum_opts_set, opt, value,
4608 p + opt_len, DK_UNSPECIFIED, input_location,
4609 global_dc);
4610 else
4611 {
4612 error ("attribute(target(\"%s\")) is unknown", orig_p);
4613 ret = false;
4614 }
4615 }
4616
4617 else
4618 gcc_unreachable ();
4619 }
4620
4621 return ret;
4622 }
4623
4624 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4625
4626 tree
4627 ix86_valid_target_attribute_tree (tree args,
4628 struct gcc_options *opts,
4629 struct gcc_options *opts_set)
4630 {
4631 const char *orig_arch_string = ix86_arch_string;
4632 const char *orig_tune_string = ix86_tune_string;
4633 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
4634 int orig_tune_defaulted = ix86_tune_defaulted;
4635 int orig_arch_specified = ix86_arch_specified;
4636 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4637 tree t = NULL_TREE;
4638 int i;
4639 struct cl_target_option *def
4640 = TREE_TARGET_OPTION (target_option_default_node);
4641 struct gcc_options enum_opts_set;
4642
4643 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4644
4645 /* Process each of the options on the chain. */
4646 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
4647 opts_set, &enum_opts_set))
4648 return error_mark_node;
4649
4650 /* If the changed options are different from the default, rerun
4651 ix86_option_override_internal, and then save the options away.
4652 The string options are are attribute options, and will be undone
4653 when we copy the save structure. */
4654 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
4655 || opts->x_target_flags != def->x_target_flags
4656 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4657 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4658 || enum_opts_set.x_ix86_fpmath)
4659 {
4660 /* If we are using the default tune= or arch=, undo the string assigned,
4661 and use the default. */
4662 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4663 opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4664 else if (!orig_arch_specified)
4665 opts->x_ix86_arch_string = NULL;
4666
4667 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4668 opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4669 else if (orig_tune_defaulted)
4670 opts->x_ix86_tune_string = NULL;
4671
4672 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4673 if (enum_opts_set.x_ix86_fpmath)
4674 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4675 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4676 && TARGET_SSE_P (opts->x_ix86_isa_flags))
4677 {
4678 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4679 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4680 }
4681
4682 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4683 ix86_option_override_internal (false, opts, opts_set);
4684
4685 /* Add any builtin functions with the new isa if any. */
4686 ix86_add_new_builtins (opts->x_ix86_isa_flags);
4687
4688 /* Save the current options unless we are validating options for
4689 #pragma. */
4690 t = build_target_option_node (opts);
4691
4692 opts->x_ix86_arch_string = orig_arch_string;
4693 opts->x_ix86_tune_string = orig_tune_string;
4694 opts_set->x_ix86_fpmath = orig_fpmath_set;
4695
4696 /* Free up memory allocated to hold the strings */
4697 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4698 free (option_strings[i]);
4699 }
4700
4701 return t;
4702 }
4703
4704 /* Hook to validate attribute((target("string"))). */
4705
4706 static bool
4707 ix86_valid_target_attribute_p (tree fndecl,
4708 tree ARG_UNUSED (name),
4709 tree args,
4710 int ARG_UNUSED (flags))
4711 {
4712 struct gcc_options func_options;
4713 tree new_target, new_optimize;
4714 bool ret = true;
4715
4716 /* attribute((target("default"))) does nothing, beyond
4717 affecting multi-versioning. */
4718 if (TREE_VALUE (args)
4719 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4720 && TREE_CHAIN (args) == NULL_TREE
4721 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4722 return true;
4723
4724 tree old_optimize = build_optimization_node (&global_options);
4725
4726 /* Get the optimization options of the current function. */
4727 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4728
4729 if (!func_optimize)
4730 func_optimize = old_optimize;
4731
4732 /* Init func_options. */
4733 memset (&func_options, 0, sizeof (func_options));
4734 init_options_struct (&func_options, NULL);
4735 lang_hooks.init_options_struct (&func_options);
4736
4737 cl_optimization_restore (&func_options,
4738 TREE_OPTIMIZATION (func_optimize));
4739
4740 /* Initialize func_options to the default before its target options can
4741 be set. */
4742 cl_target_option_restore (&func_options,
4743 TREE_TARGET_OPTION (target_option_default_node));
4744
4745 new_target = ix86_valid_target_attribute_tree (args, &func_options,
4746 &global_options_set);
4747
4748 new_optimize = build_optimization_node (&func_options);
4749
4750 if (new_target == error_mark_node)
4751 ret = false;
4752
4753 else if (fndecl && new_target)
4754 {
4755 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4756
4757 if (old_optimize != new_optimize)
4758 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4759 }
4760
4761 return ret;
4762 }
4763
4764 \f
4765 /* Hook to determine if one function can safely inline another. */
4766
4767 static bool
4768 ix86_can_inline_p (tree caller, tree callee)
4769 {
4770 bool ret = false;
4771 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4772 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4773
4774 /* If callee has no option attributes, then it is ok to inline. */
4775 if (!callee_tree)
4776 ret = true;
4777
4778 /* If caller has no option attributes, but callee does then it is not ok to
4779 inline. */
4780 else if (!caller_tree)
4781 ret = false;
4782
4783 else
4784 {
4785 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4786 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4787
4788 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4789 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4790 function. */
4791 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4792 != callee_opts->x_ix86_isa_flags)
4793 ret = false;
4794
4795 /* See if we have the same non-isa options. */
4796 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4797 ret = false;
4798
4799 /* See if arch, tune, etc. are the same. */
4800 else if (caller_opts->arch != callee_opts->arch)
4801 ret = false;
4802
4803 else if (caller_opts->tune != callee_opts->tune)
4804 ret = false;
4805
4806 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4807 ret = false;
4808
4809 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4810 ret = false;
4811
4812 else
4813 ret = true;
4814 }
4815
4816 return ret;
4817 }
4818
4819 \f
4820 /* Remember the last target of ix86_set_current_function. */
4821 static GTY(()) tree ix86_previous_fndecl;
4822
4823 /* Invalidate ix86_previous_fndecl cache. */
4824 void
4825 ix86_reset_previous_fndecl (void)
4826 {
4827 ix86_previous_fndecl = NULL_TREE;
4828 }
4829
4830 /* Establish appropriate back-end context for processing the function
4831 FNDECL. The argument might be NULL to indicate processing at top
4832 level, outside of any function scope. */
4833 static void
4834 ix86_set_current_function (tree fndecl)
4835 {
4836 /* Only change the context if the function changes. This hook is called
4837 several times in the course of compiling a function, and we don't want to
4838 slow things down too much or call target_reinit when it isn't safe. */
4839 if (fndecl && fndecl != ix86_previous_fndecl)
4840 {
4841 tree old_tree = (ix86_previous_fndecl
4842 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4843 : NULL_TREE);
4844
4845 tree new_tree = (fndecl
4846 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4847 : NULL_TREE);
4848
4849 ix86_previous_fndecl = fndecl;
4850 if (old_tree == new_tree)
4851 ;
4852
4853 else if (new_tree)
4854 {
4855 cl_target_option_restore (&global_options,
4856 TREE_TARGET_OPTION (new_tree));
4857 target_reinit ();
4858 }
4859
4860 else if (old_tree)
4861 {
4862 struct cl_target_option *def
4863 = TREE_TARGET_OPTION (target_option_current_node);
4864
4865 cl_target_option_restore (&global_options, def);
4866 target_reinit ();
4867 }
4868 }
4869 }
4870
4871 \f
4872 /* Return true if this goes in large data/bss. */
4873
4874 static bool
4875 ix86_in_large_data_p (tree exp)
4876 {
4877 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4878 return false;
4879
4880 /* Functions are never large data. */
4881 if (TREE_CODE (exp) == FUNCTION_DECL)
4882 return false;
4883
4884 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4885 {
4886 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4887 if (strcmp (section, ".ldata") == 0
4888 || strcmp (section, ".lbss") == 0)
4889 return true;
4890 return false;
4891 }
4892 else
4893 {
4894 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4895
4896 /* If this is an incomplete type with size 0, then we can't put it
4897 in data because it might be too big when completed. */
4898 if (!size || size > ix86_section_threshold)
4899 return true;
4900 }
4901
4902 return false;
4903 }
4904
4905 /* Switch to the appropriate section for output of DECL.
4906 DECL is either a `VAR_DECL' node or a constant of some sort.
4907 RELOC indicates whether forming the initial value of DECL requires
4908 link-time relocations. */
4909
4910 ATTRIBUTE_UNUSED static section *
4911 x86_64_elf_select_section (tree decl, int reloc,
4912 unsigned HOST_WIDE_INT align)
4913 {
4914 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4915 && ix86_in_large_data_p (decl))
4916 {
4917 const char *sname = NULL;
4918 unsigned int flags = SECTION_WRITE;
4919 switch (categorize_decl_for_section (decl, reloc))
4920 {
4921 case SECCAT_DATA:
4922 sname = ".ldata";
4923 break;
4924 case SECCAT_DATA_REL:
4925 sname = ".ldata.rel";
4926 break;
4927 case SECCAT_DATA_REL_LOCAL:
4928 sname = ".ldata.rel.local";
4929 break;
4930 case SECCAT_DATA_REL_RO:
4931 sname = ".ldata.rel.ro";
4932 break;
4933 case SECCAT_DATA_REL_RO_LOCAL:
4934 sname = ".ldata.rel.ro.local";
4935 break;
4936 case SECCAT_BSS:
4937 sname = ".lbss";
4938 flags |= SECTION_BSS;
4939 break;
4940 case SECCAT_RODATA:
4941 case SECCAT_RODATA_MERGE_STR:
4942 case SECCAT_RODATA_MERGE_STR_INIT:
4943 case SECCAT_RODATA_MERGE_CONST:
4944 sname = ".lrodata";
4945 flags = 0;
4946 break;
4947 case SECCAT_SRODATA:
4948 case SECCAT_SDATA:
4949 case SECCAT_SBSS:
4950 gcc_unreachable ();
4951 case SECCAT_TEXT:
4952 case SECCAT_TDATA:
4953 case SECCAT_TBSS:
4954 /* We don't split these for medium model. Place them into
4955 default sections and hope for best. */
4956 break;
4957 }
4958 if (sname)
4959 {
4960 /* We might get called with string constants, but get_named_section
4961 doesn't like them as they are not DECLs. Also, we need to set
4962 flags in that case. */
4963 if (!DECL_P (decl))
4964 return get_section (sname, flags, NULL);
4965 return get_named_section (decl, sname, reloc);
4966 }
4967 }
4968 return default_elf_select_section (decl, reloc, align);
4969 }
4970
4971 /* Select a set of attributes for section NAME based on the properties
4972 of DECL and whether or not RELOC indicates that DECL's initializer
4973 might contain runtime relocations. */
4974
4975 static unsigned int ATTRIBUTE_UNUSED
4976 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4977 {
4978 unsigned int flags = default_section_type_flags (decl, name, reloc);
4979
4980 if (decl == NULL_TREE
4981 && (strcmp (name, ".ldata.rel.ro") == 0
4982 || strcmp (name, ".ldata.rel.ro.local") == 0))
4983 flags |= SECTION_RELRO;
4984
4985 if (strcmp (name, ".lbss") == 0
4986 || strncmp (name, ".lbss.", 5) == 0
4987 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4988 flags |= SECTION_BSS;
4989
4990 return flags;
4991 }
4992
4993 /* Build up a unique section name, expressed as a
4994 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4995 RELOC indicates whether the initial value of EXP requires
4996 link-time relocations. */
4997
4998 static void ATTRIBUTE_UNUSED
4999 x86_64_elf_unique_section (tree decl, int reloc)
5000 {
5001 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5002 && ix86_in_large_data_p (decl))
5003 {
5004 const char *prefix = NULL;
5005 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
5006 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
5007
5008 switch (categorize_decl_for_section (decl, reloc))
5009 {
5010 case SECCAT_DATA:
5011 case SECCAT_DATA_REL:
5012 case SECCAT_DATA_REL_LOCAL:
5013 case SECCAT_DATA_REL_RO:
5014 case SECCAT_DATA_REL_RO_LOCAL:
5015 prefix = one_only ? ".ld" : ".ldata";
5016 break;
5017 case SECCAT_BSS:
5018 prefix = one_only ? ".lb" : ".lbss";
5019 break;
5020 case SECCAT_RODATA:
5021 case SECCAT_RODATA_MERGE_STR:
5022 case SECCAT_RODATA_MERGE_STR_INIT:
5023 case SECCAT_RODATA_MERGE_CONST:
5024 prefix = one_only ? ".lr" : ".lrodata";
5025 break;
5026 case SECCAT_SRODATA:
5027 case SECCAT_SDATA:
5028 case SECCAT_SBSS:
5029 gcc_unreachable ();
5030 case SECCAT_TEXT:
5031 case SECCAT_TDATA:
5032 case SECCAT_TBSS:
5033 /* We don't split these for medium model. Place them into
5034 default sections and hope for best. */
5035 break;
5036 }
5037 if (prefix)
5038 {
5039 const char *name, *linkonce;
5040 char *string;
5041
5042 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
5043 name = targetm.strip_name_encoding (name);
5044
5045 /* If we're using one_only, then there needs to be a .gnu.linkonce
5046 prefix to the section name. */
5047 linkonce = one_only ? ".gnu.linkonce" : "";
5048
5049 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
5050
5051 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
5052 return;
5053 }
5054 }
5055 default_unique_section (decl, reloc);
5056 }
5057
5058 #ifdef COMMON_ASM_OP
5059 /* This says how to output assembler code to declare an
5060 uninitialized external linkage data object.
5061
5062 For medium model x86-64 we need to use .largecomm opcode for
5063 large objects. */
5064 void
5065 x86_elf_aligned_common (FILE *file,
5066 const char *name, unsigned HOST_WIDE_INT size,
5067 int align)
5068 {
5069 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5070 && size > (unsigned int)ix86_section_threshold)
5071 fputs (".largecomm\t", file);
5072 else
5073 fputs (COMMON_ASM_OP, file);
5074 assemble_name (file, name);
5075 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
5076 size, align / BITS_PER_UNIT);
5077 }
5078 #endif
5079
5080 /* Utility function for targets to use in implementing
5081 ASM_OUTPUT_ALIGNED_BSS. */
5082
5083 void
5084 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
5085 const char *name, unsigned HOST_WIDE_INT size,
5086 int align)
5087 {
5088 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5089 && size > (unsigned int)ix86_section_threshold)
5090 switch_to_section (get_named_section (decl, ".lbss", 0));
5091 else
5092 switch_to_section (bss_section);
5093 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
5094 #ifdef ASM_DECLARE_OBJECT_NAME
5095 last_assemble_variable_decl = decl;
5096 ASM_DECLARE_OBJECT_NAME (file, name, decl);
5097 #else
5098 /* Standard thing is just output label for the object. */
5099 ASM_OUTPUT_LABEL (file, name);
5100 #endif /* ASM_DECLARE_OBJECT_NAME */
5101 ASM_OUTPUT_SKIP (file, size ? size : 1);
5102 }
5103 \f
5104 /* Decide whether we must probe the stack before any space allocation
5105 on this target. It's essentially TARGET_STACK_PROBE except when
5106 -fstack-check causes the stack to be already probed differently. */
5107
5108 bool
5109 ix86_target_stack_probe (void)
5110 {
5111 /* Do not probe the stack twice if static stack checking is enabled. */
5112 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5113 return false;
5114
5115 return TARGET_STACK_PROBE;
5116 }
5117 \f
5118 /* Decide whether we can make a sibling call to a function. DECL is the
5119 declaration of the function being targeted by the call and EXP is the
5120 CALL_EXPR representing the call. */
5121
5122 static bool
5123 ix86_function_ok_for_sibcall (tree decl, tree exp)
5124 {
5125 tree type, decl_or_type;
5126 rtx a, b;
5127
5128 /* If we are generating position-independent code, we cannot sibcall
5129 optimize any indirect call, or a direct call to a global function,
5130 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5131 if (!TARGET_MACHO
5132 && !TARGET_64BIT
5133 && flag_pic
5134 && (!decl || !targetm.binds_local_p (decl)))
5135 return false;
5136
5137 /* If we need to align the outgoing stack, then sibcalling would
5138 unalign the stack, which may break the called function. */
5139 if (ix86_minimum_incoming_stack_boundary (true)
5140 < PREFERRED_STACK_BOUNDARY)
5141 return false;
5142
5143 if (decl)
5144 {
5145 decl_or_type = decl;
5146 type = TREE_TYPE (decl);
5147 }
5148 else
5149 {
5150 /* We're looking at the CALL_EXPR, we need the type of the function. */
5151 type = CALL_EXPR_FN (exp); /* pointer expression */
5152 type = TREE_TYPE (type); /* pointer type */
5153 type = TREE_TYPE (type); /* function type */
5154 decl_or_type = type;
5155 }
5156
5157 /* Check that the return value locations are the same. Like
5158 if we are returning floats on the 80387 register stack, we cannot
5159 make a sibcall from a function that doesn't return a float to a
5160 function that does or, conversely, from a function that does return
5161 a float to a function that doesn't; the necessary stack adjustment
5162 would not be executed. This is also the place we notice
5163 differences in the return value ABI. Note that it is ok for one
5164 of the functions to have void return type as long as the return
5165 value of the other is passed in a register. */
5166 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5167 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5168 cfun->decl, false);
5169 if (STACK_REG_P (a) || STACK_REG_P (b))
5170 {
5171 if (!rtx_equal_p (a, b))
5172 return false;
5173 }
5174 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5175 ;
5176 else if (!rtx_equal_p (a, b))
5177 return false;
5178
5179 if (TARGET_64BIT)
5180 {
5181 /* The SYSV ABI has more call-clobbered registers;
5182 disallow sibcalls from MS to SYSV. */
5183 if (cfun->machine->call_abi == MS_ABI
5184 && ix86_function_type_abi (type) == SYSV_ABI)
5185 return false;
5186 }
5187 else
5188 {
5189 /* If this call is indirect, we'll need to be able to use a
5190 call-clobbered register for the address of the target function.
5191 Make sure that all such registers are not used for passing
5192 parameters. Note that DLLIMPORT functions are indirect. */
5193 if (!decl
5194 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5195 {
5196 if (ix86_function_regparm (type, NULL) >= 3)
5197 {
5198 /* ??? Need to count the actual number of registers to be used,
5199 not the possible number of registers. Fix later. */
5200 return false;
5201 }
5202 }
5203 }
5204
5205 /* Otherwise okay. That also includes certain types of indirect calls. */
5206 return true;
5207 }
5208
5209 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5210 and "sseregparm" calling convention attributes;
5211 arguments as in struct attribute_spec.handler. */
5212
5213 static tree
5214 ix86_handle_cconv_attribute (tree *node, tree name,
5215 tree args,
5216 int flags ATTRIBUTE_UNUSED,
5217 bool *no_add_attrs)
5218 {
5219 if (TREE_CODE (*node) != FUNCTION_TYPE
5220 && TREE_CODE (*node) != METHOD_TYPE
5221 && TREE_CODE (*node) != FIELD_DECL
5222 && TREE_CODE (*node) != TYPE_DECL)
5223 {
5224 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5225 name);
5226 *no_add_attrs = true;
5227 return NULL_TREE;
5228 }
5229
5230 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5231 if (is_attribute_p ("regparm", name))
5232 {
5233 tree cst;
5234
5235 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5236 {
5237 error ("fastcall and regparm attributes are not compatible");
5238 }
5239
5240 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5241 {
5242 error ("regparam and thiscall attributes are not compatible");
5243 }
5244
5245 cst = TREE_VALUE (args);
5246 if (TREE_CODE (cst) != INTEGER_CST)
5247 {
5248 warning (OPT_Wattributes,
5249 "%qE attribute requires an integer constant argument",
5250 name);
5251 *no_add_attrs = true;
5252 }
5253 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5254 {
5255 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5256 name, REGPARM_MAX);
5257 *no_add_attrs = true;
5258 }
5259
5260 return NULL_TREE;
5261 }
5262
5263 if (TARGET_64BIT)
5264 {
5265 /* Do not warn when emulating the MS ABI. */
5266 if ((TREE_CODE (*node) != FUNCTION_TYPE
5267 && TREE_CODE (*node) != METHOD_TYPE)
5268 || ix86_function_type_abi (*node) != MS_ABI)
5269 warning (OPT_Wattributes, "%qE attribute ignored",
5270 name);
5271 *no_add_attrs = true;
5272 return NULL_TREE;
5273 }
5274
5275 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5276 if (is_attribute_p ("fastcall", name))
5277 {
5278 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5279 {
5280 error ("fastcall and cdecl attributes are not compatible");
5281 }
5282 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5283 {
5284 error ("fastcall and stdcall attributes are not compatible");
5285 }
5286 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5287 {
5288 error ("fastcall and regparm attributes are not compatible");
5289 }
5290 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5291 {
5292 error ("fastcall and thiscall attributes are not compatible");
5293 }
5294 }
5295
5296 /* Can combine stdcall with fastcall (redundant), regparm and
5297 sseregparm. */
5298 else if (is_attribute_p ("stdcall", name))
5299 {
5300 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5301 {
5302 error ("stdcall and cdecl attributes are not compatible");
5303 }
5304 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5305 {
5306 error ("stdcall and fastcall attributes are not compatible");
5307 }
5308 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5309 {
5310 error ("stdcall and thiscall attributes are not compatible");
5311 }
5312 }
5313
5314 /* Can combine cdecl with regparm and sseregparm. */
5315 else if (is_attribute_p ("cdecl", name))
5316 {
5317 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5318 {
5319 error ("stdcall and cdecl attributes are not compatible");
5320 }
5321 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5322 {
5323 error ("fastcall and cdecl attributes are not compatible");
5324 }
5325 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5326 {
5327 error ("cdecl and thiscall attributes are not compatible");
5328 }
5329 }
5330 else if (is_attribute_p ("thiscall", name))
5331 {
5332 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5333 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5334 name);
5335 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5336 {
5337 error ("stdcall and thiscall attributes are not compatible");
5338 }
5339 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5340 {
5341 error ("fastcall and thiscall attributes are not compatible");
5342 }
5343 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5344 {
5345 error ("cdecl and thiscall attributes are not compatible");
5346 }
5347 }
5348
5349 /* Can combine sseregparm with all attributes. */
5350
5351 return NULL_TREE;
5352 }
5353
5354 /* The transactional memory builtins are implicitly regparm or fastcall
5355 depending on the ABI. Override the generic do-nothing attribute that
5356 these builtins were declared with, and replace it with one of the two
5357 attributes that we expect elsewhere. */
5358
5359 static tree
5360 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5361 tree args ATTRIBUTE_UNUSED,
5362 int flags, bool *no_add_attrs)
5363 {
5364 tree alt;
5365
5366 /* In no case do we want to add the placeholder attribute. */
5367 *no_add_attrs = true;
5368
5369 /* The 64-bit ABI is unchanged for transactional memory. */
5370 if (TARGET_64BIT)
5371 return NULL_TREE;
5372
5373 /* ??? Is there a better way to validate 32-bit windows? We have
5374 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5375 if (CHECK_STACK_LIMIT > 0)
5376 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5377 else
5378 {
5379 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5380 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5381 }
5382 decl_attributes (node, alt, flags);
5383
5384 return NULL_TREE;
5385 }
5386
5387 /* This function determines from TYPE the calling-convention. */
5388
5389 unsigned int
5390 ix86_get_callcvt (const_tree type)
5391 {
5392 unsigned int ret = 0;
5393 bool is_stdarg;
5394 tree attrs;
5395
5396 if (TARGET_64BIT)
5397 return IX86_CALLCVT_CDECL;
5398
5399 attrs = TYPE_ATTRIBUTES (type);
5400 if (attrs != NULL_TREE)
5401 {
5402 if (lookup_attribute ("cdecl", attrs))
5403 ret |= IX86_CALLCVT_CDECL;
5404 else if (lookup_attribute ("stdcall", attrs))
5405 ret |= IX86_CALLCVT_STDCALL;
5406 else if (lookup_attribute ("fastcall", attrs))
5407 ret |= IX86_CALLCVT_FASTCALL;
5408 else if (lookup_attribute ("thiscall", attrs))
5409 ret |= IX86_CALLCVT_THISCALL;
5410
5411 /* Regparam isn't allowed for thiscall and fastcall. */
5412 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5413 {
5414 if (lookup_attribute ("regparm", attrs))
5415 ret |= IX86_CALLCVT_REGPARM;
5416 if (lookup_attribute ("sseregparm", attrs))
5417 ret |= IX86_CALLCVT_SSEREGPARM;
5418 }
5419
5420 if (IX86_BASE_CALLCVT(ret) != 0)
5421 return ret;
5422 }
5423
5424 is_stdarg = stdarg_p (type);
5425 if (TARGET_RTD && !is_stdarg)
5426 return IX86_CALLCVT_STDCALL | ret;
5427
5428 if (ret != 0
5429 || is_stdarg
5430 || TREE_CODE (type) != METHOD_TYPE
5431 || ix86_function_type_abi (type) != MS_ABI)
5432 return IX86_CALLCVT_CDECL | ret;
5433
5434 return IX86_CALLCVT_THISCALL;
5435 }
5436
5437 /* Return 0 if the attributes for two types are incompatible, 1 if they
5438 are compatible, and 2 if they are nearly compatible (which causes a
5439 warning to be generated). */
5440
5441 static int
5442 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5443 {
5444 unsigned int ccvt1, ccvt2;
5445
5446 if (TREE_CODE (type1) != FUNCTION_TYPE
5447 && TREE_CODE (type1) != METHOD_TYPE)
5448 return 1;
5449
5450 ccvt1 = ix86_get_callcvt (type1);
5451 ccvt2 = ix86_get_callcvt (type2);
5452 if (ccvt1 != ccvt2)
5453 return 0;
5454 if (ix86_function_regparm (type1, NULL)
5455 != ix86_function_regparm (type2, NULL))
5456 return 0;
5457
5458 return 1;
5459 }
5460 \f
5461 /* Return the regparm value for a function with the indicated TYPE and DECL.
5462 DECL may be NULL when calling function indirectly
5463 or considering a libcall. */
5464
5465 static int
5466 ix86_function_regparm (const_tree type, const_tree decl)
5467 {
5468 tree attr;
5469 int regparm;
5470 unsigned int ccvt;
5471
5472 if (TARGET_64BIT)
5473 return (ix86_function_type_abi (type) == SYSV_ABI
5474 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5475 ccvt = ix86_get_callcvt (type);
5476 regparm = ix86_regparm;
5477
5478 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5479 {
5480 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5481 if (attr)
5482 {
5483 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5484 return regparm;
5485 }
5486 }
5487 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5488 return 2;
5489 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5490 return 1;
5491
5492 /* Use register calling convention for local functions when possible. */
5493 if (decl
5494 && TREE_CODE (decl) == FUNCTION_DECL
5495 && optimize
5496 && !(profile_flag && !flag_fentry))
5497 {
5498 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5499 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5500 if (i && i->local && i->can_change_signature)
5501 {
5502 int local_regparm, globals = 0, regno;
5503
5504 /* Make sure no regparm register is taken by a
5505 fixed register variable. */
5506 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5507 if (fixed_regs[local_regparm])
5508 break;
5509
5510 /* We don't want to use regparm(3) for nested functions as
5511 these use a static chain pointer in the third argument. */
5512 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5513 local_regparm = 2;
5514
5515 /* In 32-bit mode save a register for the split stack. */
5516 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5517 local_regparm = 2;
5518
5519 /* Each fixed register usage increases register pressure,
5520 so less registers should be used for argument passing.
5521 This functionality can be overriden by an explicit
5522 regparm value. */
5523 for (regno = AX_REG; regno <= DI_REG; regno++)
5524 if (fixed_regs[regno])
5525 globals++;
5526
5527 local_regparm
5528 = globals < local_regparm ? local_regparm - globals : 0;
5529
5530 if (local_regparm > regparm)
5531 regparm = local_regparm;
5532 }
5533 }
5534
5535 return regparm;
5536 }
5537
5538 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5539 DFmode (2) arguments in SSE registers for a function with the
5540 indicated TYPE and DECL. DECL may be NULL when calling function
5541 indirectly or considering a libcall. Otherwise return 0. */
5542
5543 static int
5544 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5545 {
5546 gcc_assert (!TARGET_64BIT);
5547
5548 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5549 by the sseregparm attribute. */
5550 if (TARGET_SSEREGPARM
5551 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5552 {
5553 if (!TARGET_SSE)
5554 {
5555 if (warn)
5556 {
5557 if (decl)
5558 error ("calling %qD with attribute sseregparm without "
5559 "SSE/SSE2 enabled", decl);
5560 else
5561 error ("calling %qT with attribute sseregparm without "
5562 "SSE/SSE2 enabled", type);
5563 }
5564 return 0;
5565 }
5566
5567 return 2;
5568 }
5569
5570 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5571 (and DFmode for SSE2) arguments in SSE registers. */
5572 if (decl && TARGET_SSE_MATH && optimize
5573 && !(profile_flag && !flag_fentry))
5574 {
5575 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5576 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5577 if (i && i->local && i->can_change_signature)
5578 return TARGET_SSE2 ? 2 : 1;
5579 }
5580
5581 return 0;
5582 }
5583
5584 /* Return true if EAX is live at the start of the function. Used by
5585 ix86_expand_prologue to determine if we need special help before
5586 calling allocate_stack_worker. */
5587
5588 static bool
5589 ix86_eax_live_at_start_p (void)
5590 {
5591 /* Cheat. Don't bother working forward from ix86_function_regparm
5592 to the function type to whether an actual argument is located in
5593 eax. Instead just look at cfg info, which is still close enough
5594 to correct at this point. This gives false positives for broken
5595 functions that might use uninitialized data that happens to be
5596 allocated in eax, but who cares? */
5597 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
5598 }
5599
5600 static bool
5601 ix86_keep_aggregate_return_pointer (tree fntype)
5602 {
5603 tree attr;
5604
5605 if (!TARGET_64BIT)
5606 {
5607 attr = lookup_attribute ("callee_pop_aggregate_return",
5608 TYPE_ATTRIBUTES (fntype));
5609 if (attr)
5610 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5611
5612 /* For 32-bit MS-ABI the default is to keep aggregate
5613 return pointer. */
5614 if (ix86_function_type_abi (fntype) == MS_ABI)
5615 return true;
5616 }
5617 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5618 }
5619
5620 /* Value is the number of bytes of arguments automatically
5621 popped when returning from a subroutine call.
5622 FUNDECL is the declaration node of the function (as a tree),
5623 FUNTYPE is the data type of the function (as a tree),
5624 or for a library call it is an identifier node for the subroutine name.
5625 SIZE is the number of bytes of arguments passed on the stack.
5626
5627 On the 80386, the RTD insn may be used to pop them if the number
5628 of args is fixed, but if the number is variable then the caller
5629 must pop them all. RTD can't be used for library calls now
5630 because the library is compiled with the Unix compiler.
5631 Use of RTD is a selectable option, since it is incompatible with
5632 standard Unix calling sequences. If the option is not selected,
5633 the caller must always pop the args.
5634
5635 The attribute stdcall is equivalent to RTD on a per module basis. */
5636
5637 static int
5638 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5639 {
5640 unsigned int ccvt;
5641
5642 /* None of the 64-bit ABIs pop arguments. */
5643 if (TARGET_64BIT)
5644 return 0;
5645
5646 ccvt = ix86_get_callcvt (funtype);
5647
5648 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5649 | IX86_CALLCVT_THISCALL)) != 0
5650 && ! stdarg_p (funtype))
5651 return size;
5652
5653 /* Lose any fake structure return argument if it is passed on the stack. */
5654 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5655 && !ix86_keep_aggregate_return_pointer (funtype))
5656 {
5657 int nregs = ix86_function_regparm (funtype, fundecl);
5658 if (nregs == 0)
5659 return GET_MODE_SIZE (Pmode);
5660 }
5661
5662 return 0;
5663 }
5664
5665 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5666
5667 static bool
5668 ix86_legitimate_combined_insn (rtx insn)
5669 {
5670 /* Check operand constraints in case hard registers were propagated
5671 into insn pattern. This check prevents combine pass from
5672 generating insn patterns with invalid hard register operands.
5673 These invalid insns can eventually confuse reload to error out
5674 with a spill failure. See also PRs 46829 and 46843. */
5675 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5676 {
5677 int i;
5678
5679 extract_insn (insn);
5680 preprocess_constraints ();
5681
5682 for (i = 0; i < recog_data.n_operands; i++)
5683 {
5684 rtx op = recog_data.operand[i];
5685 enum machine_mode mode = GET_MODE (op);
5686 struct operand_alternative *op_alt;
5687 int offset = 0;
5688 bool win;
5689 int j;
5690
5691 /* A unary operator may be accepted by the predicate, but it
5692 is irrelevant for matching constraints. */
5693 if (UNARY_P (op))
5694 op = XEXP (op, 0);
5695
5696 if (GET_CODE (op) == SUBREG)
5697 {
5698 if (REG_P (SUBREG_REG (op))
5699 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5700 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5701 GET_MODE (SUBREG_REG (op)),
5702 SUBREG_BYTE (op),
5703 GET_MODE (op));
5704 op = SUBREG_REG (op);
5705 }
5706
5707 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5708 continue;
5709
5710 op_alt = recog_op_alt[i];
5711
5712 /* Operand has no constraints, anything is OK. */
5713 win = !recog_data.n_alternatives;
5714
5715 for (j = 0; j < recog_data.n_alternatives; j++)
5716 {
5717 if (op_alt[j].anything_ok
5718 || (op_alt[j].matches != -1
5719 && operands_match_p
5720 (recog_data.operand[i],
5721 recog_data.operand[op_alt[j].matches]))
5722 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5723 {
5724 win = true;
5725 break;
5726 }
5727 }
5728
5729 if (!win)
5730 return false;
5731 }
5732 }
5733
5734 return true;
5735 }
5736 \f
5737 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5738
5739 static unsigned HOST_WIDE_INT
5740 ix86_asan_shadow_offset (void)
5741 {
5742 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5743 : HOST_WIDE_INT_C (0x7fff8000))
5744 : (HOST_WIDE_INT_1 << 29);
5745 }
5746 \f
5747 /* Argument support functions. */
5748
5749 /* Return true when register may be used to pass function parameters. */
5750 bool
5751 ix86_function_arg_regno_p (int regno)
5752 {
5753 int i;
5754 const int *parm_regs;
5755
5756 if (!TARGET_64BIT)
5757 {
5758 if (TARGET_MACHO)
5759 return (regno < REGPARM_MAX
5760 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5761 else
5762 return (regno < REGPARM_MAX
5763 || (TARGET_MMX && MMX_REGNO_P (regno)
5764 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5765 || (TARGET_SSE && SSE_REGNO_P (regno)
5766 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5767 }
5768
5769 if (TARGET_SSE && SSE_REGNO_P (regno)
5770 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5771 return true;
5772
5773 /* TODO: The function should depend on current function ABI but
5774 builtins.c would need updating then. Therefore we use the
5775 default ABI. */
5776
5777 /* RAX is used as hidden argument to va_arg functions. */
5778 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5779 return true;
5780
5781 if (ix86_abi == MS_ABI)
5782 parm_regs = x86_64_ms_abi_int_parameter_registers;
5783 else
5784 parm_regs = x86_64_int_parameter_registers;
5785 for (i = 0; i < (ix86_abi == MS_ABI
5786 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5787 if (regno == parm_regs[i])
5788 return true;
5789 return false;
5790 }
5791
5792 /* Return if we do not know how to pass TYPE solely in registers. */
5793
5794 static bool
5795 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5796 {
5797 if (must_pass_in_stack_var_size_or_pad (mode, type))
5798 return true;
5799
5800 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5801 The layout_type routine is crafty and tries to trick us into passing
5802 currently unsupported vector types on the stack by using TImode. */
5803 return (!TARGET_64BIT && mode == TImode
5804 && type && TREE_CODE (type) != VECTOR_TYPE);
5805 }
5806
5807 /* It returns the size, in bytes, of the area reserved for arguments passed
5808 in registers for the function represented by fndecl dependent to the used
5809 abi format. */
5810 int
5811 ix86_reg_parm_stack_space (const_tree fndecl)
5812 {
5813 enum calling_abi call_abi = SYSV_ABI;
5814 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5815 call_abi = ix86_function_abi (fndecl);
5816 else
5817 call_abi = ix86_function_type_abi (fndecl);
5818 if (TARGET_64BIT && call_abi == MS_ABI)
5819 return 32;
5820 return 0;
5821 }
5822
5823 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5824 call abi used. */
5825 enum calling_abi
5826 ix86_function_type_abi (const_tree fntype)
5827 {
5828 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5829 {
5830 enum calling_abi abi = ix86_abi;
5831 if (abi == SYSV_ABI)
5832 {
5833 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5834 abi = MS_ABI;
5835 }
5836 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5837 abi = SYSV_ABI;
5838 return abi;
5839 }
5840 return ix86_abi;
5841 }
5842
5843 /* We add this as a workaround in order to use libc_has_function
5844 hook in i386.md. */
5845 bool
5846 ix86_libc_has_function (enum function_class fn_class)
5847 {
5848 return targetm.libc_has_function (fn_class);
5849 }
5850
5851 static bool
5852 ix86_function_ms_hook_prologue (const_tree fn)
5853 {
5854 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5855 {
5856 if (decl_function_context (fn) != NULL_TREE)
5857 error_at (DECL_SOURCE_LOCATION (fn),
5858 "ms_hook_prologue is not compatible with nested function");
5859 else
5860 return true;
5861 }
5862 return false;
5863 }
5864
5865 static enum calling_abi
5866 ix86_function_abi (const_tree fndecl)
5867 {
5868 if (! fndecl)
5869 return ix86_abi;
5870 return ix86_function_type_abi (TREE_TYPE (fndecl));
5871 }
5872
5873 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5874 call abi used. */
5875 enum calling_abi
5876 ix86_cfun_abi (void)
5877 {
5878 if (! cfun)
5879 return ix86_abi;
5880 return cfun->machine->call_abi;
5881 }
5882
5883 /* Write the extra assembler code needed to declare a function properly. */
5884
5885 void
5886 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5887 tree decl)
5888 {
5889 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5890
5891 if (is_ms_hook)
5892 {
5893 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5894 unsigned int filler_cc = 0xcccccccc;
5895
5896 for (i = 0; i < filler_count; i += 4)
5897 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5898 }
5899
5900 #ifdef SUBTARGET_ASM_UNWIND_INIT
5901 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5902 #endif
5903
5904 ASM_OUTPUT_LABEL (asm_out_file, fname);
5905
5906 /* Output magic byte marker, if hot-patch attribute is set. */
5907 if (is_ms_hook)
5908 {
5909 if (TARGET_64BIT)
5910 {
5911 /* leaq [%rsp + 0], %rsp */
5912 asm_fprintf (asm_out_file, ASM_BYTE
5913 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5914 }
5915 else
5916 {
5917 /* movl.s %edi, %edi
5918 push %ebp
5919 movl.s %esp, %ebp */
5920 asm_fprintf (asm_out_file, ASM_BYTE
5921 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5922 }
5923 }
5924 }
5925
5926 /* regclass.c */
5927 extern void init_regs (void);
5928
5929 /* Implementation of call abi switching target hook. Specific to FNDECL
5930 the specific call register sets are set. See also
5931 ix86_conditional_register_usage for more details. */
5932 void
5933 ix86_call_abi_override (const_tree fndecl)
5934 {
5935 if (fndecl == NULL_TREE)
5936 cfun->machine->call_abi = ix86_abi;
5937 else
5938 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5939 }
5940
5941 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5942 expensive re-initialization of init_regs each time we switch function context
5943 since this is needed only during RTL expansion. */
5944 static void
5945 ix86_maybe_switch_abi (void)
5946 {
5947 if (TARGET_64BIT &&
5948 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5949 reinit_regs ();
5950 }
5951
5952 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5953 for a call to a function whose data type is FNTYPE.
5954 For a library call, FNTYPE is 0. */
5955
5956 void
5957 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5958 tree fntype, /* tree ptr for function decl */
5959 rtx libname, /* SYMBOL_REF of library name or 0 */
5960 tree fndecl,
5961 int caller)
5962 {
5963 struct cgraph_local_info *i;
5964
5965 memset (cum, 0, sizeof (*cum));
5966
5967 if (fndecl)
5968 {
5969 i = cgraph_local_info (fndecl);
5970 cum->call_abi = ix86_function_abi (fndecl);
5971 }
5972 else
5973 {
5974 i = NULL;
5975 cum->call_abi = ix86_function_type_abi (fntype);
5976 }
5977
5978 cum->caller = caller;
5979
5980 /* Set up the number of registers to use for passing arguments. */
5981
5982 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5983 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5984 "or subtarget optimization implying it");
5985 cum->nregs = ix86_regparm;
5986 if (TARGET_64BIT)
5987 {
5988 cum->nregs = (cum->call_abi == SYSV_ABI
5989 ? X86_64_REGPARM_MAX
5990 : X86_64_MS_REGPARM_MAX);
5991 }
5992 if (TARGET_SSE)
5993 {
5994 cum->sse_nregs = SSE_REGPARM_MAX;
5995 if (TARGET_64BIT)
5996 {
5997 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5998 ? X86_64_SSE_REGPARM_MAX
5999 : X86_64_MS_SSE_REGPARM_MAX);
6000 }
6001 }
6002 if (TARGET_MMX)
6003 cum->mmx_nregs = MMX_REGPARM_MAX;
6004 cum->warn_avx = true;
6005 cum->warn_sse = true;
6006 cum->warn_mmx = true;
6007
6008 /* Because type might mismatch in between caller and callee, we need to
6009 use actual type of function for local calls.
6010 FIXME: cgraph_analyze can be told to actually record if function uses
6011 va_start so for local functions maybe_vaarg can be made aggressive
6012 helping K&R code.
6013 FIXME: once typesytem is fixed, we won't need this code anymore. */
6014 if (i && i->local && i->can_change_signature)
6015 fntype = TREE_TYPE (fndecl);
6016 cum->maybe_vaarg = (fntype
6017 ? (!prototype_p (fntype) || stdarg_p (fntype))
6018 : !libname);
6019
6020 if (!TARGET_64BIT)
6021 {
6022 /* If there are variable arguments, then we won't pass anything
6023 in registers in 32-bit mode. */
6024 if (stdarg_p (fntype))
6025 {
6026 cum->nregs = 0;
6027 cum->sse_nregs = 0;
6028 cum->mmx_nregs = 0;
6029 cum->warn_avx = 0;
6030 cum->warn_sse = 0;
6031 cum->warn_mmx = 0;
6032 return;
6033 }
6034
6035 /* Use ecx and edx registers if function has fastcall attribute,
6036 else look for regparm information. */
6037 if (fntype)
6038 {
6039 unsigned int ccvt = ix86_get_callcvt (fntype);
6040 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6041 {
6042 cum->nregs = 1;
6043 cum->fastcall = 1; /* Same first register as in fastcall. */
6044 }
6045 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6046 {
6047 cum->nregs = 2;
6048 cum->fastcall = 1;
6049 }
6050 else
6051 cum->nregs = ix86_function_regparm (fntype, fndecl);
6052 }
6053
6054 /* Set up the number of SSE registers used for passing SFmode
6055 and DFmode arguments. Warn for mismatching ABI. */
6056 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
6057 }
6058 }
6059
6060 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
6061 But in the case of vector types, it is some vector mode.
6062
6063 When we have only some of our vector isa extensions enabled, then there
6064 are some modes for which vector_mode_supported_p is false. For these
6065 modes, the generic vector support in gcc will choose some non-vector mode
6066 in order to implement the type. By computing the natural mode, we'll
6067 select the proper ABI location for the operand and not depend on whatever
6068 the middle-end decides to do with these vector types.
6069
6070 The midde-end can't deal with the vector types > 16 bytes. In this
6071 case, we return the original mode and warn ABI change if CUM isn't
6072 NULL. */
6073
6074 static enum machine_mode
6075 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
6076 {
6077 enum machine_mode mode = TYPE_MODE (type);
6078
6079 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
6080 {
6081 HOST_WIDE_INT size = int_size_in_bytes (type);
6082 if ((size == 8 || size == 16 || size == 32)
6083 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
6084 && TYPE_VECTOR_SUBPARTS (type) > 1)
6085 {
6086 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
6087
6088 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
6089 mode = MIN_MODE_VECTOR_FLOAT;
6090 else
6091 mode = MIN_MODE_VECTOR_INT;
6092
6093 /* Get the mode which has this inner mode and number of units. */
6094 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
6095 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
6096 && GET_MODE_INNER (mode) == innermode)
6097 {
6098 if (size == 32 && !TARGET_AVX)
6099 {
6100 static bool warnedavx;
6101
6102 if (cum
6103 && !warnedavx
6104 && cum->warn_avx)
6105 {
6106 warnedavx = true;
6107 warning (0, "AVX vector argument without AVX "
6108 "enabled changes the ABI");
6109 }
6110 return TYPE_MODE (type);
6111 }
6112 else if ((size == 8 || size == 16) && !TARGET_SSE)
6113 {
6114 static bool warnedsse;
6115
6116 if (cum
6117 && !warnedsse
6118 && cum->warn_sse)
6119 {
6120 warnedsse = true;
6121 warning (0, "SSE vector argument without SSE "
6122 "enabled changes the ABI");
6123 }
6124 return mode;
6125 }
6126 else
6127 return mode;
6128 }
6129
6130 gcc_unreachable ();
6131 }
6132 }
6133
6134 return mode;
6135 }
6136
6137 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6138 this may not agree with the mode that the type system has chosen for the
6139 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6140 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6141
6142 static rtx
6143 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6144 unsigned int regno)
6145 {
6146 rtx tmp;
6147
6148 if (orig_mode != BLKmode)
6149 tmp = gen_rtx_REG (orig_mode, regno);
6150 else
6151 {
6152 tmp = gen_rtx_REG (mode, regno);
6153 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6154 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6155 }
6156
6157 return tmp;
6158 }
6159
6160 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6161 of this code is to classify each 8bytes of incoming argument by the register
6162 class and assign registers accordingly. */
6163
6164 /* Return the union class of CLASS1 and CLASS2.
6165 See the x86-64 PS ABI for details. */
6166
6167 static enum x86_64_reg_class
6168 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6169 {
6170 /* Rule #1: If both classes are equal, this is the resulting class. */
6171 if (class1 == class2)
6172 return class1;
6173
6174 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6175 the other class. */
6176 if (class1 == X86_64_NO_CLASS)
6177 return class2;
6178 if (class2 == X86_64_NO_CLASS)
6179 return class1;
6180
6181 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6182 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6183 return X86_64_MEMORY_CLASS;
6184
6185 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6186 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6187 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6188 return X86_64_INTEGERSI_CLASS;
6189 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6190 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6191 return X86_64_INTEGER_CLASS;
6192
6193 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6194 MEMORY is used. */
6195 if (class1 == X86_64_X87_CLASS
6196 || class1 == X86_64_X87UP_CLASS
6197 || class1 == X86_64_COMPLEX_X87_CLASS
6198 || class2 == X86_64_X87_CLASS
6199 || class2 == X86_64_X87UP_CLASS
6200 || class2 == X86_64_COMPLEX_X87_CLASS)
6201 return X86_64_MEMORY_CLASS;
6202
6203 /* Rule #6: Otherwise class SSE is used. */
6204 return X86_64_SSE_CLASS;
6205 }
6206
6207 /* Classify the argument of type TYPE and mode MODE.
6208 CLASSES will be filled by the register class used to pass each word
6209 of the operand. The number of words is returned. In case the parameter
6210 should be passed in memory, 0 is returned. As a special case for zero
6211 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6212
6213 BIT_OFFSET is used internally for handling records and specifies offset
6214 of the offset in bits modulo 256 to avoid overflow cases.
6215
6216 See the x86-64 PS ABI for details.
6217 */
6218
6219 static int
6220 classify_argument (enum machine_mode mode, const_tree type,
6221 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6222 {
6223 HOST_WIDE_INT bytes =
6224 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6225 int words
6226 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6227
6228 /* Variable sized entities are always passed/returned in memory. */
6229 if (bytes < 0)
6230 return 0;
6231
6232 if (mode != VOIDmode
6233 && targetm.calls.must_pass_in_stack (mode, type))
6234 return 0;
6235
6236 if (type && AGGREGATE_TYPE_P (type))
6237 {
6238 int i;
6239 tree field;
6240 enum x86_64_reg_class subclasses[MAX_CLASSES];
6241
6242 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6243 if (bytes > 32)
6244 return 0;
6245
6246 for (i = 0; i < words; i++)
6247 classes[i] = X86_64_NO_CLASS;
6248
6249 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6250 signalize memory class, so handle it as special case. */
6251 if (!words)
6252 {
6253 classes[0] = X86_64_NO_CLASS;
6254 return 1;
6255 }
6256
6257 /* Classify each field of record and merge classes. */
6258 switch (TREE_CODE (type))
6259 {
6260 case RECORD_TYPE:
6261 /* And now merge the fields of structure. */
6262 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6263 {
6264 if (TREE_CODE (field) == FIELD_DECL)
6265 {
6266 int num;
6267
6268 if (TREE_TYPE (field) == error_mark_node)
6269 continue;
6270
6271 /* Bitfields are always classified as integer. Handle them
6272 early, since later code would consider them to be
6273 misaligned integers. */
6274 if (DECL_BIT_FIELD (field))
6275 {
6276 for (i = (int_bit_position (field)
6277 + (bit_offset % 64)) / 8 / 8;
6278 i < ((int_bit_position (field) + (bit_offset % 64))
6279 + tree_to_shwi (DECL_SIZE (field))
6280 + 63) / 8 / 8; i++)
6281 classes[i] =
6282 merge_classes (X86_64_INTEGER_CLASS,
6283 classes[i]);
6284 }
6285 else
6286 {
6287 int pos;
6288
6289 type = TREE_TYPE (field);
6290
6291 /* Flexible array member is ignored. */
6292 if (TYPE_MODE (type) == BLKmode
6293 && TREE_CODE (type) == ARRAY_TYPE
6294 && TYPE_SIZE (type) == NULL_TREE
6295 && TYPE_DOMAIN (type) != NULL_TREE
6296 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6297 == NULL_TREE))
6298 {
6299 static bool warned;
6300
6301 if (!warned && warn_psabi)
6302 {
6303 warned = true;
6304 inform (input_location,
6305 "the ABI of passing struct with"
6306 " a flexible array member has"
6307 " changed in GCC 4.4");
6308 }
6309 continue;
6310 }
6311 num = classify_argument (TYPE_MODE (type), type,
6312 subclasses,
6313 (int_bit_position (field)
6314 + bit_offset) % 256);
6315 if (!num)
6316 return 0;
6317 pos = (int_bit_position (field)
6318 + (bit_offset % 64)) / 8 / 8;
6319 for (i = 0; i < num && (i + pos) < words; i++)
6320 classes[i + pos] =
6321 merge_classes (subclasses[i], classes[i + pos]);
6322 }
6323 }
6324 }
6325 break;
6326
6327 case ARRAY_TYPE:
6328 /* Arrays are handled as small records. */
6329 {
6330 int num;
6331 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6332 TREE_TYPE (type), subclasses, bit_offset);
6333 if (!num)
6334 return 0;
6335
6336 /* The partial classes are now full classes. */
6337 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6338 subclasses[0] = X86_64_SSE_CLASS;
6339 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6340 && !((bit_offset % 64) == 0 && bytes == 4))
6341 subclasses[0] = X86_64_INTEGER_CLASS;
6342
6343 for (i = 0; i < words; i++)
6344 classes[i] = subclasses[i % num];
6345
6346 break;
6347 }
6348 case UNION_TYPE:
6349 case QUAL_UNION_TYPE:
6350 /* Unions are similar to RECORD_TYPE but offset is always 0.
6351 */
6352 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6353 {
6354 if (TREE_CODE (field) == FIELD_DECL)
6355 {
6356 int num;
6357
6358 if (TREE_TYPE (field) == error_mark_node)
6359 continue;
6360
6361 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6362 TREE_TYPE (field), subclasses,
6363 bit_offset);
6364 if (!num)
6365 return 0;
6366 for (i = 0; i < num; i++)
6367 classes[i] = merge_classes (subclasses[i], classes[i]);
6368 }
6369 }
6370 break;
6371
6372 default:
6373 gcc_unreachable ();
6374 }
6375
6376 if (words > 2)
6377 {
6378 /* When size > 16 bytes, if the first one isn't
6379 X86_64_SSE_CLASS or any other ones aren't
6380 X86_64_SSEUP_CLASS, everything should be passed in
6381 memory. */
6382 if (classes[0] != X86_64_SSE_CLASS)
6383 return 0;
6384
6385 for (i = 1; i < words; i++)
6386 if (classes[i] != X86_64_SSEUP_CLASS)
6387 return 0;
6388 }
6389
6390 /* Final merger cleanup. */
6391 for (i = 0; i < words; i++)
6392 {
6393 /* If one class is MEMORY, everything should be passed in
6394 memory. */
6395 if (classes[i] == X86_64_MEMORY_CLASS)
6396 return 0;
6397
6398 /* The X86_64_SSEUP_CLASS should be always preceded by
6399 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6400 if (classes[i] == X86_64_SSEUP_CLASS
6401 && classes[i - 1] != X86_64_SSE_CLASS
6402 && classes[i - 1] != X86_64_SSEUP_CLASS)
6403 {
6404 /* The first one should never be X86_64_SSEUP_CLASS. */
6405 gcc_assert (i != 0);
6406 classes[i] = X86_64_SSE_CLASS;
6407 }
6408
6409 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6410 everything should be passed in memory. */
6411 if (classes[i] == X86_64_X87UP_CLASS
6412 && (classes[i - 1] != X86_64_X87_CLASS))
6413 {
6414 static bool warned;
6415
6416 /* The first one should never be X86_64_X87UP_CLASS. */
6417 gcc_assert (i != 0);
6418 if (!warned && warn_psabi)
6419 {
6420 warned = true;
6421 inform (input_location,
6422 "the ABI of passing union with long double"
6423 " has changed in GCC 4.4");
6424 }
6425 return 0;
6426 }
6427 }
6428 return words;
6429 }
6430
6431 /* Compute alignment needed. We align all types to natural boundaries with
6432 exception of XFmode that is aligned to 64bits. */
6433 if (mode != VOIDmode && mode != BLKmode)
6434 {
6435 int mode_alignment = GET_MODE_BITSIZE (mode);
6436
6437 if (mode == XFmode)
6438 mode_alignment = 128;
6439 else if (mode == XCmode)
6440 mode_alignment = 256;
6441 if (COMPLEX_MODE_P (mode))
6442 mode_alignment /= 2;
6443 /* Misaligned fields are always returned in memory. */
6444 if (bit_offset % mode_alignment)
6445 return 0;
6446 }
6447
6448 /* for V1xx modes, just use the base mode */
6449 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6450 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6451 mode = GET_MODE_INNER (mode);
6452
6453 /* Classification of atomic types. */
6454 switch (mode)
6455 {
6456 case SDmode:
6457 case DDmode:
6458 classes[0] = X86_64_SSE_CLASS;
6459 return 1;
6460 case TDmode:
6461 classes[0] = X86_64_SSE_CLASS;
6462 classes[1] = X86_64_SSEUP_CLASS;
6463 return 2;
6464 case DImode:
6465 case SImode:
6466 case HImode:
6467 case QImode:
6468 case CSImode:
6469 case CHImode:
6470 case CQImode:
6471 {
6472 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6473
6474 if (size <= 32)
6475 {
6476 classes[0] = X86_64_INTEGERSI_CLASS;
6477 return 1;
6478 }
6479 else if (size <= 64)
6480 {
6481 classes[0] = X86_64_INTEGER_CLASS;
6482 return 1;
6483 }
6484 else if (size <= 64+32)
6485 {
6486 classes[0] = X86_64_INTEGER_CLASS;
6487 classes[1] = X86_64_INTEGERSI_CLASS;
6488 return 2;
6489 }
6490 else if (size <= 64+64)
6491 {
6492 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6493 return 2;
6494 }
6495 else
6496 gcc_unreachable ();
6497 }
6498 case CDImode:
6499 case TImode:
6500 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6501 return 2;
6502 case COImode:
6503 case OImode:
6504 /* OImode shouldn't be used directly. */
6505 gcc_unreachable ();
6506 case CTImode:
6507 return 0;
6508 case SFmode:
6509 if (!(bit_offset % 64))
6510 classes[0] = X86_64_SSESF_CLASS;
6511 else
6512 classes[0] = X86_64_SSE_CLASS;
6513 return 1;
6514 case DFmode:
6515 classes[0] = X86_64_SSEDF_CLASS;
6516 return 1;
6517 case XFmode:
6518 classes[0] = X86_64_X87_CLASS;
6519 classes[1] = X86_64_X87UP_CLASS;
6520 return 2;
6521 case TFmode:
6522 classes[0] = X86_64_SSE_CLASS;
6523 classes[1] = X86_64_SSEUP_CLASS;
6524 return 2;
6525 case SCmode:
6526 classes[0] = X86_64_SSE_CLASS;
6527 if (!(bit_offset % 64))
6528 return 1;
6529 else
6530 {
6531 static bool warned;
6532
6533 if (!warned && warn_psabi)
6534 {
6535 warned = true;
6536 inform (input_location,
6537 "the ABI of passing structure with complex float"
6538 " member has changed in GCC 4.4");
6539 }
6540 classes[1] = X86_64_SSESF_CLASS;
6541 return 2;
6542 }
6543 case DCmode:
6544 classes[0] = X86_64_SSEDF_CLASS;
6545 classes[1] = X86_64_SSEDF_CLASS;
6546 return 2;
6547 case XCmode:
6548 classes[0] = X86_64_COMPLEX_X87_CLASS;
6549 return 1;
6550 case TCmode:
6551 /* This modes is larger than 16 bytes. */
6552 return 0;
6553 case V8SFmode:
6554 case V8SImode:
6555 case V32QImode:
6556 case V16HImode:
6557 case V4DFmode:
6558 case V4DImode:
6559 classes[0] = X86_64_SSE_CLASS;
6560 classes[1] = X86_64_SSEUP_CLASS;
6561 classes[2] = X86_64_SSEUP_CLASS;
6562 classes[3] = X86_64_SSEUP_CLASS;
6563 return 4;
6564 case V4SFmode:
6565 case V4SImode:
6566 case V16QImode:
6567 case V8HImode:
6568 case V2DFmode:
6569 case V2DImode:
6570 classes[0] = X86_64_SSE_CLASS;
6571 classes[1] = X86_64_SSEUP_CLASS;
6572 return 2;
6573 case V1TImode:
6574 case V1DImode:
6575 case V2SFmode:
6576 case V2SImode:
6577 case V4HImode:
6578 case V8QImode:
6579 classes[0] = X86_64_SSE_CLASS;
6580 return 1;
6581 case BLKmode:
6582 case VOIDmode:
6583 return 0;
6584 default:
6585 gcc_assert (VECTOR_MODE_P (mode));
6586
6587 if (bytes > 16)
6588 return 0;
6589
6590 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6591
6592 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6593 classes[0] = X86_64_INTEGERSI_CLASS;
6594 else
6595 classes[0] = X86_64_INTEGER_CLASS;
6596 classes[1] = X86_64_INTEGER_CLASS;
6597 return 1 + (bytes > 8);
6598 }
6599 }
6600
6601 /* Examine the argument and return set number of register required in each
6602 class. Return 0 iff parameter should be passed in memory. */
6603 static int
6604 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6605 int *int_nregs, int *sse_nregs)
6606 {
6607 enum x86_64_reg_class regclass[MAX_CLASSES];
6608 int n = classify_argument (mode, type, regclass, 0);
6609
6610 *int_nregs = 0;
6611 *sse_nregs = 0;
6612 if (!n)
6613 return 0;
6614 for (n--; n >= 0; n--)
6615 switch (regclass[n])
6616 {
6617 case X86_64_INTEGER_CLASS:
6618 case X86_64_INTEGERSI_CLASS:
6619 (*int_nregs)++;
6620 break;
6621 case X86_64_SSE_CLASS:
6622 case X86_64_SSESF_CLASS:
6623 case X86_64_SSEDF_CLASS:
6624 (*sse_nregs)++;
6625 break;
6626 case X86_64_NO_CLASS:
6627 case X86_64_SSEUP_CLASS:
6628 break;
6629 case X86_64_X87_CLASS:
6630 case X86_64_X87UP_CLASS:
6631 if (!in_return)
6632 return 0;
6633 break;
6634 case X86_64_COMPLEX_X87_CLASS:
6635 return in_return ? 2 : 0;
6636 case X86_64_MEMORY_CLASS:
6637 gcc_unreachable ();
6638 }
6639 return 1;
6640 }
6641
6642 /* Construct container for the argument used by GCC interface. See
6643 FUNCTION_ARG for the detailed description. */
6644
6645 static rtx
6646 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6647 const_tree type, int in_return, int nintregs, int nsseregs,
6648 const int *intreg, int sse_regno)
6649 {
6650 /* The following variables hold the static issued_error state. */
6651 static bool issued_sse_arg_error;
6652 static bool issued_sse_ret_error;
6653 static bool issued_x87_ret_error;
6654
6655 enum machine_mode tmpmode;
6656 int bytes =
6657 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6658 enum x86_64_reg_class regclass[MAX_CLASSES];
6659 int n;
6660 int i;
6661 int nexps = 0;
6662 int needed_sseregs, needed_intregs;
6663 rtx exp[MAX_CLASSES];
6664 rtx ret;
6665
6666 n = classify_argument (mode, type, regclass, 0);
6667 if (!n)
6668 return NULL;
6669 if (!examine_argument (mode, type, in_return, &needed_intregs,
6670 &needed_sseregs))
6671 return NULL;
6672 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6673 return NULL;
6674
6675 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6676 some less clueful developer tries to use floating-point anyway. */
6677 if (needed_sseregs && !TARGET_SSE)
6678 {
6679 if (in_return)
6680 {
6681 if (!issued_sse_ret_error)
6682 {
6683 error ("SSE register return with SSE disabled");
6684 issued_sse_ret_error = true;
6685 }
6686 }
6687 else if (!issued_sse_arg_error)
6688 {
6689 error ("SSE register argument with SSE disabled");
6690 issued_sse_arg_error = true;
6691 }
6692 return NULL;
6693 }
6694
6695 /* Likewise, error if the ABI requires us to return values in the
6696 x87 registers and the user specified -mno-80387. */
6697 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6698 for (i = 0; i < n; i++)
6699 if (regclass[i] == X86_64_X87_CLASS
6700 || regclass[i] == X86_64_X87UP_CLASS
6701 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6702 {
6703 if (!issued_x87_ret_error)
6704 {
6705 error ("x87 register return with x87 disabled");
6706 issued_x87_ret_error = true;
6707 }
6708 return NULL;
6709 }
6710
6711 /* First construct simple cases. Avoid SCmode, since we want to use
6712 single register to pass this type. */
6713 if (n == 1 && mode != SCmode)
6714 switch (regclass[0])
6715 {
6716 case X86_64_INTEGER_CLASS:
6717 case X86_64_INTEGERSI_CLASS:
6718 return gen_rtx_REG (mode, intreg[0]);
6719 case X86_64_SSE_CLASS:
6720 case X86_64_SSESF_CLASS:
6721 case X86_64_SSEDF_CLASS:
6722 if (mode != BLKmode)
6723 return gen_reg_or_parallel (mode, orig_mode,
6724 SSE_REGNO (sse_regno));
6725 break;
6726 case X86_64_X87_CLASS:
6727 case X86_64_COMPLEX_X87_CLASS:
6728 return gen_rtx_REG (mode, FIRST_STACK_REG);
6729 case X86_64_NO_CLASS:
6730 /* Zero sized array, struct or class. */
6731 return NULL;
6732 default:
6733 gcc_unreachable ();
6734 }
6735 if (n == 2
6736 && regclass[0] == X86_64_SSE_CLASS
6737 && regclass[1] == X86_64_SSEUP_CLASS
6738 && mode != BLKmode)
6739 return gen_reg_or_parallel (mode, orig_mode,
6740 SSE_REGNO (sse_regno));
6741 if (n == 4
6742 && regclass[0] == X86_64_SSE_CLASS
6743 && regclass[1] == X86_64_SSEUP_CLASS
6744 && regclass[2] == X86_64_SSEUP_CLASS
6745 && regclass[3] == X86_64_SSEUP_CLASS
6746 && mode != BLKmode)
6747 return gen_reg_or_parallel (mode, orig_mode,
6748 SSE_REGNO (sse_regno));
6749 if (n == 2
6750 && regclass[0] == X86_64_X87_CLASS
6751 && regclass[1] == X86_64_X87UP_CLASS)
6752 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6753
6754 if (n == 2
6755 && regclass[0] == X86_64_INTEGER_CLASS
6756 && regclass[1] == X86_64_INTEGER_CLASS
6757 && (mode == CDImode || mode == TImode || mode == TFmode)
6758 && intreg[0] + 1 == intreg[1])
6759 return gen_rtx_REG (mode, intreg[0]);
6760
6761 /* Otherwise figure out the entries of the PARALLEL. */
6762 for (i = 0; i < n; i++)
6763 {
6764 int pos;
6765
6766 switch (regclass[i])
6767 {
6768 case X86_64_NO_CLASS:
6769 break;
6770 case X86_64_INTEGER_CLASS:
6771 case X86_64_INTEGERSI_CLASS:
6772 /* Merge TImodes on aligned occasions here too. */
6773 if (i * 8 + 8 > bytes)
6774 tmpmode
6775 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6776 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6777 tmpmode = SImode;
6778 else
6779 tmpmode = DImode;
6780 /* We've requested 24 bytes we
6781 don't have mode for. Use DImode. */
6782 if (tmpmode == BLKmode)
6783 tmpmode = DImode;
6784 exp [nexps++]
6785 = gen_rtx_EXPR_LIST (VOIDmode,
6786 gen_rtx_REG (tmpmode, *intreg),
6787 GEN_INT (i*8));
6788 intreg++;
6789 break;
6790 case X86_64_SSESF_CLASS:
6791 exp [nexps++]
6792 = gen_rtx_EXPR_LIST (VOIDmode,
6793 gen_rtx_REG (SFmode,
6794 SSE_REGNO (sse_regno)),
6795 GEN_INT (i*8));
6796 sse_regno++;
6797 break;
6798 case X86_64_SSEDF_CLASS:
6799 exp [nexps++]
6800 = gen_rtx_EXPR_LIST (VOIDmode,
6801 gen_rtx_REG (DFmode,
6802 SSE_REGNO (sse_regno)),
6803 GEN_INT (i*8));
6804 sse_regno++;
6805 break;
6806 case X86_64_SSE_CLASS:
6807 pos = i;
6808 switch (n)
6809 {
6810 case 1:
6811 tmpmode = DImode;
6812 break;
6813 case 2:
6814 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6815 {
6816 tmpmode = TImode;
6817 i++;
6818 }
6819 else
6820 tmpmode = DImode;
6821 break;
6822 case 4:
6823 gcc_assert (i == 0
6824 && regclass[1] == X86_64_SSEUP_CLASS
6825 && regclass[2] == X86_64_SSEUP_CLASS
6826 && regclass[3] == X86_64_SSEUP_CLASS);
6827 tmpmode = OImode;
6828 i += 3;
6829 break;
6830 default:
6831 gcc_unreachable ();
6832 }
6833 exp [nexps++]
6834 = gen_rtx_EXPR_LIST (VOIDmode,
6835 gen_rtx_REG (tmpmode,
6836 SSE_REGNO (sse_regno)),
6837 GEN_INT (pos*8));
6838 sse_regno++;
6839 break;
6840 default:
6841 gcc_unreachable ();
6842 }
6843 }
6844
6845 /* Empty aligned struct, union or class. */
6846 if (nexps == 0)
6847 return NULL;
6848
6849 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6850 for (i = 0; i < nexps; i++)
6851 XVECEXP (ret, 0, i) = exp [i];
6852 return ret;
6853 }
6854
6855 /* Update the data in CUM to advance over an argument of mode MODE
6856 and data type TYPE. (TYPE is null for libcalls where that information
6857 may not be available.) */
6858
6859 static void
6860 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6861 const_tree type, HOST_WIDE_INT bytes,
6862 HOST_WIDE_INT words)
6863 {
6864 switch (mode)
6865 {
6866 default:
6867 break;
6868
6869 case BLKmode:
6870 if (bytes < 0)
6871 break;
6872 /* FALLTHRU */
6873
6874 case DImode:
6875 case SImode:
6876 case HImode:
6877 case QImode:
6878 cum->words += words;
6879 cum->nregs -= words;
6880 cum->regno += words;
6881
6882 if (cum->nregs <= 0)
6883 {
6884 cum->nregs = 0;
6885 cum->regno = 0;
6886 }
6887 break;
6888
6889 case OImode:
6890 /* OImode shouldn't be used directly. */
6891 gcc_unreachable ();
6892
6893 case DFmode:
6894 if (cum->float_in_sse < 2)
6895 break;
6896 case SFmode:
6897 if (cum->float_in_sse < 1)
6898 break;
6899 /* FALLTHRU */
6900
6901 case V8SFmode:
6902 case V8SImode:
6903 case V32QImode:
6904 case V16HImode:
6905 case V4DFmode:
6906 case V4DImode:
6907 case TImode:
6908 case V16QImode:
6909 case V8HImode:
6910 case V4SImode:
6911 case V2DImode:
6912 case V4SFmode:
6913 case V2DFmode:
6914 if (!type || !AGGREGATE_TYPE_P (type))
6915 {
6916 cum->sse_words += words;
6917 cum->sse_nregs -= 1;
6918 cum->sse_regno += 1;
6919 if (cum->sse_nregs <= 0)
6920 {
6921 cum->sse_nregs = 0;
6922 cum->sse_regno = 0;
6923 }
6924 }
6925 break;
6926
6927 case V8QImode:
6928 case V4HImode:
6929 case V2SImode:
6930 case V2SFmode:
6931 case V1TImode:
6932 case V1DImode:
6933 if (!type || !AGGREGATE_TYPE_P (type))
6934 {
6935 cum->mmx_words += words;
6936 cum->mmx_nregs -= 1;
6937 cum->mmx_regno += 1;
6938 if (cum->mmx_nregs <= 0)
6939 {
6940 cum->mmx_nregs = 0;
6941 cum->mmx_regno = 0;
6942 }
6943 }
6944 break;
6945 }
6946 }
6947
6948 static void
6949 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6950 const_tree type, HOST_WIDE_INT words, bool named)
6951 {
6952 int int_nregs, sse_nregs;
6953
6954 /* Unnamed 256bit vector mode parameters are passed on stack. */
6955 if (!named && VALID_AVX256_REG_MODE (mode))
6956 return;
6957
6958 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6959 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6960 {
6961 cum->nregs -= int_nregs;
6962 cum->sse_nregs -= sse_nregs;
6963 cum->regno += int_nregs;
6964 cum->sse_regno += sse_nregs;
6965 }
6966 else
6967 {
6968 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6969 cum->words = (cum->words + align - 1) & ~(align - 1);
6970 cum->words += words;
6971 }
6972 }
6973
6974 static void
6975 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6976 HOST_WIDE_INT words)
6977 {
6978 /* Otherwise, this should be passed indirect. */
6979 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6980
6981 cum->words += words;
6982 if (cum->nregs > 0)
6983 {
6984 cum->nregs -= 1;
6985 cum->regno += 1;
6986 }
6987 }
6988
6989 /* Update the data in CUM to advance over an argument of mode MODE and
6990 data type TYPE. (TYPE is null for libcalls where that information
6991 may not be available.) */
6992
6993 static void
6994 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6995 const_tree type, bool named)
6996 {
6997 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6998 HOST_WIDE_INT bytes, words;
6999
7000 if (mode == BLKmode)
7001 bytes = int_size_in_bytes (type);
7002 else
7003 bytes = GET_MODE_SIZE (mode);
7004 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7005
7006 if (type)
7007 mode = type_natural_mode (type, NULL);
7008
7009 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7010 function_arg_advance_ms_64 (cum, bytes, words);
7011 else if (TARGET_64BIT)
7012 function_arg_advance_64 (cum, mode, type, words, named);
7013 else
7014 function_arg_advance_32 (cum, mode, type, bytes, words);
7015 }
7016
7017 /* Define where to put the arguments to a function.
7018 Value is zero to push the argument on the stack,
7019 or a hard register in which to store the argument.
7020
7021 MODE is the argument's machine mode.
7022 TYPE is the data type of the argument (as a tree).
7023 This is null for libcalls where that information may
7024 not be available.
7025 CUM is a variable of type CUMULATIVE_ARGS which gives info about
7026 the preceding args and about the function being called.
7027 NAMED is nonzero if this argument is a named parameter
7028 (otherwise it is an extra parameter matching an ellipsis). */
7029
7030 static rtx
7031 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7032 enum machine_mode orig_mode, const_tree type,
7033 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
7034 {
7035 static bool warnedsse, warnedmmx;
7036
7037 /* Avoid the AL settings for the Unix64 ABI. */
7038 if (mode == VOIDmode)
7039 return constm1_rtx;
7040
7041 switch (mode)
7042 {
7043 default:
7044 break;
7045
7046 case BLKmode:
7047 if (bytes < 0)
7048 break;
7049 /* FALLTHRU */
7050 case DImode:
7051 case SImode:
7052 case HImode:
7053 case QImode:
7054 if (words <= cum->nregs)
7055 {
7056 int regno = cum->regno;
7057
7058 /* Fastcall allocates the first two DWORD (SImode) or
7059 smaller arguments to ECX and EDX if it isn't an
7060 aggregate type . */
7061 if (cum->fastcall)
7062 {
7063 if (mode == BLKmode
7064 || mode == DImode
7065 || (type && AGGREGATE_TYPE_P (type)))
7066 break;
7067
7068 /* ECX not EAX is the first allocated register. */
7069 if (regno == AX_REG)
7070 regno = CX_REG;
7071 }
7072 return gen_rtx_REG (mode, regno);
7073 }
7074 break;
7075
7076 case DFmode:
7077 if (cum->float_in_sse < 2)
7078 break;
7079 case SFmode:
7080 if (cum->float_in_sse < 1)
7081 break;
7082 /* FALLTHRU */
7083 case TImode:
7084 /* In 32bit, we pass TImode in xmm registers. */
7085 case V16QImode:
7086 case V8HImode:
7087 case V4SImode:
7088 case V2DImode:
7089 case V4SFmode:
7090 case V2DFmode:
7091 if (!type || !AGGREGATE_TYPE_P (type))
7092 {
7093 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
7094 {
7095 warnedsse = true;
7096 warning (0, "SSE vector argument without SSE enabled "
7097 "changes the ABI");
7098 }
7099 if (cum->sse_nregs)
7100 return gen_reg_or_parallel (mode, orig_mode,
7101 cum->sse_regno + FIRST_SSE_REG);
7102 }
7103 break;
7104
7105 case OImode:
7106 /* OImode shouldn't be used directly. */
7107 gcc_unreachable ();
7108
7109 case V8SFmode:
7110 case V8SImode:
7111 case V32QImode:
7112 case V16HImode:
7113 case V4DFmode:
7114 case V4DImode:
7115 if (!type || !AGGREGATE_TYPE_P (type))
7116 {
7117 if (cum->sse_nregs)
7118 return gen_reg_or_parallel (mode, orig_mode,
7119 cum->sse_regno + FIRST_SSE_REG);
7120 }
7121 break;
7122
7123 case V8QImode:
7124 case V4HImode:
7125 case V2SImode:
7126 case V2SFmode:
7127 case V1TImode:
7128 case V1DImode:
7129 if (!type || !AGGREGATE_TYPE_P (type))
7130 {
7131 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7132 {
7133 warnedmmx = true;
7134 warning (0, "MMX vector argument without MMX enabled "
7135 "changes the ABI");
7136 }
7137 if (cum->mmx_nregs)
7138 return gen_reg_or_parallel (mode, orig_mode,
7139 cum->mmx_regno + FIRST_MMX_REG);
7140 }
7141 break;
7142 }
7143
7144 return NULL_RTX;
7145 }
7146
7147 static rtx
7148 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7149 enum machine_mode orig_mode, const_tree type, bool named)
7150 {
7151 /* Handle a hidden AL argument containing number of registers
7152 for varargs x86-64 functions. */
7153 if (mode == VOIDmode)
7154 return GEN_INT (cum->maybe_vaarg
7155 ? (cum->sse_nregs < 0
7156 ? X86_64_SSE_REGPARM_MAX
7157 : cum->sse_regno)
7158 : -1);
7159
7160 switch (mode)
7161 {
7162 default:
7163 break;
7164
7165 case V8SFmode:
7166 case V8SImode:
7167 case V32QImode:
7168 case V16HImode:
7169 case V4DFmode:
7170 case V4DImode:
7171 /* Unnamed 256bit vector mode parameters are passed on stack. */
7172 if (!named)
7173 return NULL;
7174 break;
7175 }
7176
7177 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7178 cum->sse_nregs,
7179 &x86_64_int_parameter_registers [cum->regno],
7180 cum->sse_regno);
7181 }
7182
7183 static rtx
7184 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7185 enum machine_mode orig_mode, bool named,
7186 HOST_WIDE_INT bytes)
7187 {
7188 unsigned int regno;
7189
7190 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7191 We use value of -2 to specify that current function call is MSABI. */
7192 if (mode == VOIDmode)
7193 return GEN_INT (-2);
7194
7195 /* If we've run out of registers, it goes on the stack. */
7196 if (cum->nregs == 0)
7197 return NULL_RTX;
7198
7199 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7200
7201 /* Only floating point modes are passed in anything but integer regs. */
7202 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7203 {
7204 if (named)
7205 regno = cum->regno + FIRST_SSE_REG;
7206 else
7207 {
7208 rtx t1, t2;
7209
7210 /* Unnamed floating parameters are passed in both the
7211 SSE and integer registers. */
7212 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7213 t2 = gen_rtx_REG (mode, regno);
7214 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7215 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7216 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7217 }
7218 }
7219 /* Handle aggregated types passed in register. */
7220 if (orig_mode == BLKmode)
7221 {
7222 if (bytes > 0 && bytes <= 8)
7223 mode = (bytes > 4 ? DImode : SImode);
7224 if (mode == BLKmode)
7225 mode = DImode;
7226 }
7227
7228 return gen_reg_or_parallel (mode, orig_mode, regno);
7229 }
7230
7231 /* Return where to put the arguments to a function.
7232 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7233
7234 MODE is the argument's machine mode. TYPE is the data type of the
7235 argument. It is null for libcalls where that information may not be
7236 available. CUM gives information about the preceding args and about
7237 the function being called. NAMED is nonzero if this argument is a
7238 named parameter (otherwise it is an extra parameter matching an
7239 ellipsis). */
7240
7241 static rtx
7242 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7243 const_tree type, bool named)
7244 {
7245 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7246 enum machine_mode mode = omode;
7247 HOST_WIDE_INT bytes, words;
7248 rtx arg;
7249
7250 if (mode == BLKmode)
7251 bytes = int_size_in_bytes (type);
7252 else
7253 bytes = GET_MODE_SIZE (mode);
7254 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7255
7256 /* To simplify the code below, represent vector types with a vector mode
7257 even if MMX/SSE are not active. */
7258 if (type && TREE_CODE (type) == VECTOR_TYPE)
7259 mode = type_natural_mode (type, cum);
7260
7261 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7262 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7263 else if (TARGET_64BIT)
7264 arg = function_arg_64 (cum, mode, omode, type, named);
7265 else
7266 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7267
7268 return arg;
7269 }
7270
7271 /* A C expression that indicates when an argument must be passed by
7272 reference. If nonzero for an argument, a copy of that argument is
7273 made in memory and a pointer to the argument is passed instead of
7274 the argument itself. The pointer is passed in whatever way is
7275 appropriate for passing a pointer to that type. */
7276
7277 static bool
7278 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7279 const_tree type, bool named ATTRIBUTE_UNUSED)
7280 {
7281 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7282
7283 /* See Windows x64 Software Convention. */
7284 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7285 {
7286 int msize = (int) GET_MODE_SIZE (mode);
7287 if (type)
7288 {
7289 /* Arrays are passed by reference. */
7290 if (TREE_CODE (type) == ARRAY_TYPE)
7291 return true;
7292
7293 if (AGGREGATE_TYPE_P (type))
7294 {
7295 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7296 are passed by reference. */
7297 msize = int_size_in_bytes (type);
7298 }
7299 }
7300
7301 /* __m128 is passed by reference. */
7302 switch (msize) {
7303 case 1: case 2: case 4: case 8:
7304 break;
7305 default:
7306 return true;
7307 }
7308 }
7309 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7310 return 1;
7311
7312 return 0;
7313 }
7314
7315 /* Return true when TYPE should be 128bit aligned for 32bit argument
7316 passing ABI. XXX: This function is obsolete and is only used for
7317 checking psABI compatibility with previous versions of GCC. */
7318
7319 static bool
7320 ix86_compat_aligned_value_p (const_tree type)
7321 {
7322 enum machine_mode mode = TYPE_MODE (type);
7323 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7324 || mode == TDmode
7325 || mode == TFmode
7326 || mode == TCmode)
7327 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7328 return true;
7329 if (TYPE_ALIGN (type) < 128)
7330 return false;
7331
7332 if (AGGREGATE_TYPE_P (type))
7333 {
7334 /* Walk the aggregates recursively. */
7335 switch (TREE_CODE (type))
7336 {
7337 case RECORD_TYPE:
7338 case UNION_TYPE:
7339 case QUAL_UNION_TYPE:
7340 {
7341 tree field;
7342
7343 /* Walk all the structure fields. */
7344 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7345 {
7346 if (TREE_CODE (field) == FIELD_DECL
7347 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7348 return true;
7349 }
7350 break;
7351 }
7352
7353 case ARRAY_TYPE:
7354 /* Just for use if some languages passes arrays by value. */
7355 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7356 return true;
7357 break;
7358
7359 default:
7360 gcc_unreachable ();
7361 }
7362 }
7363 return false;
7364 }
7365
7366 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7367 XXX: This function is obsolete and is only used for checking psABI
7368 compatibility with previous versions of GCC. */
7369
7370 static unsigned int
7371 ix86_compat_function_arg_boundary (enum machine_mode mode,
7372 const_tree type, unsigned int align)
7373 {
7374 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7375 natural boundaries. */
7376 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7377 {
7378 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7379 make an exception for SSE modes since these require 128bit
7380 alignment.
7381
7382 The handling here differs from field_alignment. ICC aligns MMX
7383 arguments to 4 byte boundaries, while structure fields are aligned
7384 to 8 byte boundaries. */
7385 if (!type)
7386 {
7387 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7388 align = PARM_BOUNDARY;
7389 }
7390 else
7391 {
7392 if (!ix86_compat_aligned_value_p (type))
7393 align = PARM_BOUNDARY;
7394 }
7395 }
7396 if (align > BIGGEST_ALIGNMENT)
7397 align = BIGGEST_ALIGNMENT;
7398 return align;
7399 }
7400
7401 /* Return true when TYPE should be 128bit aligned for 32bit argument
7402 passing ABI. */
7403
7404 static bool
7405 ix86_contains_aligned_value_p (const_tree type)
7406 {
7407 enum machine_mode mode = TYPE_MODE (type);
7408
7409 if (mode == XFmode || mode == XCmode)
7410 return false;
7411
7412 if (TYPE_ALIGN (type) < 128)
7413 return false;
7414
7415 if (AGGREGATE_TYPE_P (type))
7416 {
7417 /* Walk the aggregates recursively. */
7418 switch (TREE_CODE (type))
7419 {
7420 case RECORD_TYPE:
7421 case UNION_TYPE:
7422 case QUAL_UNION_TYPE:
7423 {
7424 tree field;
7425
7426 /* Walk all the structure fields. */
7427 for (field = TYPE_FIELDS (type);
7428 field;
7429 field = DECL_CHAIN (field))
7430 {
7431 if (TREE_CODE (field) == FIELD_DECL
7432 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7433 return true;
7434 }
7435 break;
7436 }
7437
7438 case ARRAY_TYPE:
7439 /* Just for use if some languages passes arrays by value. */
7440 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7441 return true;
7442 break;
7443
7444 default:
7445 gcc_unreachable ();
7446 }
7447 }
7448 else
7449 return TYPE_ALIGN (type) >= 128;
7450
7451 return false;
7452 }
7453
7454 /* Gives the alignment boundary, in bits, of an argument with the
7455 specified mode and type. */
7456
7457 static unsigned int
7458 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7459 {
7460 unsigned int align;
7461 if (type)
7462 {
7463 /* Since the main variant type is used for call, we convert it to
7464 the main variant type. */
7465 type = TYPE_MAIN_VARIANT (type);
7466 align = TYPE_ALIGN (type);
7467 }
7468 else
7469 align = GET_MODE_ALIGNMENT (mode);
7470 if (align < PARM_BOUNDARY)
7471 align = PARM_BOUNDARY;
7472 else
7473 {
7474 static bool warned;
7475 unsigned int saved_align = align;
7476
7477 if (!TARGET_64BIT)
7478 {
7479 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7480 if (!type)
7481 {
7482 if (mode == XFmode || mode == XCmode)
7483 align = PARM_BOUNDARY;
7484 }
7485 else if (!ix86_contains_aligned_value_p (type))
7486 align = PARM_BOUNDARY;
7487
7488 if (align < 128)
7489 align = PARM_BOUNDARY;
7490 }
7491
7492 if (warn_psabi
7493 && !warned
7494 && align != ix86_compat_function_arg_boundary (mode, type,
7495 saved_align))
7496 {
7497 warned = true;
7498 inform (input_location,
7499 "The ABI for passing parameters with %d-byte"
7500 " alignment has changed in GCC 4.6",
7501 align / BITS_PER_UNIT);
7502 }
7503 }
7504
7505 return align;
7506 }
7507
7508 /* Return true if N is a possible register number of function value. */
7509
7510 static bool
7511 ix86_function_value_regno_p (const unsigned int regno)
7512 {
7513 switch (regno)
7514 {
7515 case AX_REG:
7516 case DX_REG:
7517 return true;
7518 case DI_REG:
7519 case SI_REG:
7520 return TARGET_64BIT && ix86_abi != MS_ABI;
7521
7522 /* Complex values are returned in %st(0)/%st(1) pair. */
7523 case ST0_REG:
7524 case ST1_REG:
7525 /* TODO: The function should depend on current function ABI but
7526 builtins.c would need updating then. Therefore we use the
7527 default ABI. */
7528 if (TARGET_64BIT && ix86_abi == MS_ABI)
7529 return false;
7530 return TARGET_FLOAT_RETURNS_IN_80387;
7531
7532 /* Complex values are returned in %xmm0/%xmm1 pair. */
7533 case XMM0_REG:
7534 case XMM1_REG:
7535 return TARGET_SSE;
7536
7537 case MM0_REG:
7538 if (TARGET_MACHO || TARGET_64BIT)
7539 return false;
7540 return TARGET_MMX;
7541 }
7542
7543 return false;
7544 }
7545
7546 /* Define how to find the value returned by a function.
7547 VALTYPE is the data type of the value (as a tree).
7548 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7549 otherwise, FUNC is 0. */
7550
7551 static rtx
7552 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7553 const_tree fntype, const_tree fn)
7554 {
7555 unsigned int regno;
7556
7557 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7558 we normally prevent this case when mmx is not available. However
7559 some ABIs may require the result to be returned like DImode. */
7560 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7561 regno = FIRST_MMX_REG;
7562
7563 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7564 we prevent this case when sse is not available. However some ABIs
7565 may require the result to be returned like integer TImode. */
7566 else if (mode == TImode
7567 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7568 regno = FIRST_SSE_REG;
7569
7570 /* 32-byte vector modes in %ymm0. */
7571 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7572 regno = FIRST_SSE_REG;
7573
7574 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7575 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7576 regno = FIRST_FLOAT_REG;
7577 else
7578 /* Most things go in %eax. */
7579 regno = AX_REG;
7580
7581 /* Override FP return register with %xmm0 for local functions when
7582 SSE math is enabled or for functions with sseregparm attribute. */
7583 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7584 {
7585 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7586 if ((sse_level >= 1 && mode == SFmode)
7587 || (sse_level == 2 && mode == DFmode))
7588 regno = FIRST_SSE_REG;
7589 }
7590
7591 /* OImode shouldn't be used directly. */
7592 gcc_assert (mode != OImode);
7593
7594 return gen_rtx_REG (orig_mode, regno);
7595 }
7596
7597 static rtx
7598 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7599 const_tree valtype)
7600 {
7601 rtx ret;
7602
7603 /* Handle libcalls, which don't provide a type node. */
7604 if (valtype == NULL)
7605 {
7606 unsigned int regno;
7607
7608 switch (mode)
7609 {
7610 case SFmode:
7611 case SCmode:
7612 case DFmode:
7613 case DCmode:
7614 case TFmode:
7615 case SDmode:
7616 case DDmode:
7617 case TDmode:
7618 regno = FIRST_SSE_REG;
7619 break;
7620 case XFmode:
7621 case XCmode:
7622 regno = FIRST_FLOAT_REG;
7623 break;
7624 case TCmode:
7625 return NULL;
7626 default:
7627 regno = AX_REG;
7628 }
7629
7630 return gen_rtx_REG (mode, regno);
7631 }
7632 else if (POINTER_TYPE_P (valtype))
7633 {
7634 /* Pointers are always returned in word_mode. */
7635 mode = word_mode;
7636 }
7637
7638 ret = construct_container (mode, orig_mode, valtype, 1,
7639 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7640 x86_64_int_return_registers, 0);
7641
7642 /* For zero sized structures, construct_container returns NULL, but we
7643 need to keep rest of compiler happy by returning meaningful value. */
7644 if (!ret)
7645 ret = gen_rtx_REG (orig_mode, AX_REG);
7646
7647 return ret;
7648 }
7649
7650 static rtx
7651 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7652 const_tree valtype)
7653 {
7654 unsigned int regno = AX_REG;
7655
7656 if (TARGET_SSE)
7657 {
7658 switch (GET_MODE_SIZE (mode))
7659 {
7660 case 16:
7661 if (valtype != NULL_TREE
7662 && !VECTOR_INTEGER_TYPE_P (valtype)
7663 && !VECTOR_INTEGER_TYPE_P (valtype)
7664 && !INTEGRAL_TYPE_P (valtype)
7665 && !VECTOR_FLOAT_TYPE_P (valtype))
7666 break;
7667 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7668 && !COMPLEX_MODE_P (mode))
7669 regno = FIRST_SSE_REG;
7670 break;
7671 case 8:
7672 case 4:
7673 if (mode == SFmode || mode == DFmode)
7674 regno = FIRST_SSE_REG;
7675 break;
7676 default:
7677 break;
7678 }
7679 }
7680 return gen_rtx_REG (orig_mode, regno);
7681 }
7682
7683 static rtx
7684 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7685 enum machine_mode orig_mode, enum machine_mode mode)
7686 {
7687 const_tree fn, fntype;
7688
7689 fn = NULL_TREE;
7690 if (fntype_or_decl && DECL_P (fntype_or_decl))
7691 fn = fntype_or_decl;
7692 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7693
7694 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7695 return function_value_ms_64 (orig_mode, mode, valtype);
7696 else if (TARGET_64BIT)
7697 return function_value_64 (orig_mode, mode, valtype);
7698 else
7699 return function_value_32 (orig_mode, mode, fntype, fn);
7700 }
7701
7702 static rtx
7703 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7704 bool outgoing ATTRIBUTE_UNUSED)
7705 {
7706 enum machine_mode mode, orig_mode;
7707
7708 orig_mode = TYPE_MODE (valtype);
7709 mode = type_natural_mode (valtype, NULL);
7710 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7711 }
7712
7713 /* Pointer function arguments and return values are promoted to
7714 word_mode. */
7715
7716 static enum machine_mode
7717 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7718 int *punsignedp, const_tree fntype,
7719 int for_return)
7720 {
7721 if (type != NULL_TREE && POINTER_TYPE_P (type))
7722 {
7723 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7724 return word_mode;
7725 }
7726 return default_promote_function_mode (type, mode, punsignedp, fntype,
7727 for_return);
7728 }
7729
7730 /* Return true if a structure, union or array with MODE containing FIELD
7731 should be accessed using BLKmode. */
7732
7733 static bool
7734 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7735 {
7736 /* Union with XFmode must be in BLKmode. */
7737 return (mode == XFmode
7738 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7739 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7740 }
7741
7742 rtx
7743 ix86_libcall_value (enum machine_mode mode)
7744 {
7745 return ix86_function_value_1 (NULL, NULL, mode, mode);
7746 }
7747
7748 /* Return true iff type is returned in memory. */
7749
7750 static bool ATTRIBUTE_UNUSED
7751 return_in_memory_32 (const_tree type, enum machine_mode mode)
7752 {
7753 HOST_WIDE_INT size;
7754
7755 if (mode == BLKmode)
7756 return true;
7757
7758 size = int_size_in_bytes (type);
7759
7760 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7761 return false;
7762
7763 if (VECTOR_MODE_P (mode) || mode == TImode)
7764 {
7765 /* User-created vectors small enough to fit in EAX. */
7766 if (size < 8)
7767 return false;
7768
7769 /* MMX/3dNow values are returned in MM0,
7770 except when it doesn't exits or the ABI prescribes otherwise. */
7771 if (size == 8)
7772 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7773
7774 /* SSE values are returned in XMM0, except when it doesn't exist. */
7775 if (size == 16)
7776 return !TARGET_SSE;
7777
7778 /* AVX values are returned in YMM0, except when it doesn't exist. */
7779 if (size == 32)
7780 return !TARGET_AVX;
7781 }
7782
7783 if (mode == XFmode)
7784 return false;
7785
7786 if (size > 12)
7787 return true;
7788
7789 /* OImode shouldn't be used directly. */
7790 gcc_assert (mode != OImode);
7791
7792 return false;
7793 }
7794
7795 static bool ATTRIBUTE_UNUSED
7796 return_in_memory_64 (const_tree type, enum machine_mode mode)
7797 {
7798 int needed_intregs, needed_sseregs;
7799 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7800 }
7801
7802 static bool ATTRIBUTE_UNUSED
7803 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7804 {
7805 HOST_WIDE_INT size = int_size_in_bytes (type);
7806
7807 /* __m128 is returned in xmm0. */
7808 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7809 || VECTOR_FLOAT_TYPE_P (type))
7810 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7811 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7812 return false;
7813
7814 /* Otherwise, the size must be exactly in [1248]. */
7815 return size != 1 && size != 2 && size != 4 && size != 8;
7816 }
7817
7818 static bool
7819 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7820 {
7821 #ifdef SUBTARGET_RETURN_IN_MEMORY
7822 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7823 #else
7824 const enum machine_mode mode = type_natural_mode (type, NULL);
7825
7826 if (TARGET_64BIT)
7827 {
7828 if (ix86_function_type_abi (fntype) == MS_ABI)
7829 return return_in_memory_ms_64 (type, mode);
7830 else
7831 return return_in_memory_64 (type, mode);
7832 }
7833 else
7834 return return_in_memory_32 (type, mode);
7835 #endif
7836 }
7837
7838 /* When returning SSE vector types, we have a choice of either
7839 (1) being abi incompatible with a -march switch, or
7840 (2) generating an error.
7841 Given no good solution, I think the safest thing is one warning.
7842 The user won't be able to use -Werror, but....
7843
7844 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7845 called in response to actually generating a caller or callee that
7846 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7847 via aggregate_value_p for general type probing from tree-ssa. */
7848
7849 static rtx
7850 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7851 {
7852 static bool warnedsse, warnedmmx;
7853
7854 if (!TARGET_64BIT && type)
7855 {
7856 /* Look at the return type of the function, not the function type. */
7857 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7858
7859 if (!TARGET_SSE && !warnedsse)
7860 {
7861 if (mode == TImode
7862 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7863 {
7864 warnedsse = true;
7865 warning (0, "SSE vector return without SSE enabled "
7866 "changes the ABI");
7867 }
7868 }
7869
7870 if (!TARGET_MMX && !warnedmmx)
7871 {
7872 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7873 {
7874 warnedmmx = true;
7875 warning (0, "MMX vector return without MMX enabled "
7876 "changes the ABI");
7877 }
7878 }
7879 }
7880
7881 return NULL;
7882 }
7883
7884 \f
7885 /* Create the va_list data type. */
7886
7887 /* Returns the calling convention specific va_list date type.
7888 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7889
7890 static tree
7891 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7892 {
7893 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7894
7895 /* For i386 we use plain pointer to argument area. */
7896 if (!TARGET_64BIT || abi == MS_ABI)
7897 return build_pointer_type (char_type_node);
7898
7899 record = lang_hooks.types.make_type (RECORD_TYPE);
7900 type_decl = build_decl (BUILTINS_LOCATION,
7901 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7902
7903 f_gpr = build_decl (BUILTINS_LOCATION,
7904 FIELD_DECL, get_identifier ("gp_offset"),
7905 unsigned_type_node);
7906 f_fpr = build_decl (BUILTINS_LOCATION,
7907 FIELD_DECL, get_identifier ("fp_offset"),
7908 unsigned_type_node);
7909 f_ovf = build_decl (BUILTINS_LOCATION,
7910 FIELD_DECL, get_identifier ("overflow_arg_area"),
7911 ptr_type_node);
7912 f_sav = build_decl (BUILTINS_LOCATION,
7913 FIELD_DECL, get_identifier ("reg_save_area"),
7914 ptr_type_node);
7915
7916 va_list_gpr_counter_field = f_gpr;
7917 va_list_fpr_counter_field = f_fpr;
7918
7919 DECL_FIELD_CONTEXT (f_gpr) = record;
7920 DECL_FIELD_CONTEXT (f_fpr) = record;
7921 DECL_FIELD_CONTEXT (f_ovf) = record;
7922 DECL_FIELD_CONTEXT (f_sav) = record;
7923
7924 TYPE_STUB_DECL (record) = type_decl;
7925 TYPE_NAME (record) = type_decl;
7926 TYPE_FIELDS (record) = f_gpr;
7927 DECL_CHAIN (f_gpr) = f_fpr;
7928 DECL_CHAIN (f_fpr) = f_ovf;
7929 DECL_CHAIN (f_ovf) = f_sav;
7930
7931 layout_type (record);
7932
7933 /* The correct type is an array type of one element. */
7934 return build_array_type (record, build_index_type (size_zero_node));
7935 }
7936
7937 /* Setup the builtin va_list data type and for 64-bit the additional
7938 calling convention specific va_list data types. */
7939
7940 static tree
7941 ix86_build_builtin_va_list (void)
7942 {
7943 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7944
7945 /* Initialize abi specific va_list builtin types. */
7946 if (TARGET_64BIT)
7947 {
7948 tree t;
7949 if (ix86_abi == MS_ABI)
7950 {
7951 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7952 if (TREE_CODE (t) != RECORD_TYPE)
7953 t = build_variant_type_copy (t);
7954 sysv_va_list_type_node = t;
7955 }
7956 else
7957 {
7958 t = ret;
7959 if (TREE_CODE (t) != RECORD_TYPE)
7960 t = build_variant_type_copy (t);
7961 sysv_va_list_type_node = t;
7962 }
7963 if (ix86_abi != MS_ABI)
7964 {
7965 t = ix86_build_builtin_va_list_abi (MS_ABI);
7966 if (TREE_CODE (t) != RECORD_TYPE)
7967 t = build_variant_type_copy (t);
7968 ms_va_list_type_node = t;
7969 }
7970 else
7971 {
7972 t = ret;
7973 if (TREE_CODE (t) != RECORD_TYPE)
7974 t = build_variant_type_copy (t);
7975 ms_va_list_type_node = t;
7976 }
7977 }
7978
7979 return ret;
7980 }
7981
7982 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7983
7984 static void
7985 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7986 {
7987 rtx save_area, mem;
7988 alias_set_type set;
7989 int i, max;
7990
7991 /* GPR size of varargs save area. */
7992 if (cfun->va_list_gpr_size)
7993 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7994 else
7995 ix86_varargs_gpr_size = 0;
7996
7997 /* FPR size of varargs save area. We don't need it if we don't pass
7998 anything in SSE registers. */
7999 if (TARGET_SSE && cfun->va_list_fpr_size)
8000 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
8001 else
8002 ix86_varargs_fpr_size = 0;
8003
8004 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
8005 return;
8006
8007 save_area = frame_pointer_rtx;
8008 set = get_varargs_alias_set ();
8009
8010 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
8011 if (max > X86_64_REGPARM_MAX)
8012 max = X86_64_REGPARM_MAX;
8013
8014 for (i = cum->regno; i < max; i++)
8015 {
8016 mem = gen_rtx_MEM (word_mode,
8017 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
8018 MEM_NOTRAP_P (mem) = 1;
8019 set_mem_alias_set (mem, set);
8020 emit_move_insn (mem,
8021 gen_rtx_REG (word_mode,
8022 x86_64_int_parameter_registers[i]));
8023 }
8024
8025 if (ix86_varargs_fpr_size)
8026 {
8027 enum machine_mode smode;
8028 rtx label, test;
8029
8030 /* Now emit code to save SSE registers. The AX parameter contains number
8031 of SSE parameter registers used to call this function, though all we
8032 actually check here is the zero/non-zero status. */
8033
8034 label = gen_label_rtx ();
8035 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
8036 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
8037 label));
8038
8039 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
8040 we used movdqa (i.e. TImode) instead? Perhaps even better would
8041 be if we could determine the real mode of the data, via a hook
8042 into pass_stdarg. Ignore all that for now. */
8043 smode = V4SFmode;
8044 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
8045 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
8046
8047 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
8048 if (max > X86_64_SSE_REGPARM_MAX)
8049 max = X86_64_SSE_REGPARM_MAX;
8050
8051 for (i = cum->sse_regno; i < max; ++i)
8052 {
8053 mem = plus_constant (Pmode, save_area,
8054 i * 16 + ix86_varargs_gpr_size);
8055 mem = gen_rtx_MEM (smode, mem);
8056 MEM_NOTRAP_P (mem) = 1;
8057 set_mem_alias_set (mem, set);
8058 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
8059
8060 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
8061 }
8062
8063 emit_label (label);
8064 }
8065 }
8066
8067 static void
8068 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
8069 {
8070 alias_set_type set = get_varargs_alias_set ();
8071 int i;
8072
8073 /* Reset to zero, as there might be a sysv vaarg used
8074 before. */
8075 ix86_varargs_gpr_size = 0;
8076 ix86_varargs_fpr_size = 0;
8077
8078 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
8079 {
8080 rtx reg, mem;
8081
8082 mem = gen_rtx_MEM (Pmode,
8083 plus_constant (Pmode, virtual_incoming_args_rtx,
8084 i * UNITS_PER_WORD));
8085 MEM_NOTRAP_P (mem) = 1;
8086 set_mem_alias_set (mem, set);
8087
8088 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
8089 emit_move_insn (mem, reg);
8090 }
8091 }
8092
8093 static void
8094 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8095 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8096 int no_rtl)
8097 {
8098 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8099 CUMULATIVE_ARGS next_cum;
8100 tree fntype;
8101
8102 /* This argument doesn't appear to be used anymore. Which is good,
8103 because the old code here didn't suppress rtl generation. */
8104 gcc_assert (!no_rtl);
8105
8106 if (!TARGET_64BIT)
8107 return;
8108
8109 fntype = TREE_TYPE (current_function_decl);
8110
8111 /* For varargs, we do not want to skip the dummy va_dcl argument.
8112 For stdargs, we do want to skip the last named argument. */
8113 next_cum = *cum;
8114 if (stdarg_p (fntype))
8115 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8116 true);
8117
8118 if (cum->call_abi == MS_ABI)
8119 setup_incoming_varargs_ms_64 (&next_cum);
8120 else
8121 setup_incoming_varargs_64 (&next_cum);
8122 }
8123
8124 /* Checks if TYPE is of kind va_list char *. */
8125
8126 static bool
8127 is_va_list_char_pointer (tree type)
8128 {
8129 tree canonic;
8130
8131 /* For 32-bit it is always true. */
8132 if (!TARGET_64BIT)
8133 return true;
8134 canonic = ix86_canonical_va_list_type (type);
8135 return (canonic == ms_va_list_type_node
8136 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8137 }
8138
8139 /* Implement va_start. */
8140
8141 static void
8142 ix86_va_start (tree valist, rtx nextarg)
8143 {
8144 HOST_WIDE_INT words, n_gpr, n_fpr;
8145 tree f_gpr, f_fpr, f_ovf, f_sav;
8146 tree gpr, fpr, ovf, sav, t;
8147 tree type;
8148 rtx ovf_rtx;
8149
8150 if (flag_split_stack
8151 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8152 {
8153 unsigned int scratch_regno;
8154
8155 /* When we are splitting the stack, we can't refer to the stack
8156 arguments using internal_arg_pointer, because they may be on
8157 the old stack. The split stack prologue will arrange to
8158 leave a pointer to the old stack arguments in a scratch
8159 register, which we here copy to a pseudo-register. The split
8160 stack prologue can't set the pseudo-register directly because
8161 it (the prologue) runs before any registers have been saved. */
8162
8163 scratch_regno = split_stack_prologue_scratch_regno ();
8164 if (scratch_regno != INVALID_REGNUM)
8165 {
8166 rtx reg, seq;
8167
8168 reg = gen_reg_rtx (Pmode);
8169 cfun->machine->split_stack_varargs_pointer = reg;
8170
8171 start_sequence ();
8172 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8173 seq = get_insns ();
8174 end_sequence ();
8175
8176 push_topmost_sequence ();
8177 emit_insn_after (seq, entry_of_function ());
8178 pop_topmost_sequence ();
8179 }
8180 }
8181
8182 /* Only 64bit target needs something special. */
8183 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8184 {
8185 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8186 std_expand_builtin_va_start (valist, nextarg);
8187 else
8188 {
8189 rtx va_r, next;
8190
8191 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8192 next = expand_binop (ptr_mode, add_optab,
8193 cfun->machine->split_stack_varargs_pointer,
8194 crtl->args.arg_offset_rtx,
8195 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8196 convert_move (va_r, next, 0);
8197 }
8198 return;
8199 }
8200
8201 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8202 f_fpr = DECL_CHAIN (f_gpr);
8203 f_ovf = DECL_CHAIN (f_fpr);
8204 f_sav = DECL_CHAIN (f_ovf);
8205
8206 valist = build_simple_mem_ref (valist);
8207 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8208 /* The following should be folded into the MEM_REF offset. */
8209 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8210 f_gpr, NULL_TREE);
8211 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8212 f_fpr, NULL_TREE);
8213 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8214 f_ovf, NULL_TREE);
8215 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8216 f_sav, NULL_TREE);
8217
8218 /* Count number of gp and fp argument registers used. */
8219 words = crtl->args.info.words;
8220 n_gpr = crtl->args.info.regno;
8221 n_fpr = crtl->args.info.sse_regno;
8222
8223 if (cfun->va_list_gpr_size)
8224 {
8225 type = TREE_TYPE (gpr);
8226 t = build2 (MODIFY_EXPR, type,
8227 gpr, build_int_cst (type, n_gpr * 8));
8228 TREE_SIDE_EFFECTS (t) = 1;
8229 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8230 }
8231
8232 if (TARGET_SSE && cfun->va_list_fpr_size)
8233 {
8234 type = TREE_TYPE (fpr);
8235 t = build2 (MODIFY_EXPR, type, fpr,
8236 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8237 TREE_SIDE_EFFECTS (t) = 1;
8238 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8239 }
8240
8241 /* Find the overflow area. */
8242 type = TREE_TYPE (ovf);
8243 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8244 ovf_rtx = crtl->args.internal_arg_pointer;
8245 else
8246 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8247 t = make_tree (type, ovf_rtx);
8248 if (words != 0)
8249 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8250 t = build2 (MODIFY_EXPR, type, ovf, t);
8251 TREE_SIDE_EFFECTS (t) = 1;
8252 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8253
8254 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8255 {
8256 /* Find the register save area.
8257 Prologue of the function save it right above stack frame. */
8258 type = TREE_TYPE (sav);
8259 t = make_tree (type, frame_pointer_rtx);
8260 if (!ix86_varargs_gpr_size)
8261 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8262 t = build2 (MODIFY_EXPR, type, sav, t);
8263 TREE_SIDE_EFFECTS (t) = 1;
8264 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8265 }
8266 }
8267
8268 /* Implement va_arg. */
8269
8270 static tree
8271 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8272 gimple_seq *post_p)
8273 {
8274 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8275 tree f_gpr, f_fpr, f_ovf, f_sav;
8276 tree gpr, fpr, ovf, sav, t;
8277 int size, rsize;
8278 tree lab_false, lab_over = NULL_TREE;
8279 tree addr, t2;
8280 rtx container;
8281 int indirect_p = 0;
8282 tree ptrtype;
8283 enum machine_mode nat_mode;
8284 unsigned int arg_boundary;
8285
8286 /* Only 64bit target needs something special. */
8287 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8288 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8289
8290 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8291 f_fpr = DECL_CHAIN (f_gpr);
8292 f_ovf = DECL_CHAIN (f_fpr);
8293 f_sav = DECL_CHAIN (f_ovf);
8294
8295 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8296 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8297 valist = build_va_arg_indirect_ref (valist);
8298 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8299 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8300 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8301
8302 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8303 if (indirect_p)
8304 type = build_pointer_type (type);
8305 size = int_size_in_bytes (type);
8306 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8307
8308 nat_mode = type_natural_mode (type, NULL);
8309 switch (nat_mode)
8310 {
8311 case V8SFmode:
8312 case V8SImode:
8313 case V32QImode:
8314 case V16HImode:
8315 case V4DFmode:
8316 case V4DImode:
8317 /* Unnamed 256bit vector mode parameters are passed on stack. */
8318 if (!TARGET_64BIT_MS_ABI)
8319 {
8320 container = NULL;
8321 break;
8322 }
8323
8324 default:
8325 container = construct_container (nat_mode, TYPE_MODE (type),
8326 type, 0, X86_64_REGPARM_MAX,
8327 X86_64_SSE_REGPARM_MAX, intreg,
8328 0);
8329 break;
8330 }
8331
8332 /* Pull the value out of the saved registers. */
8333
8334 addr = create_tmp_var (ptr_type_node, "addr");
8335
8336 if (container)
8337 {
8338 int needed_intregs, needed_sseregs;
8339 bool need_temp;
8340 tree int_addr, sse_addr;
8341
8342 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8343 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8344
8345 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8346
8347 need_temp = (!REG_P (container)
8348 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8349 || TYPE_ALIGN (type) > 128));
8350
8351 /* In case we are passing structure, verify that it is consecutive block
8352 on the register save area. If not we need to do moves. */
8353 if (!need_temp && !REG_P (container))
8354 {
8355 /* Verify that all registers are strictly consecutive */
8356 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8357 {
8358 int i;
8359
8360 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8361 {
8362 rtx slot = XVECEXP (container, 0, i);
8363 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8364 || INTVAL (XEXP (slot, 1)) != i * 16)
8365 need_temp = 1;
8366 }
8367 }
8368 else
8369 {
8370 int i;
8371
8372 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8373 {
8374 rtx slot = XVECEXP (container, 0, i);
8375 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8376 || INTVAL (XEXP (slot, 1)) != i * 8)
8377 need_temp = 1;
8378 }
8379 }
8380 }
8381 if (!need_temp)
8382 {
8383 int_addr = addr;
8384 sse_addr = addr;
8385 }
8386 else
8387 {
8388 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8389 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8390 }
8391
8392 /* First ensure that we fit completely in registers. */
8393 if (needed_intregs)
8394 {
8395 t = build_int_cst (TREE_TYPE (gpr),
8396 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8397 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8398 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8399 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8400 gimplify_and_add (t, pre_p);
8401 }
8402 if (needed_sseregs)
8403 {
8404 t = build_int_cst (TREE_TYPE (fpr),
8405 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8406 + X86_64_REGPARM_MAX * 8);
8407 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8408 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8409 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8410 gimplify_and_add (t, pre_p);
8411 }
8412
8413 /* Compute index to start of area used for integer regs. */
8414 if (needed_intregs)
8415 {
8416 /* int_addr = gpr + sav; */
8417 t = fold_build_pointer_plus (sav, gpr);
8418 gimplify_assign (int_addr, t, pre_p);
8419 }
8420 if (needed_sseregs)
8421 {
8422 /* sse_addr = fpr + sav; */
8423 t = fold_build_pointer_plus (sav, fpr);
8424 gimplify_assign (sse_addr, t, pre_p);
8425 }
8426 if (need_temp)
8427 {
8428 int i, prev_size = 0;
8429 tree temp = create_tmp_var (type, "va_arg_tmp");
8430
8431 /* addr = &temp; */
8432 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8433 gimplify_assign (addr, t, pre_p);
8434
8435 for (i = 0; i < XVECLEN (container, 0); i++)
8436 {
8437 rtx slot = XVECEXP (container, 0, i);
8438 rtx reg = XEXP (slot, 0);
8439 enum machine_mode mode = GET_MODE (reg);
8440 tree piece_type;
8441 tree addr_type;
8442 tree daddr_type;
8443 tree src_addr, src;
8444 int src_offset;
8445 tree dest_addr, dest;
8446 int cur_size = GET_MODE_SIZE (mode);
8447
8448 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8449 prev_size = INTVAL (XEXP (slot, 1));
8450 if (prev_size + cur_size > size)
8451 {
8452 cur_size = size - prev_size;
8453 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8454 if (mode == BLKmode)
8455 mode = QImode;
8456 }
8457 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8458 if (mode == GET_MODE (reg))
8459 addr_type = build_pointer_type (piece_type);
8460 else
8461 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8462 true);
8463 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8464 true);
8465
8466 if (SSE_REGNO_P (REGNO (reg)))
8467 {
8468 src_addr = sse_addr;
8469 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8470 }
8471 else
8472 {
8473 src_addr = int_addr;
8474 src_offset = REGNO (reg) * 8;
8475 }
8476 src_addr = fold_convert (addr_type, src_addr);
8477 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8478
8479 dest_addr = fold_convert (daddr_type, addr);
8480 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8481 if (cur_size == GET_MODE_SIZE (mode))
8482 {
8483 src = build_va_arg_indirect_ref (src_addr);
8484 dest = build_va_arg_indirect_ref (dest_addr);
8485
8486 gimplify_assign (dest, src, pre_p);
8487 }
8488 else
8489 {
8490 tree copy
8491 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8492 3, dest_addr, src_addr,
8493 size_int (cur_size));
8494 gimplify_and_add (copy, pre_p);
8495 }
8496 prev_size += cur_size;
8497 }
8498 }
8499
8500 if (needed_intregs)
8501 {
8502 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8503 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8504 gimplify_assign (gpr, t, pre_p);
8505 }
8506
8507 if (needed_sseregs)
8508 {
8509 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8510 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8511 gimplify_assign (fpr, t, pre_p);
8512 }
8513
8514 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8515
8516 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8517 }
8518
8519 /* ... otherwise out of the overflow area. */
8520
8521 /* When we align parameter on stack for caller, if the parameter
8522 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8523 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8524 here with caller. */
8525 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8526 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8527 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8528
8529 /* Care for on-stack alignment if needed. */
8530 if (arg_boundary <= 64 || size == 0)
8531 t = ovf;
8532 else
8533 {
8534 HOST_WIDE_INT align = arg_boundary / 8;
8535 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8536 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8537 build_int_cst (TREE_TYPE (t), -align));
8538 }
8539
8540 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8541 gimplify_assign (addr, t, pre_p);
8542
8543 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8544 gimplify_assign (unshare_expr (ovf), t, pre_p);
8545
8546 if (container)
8547 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8548
8549 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8550 addr = fold_convert (ptrtype, addr);
8551
8552 if (indirect_p)
8553 addr = build_va_arg_indirect_ref (addr);
8554 return build_va_arg_indirect_ref (addr);
8555 }
8556 \f
8557 /* Return true if OPNUM's MEM should be matched
8558 in movabs* patterns. */
8559
8560 bool
8561 ix86_check_movabs (rtx insn, int opnum)
8562 {
8563 rtx set, mem;
8564
8565 set = PATTERN (insn);
8566 if (GET_CODE (set) == PARALLEL)
8567 set = XVECEXP (set, 0, 0);
8568 gcc_assert (GET_CODE (set) == SET);
8569 mem = XEXP (set, opnum);
8570 while (GET_CODE (mem) == SUBREG)
8571 mem = SUBREG_REG (mem);
8572 gcc_assert (MEM_P (mem));
8573 return volatile_ok || !MEM_VOLATILE_P (mem);
8574 }
8575 \f
8576 /* Initialize the table of extra 80387 mathematical constants. */
8577
8578 static void
8579 init_ext_80387_constants (void)
8580 {
8581 static const char * cst[5] =
8582 {
8583 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8584 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8585 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8586 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8587 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8588 };
8589 int i;
8590
8591 for (i = 0; i < 5; i++)
8592 {
8593 real_from_string (&ext_80387_constants_table[i], cst[i]);
8594 /* Ensure each constant is rounded to XFmode precision. */
8595 real_convert (&ext_80387_constants_table[i],
8596 XFmode, &ext_80387_constants_table[i]);
8597 }
8598
8599 ext_80387_constants_init = 1;
8600 }
8601
8602 /* Return non-zero if the constant is something that
8603 can be loaded with a special instruction. */
8604
8605 int
8606 standard_80387_constant_p (rtx x)
8607 {
8608 enum machine_mode mode = GET_MODE (x);
8609
8610 REAL_VALUE_TYPE r;
8611
8612 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8613 return -1;
8614
8615 if (x == CONST0_RTX (mode))
8616 return 1;
8617 if (x == CONST1_RTX (mode))
8618 return 2;
8619
8620 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8621
8622 /* For XFmode constants, try to find a special 80387 instruction when
8623 optimizing for size or on those CPUs that benefit from them. */
8624 if (mode == XFmode
8625 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8626 {
8627 int i;
8628
8629 if (! ext_80387_constants_init)
8630 init_ext_80387_constants ();
8631
8632 for (i = 0; i < 5; i++)
8633 if (real_identical (&r, &ext_80387_constants_table[i]))
8634 return i + 3;
8635 }
8636
8637 /* Load of the constant -0.0 or -1.0 will be split as
8638 fldz;fchs or fld1;fchs sequence. */
8639 if (real_isnegzero (&r))
8640 return 8;
8641 if (real_identical (&r, &dconstm1))
8642 return 9;
8643
8644 return 0;
8645 }
8646
8647 /* Return the opcode of the special instruction to be used to load
8648 the constant X. */
8649
8650 const char *
8651 standard_80387_constant_opcode (rtx x)
8652 {
8653 switch (standard_80387_constant_p (x))
8654 {
8655 case 1:
8656 return "fldz";
8657 case 2:
8658 return "fld1";
8659 case 3:
8660 return "fldlg2";
8661 case 4:
8662 return "fldln2";
8663 case 5:
8664 return "fldl2e";
8665 case 6:
8666 return "fldl2t";
8667 case 7:
8668 return "fldpi";
8669 case 8:
8670 case 9:
8671 return "#";
8672 default:
8673 gcc_unreachable ();
8674 }
8675 }
8676
8677 /* Return the CONST_DOUBLE representing the 80387 constant that is
8678 loaded by the specified special instruction. The argument IDX
8679 matches the return value from standard_80387_constant_p. */
8680
8681 rtx
8682 standard_80387_constant_rtx (int idx)
8683 {
8684 int i;
8685
8686 if (! ext_80387_constants_init)
8687 init_ext_80387_constants ();
8688
8689 switch (idx)
8690 {
8691 case 3:
8692 case 4:
8693 case 5:
8694 case 6:
8695 case 7:
8696 i = idx - 3;
8697 break;
8698
8699 default:
8700 gcc_unreachable ();
8701 }
8702
8703 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8704 XFmode);
8705 }
8706
8707 /* Return 1 if X is all 0s and 2 if x is all 1s
8708 in supported SSE/AVX vector mode. */
8709
8710 int
8711 standard_sse_constant_p (rtx x)
8712 {
8713 enum machine_mode mode = GET_MODE (x);
8714
8715 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8716 return 1;
8717 if (vector_all_ones_operand (x, mode))
8718 switch (mode)
8719 {
8720 case V16QImode:
8721 case V8HImode:
8722 case V4SImode:
8723 case V2DImode:
8724 if (TARGET_SSE2)
8725 return 2;
8726 case V32QImode:
8727 case V16HImode:
8728 case V8SImode:
8729 case V4DImode:
8730 if (TARGET_AVX2)
8731 return 2;
8732 default:
8733 break;
8734 }
8735
8736 return 0;
8737 }
8738
8739 /* Return the opcode of the special instruction to be used to load
8740 the constant X. */
8741
8742 const char *
8743 standard_sse_constant_opcode (rtx insn, rtx x)
8744 {
8745 switch (standard_sse_constant_p (x))
8746 {
8747 case 1:
8748 switch (get_attr_mode (insn))
8749 {
8750 case MODE_TI:
8751 return "%vpxor\t%0, %d0";
8752 case MODE_V2DF:
8753 return "%vxorpd\t%0, %d0";
8754 case MODE_V4SF:
8755 return "%vxorps\t%0, %d0";
8756
8757 case MODE_OI:
8758 return "vpxor\t%x0, %x0, %x0";
8759 case MODE_V4DF:
8760 return "vxorpd\t%x0, %x0, %x0";
8761 case MODE_V8SF:
8762 return "vxorps\t%x0, %x0, %x0";
8763
8764 default:
8765 break;
8766 }
8767
8768 case 2:
8769 if (get_attr_mode (insn) == MODE_XI
8770 || get_attr_mode (insn) == MODE_V8DF
8771 || get_attr_mode (insn) == MODE_V16SF)
8772 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8773 if (TARGET_AVX)
8774 return "vpcmpeqd\t%0, %0, %0";
8775 else
8776 return "pcmpeqd\t%0, %0";
8777
8778 default:
8779 break;
8780 }
8781 gcc_unreachable ();
8782 }
8783
8784 /* Returns true if OP contains a symbol reference */
8785
8786 bool
8787 symbolic_reference_mentioned_p (rtx op)
8788 {
8789 const char *fmt;
8790 int i;
8791
8792 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8793 return true;
8794
8795 fmt = GET_RTX_FORMAT (GET_CODE (op));
8796 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8797 {
8798 if (fmt[i] == 'E')
8799 {
8800 int j;
8801
8802 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8803 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8804 return true;
8805 }
8806
8807 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8808 return true;
8809 }
8810
8811 return false;
8812 }
8813
8814 /* Return true if it is appropriate to emit `ret' instructions in the
8815 body of a function. Do this only if the epilogue is simple, needing a
8816 couple of insns. Prior to reloading, we can't tell how many registers
8817 must be saved, so return false then. Return false if there is no frame
8818 marker to de-allocate. */
8819
8820 bool
8821 ix86_can_use_return_insn_p (void)
8822 {
8823 struct ix86_frame frame;
8824
8825 if (! reload_completed || frame_pointer_needed)
8826 return 0;
8827
8828 /* Don't allow more than 32k pop, since that's all we can do
8829 with one instruction. */
8830 if (crtl->args.pops_args && crtl->args.size >= 32768)
8831 return 0;
8832
8833 ix86_compute_frame_layout (&frame);
8834 return (frame.stack_pointer_offset == UNITS_PER_WORD
8835 && (frame.nregs + frame.nsseregs) == 0);
8836 }
8837 \f
8838 /* Value should be nonzero if functions must have frame pointers.
8839 Zero means the frame pointer need not be set up (and parms may
8840 be accessed via the stack pointer) in functions that seem suitable. */
8841
8842 static bool
8843 ix86_frame_pointer_required (void)
8844 {
8845 /* If we accessed previous frames, then the generated code expects
8846 to be able to access the saved ebp value in our frame. */
8847 if (cfun->machine->accesses_prev_frame)
8848 return true;
8849
8850 /* Several x86 os'es need a frame pointer for other reasons,
8851 usually pertaining to setjmp. */
8852 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8853 return true;
8854
8855 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8856 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8857 return true;
8858
8859 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8860 allocation is 4GB. */
8861 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8862 return true;
8863
8864 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8865 turns off the frame pointer by default. Turn it back on now if
8866 we've not got a leaf function. */
8867 if (TARGET_OMIT_LEAF_FRAME_POINTER
8868 && (!crtl->is_leaf
8869 || ix86_current_function_calls_tls_descriptor))
8870 return true;
8871
8872 if (crtl->profile && !flag_fentry)
8873 return true;
8874
8875 return false;
8876 }
8877
8878 /* Record that the current function accesses previous call frames. */
8879
8880 void
8881 ix86_setup_frame_addresses (void)
8882 {
8883 cfun->machine->accesses_prev_frame = 1;
8884 }
8885 \f
8886 #ifndef USE_HIDDEN_LINKONCE
8887 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8888 # define USE_HIDDEN_LINKONCE 1
8889 # else
8890 # define USE_HIDDEN_LINKONCE 0
8891 # endif
8892 #endif
8893
8894 static int pic_labels_used;
8895
8896 /* Fills in the label name that should be used for a pc thunk for
8897 the given register. */
8898
8899 static void
8900 get_pc_thunk_name (char name[32], unsigned int regno)
8901 {
8902 gcc_assert (!TARGET_64BIT);
8903
8904 if (USE_HIDDEN_LINKONCE)
8905 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8906 else
8907 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8908 }
8909
8910
8911 /* This function generates code for -fpic that loads %ebx with
8912 the return address of the caller and then returns. */
8913
8914 static void
8915 ix86_code_end (void)
8916 {
8917 rtx xops[2];
8918 int regno;
8919
8920 for (regno = AX_REG; regno <= SP_REG; regno++)
8921 {
8922 char name[32];
8923 tree decl;
8924
8925 if (!(pic_labels_used & (1 << regno)))
8926 continue;
8927
8928 get_pc_thunk_name (name, regno);
8929
8930 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8931 get_identifier (name),
8932 build_function_type_list (void_type_node, NULL_TREE));
8933 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8934 NULL_TREE, void_type_node);
8935 TREE_PUBLIC (decl) = 1;
8936 TREE_STATIC (decl) = 1;
8937 DECL_IGNORED_P (decl) = 1;
8938
8939 #if TARGET_MACHO
8940 if (TARGET_MACHO)
8941 {
8942 switch_to_section (darwin_sections[text_coal_section]);
8943 fputs ("\t.weak_definition\t", asm_out_file);
8944 assemble_name (asm_out_file, name);
8945 fputs ("\n\t.private_extern\t", asm_out_file);
8946 assemble_name (asm_out_file, name);
8947 putc ('\n', asm_out_file);
8948 ASM_OUTPUT_LABEL (asm_out_file, name);
8949 DECL_WEAK (decl) = 1;
8950 }
8951 else
8952 #endif
8953 if (USE_HIDDEN_LINKONCE)
8954 {
8955 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8956
8957 targetm.asm_out.unique_section (decl, 0);
8958 switch_to_section (get_named_section (decl, NULL, 0));
8959
8960 targetm.asm_out.globalize_label (asm_out_file, name);
8961 fputs ("\t.hidden\t", asm_out_file);
8962 assemble_name (asm_out_file, name);
8963 putc ('\n', asm_out_file);
8964 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8965 }
8966 else
8967 {
8968 switch_to_section (text_section);
8969 ASM_OUTPUT_LABEL (asm_out_file, name);
8970 }
8971
8972 DECL_INITIAL (decl) = make_node (BLOCK);
8973 current_function_decl = decl;
8974 init_function_start (decl);
8975 first_function_block_is_cold = false;
8976 /* Make sure unwind info is emitted for the thunk if needed. */
8977 final_start_function (emit_barrier (), asm_out_file, 1);
8978
8979 /* Pad stack IP move with 4 instructions (two NOPs count
8980 as one instruction). */
8981 if (TARGET_PAD_SHORT_FUNCTION)
8982 {
8983 int i = 8;
8984
8985 while (i--)
8986 fputs ("\tnop\n", asm_out_file);
8987 }
8988
8989 xops[0] = gen_rtx_REG (Pmode, regno);
8990 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8991 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8992 output_asm_insn ("%!ret", NULL);
8993 final_end_function ();
8994 init_insn_lengths ();
8995 free_after_compilation (cfun);
8996 set_cfun (NULL);
8997 current_function_decl = NULL;
8998 }
8999
9000 if (flag_split_stack)
9001 file_end_indicate_split_stack ();
9002 }
9003
9004 /* Emit code for the SET_GOT patterns. */
9005
9006 const char *
9007 output_set_got (rtx dest, rtx label)
9008 {
9009 rtx xops[3];
9010
9011 xops[0] = dest;
9012
9013 if (TARGET_VXWORKS_RTP && flag_pic)
9014 {
9015 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
9016 xops[2] = gen_rtx_MEM (Pmode,
9017 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
9018 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
9019
9020 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
9021 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
9022 an unadorned address. */
9023 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
9024 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
9025 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
9026 return "";
9027 }
9028
9029 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
9030
9031 if (!flag_pic)
9032 {
9033 if (TARGET_MACHO)
9034 /* We don't need a pic base, we're not producing pic. */
9035 gcc_unreachable ();
9036
9037 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
9038 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
9039 targetm.asm_out.internal_label (asm_out_file, "L",
9040 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
9041 }
9042 else
9043 {
9044 char name[32];
9045 get_pc_thunk_name (name, REGNO (dest));
9046 pic_labels_used |= 1 << REGNO (dest);
9047
9048 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
9049 xops[2] = gen_rtx_MEM (QImode, xops[2]);
9050 output_asm_insn ("%!call\t%X2", xops);
9051
9052 #if TARGET_MACHO
9053 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
9054 This is what will be referenced by the Mach-O PIC subsystem. */
9055 if (machopic_should_output_picbase_label () || !label)
9056 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
9057
9058 /* When we are restoring the pic base at the site of a nonlocal label,
9059 and we decided to emit the pic base above, we will still output a
9060 local label used for calculating the correction offset (even though
9061 the offset will be 0 in that case). */
9062 if (label)
9063 targetm.asm_out.internal_label (asm_out_file, "L",
9064 CODE_LABEL_NUMBER (label));
9065 #endif
9066 }
9067
9068 if (!TARGET_MACHO)
9069 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
9070
9071 return "";
9072 }
9073
9074 /* Generate an "push" pattern for input ARG. */
9075
9076 static rtx
9077 gen_push (rtx arg)
9078 {
9079 struct machine_function *m = cfun->machine;
9080
9081 if (m->fs.cfa_reg == stack_pointer_rtx)
9082 m->fs.cfa_offset += UNITS_PER_WORD;
9083 m->fs.sp_offset += UNITS_PER_WORD;
9084
9085 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9086 arg = gen_rtx_REG (word_mode, REGNO (arg));
9087
9088 return gen_rtx_SET (VOIDmode,
9089 gen_rtx_MEM (word_mode,
9090 gen_rtx_PRE_DEC (Pmode,
9091 stack_pointer_rtx)),
9092 arg);
9093 }
9094
9095 /* Generate an "pop" pattern for input ARG. */
9096
9097 static rtx
9098 gen_pop (rtx arg)
9099 {
9100 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9101 arg = gen_rtx_REG (word_mode, REGNO (arg));
9102
9103 return gen_rtx_SET (VOIDmode,
9104 arg,
9105 gen_rtx_MEM (word_mode,
9106 gen_rtx_POST_INC (Pmode,
9107 stack_pointer_rtx)));
9108 }
9109
9110 /* Return >= 0 if there is an unused call-clobbered register available
9111 for the entire function. */
9112
9113 static unsigned int
9114 ix86_select_alt_pic_regnum (void)
9115 {
9116 if (crtl->is_leaf
9117 && !crtl->profile
9118 && !ix86_current_function_calls_tls_descriptor)
9119 {
9120 int i, drap;
9121 /* Can't use the same register for both PIC and DRAP. */
9122 if (crtl->drap_reg)
9123 drap = REGNO (crtl->drap_reg);
9124 else
9125 drap = -1;
9126 for (i = 2; i >= 0; --i)
9127 if (i != drap && !df_regs_ever_live_p (i))
9128 return i;
9129 }
9130
9131 return INVALID_REGNUM;
9132 }
9133
9134 /* Return TRUE if we need to save REGNO. */
9135
9136 static bool
9137 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9138 {
9139 if (pic_offset_table_rtx
9140 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9141 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9142 || crtl->profile
9143 || crtl->calls_eh_return
9144 || crtl->uses_const_pool
9145 || cfun->has_nonlocal_label))
9146 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9147
9148 if (crtl->calls_eh_return && maybe_eh_return)
9149 {
9150 unsigned i;
9151 for (i = 0; ; i++)
9152 {
9153 unsigned test = EH_RETURN_DATA_REGNO (i);
9154 if (test == INVALID_REGNUM)
9155 break;
9156 if (test == regno)
9157 return true;
9158 }
9159 }
9160
9161 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9162 return true;
9163
9164 return (df_regs_ever_live_p (regno)
9165 && !call_used_regs[regno]
9166 && !fixed_regs[regno]
9167 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9168 }
9169
9170 /* Return number of saved general prupose registers. */
9171
9172 static int
9173 ix86_nsaved_regs (void)
9174 {
9175 int nregs = 0;
9176 int regno;
9177
9178 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9179 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9180 nregs ++;
9181 return nregs;
9182 }
9183
9184 /* Return number of saved SSE registrers. */
9185
9186 static int
9187 ix86_nsaved_sseregs (void)
9188 {
9189 int nregs = 0;
9190 int regno;
9191
9192 if (!TARGET_64BIT_MS_ABI)
9193 return 0;
9194 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9195 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9196 nregs ++;
9197 return nregs;
9198 }
9199
9200 /* Given FROM and TO register numbers, say whether this elimination is
9201 allowed. If stack alignment is needed, we can only replace argument
9202 pointer with hard frame pointer, or replace frame pointer with stack
9203 pointer. Otherwise, frame pointer elimination is automatically
9204 handled and all other eliminations are valid. */
9205
9206 static bool
9207 ix86_can_eliminate (const int from, const int to)
9208 {
9209 if (stack_realign_fp)
9210 return ((from == ARG_POINTER_REGNUM
9211 && to == HARD_FRAME_POINTER_REGNUM)
9212 || (from == FRAME_POINTER_REGNUM
9213 && to == STACK_POINTER_REGNUM));
9214 else
9215 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9216 }
9217
9218 /* Return the offset between two registers, one to be eliminated, and the other
9219 its replacement, at the start of a routine. */
9220
9221 HOST_WIDE_INT
9222 ix86_initial_elimination_offset (int from, int to)
9223 {
9224 struct ix86_frame frame;
9225 ix86_compute_frame_layout (&frame);
9226
9227 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9228 return frame.hard_frame_pointer_offset;
9229 else if (from == FRAME_POINTER_REGNUM
9230 && to == HARD_FRAME_POINTER_REGNUM)
9231 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9232 else
9233 {
9234 gcc_assert (to == STACK_POINTER_REGNUM);
9235
9236 if (from == ARG_POINTER_REGNUM)
9237 return frame.stack_pointer_offset;
9238
9239 gcc_assert (from == FRAME_POINTER_REGNUM);
9240 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9241 }
9242 }
9243
9244 /* In a dynamically-aligned function, we can't know the offset from
9245 stack pointer to frame pointer, so we must ensure that setjmp
9246 eliminates fp against the hard fp (%ebp) rather than trying to
9247 index from %esp up to the top of the frame across a gap that is
9248 of unknown (at compile-time) size. */
9249 static rtx
9250 ix86_builtin_setjmp_frame_value (void)
9251 {
9252 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9253 }
9254
9255 /* When using -fsplit-stack, the allocation routines set a field in
9256 the TCB to the bottom of the stack plus this much space, measured
9257 in bytes. */
9258
9259 #define SPLIT_STACK_AVAILABLE 256
9260
9261 /* Fill structure ix86_frame about frame of currently computed function. */
9262
9263 static void
9264 ix86_compute_frame_layout (struct ix86_frame *frame)
9265 {
9266 unsigned HOST_WIDE_INT stack_alignment_needed;
9267 HOST_WIDE_INT offset;
9268 unsigned HOST_WIDE_INT preferred_alignment;
9269 HOST_WIDE_INT size = get_frame_size ();
9270 HOST_WIDE_INT to_allocate;
9271
9272 frame->nregs = ix86_nsaved_regs ();
9273 frame->nsseregs = ix86_nsaved_sseregs ();
9274
9275 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9276 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9277
9278 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9279 function prologues and leaf. */
9280 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9281 && (!crtl->is_leaf || cfun->calls_alloca != 0
9282 || ix86_current_function_calls_tls_descriptor))
9283 {
9284 preferred_alignment = 16;
9285 stack_alignment_needed = 16;
9286 crtl->preferred_stack_boundary = 128;
9287 crtl->stack_alignment_needed = 128;
9288 }
9289
9290 gcc_assert (!size || stack_alignment_needed);
9291 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9292 gcc_assert (preferred_alignment <= stack_alignment_needed);
9293
9294 /* For SEH we have to limit the amount of code movement into the prologue.
9295 At present we do this via a BLOCKAGE, at which point there's very little
9296 scheduling that can be done, which means that there's very little point
9297 in doing anything except PUSHs. */
9298 if (TARGET_SEH)
9299 cfun->machine->use_fast_prologue_epilogue = false;
9300
9301 /* During reload iteration the amount of registers saved can change.
9302 Recompute the value as needed. Do not recompute when amount of registers
9303 didn't change as reload does multiple calls to the function and does not
9304 expect the decision to change within single iteration. */
9305 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
9306 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9307 {
9308 int count = frame->nregs;
9309 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9310
9311 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9312
9313 /* The fast prologue uses move instead of push to save registers. This
9314 is significantly longer, but also executes faster as modern hardware
9315 can execute the moves in parallel, but can't do that for push/pop.
9316
9317 Be careful about choosing what prologue to emit: When function takes
9318 many instructions to execute we may use slow version as well as in
9319 case function is known to be outside hot spot (this is known with
9320 feedback only). Weight the size of function by number of registers
9321 to save as it is cheap to use one or two push instructions but very
9322 slow to use many of them. */
9323 if (count)
9324 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9325 if (node->frequency < NODE_FREQUENCY_NORMAL
9326 || (flag_branch_probabilities
9327 && node->frequency < NODE_FREQUENCY_HOT))
9328 cfun->machine->use_fast_prologue_epilogue = false;
9329 else
9330 cfun->machine->use_fast_prologue_epilogue
9331 = !expensive_function_p (count);
9332 }
9333
9334 frame->save_regs_using_mov
9335 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9336 /* If static stack checking is enabled and done with probes,
9337 the registers need to be saved before allocating the frame. */
9338 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9339
9340 /* Skip return address. */
9341 offset = UNITS_PER_WORD;
9342
9343 /* Skip pushed static chain. */
9344 if (ix86_static_chain_on_stack)
9345 offset += UNITS_PER_WORD;
9346
9347 /* Skip saved base pointer. */
9348 if (frame_pointer_needed)
9349 offset += UNITS_PER_WORD;
9350 frame->hfp_save_offset = offset;
9351
9352 /* The traditional frame pointer location is at the top of the frame. */
9353 frame->hard_frame_pointer_offset = offset;
9354
9355 /* Register save area */
9356 offset += frame->nregs * UNITS_PER_WORD;
9357 frame->reg_save_offset = offset;
9358
9359 /* On SEH target, registers are pushed just before the frame pointer
9360 location. */
9361 if (TARGET_SEH)
9362 frame->hard_frame_pointer_offset = offset;
9363
9364 /* Align and set SSE register save area. */
9365 if (frame->nsseregs)
9366 {
9367 /* The only ABI that has saved SSE registers (Win64) also has a
9368 16-byte aligned default stack, and thus we don't need to be
9369 within the re-aligned local stack frame to save them. */
9370 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9371 offset = (offset + 16 - 1) & -16;
9372 offset += frame->nsseregs * 16;
9373 }
9374 frame->sse_reg_save_offset = offset;
9375
9376 /* The re-aligned stack starts here. Values before this point are not
9377 directly comparable with values below this point. In order to make
9378 sure that no value happens to be the same before and after, force
9379 the alignment computation below to add a non-zero value. */
9380 if (stack_realign_fp)
9381 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9382
9383 /* Va-arg area */
9384 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9385 offset += frame->va_arg_size;
9386
9387 /* Align start of frame for local function. */
9388 if (stack_realign_fp
9389 || offset != frame->sse_reg_save_offset
9390 || size != 0
9391 || !crtl->is_leaf
9392 || cfun->calls_alloca
9393 || ix86_current_function_calls_tls_descriptor)
9394 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9395
9396 /* Frame pointer points here. */
9397 frame->frame_pointer_offset = offset;
9398
9399 offset += size;
9400
9401 /* Add outgoing arguments area. Can be skipped if we eliminated
9402 all the function calls as dead code.
9403 Skipping is however impossible when function calls alloca. Alloca
9404 expander assumes that last crtl->outgoing_args_size
9405 of stack frame are unused. */
9406 if (ACCUMULATE_OUTGOING_ARGS
9407 && (!crtl->is_leaf || cfun->calls_alloca
9408 || ix86_current_function_calls_tls_descriptor))
9409 {
9410 offset += crtl->outgoing_args_size;
9411 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9412 }
9413 else
9414 frame->outgoing_arguments_size = 0;
9415
9416 /* Align stack boundary. Only needed if we're calling another function
9417 or using alloca. */
9418 if (!crtl->is_leaf || cfun->calls_alloca
9419 || ix86_current_function_calls_tls_descriptor)
9420 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9421
9422 /* We've reached end of stack frame. */
9423 frame->stack_pointer_offset = offset;
9424
9425 /* Size prologue needs to allocate. */
9426 to_allocate = offset - frame->sse_reg_save_offset;
9427
9428 if ((!to_allocate && frame->nregs <= 1)
9429 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9430 frame->save_regs_using_mov = false;
9431
9432 if (ix86_using_red_zone ()
9433 && crtl->sp_is_unchanging
9434 && crtl->is_leaf
9435 && !ix86_current_function_calls_tls_descriptor)
9436 {
9437 frame->red_zone_size = to_allocate;
9438 if (frame->save_regs_using_mov)
9439 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9440 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9441 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9442 }
9443 else
9444 frame->red_zone_size = 0;
9445 frame->stack_pointer_offset -= frame->red_zone_size;
9446
9447 /* The SEH frame pointer location is near the bottom of the frame.
9448 This is enforced by the fact that the difference between the
9449 stack pointer and the frame pointer is limited to 240 bytes in
9450 the unwind data structure. */
9451 if (TARGET_SEH)
9452 {
9453 HOST_WIDE_INT diff;
9454
9455 /* If we can leave the frame pointer where it is, do so. Also, returns
9456 the establisher frame for __builtin_frame_address (0). */
9457 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9458 if (diff <= SEH_MAX_FRAME_SIZE
9459 && (diff > 240 || (diff & 15) != 0)
9460 && !crtl->accesses_prior_frames)
9461 {
9462 /* Ideally we'd determine what portion of the local stack frame
9463 (within the constraint of the lowest 240) is most heavily used.
9464 But without that complication, simply bias the frame pointer
9465 by 128 bytes so as to maximize the amount of the local stack
9466 frame that is addressable with 8-bit offsets. */
9467 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9468 }
9469 }
9470 }
9471
9472 /* This is semi-inlined memory_address_length, but simplified
9473 since we know that we're always dealing with reg+offset, and
9474 to avoid having to create and discard all that rtl. */
9475
9476 static inline int
9477 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9478 {
9479 int len = 4;
9480
9481 if (offset == 0)
9482 {
9483 /* EBP and R13 cannot be encoded without an offset. */
9484 len = (regno == BP_REG || regno == R13_REG);
9485 }
9486 else if (IN_RANGE (offset, -128, 127))
9487 len = 1;
9488
9489 /* ESP and R12 must be encoded with a SIB byte. */
9490 if (regno == SP_REG || regno == R12_REG)
9491 len++;
9492
9493 return len;
9494 }
9495
9496 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9497 The valid base registers are taken from CFUN->MACHINE->FS. */
9498
9499 static rtx
9500 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9501 {
9502 const struct machine_function *m = cfun->machine;
9503 rtx base_reg = NULL;
9504 HOST_WIDE_INT base_offset = 0;
9505
9506 if (m->use_fast_prologue_epilogue)
9507 {
9508 /* Choose the base register most likely to allow the most scheduling
9509 opportunities. Generally FP is valid throughout the function,
9510 while DRAP must be reloaded within the epilogue. But choose either
9511 over the SP due to increased encoding size. */
9512
9513 if (m->fs.fp_valid)
9514 {
9515 base_reg = hard_frame_pointer_rtx;
9516 base_offset = m->fs.fp_offset - cfa_offset;
9517 }
9518 else if (m->fs.drap_valid)
9519 {
9520 base_reg = crtl->drap_reg;
9521 base_offset = 0 - cfa_offset;
9522 }
9523 else if (m->fs.sp_valid)
9524 {
9525 base_reg = stack_pointer_rtx;
9526 base_offset = m->fs.sp_offset - cfa_offset;
9527 }
9528 }
9529 else
9530 {
9531 HOST_WIDE_INT toffset;
9532 int len = 16, tlen;
9533
9534 /* Choose the base register with the smallest address encoding.
9535 With a tie, choose FP > DRAP > SP. */
9536 if (m->fs.sp_valid)
9537 {
9538 base_reg = stack_pointer_rtx;
9539 base_offset = m->fs.sp_offset - cfa_offset;
9540 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9541 }
9542 if (m->fs.drap_valid)
9543 {
9544 toffset = 0 - cfa_offset;
9545 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9546 if (tlen <= len)
9547 {
9548 base_reg = crtl->drap_reg;
9549 base_offset = toffset;
9550 len = tlen;
9551 }
9552 }
9553 if (m->fs.fp_valid)
9554 {
9555 toffset = m->fs.fp_offset - cfa_offset;
9556 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9557 if (tlen <= len)
9558 {
9559 base_reg = hard_frame_pointer_rtx;
9560 base_offset = toffset;
9561 len = tlen;
9562 }
9563 }
9564 }
9565 gcc_assert (base_reg != NULL);
9566
9567 return plus_constant (Pmode, base_reg, base_offset);
9568 }
9569
9570 /* Emit code to save registers in the prologue. */
9571
9572 static void
9573 ix86_emit_save_regs (void)
9574 {
9575 unsigned int regno;
9576 rtx insn;
9577
9578 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9579 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9580 {
9581 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9582 RTX_FRAME_RELATED_P (insn) = 1;
9583 }
9584 }
9585
9586 /* Emit a single register save at CFA - CFA_OFFSET. */
9587
9588 static void
9589 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9590 HOST_WIDE_INT cfa_offset)
9591 {
9592 struct machine_function *m = cfun->machine;
9593 rtx reg = gen_rtx_REG (mode, regno);
9594 rtx mem, addr, base, insn;
9595
9596 addr = choose_baseaddr (cfa_offset);
9597 mem = gen_frame_mem (mode, addr);
9598
9599 /* For SSE saves, we need to indicate the 128-bit alignment. */
9600 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9601
9602 insn = emit_move_insn (mem, reg);
9603 RTX_FRAME_RELATED_P (insn) = 1;
9604
9605 base = addr;
9606 if (GET_CODE (base) == PLUS)
9607 base = XEXP (base, 0);
9608 gcc_checking_assert (REG_P (base));
9609
9610 /* When saving registers into a re-aligned local stack frame, avoid
9611 any tricky guessing by dwarf2out. */
9612 if (m->fs.realigned)
9613 {
9614 gcc_checking_assert (stack_realign_drap);
9615
9616 if (regno == REGNO (crtl->drap_reg))
9617 {
9618 /* A bit of a hack. We force the DRAP register to be saved in
9619 the re-aligned stack frame, which provides us with a copy
9620 of the CFA that will last past the prologue. Install it. */
9621 gcc_checking_assert (cfun->machine->fs.fp_valid);
9622 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9623 cfun->machine->fs.fp_offset - cfa_offset);
9624 mem = gen_rtx_MEM (mode, addr);
9625 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9626 }
9627 else
9628 {
9629 /* The frame pointer is a stable reference within the
9630 aligned frame. Use it. */
9631 gcc_checking_assert (cfun->machine->fs.fp_valid);
9632 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9633 cfun->machine->fs.fp_offset - cfa_offset);
9634 mem = gen_rtx_MEM (mode, addr);
9635 add_reg_note (insn, REG_CFA_EXPRESSION,
9636 gen_rtx_SET (VOIDmode, mem, reg));
9637 }
9638 }
9639
9640 /* The memory may not be relative to the current CFA register,
9641 which means that we may need to generate a new pattern for
9642 use by the unwind info. */
9643 else if (base != m->fs.cfa_reg)
9644 {
9645 addr = plus_constant (Pmode, m->fs.cfa_reg,
9646 m->fs.cfa_offset - cfa_offset);
9647 mem = gen_rtx_MEM (mode, addr);
9648 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9649 }
9650 }
9651
9652 /* Emit code to save registers using MOV insns.
9653 First register is stored at CFA - CFA_OFFSET. */
9654 static void
9655 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9656 {
9657 unsigned int regno;
9658
9659 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9660 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9661 {
9662 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9663 cfa_offset -= UNITS_PER_WORD;
9664 }
9665 }
9666
9667 /* Emit code to save SSE registers using MOV insns.
9668 First register is stored at CFA - CFA_OFFSET. */
9669 static void
9670 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9671 {
9672 unsigned int regno;
9673
9674 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9675 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9676 {
9677 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9678 cfa_offset -= 16;
9679 }
9680 }
9681
9682 static GTY(()) rtx queued_cfa_restores;
9683
9684 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9685 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9686 Don't add the note if the previously saved value will be left untouched
9687 within stack red-zone till return, as unwinders can find the same value
9688 in the register and on the stack. */
9689
9690 static void
9691 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9692 {
9693 if (!crtl->shrink_wrapped
9694 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9695 return;
9696
9697 if (insn)
9698 {
9699 add_reg_note (insn, REG_CFA_RESTORE, reg);
9700 RTX_FRAME_RELATED_P (insn) = 1;
9701 }
9702 else
9703 queued_cfa_restores
9704 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9705 }
9706
9707 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9708
9709 static void
9710 ix86_add_queued_cfa_restore_notes (rtx insn)
9711 {
9712 rtx last;
9713 if (!queued_cfa_restores)
9714 return;
9715 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9716 ;
9717 XEXP (last, 1) = REG_NOTES (insn);
9718 REG_NOTES (insn) = queued_cfa_restores;
9719 queued_cfa_restores = NULL_RTX;
9720 RTX_FRAME_RELATED_P (insn) = 1;
9721 }
9722
9723 /* Expand prologue or epilogue stack adjustment.
9724 The pattern exist to put a dependency on all ebp-based memory accesses.
9725 STYLE should be negative if instructions should be marked as frame related,
9726 zero if %r11 register is live and cannot be freely used and positive
9727 otherwise. */
9728
9729 static void
9730 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9731 int style, bool set_cfa)
9732 {
9733 struct machine_function *m = cfun->machine;
9734 rtx insn;
9735 bool add_frame_related_expr = false;
9736
9737 if (Pmode == SImode)
9738 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9739 else if (x86_64_immediate_operand (offset, DImode))
9740 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9741 else
9742 {
9743 rtx tmp;
9744 /* r11 is used by indirect sibcall return as well, set before the
9745 epilogue and used after the epilogue. */
9746 if (style)
9747 tmp = gen_rtx_REG (DImode, R11_REG);
9748 else
9749 {
9750 gcc_assert (src != hard_frame_pointer_rtx
9751 && dest != hard_frame_pointer_rtx);
9752 tmp = hard_frame_pointer_rtx;
9753 }
9754 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9755 if (style < 0)
9756 add_frame_related_expr = true;
9757
9758 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9759 }
9760
9761 insn = emit_insn (insn);
9762 if (style >= 0)
9763 ix86_add_queued_cfa_restore_notes (insn);
9764
9765 if (set_cfa)
9766 {
9767 rtx r;
9768
9769 gcc_assert (m->fs.cfa_reg == src);
9770 m->fs.cfa_offset += INTVAL (offset);
9771 m->fs.cfa_reg = dest;
9772
9773 r = gen_rtx_PLUS (Pmode, src, offset);
9774 r = gen_rtx_SET (VOIDmode, dest, r);
9775 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9776 RTX_FRAME_RELATED_P (insn) = 1;
9777 }
9778 else if (style < 0)
9779 {
9780 RTX_FRAME_RELATED_P (insn) = 1;
9781 if (add_frame_related_expr)
9782 {
9783 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9784 r = gen_rtx_SET (VOIDmode, dest, r);
9785 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9786 }
9787 }
9788
9789 if (dest == stack_pointer_rtx)
9790 {
9791 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9792 bool valid = m->fs.sp_valid;
9793
9794 if (src == hard_frame_pointer_rtx)
9795 {
9796 valid = m->fs.fp_valid;
9797 ooffset = m->fs.fp_offset;
9798 }
9799 else if (src == crtl->drap_reg)
9800 {
9801 valid = m->fs.drap_valid;
9802 ooffset = 0;
9803 }
9804 else
9805 {
9806 /* Else there are two possibilities: SP itself, which we set
9807 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9808 taken care of this by hand along the eh_return path. */
9809 gcc_checking_assert (src == stack_pointer_rtx
9810 || offset == const0_rtx);
9811 }
9812
9813 m->fs.sp_offset = ooffset - INTVAL (offset);
9814 m->fs.sp_valid = valid;
9815 }
9816 }
9817
9818 /* Find an available register to be used as dynamic realign argument
9819 pointer regsiter. Such a register will be written in prologue and
9820 used in begin of body, so it must not be
9821 1. parameter passing register.
9822 2. GOT pointer.
9823 We reuse static-chain register if it is available. Otherwise, we
9824 use DI for i386 and R13 for x86-64. We chose R13 since it has
9825 shorter encoding.
9826
9827 Return: the regno of chosen register. */
9828
9829 static unsigned int
9830 find_drap_reg (void)
9831 {
9832 tree decl = cfun->decl;
9833
9834 if (TARGET_64BIT)
9835 {
9836 /* Use R13 for nested function or function need static chain.
9837 Since function with tail call may use any caller-saved
9838 registers in epilogue, DRAP must not use caller-saved
9839 register in such case. */
9840 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9841 return R13_REG;
9842
9843 return R10_REG;
9844 }
9845 else
9846 {
9847 /* Use DI for nested function or function need static chain.
9848 Since function with tail call may use any caller-saved
9849 registers in epilogue, DRAP must not use caller-saved
9850 register in such case. */
9851 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9852 return DI_REG;
9853
9854 /* Reuse static chain register if it isn't used for parameter
9855 passing. */
9856 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9857 {
9858 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9859 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9860 return CX_REG;
9861 }
9862 return DI_REG;
9863 }
9864 }
9865
9866 /* Return minimum incoming stack alignment. */
9867
9868 static unsigned int
9869 ix86_minimum_incoming_stack_boundary (bool sibcall)
9870 {
9871 unsigned int incoming_stack_boundary;
9872
9873 /* Prefer the one specified at command line. */
9874 if (ix86_user_incoming_stack_boundary)
9875 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9876 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9877 if -mstackrealign is used, it isn't used for sibcall check and
9878 estimated stack alignment is 128bit. */
9879 else if (!sibcall
9880 && !TARGET_64BIT
9881 && ix86_force_align_arg_pointer
9882 && crtl->stack_alignment_estimated == 128)
9883 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9884 else
9885 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9886
9887 /* Incoming stack alignment can be changed on individual functions
9888 via force_align_arg_pointer attribute. We use the smallest
9889 incoming stack boundary. */
9890 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9891 && lookup_attribute (ix86_force_align_arg_pointer_string,
9892 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9893 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9894
9895 /* The incoming stack frame has to be aligned at least at
9896 parm_stack_boundary. */
9897 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9898 incoming_stack_boundary = crtl->parm_stack_boundary;
9899
9900 /* Stack at entrance of main is aligned by runtime. We use the
9901 smallest incoming stack boundary. */
9902 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9903 && DECL_NAME (current_function_decl)
9904 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9905 && DECL_FILE_SCOPE_P (current_function_decl))
9906 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9907
9908 return incoming_stack_boundary;
9909 }
9910
9911 /* Update incoming stack boundary and estimated stack alignment. */
9912
9913 static void
9914 ix86_update_stack_boundary (void)
9915 {
9916 ix86_incoming_stack_boundary
9917 = ix86_minimum_incoming_stack_boundary (false);
9918
9919 /* x86_64 vararg needs 16byte stack alignment for register save
9920 area. */
9921 if (TARGET_64BIT
9922 && cfun->stdarg
9923 && crtl->stack_alignment_estimated < 128)
9924 crtl->stack_alignment_estimated = 128;
9925 }
9926
9927 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9928 needed or an rtx for DRAP otherwise. */
9929
9930 static rtx
9931 ix86_get_drap_rtx (void)
9932 {
9933 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9934 crtl->need_drap = true;
9935
9936 if (stack_realign_drap)
9937 {
9938 /* Assign DRAP to vDRAP and returns vDRAP */
9939 unsigned int regno = find_drap_reg ();
9940 rtx drap_vreg;
9941 rtx arg_ptr;
9942 rtx seq, insn;
9943
9944 arg_ptr = gen_rtx_REG (Pmode, regno);
9945 crtl->drap_reg = arg_ptr;
9946
9947 start_sequence ();
9948 drap_vreg = copy_to_reg (arg_ptr);
9949 seq = get_insns ();
9950 end_sequence ();
9951
9952 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9953 if (!optimize)
9954 {
9955 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9956 RTX_FRAME_RELATED_P (insn) = 1;
9957 }
9958 return drap_vreg;
9959 }
9960 else
9961 return NULL;
9962 }
9963
9964 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9965
9966 static rtx
9967 ix86_internal_arg_pointer (void)
9968 {
9969 return virtual_incoming_args_rtx;
9970 }
9971
9972 struct scratch_reg {
9973 rtx reg;
9974 bool saved;
9975 };
9976
9977 /* Return a short-lived scratch register for use on function entry.
9978 In 32-bit mode, it is valid only after the registers are saved
9979 in the prologue. This register must be released by means of
9980 release_scratch_register_on_entry once it is dead. */
9981
9982 static void
9983 get_scratch_register_on_entry (struct scratch_reg *sr)
9984 {
9985 int regno;
9986
9987 sr->saved = false;
9988
9989 if (TARGET_64BIT)
9990 {
9991 /* We always use R11 in 64-bit mode. */
9992 regno = R11_REG;
9993 }
9994 else
9995 {
9996 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9997 bool fastcall_p
9998 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9999 bool thiscall_p
10000 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
10001 bool static_chain_p = DECL_STATIC_CHAIN (decl);
10002 int regparm = ix86_function_regparm (fntype, decl);
10003 int drap_regno
10004 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
10005
10006 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
10007 for the static chain register. */
10008 if ((regparm < 1 || (fastcall_p && !static_chain_p))
10009 && drap_regno != AX_REG)
10010 regno = AX_REG;
10011 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
10012 for the static chain register. */
10013 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
10014 regno = AX_REG;
10015 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
10016 regno = DX_REG;
10017 /* ecx is the static chain register. */
10018 else if (regparm < 3 && !fastcall_p && !thiscall_p
10019 && !static_chain_p
10020 && drap_regno != CX_REG)
10021 regno = CX_REG;
10022 else if (ix86_save_reg (BX_REG, true))
10023 regno = BX_REG;
10024 /* esi is the static chain register. */
10025 else if (!(regparm == 3 && static_chain_p)
10026 && ix86_save_reg (SI_REG, true))
10027 regno = SI_REG;
10028 else if (ix86_save_reg (DI_REG, true))
10029 regno = DI_REG;
10030 else
10031 {
10032 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
10033 sr->saved = true;
10034 }
10035 }
10036
10037 sr->reg = gen_rtx_REG (Pmode, regno);
10038 if (sr->saved)
10039 {
10040 rtx insn = emit_insn (gen_push (sr->reg));
10041 RTX_FRAME_RELATED_P (insn) = 1;
10042 }
10043 }
10044
10045 /* Release a scratch register obtained from the preceding function. */
10046
10047 static void
10048 release_scratch_register_on_entry (struct scratch_reg *sr)
10049 {
10050 if (sr->saved)
10051 {
10052 struct machine_function *m = cfun->machine;
10053 rtx x, insn = emit_insn (gen_pop (sr->reg));
10054
10055 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
10056 RTX_FRAME_RELATED_P (insn) = 1;
10057 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
10058 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10059 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
10060 m->fs.sp_offset -= UNITS_PER_WORD;
10061 }
10062 }
10063
10064 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
10065
10066 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
10067
10068 static void
10069 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
10070 {
10071 /* We skip the probe for the first interval + a small dope of 4 words and
10072 probe that many bytes past the specified size to maintain a protection
10073 area at the botton of the stack. */
10074 const int dope = 4 * UNITS_PER_WORD;
10075 rtx size_rtx = GEN_INT (size), last;
10076
10077 /* See if we have a constant small number of probes to generate. If so,
10078 that's the easy case. The run-time loop is made up of 11 insns in the
10079 generic case while the compile-time loop is made up of 3+2*(n-1) insns
10080 for n # of intervals. */
10081 if (size <= 5 * PROBE_INTERVAL)
10082 {
10083 HOST_WIDE_INT i, adjust;
10084 bool first_probe = true;
10085
10086 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
10087 values of N from 1 until it exceeds SIZE. If only one probe is
10088 needed, this will not generate any code. Then adjust and probe
10089 to PROBE_INTERVAL + SIZE. */
10090 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10091 {
10092 if (first_probe)
10093 {
10094 adjust = 2 * PROBE_INTERVAL + dope;
10095 first_probe = false;
10096 }
10097 else
10098 adjust = PROBE_INTERVAL;
10099
10100 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10101 plus_constant (Pmode, stack_pointer_rtx,
10102 -adjust)));
10103 emit_stack_probe (stack_pointer_rtx);
10104 }
10105
10106 if (first_probe)
10107 adjust = size + PROBE_INTERVAL + dope;
10108 else
10109 adjust = size + PROBE_INTERVAL - i;
10110
10111 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10112 plus_constant (Pmode, stack_pointer_rtx,
10113 -adjust)));
10114 emit_stack_probe (stack_pointer_rtx);
10115
10116 /* Adjust back to account for the additional first interval. */
10117 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10118 plus_constant (Pmode, stack_pointer_rtx,
10119 PROBE_INTERVAL + dope)));
10120 }
10121
10122 /* Otherwise, do the same as above, but in a loop. Note that we must be
10123 extra careful with variables wrapping around because we might be at
10124 the very top (or the very bottom) of the address space and we have
10125 to be able to handle this case properly; in particular, we use an
10126 equality test for the loop condition. */
10127 else
10128 {
10129 HOST_WIDE_INT rounded_size;
10130 struct scratch_reg sr;
10131
10132 get_scratch_register_on_entry (&sr);
10133
10134
10135 /* Step 1: round SIZE to the previous multiple of the interval. */
10136
10137 rounded_size = size & -PROBE_INTERVAL;
10138
10139
10140 /* Step 2: compute initial and final value of the loop counter. */
10141
10142 /* SP = SP_0 + PROBE_INTERVAL. */
10143 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10144 plus_constant (Pmode, stack_pointer_rtx,
10145 - (PROBE_INTERVAL + dope))));
10146
10147 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10148 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10149 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10150 gen_rtx_PLUS (Pmode, sr.reg,
10151 stack_pointer_rtx)));
10152
10153
10154 /* Step 3: the loop
10155
10156 while (SP != LAST_ADDR)
10157 {
10158 SP = SP + PROBE_INTERVAL
10159 probe at SP
10160 }
10161
10162 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10163 values of N from 1 until it is equal to ROUNDED_SIZE. */
10164
10165 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10166
10167
10168 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10169 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10170
10171 if (size != rounded_size)
10172 {
10173 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10174 plus_constant (Pmode, stack_pointer_rtx,
10175 rounded_size - size)));
10176 emit_stack_probe (stack_pointer_rtx);
10177 }
10178
10179 /* Adjust back to account for the additional first interval. */
10180 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10181 plus_constant (Pmode, stack_pointer_rtx,
10182 PROBE_INTERVAL + dope)));
10183
10184 release_scratch_register_on_entry (&sr);
10185 }
10186
10187 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10188
10189 /* Even if the stack pointer isn't the CFA register, we need to correctly
10190 describe the adjustments made to it, in particular differentiate the
10191 frame-related ones from the frame-unrelated ones. */
10192 if (size > 0)
10193 {
10194 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10195 XVECEXP (expr, 0, 0)
10196 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10197 plus_constant (Pmode, stack_pointer_rtx, -size));
10198 XVECEXP (expr, 0, 1)
10199 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10200 plus_constant (Pmode, stack_pointer_rtx,
10201 PROBE_INTERVAL + dope + size));
10202 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10203 RTX_FRAME_RELATED_P (last) = 1;
10204
10205 cfun->machine->fs.sp_offset += size;
10206 }
10207
10208 /* Make sure nothing is scheduled before we are done. */
10209 emit_insn (gen_blockage ());
10210 }
10211
10212 /* Adjust the stack pointer up to REG while probing it. */
10213
10214 const char *
10215 output_adjust_stack_and_probe (rtx reg)
10216 {
10217 static int labelno = 0;
10218 char loop_lab[32], end_lab[32];
10219 rtx xops[2];
10220
10221 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10222 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10223
10224 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10225
10226 /* Jump to END_LAB if SP == LAST_ADDR. */
10227 xops[0] = stack_pointer_rtx;
10228 xops[1] = reg;
10229 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10230 fputs ("\tje\t", asm_out_file);
10231 assemble_name_raw (asm_out_file, end_lab);
10232 fputc ('\n', asm_out_file);
10233
10234 /* SP = SP + PROBE_INTERVAL. */
10235 xops[1] = GEN_INT (PROBE_INTERVAL);
10236 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10237
10238 /* Probe at SP. */
10239 xops[1] = const0_rtx;
10240 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10241
10242 fprintf (asm_out_file, "\tjmp\t");
10243 assemble_name_raw (asm_out_file, loop_lab);
10244 fputc ('\n', asm_out_file);
10245
10246 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10247
10248 return "";
10249 }
10250
10251 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10252 inclusive. These are offsets from the current stack pointer. */
10253
10254 static void
10255 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10256 {
10257 /* See if we have a constant small number of probes to generate. If so,
10258 that's the easy case. The run-time loop is made up of 7 insns in the
10259 generic case while the compile-time loop is made up of n insns for n #
10260 of intervals. */
10261 if (size <= 7 * PROBE_INTERVAL)
10262 {
10263 HOST_WIDE_INT i;
10264
10265 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10266 it exceeds SIZE. If only one probe is needed, this will not
10267 generate any code. Then probe at FIRST + SIZE. */
10268 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10269 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10270 -(first + i)));
10271
10272 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10273 -(first + size)));
10274 }
10275
10276 /* Otherwise, do the same as above, but in a loop. Note that we must be
10277 extra careful with variables wrapping around because we might be at
10278 the very top (or the very bottom) of the address space and we have
10279 to be able to handle this case properly; in particular, we use an
10280 equality test for the loop condition. */
10281 else
10282 {
10283 HOST_WIDE_INT rounded_size, last;
10284 struct scratch_reg sr;
10285
10286 get_scratch_register_on_entry (&sr);
10287
10288
10289 /* Step 1: round SIZE to the previous multiple of the interval. */
10290
10291 rounded_size = size & -PROBE_INTERVAL;
10292
10293
10294 /* Step 2: compute initial and final value of the loop counter. */
10295
10296 /* TEST_OFFSET = FIRST. */
10297 emit_move_insn (sr.reg, GEN_INT (-first));
10298
10299 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10300 last = first + rounded_size;
10301
10302
10303 /* Step 3: the loop
10304
10305 while (TEST_ADDR != LAST_ADDR)
10306 {
10307 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10308 probe at TEST_ADDR
10309 }
10310
10311 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10312 until it is equal to ROUNDED_SIZE. */
10313
10314 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10315
10316
10317 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10318 that SIZE is equal to ROUNDED_SIZE. */
10319
10320 if (size != rounded_size)
10321 emit_stack_probe (plus_constant (Pmode,
10322 gen_rtx_PLUS (Pmode,
10323 stack_pointer_rtx,
10324 sr.reg),
10325 rounded_size - size));
10326
10327 release_scratch_register_on_entry (&sr);
10328 }
10329
10330 /* Make sure nothing is scheduled before we are done. */
10331 emit_insn (gen_blockage ());
10332 }
10333
10334 /* Probe a range of stack addresses from REG to END, inclusive. These are
10335 offsets from the current stack pointer. */
10336
10337 const char *
10338 output_probe_stack_range (rtx reg, rtx end)
10339 {
10340 static int labelno = 0;
10341 char loop_lab[32], end_lab[32];
10342 rtx xops[3];
10343
10344 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10345 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10346
10347 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10348
10349 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10350 xops[0] = reg;
10351 xops[1] = end;
10352 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10353 fputs ("\tje\t", asm_out_file);
10354 assemble_name_raw (asm_out_file, end_lab);
10355 fputc ('\n', asm_out_file);
10356
10357 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10358 xops[1] = GEN_INT (PROBE_INTERVAL);
10359 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10360
10361 /* Probe at TEST_ADDR. */
10362 xops[0] = stack_pointer_rtx;
10363 xops[1] = reg;
10364 xops[2] = const0_rtx;
10365 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10366
10367 fprintf (asm_out_file, "\tjmp\t");
10368 assemble_name_raw (asm_out_file, loop_lab);
10369 fputc ('\n', asm_out_file);
10370
10371 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10372
10373 return "";
10374 }
10375
10376 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10377 to be generated in correct form. */
10378 static void
10379 ix86_finalize_stack_realign_flags (void)
10380 {
10381 /* Check if stack realign is really needed after reload, and
10382 stores result in cfun */
10383 unsigned int incoming_stack_boundary
10384 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10385 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10386 unsigned int stack_realign = (incoming_stack_boundary
10387 < (crtl->is_leaf
10388 ? crtl->max_used_stack_slot_alignment
10389 : crtl->stack_alignment_needed));
10390
10391 if (crtl->stack_realign_finalized)
10392 {
10393 /* After stack_realign_needed is finalized, we can't no longer
10394 change it. */
10395 gcc_assert (crtl->stack_realign_needed == stack_realign);
10396 return;
10397 }
10398
10399 /* If the only reason for frame_pointer_needed is that we conservatively
10400 assumed stack realignment might be needed, but in the end nothing that
10401 needed the stack alignment had been spilled, clear frame_pointer_needed
10402 and say we don't need stack realignment. */
10403 if (stack_realign
10404 && !crtl->need_drap
10405 && frame_pointer_needed
10406 && crtl->is_leaf
10407 && flag_omit_frame_pointer
10408 && crtl->sp_is_unchanging
10409 && !ix86_current_function_calls_tls_descriptor
10410 && !crtl->accesses_prior_frames
10411 && !cfun->calls_alloca
10412 && !crtl->calls_eh_return
10413 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10414 && !ix86_frame_pointer_required ()
10415 && get_frame_size () == 0
10416 && ix86_nsaved_sseregs () == 0
10417 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10418 {
10419 HARD_REG_SET set_up_by_prologue, prologue_used;
10420 basic_block bb;
10421
10422 CLEAR_HARD_REG_SET (prologue_used);
10423 CLEAR_HARD_REG_SET (set_up_by_prologue);
10424 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10425 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10426 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10427 HARD_FRAME_POINTER_REGNUM);
10428 FOR_EACH_BB (bb)
10429 {
10430 rtx insn;
10431 FOR_BB_INSNS (bb, insn)
10432 if (NONDEBUG_INSN_P (insn)
10433 && requires_stack_frame_p (insn, prologue_used,
10434 set_up_by_prologue))
10435 {
10436 crtl->stack_realign_needed = stack_realign;
10437 crtl->stack_realign_finalized = true;
10438 return;
10439 }
10440 }
10441
10442 frame_pointer_needed = false;
10443 stack_realign = false;
10444 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10445 crtl->stack_alignment_needed = incoming_stack_boundary;
10446 crtl->stack_alignment_estimated = incoming_stack_boundary;
10447 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10448 crtl->preferred_stack_boundary = incoming_stack_boundary;
10449 df_finish_pass (true);
10450 df_scan_alloc (NULL);
10451 df_scan_blocks ();
10452 df_compute_regs_ever_live (true);
10453 df_analyze ();
10454 }
10455
10456 crtl->stack_realign_needed = stack_realign;
10457 crtl->stack_realign_finalized = true;
10458 }
10459
10460 /* Expand the prologue into a bunch of separate insns. */
10461
10462 void
10463 ix86_expand_prologue (void)
10464 {
10465 struct machine_function *m = cfun->machine;
10466 rtx insn, t;
10467 bool pic_reg_used;
10468 struct ix86_frame frame;
10469 HOST_WIDE_INT allocate;
10470 bool int_registers_saved;
10471 bool sse_registers_saved;
10472
10473 ix86_finalize_stack_realign_flags ();
10474
10475 /* DRAP should not coexist with stack_realign_fp */
10476 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10477
10478 memset (&m->fs, 0, sizeof (m->fs));
10479
10480 /* Initialize CFA state for before the prologue. */
10481 m->fs.cfa_reg = stack_pointer_rtx;
10482 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10483
10484 /* Track SP offset to the CFA. We continue tracking this after we've
10485 swapped the CFA register away from SP. In the case of re-alignment
10486 this is fudged; we're interested to offsets within the local frame. */
10487 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10488 m->fs.sp_valid = true;
10489
10490 ix86_compute_frame_layout (&frame);
10491
10492 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10493 {
10494 /* We should have already generated an error for any use of
10495 ms_hook on a nested function. */
10496 gcc_checking_assert (!ix86_static_chain_on_stack);
10497
10498 /* Check if profiling is active and we shall use profiling before
10499 prologue variant. If so sorry. */
10500 if (crtl->profile && flag_fentry != 0)
10501 sorry ("ms_hook_prologue attribute isn%'t compatible "
10502 "with -mfentry for 32-bit");
10503
10504 /* In ix86_asm_output_function_label we emitted:
10505 8b ff movl.s %edi,%edi
10506 55 push %ebp
10507 8b ec movl.s %esp,%ebp
10508
10509 This matches the hookable function prologue in Win32 API
10510 functions in Microsoft Windows XP Service Pack 2 and newer.
10511 Wine uses this to enable Windows apps to hook the Win32 API
10512 functions provided by Wine.
10513
10514 What that means is that we've already set up the frame pointer. */
10515
10516 if (frame_pointer_needed
10517 && !(crtl->drap_reg && crtl->stack_realign_needed))
10518 {
10519 rtx push, mov;
10520
10521 /* We've decided to use the frame pointer already set up.
10522 Describe this to the unwinder by pretending that both
10523 push and mov insns happen right here.
10524
10525 Putting the unwind info here at the end of the ms_hook
10526 is done so that we can make absolutely certain we get
10527 the required byte sequence at the start of the function,
10528 rather than relying on an assembler that can produce
10529 the exact encoding required.
10530
10531 However it does mean (in the unpatched case) that we have
10532 a 1 insn window where the asynchronous unwind info is
10533 incorrect. However, if we placed the unwind info at
10534 its correct location we would have incorrect unwind info
10535 in the patched case. Which is probably all moot since
10536 I don't expect Wine generates dwarf2 unwind info for the
10537 system libraries that use this feature. */
10538
10539 insn = emit_insn (gen_blockage ());
10540
10541 push = gen_push (hard_frame_pointer_rtx);
10542 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10543 stack_pointer_rtx);
10544 RTX_FRAME_RELATED_P (push) = 1;
10545 RTX_FRAME_RELATED_P (mov) = 1;
10546
10547 RTX_FRAME_RELATED_P (insn) = 1;
10548 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10549 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10550
10551 /* Note that gen_push incremented m->fs.cfa_offset, even
10552 though we didn't emit the push insn here. */
10553 m->fs.cfa_reg = hard_frame_pointer_rtx;
10554 m->fs.fp_offset = m->fs.cfa_offset;
10555 m->fs.fp_valid = true;
10556 }
10557 else
10558 {
10559 /* The frame pointer is not needed so pop %ebp again.
10560 This leaves us with a pristine state. */
10561 emit_insn (gen_pop (hard_frame_pointer_rtx));
10562 }
10563 }
10564
10565 /* The first insn of a function that accepts its static chain on the
10566 stack is to push the register that would be filled in by a direct
10567 call. This insn will be skipped by the trampoline. */
10568 else if (ix86_static_chain_on_stack)
10569 {
10570 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10571 emit_insn (gen_blockage ());
10572
10573 /* We don't want to interpret this push insn as a register save,
10574 only as a stack adjustment. The real copy of the register as
10575 a save will be done later, if needed. */
10576 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10577 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10578 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10579 RTX_FRAME_RELATED_P (insn) = 1;
10580 }
10581
10582 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10583 of DRAP is needed and stack realignment is really needed after reload */
10584 if (stack_realign_drap)
10585 {
10586 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10587
10588 /* Only need to push parameter pointer reg if it is caller saved. */
10589 if (!call_used_regs[REGNO (crtl->drap_reg)])
10590 {
10591 /* Push arg pointer reg */
10592 insn = emit_insn (gen_push (crtl->drap_reg));
10593 RTX_FRAME_RELATED_P (insn) = 1;
10594 }
10595
10596 /* Grab the argument pointer. */
10597 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10598 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10599 RTX_FRAME_RELATED_P (insn) = 1;
10600 m->fs.cfa_reg = crtl->drap_reg;
10601 m->fs.cfa_offset = 0;
10602
10603 /* Align the stack. */
10604 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10605 stack_pointer_rtx,
10606 GEN_INT (-align_bytes)));
10607 RTX_FRAME_RELATED_P (insn) = 1;
10608
10609 /* Replicate the return address on the stack so that return
10610 address can be reached via (argp - 1) slot. This is needed
10611 to implement macro RETURN_ADDR_RTX and intrinsic function
10612 expand_builtin_return_addr etc. */
10613 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10614 t = gen_frame_mem (word_mode, t);
10615 insn = emit_insn (gen_push (t));
10616 RTX_FRAME_RELATED_P (insn) = 1;
10617
10618 /* For the purposes of frame and register save area addressing,
10619 we've started over with a new frame. */
10620 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10621 m->fs.realigned = true;
10622 }
10623
10624 int_registers_saved = (frame.nregs == 0);
10625 sse_registers_saved = (frame.nsseregs == 0);
10626
10627 if (frame_pointer_needed && !m->fs.fp_valid)
10628 {
10629 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10630 slower on all targets. Also sdb doesn't like it. */
10631 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10632 RTX_FRAME_RELATED_P (insn) = 1;
10633
10634 /* Push registers now, before setting the frame pointer
10635 on SEH target. */
10636 if (!int_registers_saved
10637 && TARGET_SEH
10638 && !frame.save_regs_using_mov)
10639 {
10640 ix86_emit_save_regs ();
10641 int_registers_saved = true;
10642 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10643 }
10644
10645 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10646 {
10647 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10648 RTX_FRAME_RELATED_P (insn) = 1;
10649
10650 if (m->fs.cfa_reg == stack_pointer_rtx)
10651 m->fs.cfa_reg = hard_frame_pointer_rtx;
10652 m->fs.fp_offset = m->fs.sp_offset;
10653 m->fs.fp_valid = true;
10654 }
10655 }
10656
10657 if (!int_registers_saved)
10658 {
10659 /* If saving registers via PUSH, do so now. */
10660 if (!frame.save_regs_using_mov)
10661 {
10662 ix86_emit_save_regs ();
10663 int_registers_saved = true;
10664 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10665 }
10666
10667 /* When using red zone we may start register saving before allocating
10668 the stack frame saving one cycle of the prologue. However, avoid
10669 doing this if we have to probe the stack; at least on x86_64 the
10670 stack probe can turn into a call that clobbers a red zone location. */
10671 else if (ix86_using_red_zone ()
10672 && (! TARGET_STACK_PROBE
10673 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10674 {
10675 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10676 int_registers_saved = true;
10677 }
10678 }
10679
10680 if (stack_realign_fp)
10681 {
10682 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10683 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10684
10685 /* The computation of the size of the re-aligned stack frame means
10686 that we must allocate the size of the register save area before
10687 performing the actual alignment. Otherwise we cannot guarantee
10688 that there's enough storage above the realignment point. */
10689 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10690 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10691 GEN_INT (m->fs.sp_offset
10692 - frame.sse_reg_save_offset),
10693 -1, false);
10694
10695 /* Align the stack. */
10696 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10697 stack_pointer_rtx,
10698 GEN_INT (-align_bytes)));
10699
10700 /* For the purposes of register save area addressing, the stack
10701 pointer is no longer valid. As for the value of sp_offset,
10702 see ix86_compute_frame_layout, which we need to match in order
10703 to pass verification of stack_pointer_offset at the end. */
10704 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10705 m->fs.sp_valid = false;
10706 }
10707
10708 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10709
10710 if (flag_stack_usage_info)
10711 {
10712 /* We start to count from ARG_POINTER. */
10713 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10714
10715 /* If it was realigned, take into account the fake frame. */
10716 if (stack_realign_drap)
10717 {
10718 if (ix86_static_chain_on_stack)
10719 stack_size += UNITS_PER_WORD;
10720
10721 if (!call_used_regs[REGNO (crtl->drap_reg)])
10722 stack_size += UNITS_PER_WORD;
10723
10724 /* This over-estimates by 1 minimal-stack-alignment-unit but
10725 mitigates that by counting in the new return address slot. */
10726 current_function_dynamic_stack_size
10727 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10728 }
10729
10730 current_function_static_stack_size = stack_size;
10731 }
10732
10733 /* On SEH target with very large frame size, allocate an area to save
10734 SSE registers (as the very large allocation won't be described). */
10735 if (TARGET_SEH
10736 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10737 && !sse_registers_saved)
10738 {
10739 HOST_WIDE_INT sse_size =
10740 frame.sse_reg_save_offset - frame.reg_save_offset;
10741
10742 gcc_assert (int_registers_saved);
10743
10744 /* No need to do stack checking as the area will be immediately
10745 written. */
10746 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10747 GEN_INT (-sse_size), -1,
10748 m->fs.cfa_reg == stack_pointer_rtx);
10749 allocate -= sse_size;
10750 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10751 sse_registers_saved = true;
10752 }
10753
10754 /* The stack has already been decremented by the instruction calling us
10755 so probe if the size is non-negative to preserve the protection area. */
10756 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10757 {
10758 /* We expect the registers to be saved when probes are used. */
10759 gcc_assert (int_registers_saved);
10760
10761 if (STACK_CHECK_MOVING_SP)
10762 {
10763 if (!(crtl->is_leaf && !cfun->calls_alloca
10764 && allocate <= PROBE_INTERVAL))
10765 {
10766 ix86_adjust_stack_and_probe (allocate);
10767 allocate = 0;
10768 }
10769 }
10770 else
10771 {
10772 HOST_WIDE_INT size = allocate;
10773
10774 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10775 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10776
10777 if (TARGET_STACK_PROBE)
10778 {
10779 if (crtl->is_leaf && !cfun->calls_alloca)
10780 {
10781 if (size > PROBE_INTERVAL)
10782 ix86_emit_probe_stack_range (0, size);
10783 }
10784 else
10785 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10786 }
10787 else
10788 {
10789 if (crtl->is_leaf && !cfun->calls_alloca)
10790 {
10791 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
10792 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
10793 size - STACK_CHECK_PROTECT);
10794 }
10795 else
10796 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10797 }
10798 }
10799 }
10800
10801 if (allocate == 0)
10802 ;
10803 else if (!ix86_target_stack_probe ()
10804 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10805 {
10806 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10807 GEN_INT (-allocate), -1,
10808 m->fs.cfa_reg == stack_pointer_rtx);
10809 }
10810 else
10811 {
10812 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10813 rtx r10 = NULL;
10814 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10815 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10816 bool eax_live = false;
10817 bool r10_live = false;
10818
10819 if (TARGET_64BIT)
10820 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10821 if (!TARGET_64BIT_MS_ABI)
10822 eax_live = ix86_eax_live_at_start_p ();
10823
10824 /* Note that SEH directives need to continue tracking the stack
10825 pointer even after the frame pointer has been set up. */
10826 if (eax_live)
10827 {
10828 insn = emit_insn (gen_push (eax));
10829 allocate -= UNITS_PER_WORD;
10830 if (sp_is_cfa_reg || TARGET_SEH)
10831 {
10832 if (sp_is_cfa_reg)
10833 m->fs.cfa_offset += UNITS_PER_WORD;
10834 RTX_FRAME_RELATED_P (insn) = 1;
10835 }
10836 }
10837
10838 if (r10_live)
10839 {
10840 r10 = gen_rtx_REG (Pmode, R10_REG);
10841 insn = emit_insn (gen_push (r10));
10842 allocate -= UNITS_PER_WORD;
10843 if (sp_is_cfa_reg || TARGET_SEH)
10844 {
10845 if (sp_is_cfa_reg)
10846 m->fs.cfa_offset += UNITS_PER_WORD;
10847 RTX_FRAME_RELATED_P (insn) = 1;
10848 }
10849 }
10850
10851 emit_move_insn (eax, GEN_INT (allocate));
10852 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10853
10854 /* Use the fact that AX still contains ALLOCATE. */
10855 adjust_stack_insn = (Pmode == DImode
10856 ? gen_pro_epilogue_adjust_stack_di_sub
10857 : gen_pro_epilogue_adjust_stack_si_sub);
10858
10859 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10860 stack_pointer_rtx, eax));
10861
10862 if (sp_is_cfa_reg || TARGET_SEH)
10863 {
10864 if (sp_is_cfa_reg)
10865 m->fs.cfa_offset += allocate;
10866 RTX_FRAME_RELATED_P (insn) = 1;
10867 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10868 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10869 plus_constant (Pmode, stack_pointer_rtx,
10870 -allocate)));
10871 }
10872 m->fs.sp_offset += allocate;
10873
10874 if (r10_live && eax_live)
10875 {
10876 t = choose_baseaddr (m->fs.sp_offset - allocate);
10877 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10878 gen_frame_mem (word_mode, t));
10879 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10880 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10881 gen_frame_mem (word_mode, t));
10882 }
10883 else if (eax_live || r10_live)
10884 {
10885 t = choose_baseaddr (m->fs.sp_offset - allocate);
10886 emit_move_insn (gen_rtx_REG (word_mode,
10887 (eax_live ? AX_REG : R10_REG)),
10888 gen_frame_mem (word_mode, t));
10889 }
10890 }
10891 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10892
10893 /* If we havn't already set up the frame pointer, do so now. */
10894 if (frame_pointer_needed && !m->fs.fp_valid)
10895 {
10896 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10897 GEN_INT (frame.stack_pointer_offset
10898 - frame.hard_frame_pointer_offset));
10899 insn = emit_insn (insn);
10900 RTX_FRAME_RELATED_P (insn) = 1;
10901 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10902
10903 if (m->fs.cfa_reg == stack_pointer_rtx)
10904 m->fs.cfa_reg = hard_frame_pointer_rtx;
10905 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10906 m->fs.fp_valid = true;
10907 }
10908
10909 if (!int_registers_saved)
10910 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10911 if (!sse_registers_saved)
10912 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10913
10914 pic_reg_used = false;
10915 /* We don't use pic-register for pe-coff target. */
10916 if (pic_offset_table_rtx
10917 && !TARGET_PECOFF
10918 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10919 || crtl->profile))
10920 {
10921 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10922
10923 if (alt_pic_reg_used != INVALID_REGNUM)
10924 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10925
10926 pic_reg_used = true;
10927 }
10928
10929 if (pic_reg_used)
10930 {
10931 if (TARGET_64BIT)
10932 {
10933 if (ix86_cmodel == CM_LARGE_PIC)
10934 {
10935 rtx label, tmp_reg;
10936
10937 gcc_assert (Pmode == DImode);
10938 label = gen_label_rtx ();
10939 emit_label (label);
10940 LABEL_PRESERVE_P (label) = 1;
10941 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10942 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10943 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10944 label));
10945 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10946 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10947 pic_offset_table_rtx, tmp_reg));
10948 }
10949 else
10950 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10951 }
10952 else
10953 {
10954 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10955 RTX_FRAME_RELATED_P (insn) = 1;
10956 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10957 }
10958 }
10959
10960 /* In the pic_reg_used case, make sure that the got load isn't deleted
10961 when mcount needs it. Blockage to avoid call movement across mcount
10962 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10963 note. */
10964 if (crtl->profile && !flag_fentry && pic_reg_used)
10965 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10966
10967 if (crtl->drap_reg && !crtl->stack_realign_needed)
10968 {
10969 /* vDRAP is setup but after reload it turns out stack realign
10970 isn't necessary, here we will emit prologue to setup DRAP
10971 without stack realign adjustment */
10972 t = choose_baseaddr (0);
10973 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10974 }
10975
10976 /* Prevent instructions from being scheduled into register save push
10977 sequence when access to the redzone area is done through frame pointer.
10978 The offset between the frame pointer and the stack pointer is calculated
10979 relative to the value of the stack pointer at the end of the function
10980 prologue, and moving instructions that access redzone area via frame
10981 pointer inside push sequence violates this assumption. */
10982 if (frame_pointer_needed && frame.red_zone_size)
10983 emit_insn (gen_memory_blockage ());
10984
10985 /* Emit cld instruction if stringops are used in the function. */
10986 if (TARGET_CLD && ix86_current_function_needs_cld)
10987 emit_insn (gen_cld ());
10988
10989 /* SEH requires that the prologue end within 256 bytes of the start of
10990 the function. Prevent instruction schedules that would extend that.
10991 Further, prevent alloca modifications to the stack pointer from being
10992 combined with prologue modifications. */
10993 if (TARGET_SEH)
10994 emit_insn (gen_prologue_use (stack_pointer_rtx));
10995 }
10996
10997 /* Emit code to restore REG using a POP insn. */
10998
10999 static void
11000 ix86_emit_restore_reg_using_pop (rtx reg)
11001 {
11002 struct machine_function *m = cfun->machine;
11003 rtx insn = emit_insn (gen_pop (reg));
11004
11005 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
11006 m->fs.sp_offset -= UNITS_PER_WORD;
11007
11008 if (m->fs.cfa_reg == crtl->drap_reg
11009 && REGNO (reg) == REGNO (crtl->drap_reg))
11010 {
11011 /* Previously we'd represented the CFA as an expression
11012 like *(%ebp - 8). We've just popped that value from
11013 the stack, which means we need to reset the CFA to
11014 the drap register. This will remain until we restore
11015 the stack pointer. */
11016 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11017 RTX_FRAME_RELATED_P (insn) = 1;
11018
11019 /* This means that the DRAP register is valid for addressing too. */
11020 m->fs.drap_valid = true;
11021 return;
11022 }
11023
11024 if (m->fs.cfa_reg == stack_pointer_rtx)
11025 {
11026 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11027 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
11028 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11029 RTX_FRAME_RELATED_P (insn) = 1;
11030
11031 m->fs.cfa_offset -= UNITS_PER_WORD;
11032 }
11033
11034 /* When the frame pointer is the CFA, and we pop it, we are
11035 swapping back to the stack pointer as the CFA. This happens
11036 for stack frames that don't allocate other data, so we assume
11037 the stack pointer is now pointing at the return address, i.e.
11038 the function entry state, which makes the offset be 1 word. */
11039 if (reg == hard_frame_pointer_rtx)
11040 {
11041 m->fs.fp_valid = false;
11042 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11043 {
11044 m->fs.cfa_reg = stack_pointer_rtx;
11045 m->fs.cfa_offset -= UNITS_PER_WORD;
11046
11047 add_reg_note (insn, REG_CFA_DEF_CFA,
11048 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11049 GEN_INT (m->fs.cfa_offset)));
11050 RTX_FRAME_RELATED_P (insn) = 1;
11051 }
11052 }
11053 }
11054
11055 /* Emit code to restore saved registers using POP insns. */
11056
11057 static void
11058 ix86_emit_restore_regs_using_pop (void)
11059 {
11060 unsigned int regno;
11061
11062 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11063 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
11064 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
11065 }
11066
11067 /* Emit code and notes for the LEAVE instruction. */
11068
11069 static void
11070 ix86_emit_leave (void)
11071 {
11072 struct machine_function *m = cfun->machine;
11073 rtx insn = emit_insn (ix86_gen_leave ());
11074
11075 ix86_add_queued_cfa_restore_notes (insn);
11076
11077 gcc_assert (m->fs.fp_valid);
11078 m->fs.sp_valid = true;
11079 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
11080 m->fs.fp_valid = false;
11081
11082 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11083 {
11084 m->fs.cfa_reg = stack_pointer_rtx;
11085 m->fs.cfa_offset = m->fs.sp_offset;
11086
11087 add_reg_note (insn, REG_CFA_DEF_CFA,
11088 plus_constant (Pmode, stack_pointer_rtx,
11089 m->fs.sp_offset));
11090 RTX_FRAME_RELATED_P (insn) = 1;
11091 }
11092 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
11093 m->fs.fp_offset);
11094 }
11095
11096 /* Emit code to restore saved registers using MOV insns.
11097 First register is restored from CFA - CFA_OFFSET. */
11098 static void
11099 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
11100 bool maybe_eh_return)
11101 {
11102 struct machine_function *m = cfun->machine;
11103 unsigned int regno;
11104
11105 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11106 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11107 {
11108 rtx reg = gen_rtx_REG (word_mode, regno);
11109 rtx insn, mem;
11110
11111 mem = choose_baseaddr (cfa_offset);
11112 mem = gen_frame_mem (word_mode, mem);
11113 insn = emit_move_insn (reg, mem);
11114
11115 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
11116 {
11117 /* Previously we'd represented the CFA as an expression
11118 like *(%ebp - 8). We've just popped that value from
11119 the stack, which means we need to reset the CFA to
11120 the drap register. This will remain until we restore
11121 the stack pointer. */
11122 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11123 RTX_FRAME_RELATED_P (insn) = 1;
11124
11125 /* This means that the DRAP register is valid for addressing. */
11126 m->fs.drap_valid = true;
11127 }
11128 else
11129 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11130
11131 cfa_offset -= UNITS_PER_WORD;
11132 }
11133 }
11134
11135 /* Emit code to restore saved registers using MOV insns.
11136 First register is restored from CFA - CFA_OFFSET. */
11137 static void
11138 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11139 bool maybe_eh_return)
11140 {
11141 unsigned int regno;
11142
11143 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11144 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11145 {
11146 rtx reg = gen_rtx_REG (V4SFmode, regno);
11147 rtx mem;
11148
11149 mem = choose_baseaddr (cfa_offset);
11150 mem = gen_rtx_MEM (V4SFmode, mem);
11151 set_mem_align (mem, 128);
11152 emit_move_insn (reg, mem);
11153
11154 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11155
11156 cfa_offset -= 16;
11157 }
11158 }
11159
11160 /* Restore function stack, frame, and registers. */
11161
11162 void
11163 ix86_expand_epilogue (int style)
11164 {
11165 struct machine_function *m = cfun->machine;
11166 struct machine_frame_state frame_state_save = m->fs;
11167 struct ix86_frame frame;
11168 bool restore_regs_via_mov;
11169 bool using_drap;
11170
11171 ix86_finalize_stack_realign_flags ();
11172 ix86_compute_frame_layout (&frame);
11173
11174 m->fs.sp_valid = (!frame_pointer_needed
11175 || (crtl->sp_is_unchanging
11176 && !stack_realign_fp));
11177 gcc_assert (!m->fs.sp_valid
11178 || m->fs.sp_offset == frame.stack_pointer_offset);
11179
11180 /* The FP must be valid if the frame pointer is present. */
11181 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11182 gcc_assert (!m->fs.fp_valid
11183 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11184
11185 /* We must have *some* valid pointer to the stack frame. */
11186 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11187
11188 /* The DRAP is never valid at this point. */
11189 gcc_assert (!m->fs.drap_valid);
11190
11191 /* See the comment about red zone and frame
11192 pointer usage in ix86_expand_prologue. */
11193 if (frame_pointer_needed && frame.red_zone_size)
11194 emit_insn (gen_memory_blockage ());
11195
11196 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11197 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11198
11199 /* Determine the CFA offset of the end of the red-zone. */
11200 m->fs.red_zone_offset = 0;
11201 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11202 {
11203 /* The red-zone begins below the return address. */
11204 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11205
11206 /* When the register save area is in the aligned portion of
11207 the stack, determine the maximum runtime displacement that
11208 matches up with the aligned frame. */
11209 if (stack_realign_drap)
11210 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11211 + UNITS_PER_WORD);
11212 }
11213
11214 /* Special care must be taken for the normal return case of a function
11215 using eh_return: the eax and edx registers are marked as saved, but
11216 not restored along this path. Adjust the save location to match. */
11217 if (crtl->calls_eh_return && style != 2)
11218 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11219
11220 /* EH_RETURN requires the use of moves to function properly. */
11221 if (crtl->calls_eh_return)
11222 restore_regs_via_mov = true;
11223 /* SEH requires the use of pops to identify the epilogue. */
11224 else if (TARGET_SEH)
11225 restore_regs_via_mov = false;
11226 /* If we're only restoring one register and sp is not valid then
11227 using a move instruction to restore the register since it's
11228 less work than reloading sp and popping the register. */
11229 else if (!m->fs.sp_valid && frame.nregs <= 1)
11230 restore_regs_via_mov = true;
11231 else if (TARGET_EPILOGUE_USING_MOVE
11232 && cfun->machine->use_fast_prologue_epilogue
11233 && (frame.nregs > 1
11234 || m->fs.sp_offset != frame.reg_save_offset))
11235 restore_regs_via_mov = true;
11236 else if (frame_pointer_needed
11237 && !frame.nregs
11238 && m->fs.sp_offset != frame.reg_save_offset)
11239 restore_regs_via_mov = true;
11240 else if (frame_pointer_needed
11241 && TARGET_USE_LEAVE
11242 && cfun->machine->use_fast_prologue_epilogue
11243 && frame.nregs == 1)
11244 restore_regs_via_mov = true;
11245 else
11246 restore_regs_via_mov = false;
11247
11248 if (restore_regs_via_mov || frame.nsseregs)
11249 {
11250 /* Ensure that the entire register save area is addressable via
11251 the stack pointer, if we will restore via sp. */
11252 if (TARGET_64BIT
11253 && m->fs.sp_offset > 0x7fffffff
11254 && !(m->fs.fp_valid || m->fs.drap_valid)
11255 && (frame.nsseregs + frame.nregs) != 0)
11256 {
11257 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11258 GEN_INT (m->fs.sp_offset
11259 - frame.sse_reg_save_offset),
11260 style,
11261 m->fs.cfa_reg == stack_pointer_rtx);
11262 }
11263 }
11264
11265 /* If there are any SSE registers to restore, then we have to do it
11266 via moves, since there's obviously no pop for SSE regs. */
11267 if (frame.nsseregs)
11268 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11269 style == 2);
11270
11271 if (restore_regs_via_mov)
11272 {
11273 rtx t;
11274
11275 if (frame.nregs)
11276 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11277
11278 /* eh_return epilogues need %ecx added to the stack pointer. */
11279 if (style == 2)
11280 {
11281 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11282
11283 /* Stack align doesn't work with eh_return. */
11284 gcc_assert (!stack_realign_drap);
11285 /* Neither does regparm nested functions. */
11286 gcc_assert (!ix86_static_chain_on_stack);
11287
11288 if (frame_pointer_needed)
11289 {
11290 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11291 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11292 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11293
11294 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11295 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11296
11297 /* Note that we use SA as a temporary CFA, as the return
11298 address is at the proper place relative to it. We
11299 pretend this happens at the FP restore insn because
11300 prior to this insn the FP would be stored at the wrong
11301 offset relative to SA, and after this insn we have no
11302 other reasonable register to use for the CFA. We don't
11303 bother resetting the CFA to the SP for the duration of
11304 the return insn. */
11305 add_reg_note (insn, REG_CFA_DEF_CFA,
11306 plus_constant (Pmode, sa, UNITS_PER_WORD));
11307 ix86_add_queued_cfa_restore_notes (insn);
11308 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11309 RTX_FRAME_RELATED_P (insn) = 1;
11310
11311 m->fs.cfa_reg = sa;
11312 m->fs.cfa_offset = UNITS_PER_WORD;
11313 m->fs.fp_valid = false;
11314
11315 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11316 const0_rtx, style, false);
11317 }
11318 else
11319 {
11320 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11321 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11322 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11323 ix86_add_queued_cfa_restore_notes (insn);
11324
11325 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11326 if (m->fs.cfa_offset != UNITS_PER_WORD)
11327 {
11328 m->fs.cfa_offset = UNITS_PER_WORD;
11329 add_reg_note (insn, REG_CFA_DEF_CFA,
11330 plus_constant (Pmode, stack_pointer_rtx,
11331 UNITS_PER_WORD));
11332 RTX_FRAME_RELATED_P (insn) = 1;
11333 }
11334 }
11335 m->fs.sp_offset = UNITS_PER_WORD;
11336 m->fs.sp_valid = true;
11337 }
11338 }
11339 else
11340 {
11341 /* SEH requires that the function end with (1) a stack adjustment
11342 if necessary, (2) a sequence of pops, and (3) a return or
11343 jump instruction. Prevent insns from the function body from
11344 being scheduled into this sequence. */
11345 if (TARGET_SEH)
11346 {
11347 /* Prevent a catch region from being adjacent to the standard
11348 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11349 several other flags that would be interesting to test are
11350 not yet set up. */
11351 if (flag_non_call_exceptions)
11352 emit_insn (gen_nops (const1_rtx));
11353 else
11354 emit_insn (gen_blockage ());
11355 }
11356
11357 /* First step is to deallocate the stack frame so that we can
11358 pop the registers. Also do it on SEH target for very large
11359 frame as the emitted instructions aren't allowed by the ABI in
11360 epilogues. */
11361 if (!m->fs.sp_valid
11362 || (TARGET_SEH
11363 && (m->fs.sp_offset - frame.reg_save_offset
11364 >= SEH_MAX_FRAME_SIZE)))
11365 {
11366 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11367 GEN_INT (m->fs.fp_offset
11368 - frame.reg_save_offset),
11369 style, false);
11370 }
11371 else if (m->fs.sp_offset != frame.reg_save_offset)
11372 {
11373 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11374 GEN_INT (m->fs.sp_offset
11375 - frame.reg_save_offset),
11376 style,
11377 m->fs.cfa_reg == stack_pointer_rtx);
11378 }
11379
11380 ix86_emit_restore_regs_using_pop ();
11381 }
11382
11383 /* If we used a stack pointer and haven't already got rid of it,
11384 then do so now. */
11385 if (m->fs.fp_valid)
11386 {
11387 /* If the stack pointer is valid and pointing at the frame
11388 pointer store address, then we only need a pop. */
11389 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11390 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11391 /* Leave results in shorter dependency chains on CPUs that are
11392 able to grok it fast. */
11393 else if (TARGET_USE_LEAVE
11394 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
11395 || !cfun->machine->use_fast_prologue_epilogue)
11396 ix86_emit_leave ();
11397 else
11398 {
11399 pro_epilogue_adjust_stack (stack_pointer_rtx,
11400 hard_frame_pointer_rtx,
11401 const0_rtx, style, !using_drap);
11402 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11403 }
11404 }
11405
11406 if (using_drap)
11407 {
11408 int param_ptr_offset = UNITS_PER_WORD;
11409 rtx insn;
11410
11411 gcc_assert (stack_realign_drap);
11412
11413 if (ix86_static_chain_on_stack)
11414 param_ptr_offset += UNITS_PER_WORD;
11415 if (!call_used_regs[REGNO (crtl->drap_reg)])
11416 param_ptr_offset += UNITS_PER_WORD;
11417
11418 insn = emit_insn (gen_rtx_SET
11419 (VOIDmode, stack_pointer_rtx,
11420 gen_rtx_PLUS (Pmode,
11421 crtl->drap_reg,
11422 GEN_INT (-param_ptr_offset))));
11423 m->fs.cfa_reg = stack_pointer_rtx;
11424 m->fs.cfa_offset = param_ptr_offset;
11425 m->fs.sp_offset = param_ptr_offset;
11426 m->fs.realigned = false;
11427
11428 add_reg_note (insn, REG_CFA_DEF_CFA,
11429 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11430 GEN_INT (param_ptr_offset)));
11431 RTX_FRAME_RELATED_P (insn) = 1;
11432
11433 if (!call_used_regs[REGNO (crtl->drap_reg)])
11434 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11435 }
11436
11437 /* At this point the stack pointer must be valid, and we must have
11438 restored all of the registers. We may not have deallocated the
11439 entire stack frame. We've delayed this until now because it may
11440 be possible to merge the local stack deallocation with the
11441 deallocation forced by ix86_static_chain_on_stack. */
11442 gcc_assert (m->fs.sp_valid);
11443 gcc_assert (!m->fs.fp_valid);
11444 gcc_assert (!m->fs.realigned);
11445 if (m->fs.sp_offset != UNITS_PER_WORD)
11446 {
11447 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11448 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11449 style, true);
11450 }
11451 else
11452 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11453
11454 /* Sibcall epilogues don't want a return instruction. */
11455 if (style == 0)
11456 {
11457 m->fs = frame_state_save;
11458 return;
11459 }
11460
11461 if (crtl->args.pops_args && crtl->args.size)
11462 {
11463 rtx popc = GEN_INT (crtl->args.pops_args);
11464
11465 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11466 address, do explicit add, and jump indirectly to the caller. */
11467
11468 if (crtl->args.pops_args >= 65536)
11469 {
11470 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11471 rtx insn;
11472
11473 /* There is no "pascal" calling convention in any 64bit ABI. */
11474 gcc_assert (!TARGET_64BIT);
11475
11476 insn = emit_insn (gen_pop (ecx));
11477 m->fs.cfa_offset -= UNITS_PER_WORD;
11478 m->fs.sp_offset -= UNITS_PER_WORD;
11479
11480 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11481 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11482 add_reg_note (insn, REG_CFA_REGISTER,
11483 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11484 RTX_FRAME_RELATED_P (insn) = 1;
11485
11486 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11487 popc, -1, true);
11488 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11489 }
11490 else
11491 emit_jump_insn (gen_simple_return_pop_internal (popc));
11492 }
11493 else
11494 emit_jump_insn (gen_simple_return_internal ());
11495
11496 /* Restore the state back to the state from the prologue,
11497 so that it's correct for the next epilogue. */
11498 m->fs = frame_state_save;
11499 }
11500
11501 /* Reset from the function's potential modifications. */
11502
11503 static void
11504 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11505 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11506 {
11507 if (pic_offset_table_rtx)
11508 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11509 #if TARGET_MACHO
11510 /* Mach-O doesn't support labels at the end of objects, so if
11511 it looks like we might want one, insert a NOP. */
11512 {
11513 rtx insn = get_last_insn ();
11514 rtx deleted_debug_label = NULL_RTX;
11515 while (insn
11516 && NOTE_P (insn)
11517 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11518 {
11519 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11520 notes only, instead set their CODE_LABEL_NUMBER to -1,
11521 otherwise there would be code generation differences
11522 in between -g and -g0. */
11523 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11524 deleted_debug_label = insn;
11525 insn = PREV_INSN (insn);
11526 }
11527 if (insn
11528 && (LABEL_P (insn)
11529 || (NOTE_P (insn)
11530 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11531 fputs ("\tnop\n", file);
11532 else if (deleted_debug_label)
11533 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11534 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11535 CODE_LABEL_NUMBER (insn) = -1;
11536 }
11537 #endif
11538
11539 }
11540
11541 /* Return a scratch register to use in the split stack prologue. The
11542 split stack prologue is used for -fsplit-stack. It is the first
11543 instructions in the function, even before the regular prologue.
11544 The scratch register can be any caller-saved register which is not
11545 used for parameters or for the static chain. */
11546
11547 static unsigned int
11548 split_stack_prologue_scratch_regno (void)
11549 {
11550 if (TARGET_64BIT)
11551 return R11_REG;
11552 else
11553 {
11554 bool is_fastcall, is_thiscall;
11555 int regparm;
11556
11557 is_fastcall = (lookup_attribute ("fastcall",
11558 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11559 != NULL);
11560 is_thiscall = (lookup_attribute ("thiscall",
11561 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11562 != NULL);
11563 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11564
11565 if (is_fastcall)
11566 {
11567 if (DECL_STATIC_CHAIN (cfun->decl))
11568 {
11569 sorry ("-fsplit-stack does not support fastcall with "
11570 "nested function");
11571 return INVALID_REGNUM;
11572 }
11573 return AX_REG;
11574 }
11575 else if (is_thiscall)
11576 {
11577 if (!DECL_STATIC_CHAIN (cfun->decl))
11578 return DX_REG;
11579 return AX_REG;
11580 }
11581 else if (regparm < 3)
11582 {
11583 if (!DECL_STATIC_CHAIN (cfun->decl))
11584 return CX_REG;
11585 else
11586 {
11587 if (regparm >= 2)
11588 {
11589 sorry ("-fsplit-stack does not support 2 register "
11590 " parameters for a nested function");
11591 return INVALID_REGNUM;
11592 }
11593 return DX_REG;
11594 }
11595 }
11596 else
11597 {
11598 /* FIXME: We could make this work by pushing a register
11599 around the addition and comparison. */
11600 sorry ("-fsplit-stack does not support 3 register parameters");
11601 return INVALID_REGNUM;
11602 }
11603 }
11604 }
11605
11606 /* A SYMBOL_REF for the function which allocates new stackspace for
11607 -fsplit-stack. */
11608
11609 static GTY(()) rtx split_stack_fn;
11610
11611 /* A SYMBOL_REF for the more stack function when using the large
11612 model. */
11613
11614 static GTY(()) rtx split_stack_fn_large;
11615
11616 /* Handle -fsplit-stack. These are the first instructions in the
11617 function, even before the regular prologue. */
11618
11619 void
11620 ix86_expand_split_stack_prologue (void)
11621 {
11622 struct ix86_frame frame;
11623 HOST_WIDE_INT allocate;
11624 unsigned HOST_WIDE_INT args_size;
11625 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11626 rtx scratch_reg = NULL_RTX;
11627 rtx varargs_label = NULL_RTX;
11628 rtx fn;
11629
11630 gcc_assert (flag_split_stack && reload_completed);
11631
11632 ix86_finalize_stack_realign_flags ();
11633 ix86_compute_frame_layout (&frame);
11634 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11635
11636 /* This is the label we will branch to if we have enough stack
11637 space. We expect the basic block reordering pass to reverse this
11638 branch if optimizing, so that we branch in the unlikely case. */
11639 label = gen_label_rtx ();
11640
11641 /* We need to compare the stack pointer minus the frame size with
11642 the stack boundary in the TCB. The stack boundary always gives
11643 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11644 can compare directly. Otherwise we need to do an addition. */
11645
11646 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11647 UNSPEC_STACK_CHECK);
11648 limit = gen_rtx_CONST (Pmode, limit);
11649 limit = gen_rtx_MEM (Pmode, limit);
11650 if (allocate < SPLIT_STACK_AVAILABLE)
11651 current = stack_pointer_rtx;
11652 else
11653 {
11654 unsigned int scratch_regno;
11655 rtx offset;
11656
11657 /* We need a scratch register to hold the stack pointer minus
11658 the required frame size. Since this is the very start of the
11659 function, the scratch register can be any caller-saved
11660 register which is not used for parameters. */
11661 offset = GEN_INT (- allocate);
11662 scratch_regno = split_stack_prologue_scratch_regno ();
11663 if (scratch_regno == INVALID_REGNUM)
11664 return;
11665 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11666 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11667 {
11668 /* We don't use ix86_gen_add3 in this case because it will
11669 want to split to lea, but when not optimizing the insn
11670 will not be split after this point. */
11671 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11672 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11673 offset)));
11674 }
11675 else
11676 {
11677 emit_move_insn (scratch_reg, offset);
11678 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11679 stack_pointer_rtx));
11680 }
11681 current = scratch_reg;
11682 }
11683
11684 ix86_expand_branch (GEU, current, limit, label);
11685 jump_insn = get_last_insn ();
11686 JUMP_LABEL (jump_insn) = label;
11687
11688 /* Mark the jump as very likely to be taken. */
11689 add_int_reg_note (jump_insn, REG_BR_PROB,
11690 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
11691
11692 if (split_stack_fn == NULL_RTX)
11693 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11694 fn = split_stack_fn;
11695
11696 /* Get more stack space. We pass in the desired stack space and the
11697 size of the arguments to copy to the new stack. In 32-bit mode
11698 we push the parameters; __morestack will return on a new stack
11699 anyhow. In 64-bit mode we pass the parameters in r10 and
11700 r11. */
11701 allocate_rtx = GEN_INT (allocate);
11702 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11703 call_fusage = NULL_RTX;
11704 if (TARGET_64BIT)
11705 {
11706 rtx reg10, reg11;
11707
11708 reg10 = gen_rtx_REG (Pmode, R10_REG);
11709 reg11 = gen_rtx_REG (Pmode, R11_REG);
11710
11711 /* If this function uses a static chain, it will be in %r10.
11712 Preserve it across the call to __morestack. */
11713 if (DECL_STATIC_CHAIN (cfun->decl))
11714 {
11715 rtx rax;
11716
11717 rax = gen_rtx_REG (word_mode, AX_REG);
11718 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11719 use_reg (&call_fusage, rax);
11720 }
11721
11722 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11723 && !TARGET_PECOFF)
11724 {
11725 HOST_WIDE_INT argval;
11726
11727 gcc_assert (Pmode == DImode);
11728 /* When using the large model we need to load the address
11729 into a register, and we've run out of registers. So we
11730 switch to a different calling convention, and we call a
11731 different function: __morestack_large. We pass the
11732 argument size in the upper 32 bits of r10 and pass the
11733 frame size in the lower 32 bits. */
11734 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11735 gcc_assert ((args_size & 0xffffffff) == args_size);
11736
11737 if (split_stack_fn_large == NULL_RTX)
11738 split_stack_fn_large =
11739 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11740
11741 if (ix86_cmodel == CM_LARGE_PIC)
11742 {
11743 rtx label, x;
11744
11745 label = gen_label_rtx ();
11746 emit_label (label);
11747 LABEL_PRESERVE_P (label) = 1;
11748 emit_insn (gen_set_rip_rex64 (reg10, label));
11749 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11750 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11751 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11752 UNSPEC_GOT);
11753 x = gen_rtx_CONST (Pmode, x);
11754 emit_move_insn (reg11, x);
11755 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11756 x = gen_const_mem (Pmode, x);
11757 emit_move_insn (reg11, x);
11758 }
11759 else
11760 emit_move_insn (reg11, split_stack_fn_large);
11761
11762 fn = reg11;
11763
11764 argval = ((args_size << 16) << 16) + allocate;
11765 emit_move_insn (reg10, GEN_INT (argval));
11766 }
11767 else
11768 {
11769 emit_move_insn (reg10, allocate_rtx);
11770 emit_move_insn (reg11, GEN_INT (args_size));
11771 use_reg (&call_fusage, reg11);
11772 }
11773
11774 use_reg (&call_fusage, reg10);
11775 }
11776 else
11777 {
11778 emit_insn (gen_push (GEN_INT (args_size)));
11779 emit_insn (gen_push (allocate_rtx));
11780 }
11781 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11782 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11783 NULL_RTX, false);
11784 add_function_usage_to (call_insn, call_fusage);
11785
11786 /* In order to make call/return prediction work right, we now need
11787 to execute a return instruction. See
11788 libgcc/config/i386/morestack.S for the details on how this works.
11789
11790 For flow purposes gcc must not see this as a return
11791 instruction--we need control flow to continue at the subsequent
11792 label. Therefore, we use an unspec. */
11793 gcc_assert (crtl->args.pops_args < 65536);
11794 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11795
11796 /* If we are in 64-bit mode and this function uses a static chain,
11797 we saved %r10 in %rax before calling _morestack. */
11798 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11799 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11800 gen_rtx_REG (word_mode, AX_REG));
11801
11802 /* If this function calls va_start, we need to store a pointer to
11803 the arguments on the old stack, because they may not have been
11804 all copied to the new stack. At this point the old stack can be
11805 found at the frame pointer value used by __morestack, because
11806 __morestack has set that up before calling back to us. Here we
11807 store that pointer in a scratch register, and in
11808 ix86_expand_prologue we store the scratch register in a stack
11809 slot. */
11810 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11811 {
11812 unsigned int scratch_regno;
11813 rtx frame_reg;
11814 int words;
11815
11816 scratch_regno = split_stack_prologue_scratch_regno ();
11817 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11818 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11819
11820 /* 64-bit:
11821 fp -> old fp value
11822 return address within this function
11823 return address of caller of this function
11824 stack arguments
11825 So we add three words to get to the stack arguments.
11826
11827 32-bit:
11828 fp -> old fp value
11829 return address within this function
11830 first argument to __morestack
11831 second argument to __morestack
11832 return address of caller of this function
11833 stack arguments
11834 So we add five words to get to the stack arguments.
11835 */
11836 words = TARGET_64BIT ? 3 : 5;
11837 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11838 gen_rtx_PLUS (Pmode, frame_reg,
11839 GEN_INT (words * UNITS_PER_WORD))));
11840
11841 varargs_label = gen_label_rtx ();
11842 emit_jump_insn (gen_jump (varargs_label));
11843 JUMP_LABEL (get_last_insn ()) = varargs_label;
11844
11845 emit_barrier ();
11846 }
11847
11848 emit_label (label);
11849 LABEL_NUSES (label) = 1;
11850
11851 /* If this function calls va_start, we now have to set the scratch
11852 register for the case where we do not call __morestack. In this
11853 case we need to set it based on the stack pointer. */
11854 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11855 {
11856 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11857 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11858 GEN_INT (UNITS_PER_WORD))));
11859
11860 emit_label (varargs_label);
11861 LABEL_NUSES (varargs_label) = 1;
11862 }
11863 }
11864
11865 /* We may have to tell the dataflow pass that the split stack prologue
11866 is initializing a scratch register. */
11867
11868 static void
11869 ix86_live_on_entry (bitmap regs)
11870 {
11871 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11872 {
11873 gcc_assert (flag_split_stack);
11874 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11875 }
11876 }
11877 \f
11878 /* Extract the parts of an RTL expression that is a valid memory address
11879 for an instruction. Return 0 if the structure of the address is
11880 grossly off. Return -1 if the address contains ASHIFT, so it is not
11881 strictly valid, but still used for computing length of lea instruction. */
11882
11883 int
11884 ix86_decompose_address (rtx addr, struct ix86_address *out)
11885 {
11886 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11887 rtx base_reg, index_reg;
11888 HOST_WIDE_INT scale = 1;
11889 rtx scale_rtx = NULL_RTX;
11890 rtx tmp;
11891 int retval = 1;
11892 enum ix86_address_seg seg = SEG_DEFAULT;
11893
11894 /* Allow zero-extended SImode addresses,
11895 they will be emitted with addr32 prefix. */
11896 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11897 {
11898 if (GET_CODE (addr) == ZERO_EXTEND
11899 && GET_MODE (XEXP (addr, 0)) == SImode)
11900 {
11901 addr = XEXP (addr, 0);
11902 if (CONST_INT_P (addr))
11903 return 0;
11904 }
11905 else if (GET_CODE (addr) == AND
11906 && const_32bit_mask (XEXP (addr, 1), DImode))
11907 {
11908 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11909 if (addr == NULL_RTX)
11910 return 0;
11911
11912 if (CONST_INT_P (addr))
11913 return 0;
11914 }
11915 }
11916
11917 /* Allow SImode subregs of DImode addresses,
11918 they will be emitted with addr32 prefix. */
11919 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11920 {
11921 if (GET_CODE (addr) == SUBREG
11922 && GET_MODE (SUBREG_REG (addr)) == DImode)
11923 {
11924 addr = SUBREG_REG (addr);
11925 if (CONST_INT_P (addr))
11926 return 0;
11927 }
11928 }
11929
11930 if (REG_P (addr))
11931 base = addr;
11932 else if (GET_CODE (addr) == SUBREG)
11933 {
11934 if (REG_P (SUBREG_REG (addr)))
11935 base = addr;
11936 else
11937 return 0;
11938 }
11939 else if (GET_CODE (addr) == PLUS)
11940 {
11941 rtx addends[4], op;
11942 int n = 0, i;
11943
11944 op = addr;
11945 do
11946 {
11947 if (n >= 4)
11948 return 0;
11949 addends[n++] = XEXP (op, 1);
11950 op = XEXP (op, 0);
11951 }
11952 while (GET_CODE (op) == PLUS);
11953 if (n >= 4)
11954 return 0;
11955 addends[n] = op;
11956
11957 for (i = n; i >= 0; --i)
11958 {
11959 op = addends[i];
11960 switch (GET_CODE (op))
11961 {
11962 case MULT:
11963 if (index)
11964 return 0;
11965 index = XEXP (op, 0);
11966 scale_rtx = XEXP (op, 1);
11967 break;
11968
11969 case ASHIFT:
11970 if (index)
11971 return 0;
11972 index = XEXP (op, 0);
11973 tmp = XEXP (op, 1);
11974 if (!CONST_INT_P (tmp))
11975 return 0;
11976 scale = INTVAL (tmp);
11977 if ((unsigned HOST_WIDE_INT) scale > 3)
11978 return 0;
11979 scale = 1 << scale;
11980 break;
11981
11982 case ZERO_EXTEND:
11983 op = XEXP (op, 0);
11984 if (GET_CODE (op) != UNSPEC)
11985 return 0;
11986 /* FALLTHRU */
11987
11988 case UNSPEC:
11989 if (XINT (op, 1) == UNSPEC_TP
11990 && TARGET_TLS_DIRECT_SEG_REFS
11991 && seg == SEG_DEFAULT)
11992 seg = DEFAULT_TLS_SEG_REG;
11993 else
11994 return 0;
11995 break;
11996
11997 case SUBREG:
11998 if (!REG_P (SUBREG_REG (op)))
11999 return 0;
12000 /* FALLTHRU */
12001
12002 case REG:
12003 if (!base)
12004 base = op;
12005 else if (!index)
12006 index = op;
12007 else
12008 return 0;
12009 break;
12010
12011 case CONST:
12012 case CONST_INT:
12013 case SYMBOL_REF:
12014 case LABEL_REF:
12015 if (disp)
12016 return 0;
12017 disp = op;
12018 break;
12019
12020 default:
12021 return 0;
12022 }
12023 }
12024 }
12025 else if (GET_CODE (addr) == MULT)
12026 {
12027 index = XEXP (addr, 0); /* index*scale */
12028 scale_rtx = XEXP (addr, 1);
12029 }
12030 else if (GET_CODE (addr) == ASHIFT)
12031 {
12032 /* We're called for lea too, which implements ashift on occasion. */
12033 index = XEXP (addr, 0);
12034 tmp = XEXP (addr, 1);
12035 if (!CONST_INT_P (tmp))
12036 return 0;
12037 scale = INTVAL (tmp);
12038 if ((unsigned HOST_WIDE_INT) scale > 3)
12039 return 0;
12040 scale = 1 << scale;
12041 retval = -1;
12042 }
12043 else
12044 disp = addr; /* displacement */
12045
12046 if (index)
12047 {
12048 if (REG_P (index))
12049 ;
12050 else if (GET_CODE (index) == SUBREG
12051 && REG_P (SUBREG_REG (index)))
12052 ;
12053 else
12054 return 0;
12055 }
12056
12057 /* Extract the integral value of scale. */
12058 if (scale_rtx)
12059 {
12060 if (!CONST_INT_P (scale_rtx))
12061 return 0;
12062 scale = INTVAL (scale_rtx);
12063 }
12064
12065 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
12066 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
12067
12068 /* Avoid useless 0 displacement. */
12069 if (disp == const0_rtx && (base || index))
12070 disp = NULL_RTX;
12071
12072 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12073 if (base_reg && index_reg && scale == 1
12074 && (index_reg == arg_pointer_rtx
12075 || index_reg == frame_pointer_rtx
12076 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12077 {
12078 rtx tmp;
12079 tmp = base, base = index, index = tmp;
12080 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12081 }
12082
12083 /* Special case: %ebp cannot be encoded as a base without a displacement.
12084 Similarly %r13. */
12085 if (!disp
12086 && base_reg
12087 && (base_reg == hard_frame_pointer_rtx
12088 || base_reg == frame_pointer_rtx
12089 || base_reg == arg_pointer_rtx
12090 || (REG_P (base_reg)
12091 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12092 || REGNO (base_reg) == R13_REG))))
12093 disp = const0_rtx;
12094
12095 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12096 Avoid this by transforming to [%esi+0].
12097 Reload calls address legitimization without cfun defined, so we need
12098 to test cfun for being non-NULL. */
12099 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12100 && base_reg && !index_reg && !disp
12101 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12102 disp = const0_rtx;
12103
12104 /* Special case: encode reg+reg instead of reg*2. */
12105 if (!base && index && scale == 2)
12106 base = index, base_reg = index_reg, scale = 1;
12107
12108 /* Special case: scaling cannot be encoded without base or displacement. */
12109 if (!base && !disp && index && scale != 1)
12110 disp = const0_rtx;
12111
12112 out->base = base;
12113 out->index = index;
12114 out->disp = disp;
12115 out->scale = scale;
12116 out->seg = seg;
12117
12118 return retval;
12119 }
12120 \f
12121 /* Return cost of the memory address x.
12122 For i386, it is better to use a complex address than let gcc copy
12123 the address into a reg and make a new pseudo. But not if the address
12124 requires to two regs - that would mean more pseudos with longer
12125 lifetimes. */
12126 static int
12127 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12128 addr_space_t as ATTRIBUTE_UNUSED,
12129 bool speed ATTRIBUTE_UNUSED)
12130 {
12131 struct ix86_address parts;
12132 int cost = 1;
12133 int ok = ix86_decompose_address (x, &parts);
12134
12135 gcc_assert (ok);
12136
12137 if (parts.base && GET_CODE (parts.base) == SUBREG)
12138 parts.base = SUBREG_REG (parts.base);
12139 if (parts.index && GET_CODE (parts.index) == SUBREG)
12140 parts.index = SUBREG_REG (parts.index);
12141
12142 /* Attempt to minimize number of registers in the address. */
12143 if ((parts.base
12144 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12145 || (parts.index
12146 && (!REG_P (parts.index)
12147 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12148 cost++;
12149
12150 if (parts.base
12151 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12152 && parts.index
12153 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12154 && parts.base != parts.index)
12155 cost++;
12156
12157 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12158 since it's predecode logic can't detect the length of instructions
12159 and it degenerates to vector decoded. Increase cost of such
12160 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12161 to split such addresses or even refuse such addresses at all.
12162
12163 Following addressing modes are affected:
12164 [base+scale*index]
12165 [scale*index+disp]
12166 [base+index]
12167
12168 The first and last case may be avoidable by explicitly coding the zero in
12169 memory address, but I don't have AMD-K6 machine handy to check this
12170 theory. */
12171
12172 if (TARGET_K6
12173 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12174 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12175 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12176 cost += 10;
12177
12178 return cost;
12179 }
12180 \f
12181 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12182 this is used for to form addresses to local data when -fPIC is in
12183 use. */
12184
12185 static bool
12186 darwin_local_data_pic (rtx disp)
12187 {
12188 return (GET_CODE (disp) == UNSPEC
12189 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12190 }
12191
12192 /* Determine if a given RTX is a valid constant. We already know this
12193 satisfies CONSTANT_P. */
12194
12195 static bool
12196 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12197 {
12198 switch (GET_CODE (x))
12199 {
12200 case CONST:
12201 x = XEXP (x, 0);
12202
12203 if (GET_CODE (x) == PLUS)
12204 {
12205 if (!CONST_INT_P (XEXP (x, 1)))
12206 return false;
12207 x = XEXP (x, 0);
12208 }
12209
12210 if (TARGET_MACHO && darwin_local_data_pic (x))
12211 return true;
12212
12213 /* Only some unspecs are valid as "constants". */
12214 if (GET_CODE (x) == UNSPEC)
12215 switch (XINT (x, 1))
12216 {
12217 case UNSPEC_GOT:
12218 case UNSPEC_GOTOFF:
12219 case UNSPEC_PLTOFF:
12220 return TARGET_64BIT;
12221 case UNSPEC_TPOFF:
12222 case UNSPEC_NTPOFF:
12223 x = XVECEXP (x, 0, 0);
12224 return (GET_CODE (x) == SYMBOL_REF
12225 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12226 case UNSPEC_DTPOFF:
12227 x = XVECEXP (x, 0, 0);
12228 return (GET_CODE (x) == SYMBOL_REF
12229 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12230 default:
12231 return false;
12232 }
12233
12234 /* We must have drilled down to a symbol. */
12235 if (GET_CODE (x) == LABEL_REF)
12236 return true;
12237 if (GET_CODE (x) != SYMBOL_REF)
12238 return false;
12239 /* FALLTHRU */
12240
12241 case SYMBOL_REF:
12242 /* TLS symbols are never valid. */
12243 if (SYMBOL_REF_TLS_MODEL (x))
12244 return false;
12245
12246 /* DLLIMPORT symbols are never valid. */
12247 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12248 && SYMBOL_REF_DLLIMPORT_P (x))
12249 return false;
12250
12251 #if TARGET_MACHO
12252 /* mdynamic-no-pic */
12253 if (MACHO_DYNAMIC_NO_PIC_P)
12254 return machopic_symbol_defined_p (x);
12255 #endif
12256 break;
12257
12258 case CONST_DOUBLE:
12259 if (GET_MODE (x) == TImode
12260 && x != CONST0_RTX (TImode)
12261 && !TARGET_64BIT)
12262 return false;
12263 break;
12264
12265 case CONST_VECTOR:
12266 if (!standard_sse_constant_p (x))
12267 return false;
12268
12269 default:
12270 break;
12271 }
12272
12273 /* Otherwise we handle everything else in the move patterns. */
12274 return true;
12275 }
12276
12277 /* Determine if it's legal to put X into the constant pool. This
12278 is not possible for the address of thread-local symbols, which
12279 is checked above. */
12280
12281 static bool
12282 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12283 {
12284 /* We can always put integral constants and vectors in memory. */
12285 switch (GET_CODE (x))
12286 {
12287 case CONST_INT:
12288 case CONST_DOUBLE:
12289 case CONST_VECTOR:
12290 return false;
12291
12292 default:
12293 break;
12294 }
12295 return !ix86_legitimate_constant_p (mode, x);
12296 }
12297
12298 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12299 otherwise zero. */
12300
12301 static bool
12302 is_imported_p (rtx x)
12303 {
12304 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12305 || GET_CODE (x) != SYMBOL_REF)
12306 return false;
12307
12308 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12309 }
12310
12311
12312 /* Nonzero if the constant value X is a legitimate general operand
12313 when generating PIC code. It is given that flag_pic is on and
12314 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12315
12316 bool
12317 legitimate_pic_operand_p (rtx x)
12318 {
12319 rtx inner;
12320
12321 switch (GET_CODE (x))
12322 {
12323 case CONST:
12324 inner = XEXP (x, 0);
12325 if (GET_CODE (inner) == PLUS
12326 && CONST_INT_P (XEXP (inner, 1)))
12327 inner = XEXP (inner, 0);
12328
12329 /* Only some unspecs are valid as "constants". */
12330 if (GET_CODE (inner) == UNSPEC)
12331 switch (XINT (inner, 1))
12332 {
12333 case UNSPEC_GOT:
12334 case UNSPEC_GOTOFF:
12335 case UNSPEC_PLTOFF:
12336 return TARGET_64BIT;
12337 case UNSPEC_TPOFF:
12338 x = XVECEXP (inner, 0, 0);
12339 return (GET_CODE (x) == SYMBOL_REF
12340 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12341 case UNSPEC_MACHOPIC_OFFSET:
12342 return legitimate_pic_address_disp_p (x);
12343 default:
12344 return false;
12345 }
12346 /* FALLTHRU */
12347
12348 case SYMBOL_REF:
12349 case LABEL_REF:
12350 return legitimate_pic_address_disp_p (x);
12351
12352 default:
12353 return true;
12354 }
12355 }
12356
12357 /* Determine if a given CONST RTX is a valid memory displacement
12358 in PIC mode. */
12359
12360 bool
12361 legitimate_pic_address_disp_p (rtx disp)
12362 {
12363 bool saw_plus;
12364
12365 /* In 64bit mode we can allow direct addresses of symbols and labels
12366 when they are not dynamic symbols. */
12367 if (TARGET_64BIT)
12368 {
12369 rtx op0 = disp, op1;
12370
12371 switch (GET_CODE (disp))
12372 {
12373 case LABEL_REF:
12374 return true;
12375
12376 case CONST:
12377 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12378 break;
12379 op0 = XEXP (XEXP (disp, 0), 0);
12380 op1 = XEXP (XEXP (disp, 0), 1);
12381 if (!CONST_INT_P (op1)
12382 || INTVAL (op1) >= 16*1024*1024
12383 || INTVAL (op1) < -16*1024*1024)
12384 break;
12385 if (GET_CODE (op0) == LABEL_REF)
12386 return true;
12387 if (GET_CODE (op0) == CONST
12388 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12389 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12390 return true;
12391 if (GET_CODE (op0) == UNSPEC
12392 && XINT (op0, 1) == UNSPEC_PCREL)
12393 return true;
12394 if (GET_CODE (op0) != SYMBOL_REF)
12395 break;
12396 /* FALLTHRU */
12397
12398 case SYMBOL_REF:
12399 /* TLS references should always be enclosed in UNSPEC.
12400 The dllimported symbol needs always to be resolved. */
12401 if (SYMBOL_REF_TLS_MODEL (op0)
12402 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12403 return false;
12404
12405 if (TARGET_PECOFF)
12406 {
12407 if (is_imported_p (op0))
12408 return true;
12409
12410 if (SYMBOL_REF_FAR_ADDR_P (op0)
12411 || !SYMBOL_REF_LOCAL_P (op0))
12412 break;
12413
12414 /* Function-symbols need to be resolved only for
12415 large-model.
12416 For the small-model we don't need to resolve anything
12417 here. */
12418 if ((ix86_cmodel != CM_LARGE_PIC
12419 && SYMBOL_REF_FUNCTION_P (op0))
12420 || ix86_cmodel == CM_SMALL_PIC)
12421 return true;
12422 /* Non-external symbols don't need to be resolved for
12423 large, and medium-model. */
12424 if ((ix86_cmodel == CM_LARGE_PIC
12425 || ix86_cmodel == CM_MEDIUM_PIC)
12426 && !SYMBOL_REF_EXTERNAL_P (op0))
12427 return true;
12428 }
12429 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12430 && SYMBOL_REF_LOCAL_P (op0)
12431 && ix86_cmodel != CM_LARGE_PIC)
12432 return true;
12433 break;
12434
12435 default:
12436 break;
12437 }
12438 }
12439 if (GET_CODE (disp) != CONST)
12440 return false;
12441 disp = XEXP (disp, 0);
12442
12443 if (TARGET_64BIT)
12444 {
12445 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12446 of GOT tables. We should not need these anyway. */
12447 if (GET_CODE (disp) != UNSPEC
12448 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12449 && XINT (disp, 1) != UNSPEC_GOTOFF
12450 && XINT (disp, 1) != UNSPEC_PCREL
12451 && XINT (disp, 1) != UNSPEC_PLTOFF))
12452 return false;
12453
12454 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12455 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12456 return false;
12457 return true;
12458 }
12459
12460 saw_plus = false;
12461 if (GET_CODE (disp) == PLUS)
12462 {
12463 if (!CONST_INT_P (XEXP (disp, 1)))
12464 return false;
12465 disp = XEXP (disp, 0);
12466 saw_plus = true;
12467 }
12468
12469 if (TARGET_MACHO && darwin_local_data_pic (disp))
12470 return true;
12471
12472 if (GET_CODE (disp) != UNSPEC)
12473 return false;
12474
12475 switch (XINT (disp, 1))
12476 {
12477 case UNSPEC_GOT:
12478 if (saw_plus)
12479 return false;
12480 /* We need to check for both symbols and labels because VxWorks loads
12481 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12482 details. */
12483 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12484 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12485 case UNSPEC_GOTOFF:
12486 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12487 While ABI specify also 32bit relocation but we don't produce it in
12488 small PIC model at all. */
12489 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12490 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12491 && !TARGET_64BIT)
12492 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12493 return false;
12494 case UNSPEC_GOTTPOFF:
12495 case UNSPEC_GOTNTPOFF:
12496 case UNSPEC_INDNTPOFF:
12497 if (saw_plus)
12498 return false;
12499 disp = XVECEXP (disp, 0, 0);
12500 return (GET_CODE (disp) == SYMBOL_REF
12501 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12502 case UNSPEC_NTPOFF:
12503 disp = XVECEXP (disp, 0, 0);
12504 return (GET_CODE (disp) == SYMBOL_REF
12505 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12506 case UNSPEC_DTPOFF:
12507 disp = XVECEXP (disp, 0, 0);
12508 return (GET_CODE (disp) == SYMBOL_REF
12509 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12510 }
12511
12512 return false;
12513 }
12514
12515 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12516 replace the input X, or the original X if no replacement is called for.
12517 The output parameter *WIN is 1 if the calling macro should goto WIN,
12518 0 if it should not. */
12519
12520 bool
12521 ix86_legitimize_reload_address (rtx x,
12522 enum machine_mode mode ATTRIBUTE_UNUSED,
12523 int opnum, int type,
12524 int ind_levels ATTRIBUTE_UNUSED)
12525 {
12526 /* Reload can generate:
12527
12528 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12529 (reg:DI 97))
12530 (reg:DI 2 cx))
12531
12532 This RTX is rejected from ix86_legitimate_address_p due to
12533 non-strictness of base register 97. Following this rejection,
12534 reload pushes all three components into separate registers,
12535 creating invalid memory address RTX.
12536
12537 Following code reloads only the invalid part of the
12538 memory address RTX. */
12539
12540 if (GET_CODE (x) == PLUS
12541 && REG_P (XEXP (x, 1))
12542 && GET_CODE (XEXP (x, 0)) == PLUS
12543 && REG_P (XEXP (XEXP (x, 0), 1)))
12544 {
12545 rtx base, index;
12546 bool something_reloaded = false;
12547
12548 base = XEXP (XEXP (x, 0), 1);
12549 if (!REG_OK_FOR_BASE_STRICT_P (base))
12550 {
12551 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12552 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12553 opnum, (enum reload_type) type);
12554 something_reloaded = true;
12555 }
12556
12557 index = XEXP (x, 1);
12558 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12559 {
12560 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12561 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12562 opnum, (enum reload_type) type);
12563 something_reloaded = true;
12564 }
12565
12566 gcc_assert (something_reloaded);
12567 return true;
12568 }
12569
12570 return false;
12571 }
12572
12573 /* Determine if op is suitable RTX for an address register.
12574 Return naked register if a register or a register subreg is
12575 found, otherwise return NULL_RTX. */
12576
12577 static rtx
12578 ix86_validate_address_register (rtx op)
12579 {
12580 enum machine_mode mode = GET_MODE (op);
12581
12582 /* Only SImode or DImode registers can form the address. */
12583 if (mode != SImode && mode != DImode)
12584 return NULL_RTX;
12585
12586 if (REG_P (op))
12587 return op;
12588 else if (GET_CODE (op) == SUBREG)
12589 {
12590 rtx reg = SUBREG_REG (op);
12591
12592 if (!REG_P (reg))
12593 return NULL_RTX;
12594
12595 mode = GET_MODE (reg);
12596
12597 /* Don't allow SUBREGs that span more than a word. It can
12598 lead to spill failures when the register is one word out
12599 of a two word structure. */
12600 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
12601 return NULL_RTX;
12602
12603 /* Allow only SUBREGs of non-eliminable hard registers. */
12604 if (register_no_elim_operand (reg, mode))
12605 return reg;
12606 }
12607
12608 /* Op is not a register. */
12609 return NULL_RTX;
12610 }
12611
12612 /* Recognizes RTL expressions that are valid memory addresses for an
12613 instruction. The MODE argument is the machine mode for the MEM
12614 expression that wants to use this address.
12615
12616 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12617 convert common non-canonical forms to canonical form so that they will
12618 be recognized. */
12619
12620 static bool
12621 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12622 rtx addr, bool strict)
12623 {
12624 struct ix86_address parts;
12625 rtx base, index, disp;
12626 HOST_WIDE_INT scale;
12627 enum ix86_address_seg seg;
12628
12629 if (ix86_decompose_address (addr, &parts) <= 0)
12630 /* Decomposition failed. */
12631 return false;
12632
12633 base = parts.base;
12634 index = parts.index;
12635 disp = parts.disp;
12636 scale = parts.scale;
12637 seg = parts.seg;
12638
12639 /* Validate base register. */
12640 if (base)
12641 {
12642 rtx reg = ix86_validate_address_register (base);
12643
12644 if (reg == NULL_RTX)
12645 return false;
12646
12647 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12648 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12649 /* Base is not valid. */
12650 return false;
12651 }
12652
12653 /* Validate index register. */
12654 if (index)
12655 {
12656 rtx reg = ix86_validate_address_register (index);
12657
12658 if (reg == NULL_RTX)
12659 return false;
12660
12661 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12662 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12663 /* Index is not valid. */
12664 return false;
12665 }
12666
12667 /* Index and base should have the same mode. */
12668 if (base && index
12669 && GET_MODE (base) != GET_MODE (index))
12670 return false;
12671
12672 /* Address override works only on the (%reg) part of %fs:(%reg). */
12673 if (seg != SEG_DEFAULT
12674 && ((base && GET_MODE (base) != word_mode)
12675 || (index && GET_MODE (index) != word_mode)))
12676 return false;
12677
12678 /* Validate scale factor. */
12679 if (scale != 1)
12680 {
12681 if (!index)
12682 /* Scale without index. */
12683 return false;
12684
12685 if (scale != 2 && scale != 4 && scale != 8)
12686 /* Scale is not a valid multiplier. */
12687 return false;
12688 }
12689
12690 /* Validate displacement. */
12691 if (disp)
12692 {
12693 if (GET_CODE (disp) == CONST
12694 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12695 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12696 switch (XINT (XEXP (disp, 0), 1))
12697 {
12698 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12699 used. While ABI specify also 32bit relocations, we don't produce
12700 them at all and use IP relative instead. */
12701 case UNSPEC_GOT:
12702 case UNSPEC_GOTOFF:
12703 gcc_assert (flag_pic);
12704 if (!TARGET_64BIT)
12705 goto is_legitimate_pic;
12706
12707 /* 64bit address unspec. */
12708 return false;
12709
12710 case UNSPEC_GOTPCREL:
12711 case UNSPEC_PCREL:
12712 gcc_assert (flag_pic);
12713 goto is_legitimate_pic;
12714
12715 case UNSPEC_GOTTPOFF:
12716 case UNSPEC_GOTNTPOFF:
12717 case UNSPEC_INDNTPOFF:
12718 case UNSPEC_NTPOFF:
12719 case UNSPEC_DTPOFF:
12720 break;
12721
12722 case UNSPEC_STACK_CHECK:
12723 gcc_assert (flag_split_stack);
12724 break;
12725
12726 default:
12727 /* Invalid address unspec. */
12728 return false;
12729 }
12730
12731 else if (SYMBOLIC_CONST (disp)
12732 && (flag_pic
12733 || (TARGET_MACHO
12734 #if TARGET_MACHO
12735 && MACHOPIC_INDIRECT
12736 && !machopic_operand_p (disp)
12737 #endif
12738 )))
12739 {
12740
12741 is_legitimate_pic:
12742 if (TARGET_64BIT && (index || base))
12743 {
12744 /* foo@dtpoff(%rX) is ok. */
12745 if (GET_CODE (disp) != CONST
12746 || GET_CODE (XEXP (disp, 0)) != PLUS
12747 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12748 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12749 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12750 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12751 /* Non-constant pic memory reference. */
12752 return false;
12753 }
12754 else if ((!TARGET_MACHO || flag_pic)
12755 && ! legitimate_pic_address_disp_p (disp))
12756 /* Displacement is an invalid pic construct. */
12757 return false;
12758 #if TARGET_MACHO
12759 else if (MACHO_DYNAMIC_NO_PIC_P
12760 && !ix86_legitimate_constant_p (Pmode, disp))
12761 /* displacment must be referenced via non_lazy_pointer */
12762 return false;
12763 #endif
12764
12765 /* This code used to verify that a symbolic pic displacement
12766 includes the pic_offset_table_rtx register.
12767
12768 While this is good idea, unfortunately these constructs may
12769 be created by "adds using lea" optimization for incorrect
12770 code like:
12771
12772 int a;
12773 int foo(int i)
12774 {
12775 return *(&a+i);
12776 }
12777
12778 This code is nonsensical, but results in addressing
12779 GOT table with pic_offset_table_rtx base. We can't
12780 just refuse it easily, since it gets matched by
12781 "addsi3" pattern, that later gets split to lea in the
12782 case output register differs from input. While this
12783 can be handled by separate addsi pattern for this case
12784 that never results in lea, this seems to be easier and
12785 correct fix for crash to disable this test. */
12786 }
12787 else if (GET_CODE (disp) != LABEL_REF
12788 && !CONST_INT_P (disp)
12789 && (GET_CODE (disp) != CONST
12790 || !ix86_legitimate_constant_p (Pmode, disp))
12791 && (GET_CODE (disp) != SYMBOL_REF
12792 || !ix86_legitimate_constant_p (Pmode, disp)))
12793 /* Displacement is not constant. */
12794 return false;
12795 else if (TARGET_64BIT
12796 && !x86_64_immediate_operand (disp, VOIDmode))
12797 /* Displacement is out of range. */
12798 return false;
12799 /* In x32 mode, constant addresses are sign extended to 64bit, so
12800 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12801 else if (TARGET_X32 && !(index || base)
12802 && CONST_INT_P (disp)
12803 && val_signbit_known_set_p (SImode, INTVAL (disp)))
12804 return false;
12805 }
12806
12807 /* Everything looks valid. */
12808 return true;
12809 }
12810
12811 /* Determine if a given RTX is a valid constant address. */
12812
12813 bool
12814 constant_address_p (rtx x)
12815 {
12816 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12817 }
12818 \f
12819 /* Return a unique alias set for the GOT. */
12820
12821 static alias_set_type
12822 ix86_GOT_alias_set (void)
12823 {
12824 static alias_set_type set = -1;
12825 if (set == -1)
12826 set = new_alias_set ();
12827 return set;
12828 }
12829
12830 /* Return a legitimate reference for ORIG (an address) using the
12831 register REG. If REG is 0, a new pseudo is generated.
12832
12833 There are two types of references that must be handled:
12834
12835 1. Global data references must load the address from the GOT, via
12836 the PIC reg. An insn is emitted to do this load, and the reg is
12837 returned.
12838
12839 2. Static data references, constant pool addresses, and code labels
12840 compute the address as an offset from the GOT, whose base is in
12841 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12842 differentiate them from global data objects. The returned
12843 address is the PIC reg + an unspec constant.
12844
12845 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12846 reg also appears in the address. */
12847
12848 static rtx
12849 legitimize_pic_address (rtx orig, rtx reg)
12850 {
12851 rtx addr = orig;
12852 rtx new_rtx = orig;
12853
12854 #if TARGET_MACHO
12855 if (TARGET_MACHO && !TARGET_64BIT)
12856 {
12857 if (reg == 0)
12858 reg = gen_reg_rtx (Pmode);
12859 /* Use the generic Mach-O PIC machinery. */
12860 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12861 }
12862 #endif
12863
12864 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12865 {
12866 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12867 if (tmp)
12868 return tmp;
12869 }
12870
12871 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12872 new_rtx = addr;
12873 else if (TARGET_64BIT && !TARGET_PECOFF
12874 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12875 {
12876 rtx tmpreg;
12877 /* This symbol may be referenced via a displacement from the PIC
12878 base address (@GOTOFF). */
12879
12880 if (reload_in_progress)
12881 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12882 if (GET_CODE (addr) == CONST)
12883 addr = XEXP (addr, 0);
12884 if (GET_CODE (addr) == PLUS)
12885 {
12886 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12887 UNSPEC_GOTOFF);
12888 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12889 }
12890 else
12891 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12892 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12893 if (!reg)
12894 tmpreg = gen_reg_rtx (Pmode);
12895 else
12896 tmpreg = reg;
12897 emit_move_insn (tmpreg, new_rtx);
12898
12899 if (reg != 0)
12900 {
12901 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12902 tmpreg, 1, OPTAB_DIRECT);
12903 new_rtx = reg;
12904 }
12905 else
12906 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12907 }
12908 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12909 {
12910 /* This symbol may be referenced via a displacement from the PIC
12911 base address (@GOTOFF). */
12912
12913 if (reload_in_progress)
12914 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12915 if (GET_CODE (addr) == CONST)
12916 addr = XEXP (addr, 0);
12917 if (GET_CODE (addr) == PLUS)
12918 {
12919 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12920 UNSPEC_GOTOFF);
12921 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12922 }
12923 else
12924 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12925 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12926 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12927
12928 if (reg != 0)
12929 {
12930 emit_move_insn (reg, new_rtx);
12931 new_rtx = reg;
12932 }
12933 }
12934 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12935 /* We can't use @GOTOFF for text labels on VxWorks;
12936 see gotoff_operand. */
12937 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12938 {
12939 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12940 if (tmp)
12941 return tmp;
12942
12943 /* For x64 PE-COFF there is no GOT table. So we use address
12944 directly. */
12945 if (TARGET_64BIT && TARGET_PECOFF)
12946 {
12947 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12948 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12949
12950 if (reg == 0)
12951 reg = gen_reg_rtx (Pmode);
12952 emit_move_insn (reg, new_rtx);
12953 new_rtx = reg;
12954 }
12955 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12956 {
12957 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12958 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12959 new_rtx = gen_const_mem (Pmode, new_rtx);
12960 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12961
12962 if (reg == 0)
12963 reg = gen_reg_rtx (Pmode);
12964 /* Use directly gen_movsi, otherwise the address is loaded
12965 into register for CSE. We don't want to CSE this addresses,
12966 instead we CSE addresses from the GOT table, so skip this. */
12967 emit_insn (gen_movsi (reg, new_rtx));
12968 new_rtx = reg;
12969 }
12970 else
12971 {
12972 /* This symbol must be referenced via a load from the
12973 Global Offset Table (@GOT). */
12974
12975 if (reload_in_progress)
12976 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12977 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12978 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12979 if (TARGET_64BIT)
12980 new_rtx = force_reg (Pmode, new_rtx);
12981 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12982 new_rtx = gen_const_mem (Pmode, new_rtx);
12983 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12984
12985 if (reg == 0)
12986 reg = gen_reg_rtx (Pmode);
12987 emit_move_insn (reg, new_rtx);
12988 new_rtx = reg;
12989 }
12990 }
12991 else
12992 {
12993 if (CONST_INT_P (addr)
12994 && !x86_64_immediate_operand (addr, VOIDmode))
12995 {
12996 if (reg)
12997 {
12998 emit_move_insn (reg, addr);
12999 new_rtx = reg;
13000 }
13001 else
13002 new_rtx = force_reg (Pmode, addr);
13003 }
13004 else if (GET_CODE (addr) == CONST)
13005 {
13006 addr = XEXP (addr, 0);
13007
13008 /* We must match stuff we generate before. Assume the only
13009 unspecs that can get here are ours. Not that we could do
13010 anything with them anyway.... */
13011 if (GET_CODE (addr) == UNSPEC
13012 || (GET_CODE (addr) == PLUS
13013 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
13014 return orig;
13015 gcc_assert (GET_CODE (addr) == PLUS);
13016 }
13017 if (GET_CODE (addr) == PLUS)
13018 {
13019 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
13020
13021 /* Check first to see if this is a constant offset from a @GOTOFF
13022 symbol reference. */
13023 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
13024 && CONST_INT_P (op1))
13025 {
13026 if (!TARGET_64BIT)
13027 {
13028 if (reload_in_progress)
13029 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13030 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
13031 UNSPEC_GOTOFF);
13032 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
13033 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13034 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13035
13036 if (reg != 0)
13037 {
13038 emit_move_insn (reg, new_rtx);
13039 new_rtx = reg;
13040 }
13041 }
13042 else
13043 {
13044 if (INTVAL (op1) < -16*1024*1024
13045 || INTVAL (op1) >= 16*1024*1024)
13046 {
13047 if (!x86_64_immediate_operand (op1, Pmode))
13048 op1 = force_reg (Pmode, op1);
13049 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
13050 }
13051 }
13052 }
13053 else
13054 {
13055 rtx base = legitimize_pic_address (op0, reg);
13056 enum machine_mode mode = GET_MODE (base);
13057 new_rtx
13058 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
13059
13060 if (CONST_INT_P (new_rtx))
13061 {
13062 if (INTVAL (new_rtx) < -16*1024*1024
13063 || INTVAL (new_rtx) >= 16*1024*1024)
13064 {
13065 if (!x86_64_immediate_operand (new_rtx, mode))
13066 new_rtx = force_reg (mode, new_rtx);
13067 new_rtx
13068 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
13069 }
13070 else
13071 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
13072 }
13073 else
13074 {
13075 if (GET_CODE (new_rtx) == PLUS
13076 && CONSTANT_P (XEXP (new_rtx, 1)))
13077 {
13078 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
13079 new_rtx = XEXP (new_rtx, 1);
13080 }
13081 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
13082 }
13083 }
13084 }
13085 }
13086 return new_rtx;
13087 }
13088 \f
13089 /* Load the thread pointer. If TO_REG is true, force it into a register. */
13090
13091 static rtx
13092 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
13093 {
13094 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
13095
13096 if (GET_MODE (tp) != tp_mode)
13097 {
13098 gcc_assert (GET_MODE (tp) == SImode);
13099 gcc_assert (tp_mode == DImode);
13100
13101 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
13102 }
13103
13104 if (to_reg)
13105 tp = copy_to_mode_reg (tp_mode, tp);
13106
13107 return tp;
13108 }
13109
13110 /* Construct the SYMBOL_REF for the tls_get_addr function. */
13111
13112 static GTY(()) rtx ix86_tls_symbol;
13113
13114 static rtx
13115 ix86_tls_get_addr (void)
13116 {
13117 if (!ix86_tls_symbol)
13118 {
13119 const char *sym
13120 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
13121 ? "___tls_get_addr" : "__tls_get_addr");
13122
13123 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
13124 }
13125
13126 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
13127 {
13128 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
13129 UNSPEC_PLTOFF);
13130 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
13131 gen_rtx_CONST (Pmode, unspec));
13132 }
13133
13134 return ix86_tls_symbol;
13135 }
13136
13137 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
13138
13139 static GTY(()) rtx ix86_tls_module_base_symbol;
13140
13141 rtx
13142 ix86_tls_module_base (void)
13143 {
13144 if (!ix86_tls_module_base_symbol)
13145 {
13146 ix86_tls_module_base_symbol
13147 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13148
13149 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13150 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13151 }
13152
13153 return ix86_tls_module_base_symbol;
13154 }
13155
13156 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13157 false if we expect this to be used for a memory address and true if
13158 we expect to load the address into a register. */
13159
13160 static rtx
13161 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13162 {
13163 rtx dest, base, off;
13164 rtx pic = NULL_RTX, tp = NULL_RTX;
13165 enum machine_mode tp_mode = Pmode;
13166 int type;
13167
13168 switch (model)
13169 {
13170 case TLS_MODEL_GLOBAL_DYNAMIC:
13171 dest = gen_reg_rtx (Pmode);
13172
13173 if (!TARGET_64BIT)
13174 {
13175 if (flag_pic && !TARGET_PECOFF)
13176 pic = pic_offset_table_rtx;
13177 else
13178 {
13179 pic = gen_reg_rtx (Pmode);
13180 emit_insn (gen_set_got (pic));
13181 }
13182 }
13183
13184 if (TARGET_GNU2_TLS)
13185 {
13186 if (TARGET_64BIT)
13187 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13188 else
13189 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13190
13191 tp = get_thread_pointer (Pmode, true);
13192 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13193
13194 if (GET_MODE (x) != Pmode)
13195 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13196
13197 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13198 }
13199 else
13200 {
13201 rtx caddr = ix86_tls_get_addr ();
13202
13203 if (TARGET_64BIT)
13204 {
13205 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13206 rtx insns;
13207
13208 start_sequence ();
13209 emit_call_insn
13210 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13211 insns = get_insns ();
13212 end_sequence ();
13213
13214 if (GET_MODE (x) != Pmode)
13215 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13216
13217 RTL_CONST_CALL_P (insns) = 1;
13218 emit_libcall_block (insns, dest, rax, x);
13219 }
13220 else
13221 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13222 }
13223 break;
13224
13225 case TLS_MODEL_LOCAL_DYNAMIC:
13226 base = gen_reg_rtx (Pmode);
13227
13228 if (!TARGET_64BIT)
13229 {
13230 if (flag_pic)
13231 pic = pic_offset_table_rtx;
13232 else
13233 {
13234 pic = gen_reg_rtx (Pmode);
13235 emit_insn (gen_set_got (pic));
13236 }
13237 }
13238
13239 if (TARGET_GNU2_TLS)
13240 {
13241 rtx tmp = ix86_tls_module_base ();
13242
13243 if (TARGET_64BIT)
13244 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13245 else
13246 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13247
13248 tp = get_thread_pointer (Pmode, true);
13249 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13250 gen_rtx_MINUS (Pmode, tmp, tp));
13251 }
13252 else
13253 {
13254 rtx caddr = ix86_tls_get_addr ();
13255
13256 if (TARGET_64BIT)
13257 {
13258 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13259 rtx insns, eqv;
13260
13261 start_sequence ();
13262 emit_call_insn
13263 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13264 insns = get_insns ();
13265 end_sequence ();
13266
13267 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13268 share the LD_BASE result with other LD model accesses. */
13269 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13270 UNSPEC_TLS_LD_BASE);
13271
13272 RTL_CONST_CALL_P (insns) = 1;
13273 emit_libcall_block (insns, base, rax, eqv);
13274 }
13275 else
13276 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13277 }
13278
13279 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13280 off = gen_rtx_CONST (Pmode, off);
13281
13282 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13283
13284 if (TARGET_GNU2_TLS)
13285 {
13286 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13287
13288 if (GET_MODE (x) != Pmode)
13289 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13290
13291 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13292 }
13293 break;
13294
13295 case TLS_MODEL_INITIAL_EXEC:
13296 if (TARGET_64BIT)
13297 {
13298 if (TARGET_SUN_TLS && !TARGET_X32)
13299 {
13300 /* The Sun linker took the AMD64 TLS spec literally
13301 and can only handle %rax as destination of the
13302 initial executable code sequence. */
13303
13304 dest = gen_reg_rtx (DImode);
13305 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13306 return dest;
13307 }
13308
13309 /* Generate DImode references to avoid %fs:(%reg32)
13310 problems and linker IE->LE relaxation bug. */
13311 tp_mode = DImode;
13312 pic = NULL;
13313 type = UNSPEC_GOTNTPOFF;
13314 }
13315 else if (flag_pic)
13316 {
13317 if (reload_in_progress)
13318 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13319 pic = pic_offset_table_rtx;
13320 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13321 }
13322 else if (!TARGET_ANY_GNU_TLS)
13323 {
13324 pic = gen_reg_rtx (Pmode);
13325 emit_insn (gen_set_got (pic));
13326 type = UNSPEC_GOTTPOFF;
13327 }
13328 else
13329 {
13330 pic = NULL;
13331 type = UNSPEC_INDNTPOFF;
13332 }
13333
13334 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13335 off = gen_rtx_CONST (tp_mode, off);
13336 if (pic)
13337 off = gen_rtx_PLUS (tp_mode, pic, off);
13338 off = gen_const_mem (tp_mode, off);
13339 set_mem_alias_set (off, ix86_GOT_alias_set ());
13340
13341 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13342 {
13343 base = get_thread_pointer (tp_mode,
13344 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13345 off = force_reg (tp_mode, off);
13346 return gen_rtx_PLUS (tp_mode, base, off);
13347 }
13348 else
13349 {
13350 base = get_thread_pointer (Pmode, true);
13351 dest = gen_reg_rtx (Pmode);
13352 emit_insn (ix86_gen_sub3 (dest, base, off));
13353 }
13354 break;
13355
13356 case TLS_MODEL_LOCAL_EXEC:
13357 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13358 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13359 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13360 off = gen_rtx_CONST (Pmode, off);
13361
13362 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13363 {
13364 base = get_thread_pointer (Pmode,
13365 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13366 return gen_rtx_PLUS (Pmode, base, off);
13367 }
13368 else
13369 {
13370 base = get_thread_pointer (Pmode, true);
13371 dest = gen_reg_rtx (Pmode);
13372 emit_insn (ix86_gen_sub3 (dest, base, off));
13373 }
13374 break;
13375
13376 default:
13377 gcc_unreachable ();
13378 }
13379
13380 return dest;
13381 }
13382
13383 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13384 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13385 unique refptr-DECL symbol corresponding to symbol DECL. */
13386
13387 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13388 htab_t dllimport_map;
13389
13390 static tree
13391 get_dllimport_decl (tree decl, bool beimport)
13392 {
13393 struct tree_map *h, in;
13394 void **loc;
13395 const char *name;
13396 const char *prefix;
13397 size_t namelen, prefixlen;
13398 char *imp_name;
13399 tree to;
13400 rtx rtl;
13401
13402 if (!dllimport_map)
13403 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13404
13405 in.hash = htab_hash_pointer (decl);
13406 in.base.from = decl;
13407 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13408 h = (struct tree_map *) *loc;
13409 if (h)
13410 return h->to;
13411
13412 *loc = h = ggc_alloc_tree_map ();
13413 h->hash = in.hash;
13414 h->base.from = decl;
13415 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13416 VAR_DECL, NULL, ptr_type_node);
13417 DECL_ARTIFICIAL (to) = 1;
13418 DECL_IGNORED_P (to) = 1;
13419 DECL_EXTERNAL (to) = 1;
13420 TREE_READONLY (to) = 1;
13421
13422 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13423 name = targetm.strip_name_encoding (name);
13424 if (beimport)
13425 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13426 ? "*__imp_" : "*__imp__";
13427 else
13428 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13429 namelen = strlen (name);
13430 prefixlen = strlen (prefix);
13431 imp_name = (char *) alloca (namelen + prefixlen + 1);
13432 memcpy (imp_name, prefix, prefixlen);
13433 memcpy (imp_name + prefixlen, name, namelen + 1);
13434
13435 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13436 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13437 SET_SYMBOL_REF_DECL (rtl, to);
13438 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13439 if (!beimport)
13440 {
13441 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13442 #ifdef SUB_TARGET_RECORD_STUB
13443 SUB_TARGET_RECORD_STUB (name);
13444 #endif
13445 }
13446
13447 rtl = gen_const_mem (Pmode, rtl);
13448 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13449
13450 SET_DECL_RTL (to, rtl);
13451 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13452
13453 return to;
13454 }
13455
13456 /* Expand SYMBOL into its corresponding far-addresse symbol.
13457 WANT_REG is true if we require the result be a register. */
13458
13459 static rtx
13460 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13461 {
13462 tree imp_decl;
13463 rtx x;
13464
13465 gcc_assert (SYMBOL_REF_DECL (symbol));
13466 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13467
13468 x = DECL_RTL (imp_decl);
13469 if (want_reg)
13470 x = force_reg (Pmode, x);
13471 return x;
13472 }
13473
13474 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13475 true if we require the result be a register. */
13476
13477 static rtx
13478 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13479 {
13480 tree imp_decl;
13481 rtx x;
13482
13483 gcc_assert (SYMBOL_REF_DECL (symbol));
13484 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13485
13486 x = DECL_RTL (imp_decl);
13487 if (want_reg)
13488 x = force_reg (Pmode, x);
13489 return x;
13490 }
13491
13492 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13493 is true if we require the result be a register. */
13494
13495 static rtx
13496 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13497 {
13498 if (!TARGET_PECOFF)
13499 return NULL_RTX;
13500
13501 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13502 {
13503 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13504 return legitimize_dllimport_symbol (addr, inreg);
13505 if (GET_CODE (addr) == CONST
13506 && GET_CODE (XEXP (addr, 0)) == PLUS
13507 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13508 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13509 {
13510 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13511 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13512 }
13513 }
13514
13515 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13516 return NULL_RTX;
13517 if (GET_CODE (addr) == SYMBOL_REF
13518 && !is_imported_p (addr)
13519 && SYMBOL_REF_EXTERNAL_P (addr)
13520 && SYMBOL_REF_DECL (addr))
13521 return legitimize_pe_coff_extern_decl (addr, inreg);
13522
13523 if (GET_CODE (addr) == CONST
13524 && GET_CODE (XEXP (addr, 0)) == PLUS
13525 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13526 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13527 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13528 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13529 {
13530 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13531 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13532 }
13533 return NULL_RTX;
13534 }
13535
13536 /* Try machine-dependent ways of modifying an illegitimate address
13537 to be legitimate. If we find one, return the new, valid address.
13538 This macro is used in only one place: `memory_address' in explow.c.
13539
13540 OLDX is the address as it was before break_out_memory_refs was called.
13541 In some cases it is useful to look at this to decide what needs to be done.
13542
13543 It is always safe for this macro to do nothing. It exists to recognize
13544 opportunities to optimize the output.
13545
13546 For the 80386, we handle X+REG by loading X into a register R and
13547 using R+REG. R will go in a general reg and indexing will be used.
13548 However, if REG is a broken-out memory address or multiplication,
13549 nothing needs to be done because REG can certainly go in a general reg.
13550
13551 When -fpic is used, special handling is needed for symbolic references.
13552 See comments by legitimize_pic_address in i386.c for details. */
13553
13554 static rtx
13555 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13556 enum machine_mode mode)
13557 {
13558 int changed = 0;
13559 unsigned log;
13560
13561 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13562 if (log)
13563 return legitimize_tls_address (x, (enum tls_model) log, false);
13564 if (GET_CODE (x) == CONST
13565 && GET_CODE (XEXP (x, 0)) == PLUS
13566 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13567 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13568 {
13569 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13570 (enum tls_model) log, false);
13571 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13572 }
13573
13574 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13575 {
13576 rtx tmp = legitimize_pe_coff_symbol (x, true);
13577 if (tmp)
13578 return tmp;
13579 }
13580
13581 if (flag_pic && SYMBOLIC_CONST (x))
13582 return legitimize_pic_address (x, 0);
13583
13584 #if TARGET_MACHO
13585 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13586 return machopic_indirect_data_reference (x, 0);
13587 #endif
13588
13589 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13590 if (GET_CODE (x) == ASHIFT
13591 && CONST_INT_P (XEXP (x, 1))
13592 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13593 {
13594 changed = 1;
13595 log = INTVAL (XEXP (x, 1));
13596 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13597 GEN_INT (1 << log));
13598 }
13599
13600 if (GET_CODE (x) == PLUS)
13601 {
13602 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13603
13604 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13605 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13606 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13607 {
13608 changed = 1;
13609 log = INTVAL (XEXP (XEXP (x, 0), 1));
13610 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13611 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13612 GEN_INT (1 << log));
13613 }
13614
13615 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13616 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13617 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13618 {
13619 changed = 1;
13620 log = INTVAL (XEXP (XEXP (x, 1), 1));
13621 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13622 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13623 GEN_INT (1 << log));
13624 }
13625
13626 /* Put multiply first if it isn't already. */
13627 if (GET_CODE (XEXP (x, 1)) == MULT)
13628 {
13629 rtx tmp = XEXP (x, 0);
13630 XEXP (x, 0) = XEXP (x, 1);
13631 XEXP (x, 1) = tmp;
13632 changed = 1;
13633 }
13634
13635 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13636 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13637 created by virtual register instantiation, register elimination, and
13638 similar optimizations. */
13639 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13640 {
13641 changed = 1;
13642 x = gen_rtx_PLUS (Pmode,
13643 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13644 XEXP (XEXP (x, 1), 0)),
13645 XEXP (XEXP (x, 1), 1));
13646 }
13647
13648 /* Canonicalize
13649 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13650 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13651 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13652 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13653 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13654 && CONSTANT_P (XEXP (x, 1)))
13655 {
13656 rtx constant;
13657 rtx other = NULL_RTX;
13658
13659 if (CONST_INT_P (XEXP (x, 1)))
13660 {
13661 constant = XEXP (x, 1);
13662 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13663 }
13664 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13665 {
13666 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13667 other = XEXP (x, 1);
13668 }
13669 else
13670 constant = 0;
13671
13672 if (constant)
13673 {
13674 changed = 1;
13675 x = gen_rtx_PLUS (Pmode,
13676 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13677 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13678 plus_constant (Pmode, other,
13679 INTVAL (constant)));
13680 }
13681 }
13682
13683 if (changed && ix86_legitimate_address_p (mode, x, false))
13684 return x;
13685
13686 if (GET_CODE (XEXP (x, 0)) == MULT)
13687 {
13688 changed = 1;
13689 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13690 }
13691
13692 if (GET_CODE (XEXP (x, 1)) == MULT)
13693 {
13694 changed = 1;
13695 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13696 }
13697
13698 if (changed
13699 && REG_P (XEXP (x, 1))
13700 && REG_P (XEXP (x, 0)))
13701 return x;
13702
13703 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13704 {
13705 changed = 1;
13706 x = legitimize_pic_address (x, 0);
13707 }
13708
13709 if (changed && ix86_legitimate_address_p (mode, x, false))
13710 return x;
13711
13712 if (REG_P (XEXP (x, 0)))
13713 {
13714 rtx temp = gen_reg_rtx (Pmode);
13715 rtx val = force_operand (XEXP (x, 1), temp);
13716 if (val != temp)
13717 {
13718 val = convert_to_mode (Pmode, val, 1);
13719 emit_move_insn (temp, val);
13720 }
13721
13722 XEXP (x, 1) = temp;
13723 return x;
13724 }
13725
13726 else if (REG_P (XEXP (x, 1)))
13727 {
13728 rtx temp = gen_reg_rtx (Pmode);
13729 rtx val = force_operand (XEXP (x, 0), temp);
13730 if (val != temp)
13731 {
13732 val = convert_to_mode (Pmode, val, 1);
13733 emit_move_insn (temp, val);
13734 }
13735
13736 XEXP (x, 0) = temp;
13737 return x;
13738 }
13739 }
13740
13741 return x;
13742 }
13743 \f
13744 /* Print an integer constant expression in assembler syntax. Addition
13745 and subtraction are the only arithmetic that may appear in these
13746 expressions. FILE is the stdio stream to write to, X is the rtx, and
13747 CODE is the operand print code from the output string. */
13748
13749 static void
13750 output_pic_addr_const (FILE *file, rtx x, int code)
13751 {
13752 char buf[256];
13753
13754 switch (GET_CODE (x))
13755 {
13756 case PC:
13757 gcc_assert (flag_pic);
13758 putc ('.', file);
13759 break;
13760
13761 case SYMBOL_REF:
13762 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13763 output_addr_const (file, x);
13764 else
13765 {
13766 const char *name = XSTR (x, 0);
13767
13768 /* Mark the decl as referenced so that cgraph will
13769 output the function. */
13770 if (SYMBOL_REF_DECL (x))
13771 mark_decl_referenced (SYMBOL_REF_DECL (x));
13772
13773 #if TARGET_MACHO
13774 if (MACHOPIC_INDIRECT
13775 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13776 name = machopic_indirection_name (x, /*stub_p=*/true);
13777 #endif
13778 assemble_name (file, name);
13779 }
13780 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13781 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13782 fputs ("@PLT", file);
13783 break;
13784
13785 case LABEL_REF:
13786 x = XEXP (x, 0);
13787 /* FALLTHRU */
13788 case CODE_LABEL:
13789 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13790 assemble_name (asm_out_file, buf);
13791 break;
13792
13793 case CONST_INT:
13794 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13795 break;
13796
13797 case CONST:
13798 /* This used to output parentheses around the expression,
13799 but that does not work on the 386 (either ATT or BSD assembler). */
13800 output_pic_addr_const (file, XEXP (x, 0), code);
13801 break;
13802
13803 case CONST_DOUBLE:
13804 if (GET_MODE (x) == VOIDmode)
13805 {
13806 /* We can use %d if the number is <32 bits and positive. */
13807 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13808 fprintf (file, "0x%lx%08lx",
13809 (unsigned long) CONST_DOUBLE_HIGH (x),
13810 (unsigned long) CONST_DOUBLE_LOW (x));
13811 else
13812 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13813 }
13814 else
13815 /* We can't handle floating point constants;
13816 TARGET_PRINT_OPERAND must handle them. */
13817 output_operand_lossage ("floating constant misused");
13818 break;
13819
13820 case PLUS:
13821 /* Some assemblers need integer constants to appear first. */
13822 if (CONST_INT_P (XEXP (x, 0)))
13823 {
13824 output_pic_addr_const (file, XEXP (x, 0), code);
13825 putc ('+', file);
13826 output_pic_addr_const (file, XEXP (x, 1), code);
13827 }
13828 else
13829 {
13830 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13831 output_pic_addr_const (file, XEXP (x, 1), code);
13832 putc ('+', file);
13833 output_pic_addr_const (file, XEXP (x, 0), code);
13834 }
13835 break;
13836
13837 case MINUS:
13838 if (!TARGET_MACHO)
13839 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13840 output_pic_addr_const (file, XEXP (x, 0), code);
13841 putc ('-', file);
13842 output_pic_addr_const (file, XEXP (x, 1), code);
13843 if (!TARGET_MACHO)
13844 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13845 break;
13846
13847 case UNSPEC:
13848 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13849 {
13850 bool f = i386_asm_output_addr_const_extra (file, x);
13851 gcc_assert (f);
13852 break;
13853 }
13854
13855 gcc_assert (XVECLEN (x, 0) == 1);
13856 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13857 switch (XINT (x, 1))
13858 {
13859 case UNSPEC_GOT:
13860 fputs ("@GOT", file);
13861 break;
13862 case UNSPEC_GOTOFF:
13863 fputs ("@GOTOFF", file);
13864 break;
13865 case UNSPEC_PLTOFF:
13866 fputs ("@PLTOFF", file);
13867 break;
13868 case UNSPEC_PCREL:
13869 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13870 "(%rip)" : "[rip]", file);
13871 break;
13872 case UNSPEC_GOTPCREL:
13873 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13874 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13875 break;
13876 case UNSPEC_GOTTPOFF:
13877 /* FIXME: This might be @TPOFF in Sun ld too. */
13878 fputs ("@gottpoff", file);
13879 break;
13880 case UNSPEC_TPOFF:
13881 fputs ("@tpoff", file);
13882 break;
13883 case UNSPEC_NTPOFF:
13884 if (TARGET_64BIT)
13885 fputs ("@tpoff", file);
13886 else
13887 fputs ("@ntpoff", file);
13888 break;
13889 case UNSPEC_DTPOFF:
13890 fputs ("@dtpoff", file);
13891 break;
13892 case UNSPEC_GOTNTPOFF:
13893 if (TARGET_64BIT)
13894 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13895 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13896 else
13897 fputs ("@gotntpoff", file);
13898 break;
13899 case UNSPEC_INDNTPOFF:
13900 fputs ("@indntpoff", file);
13901 break;
13902 #if TARGET_MACHO
13903 case UNSPEC_MACHOPIC_OFFSET:
13904 putc ('-', file);
13905 machopic_output_function_base_name (file);
13906 break;
13907 #endif
13908 default:
13909 output_operand_lossage ("invalid UNSPEC as operand");
13910 break;
13911 }
13912 break;
13913
13914 default:
13915 output_operand_lossage ("invalid expression as operand");
13916 }
13917 }
13918
13919 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13920 We need to emit DTP-relative relocations. */
13921
13922 static void ATTRIBUTE_UNUSED
13923 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13924 {
13925 fputs (ASM_LONG, file);
13926 output_addr_const (file, x);
13927 fputs ("@dtpoff", file);
13928 switch (size)
13929 {
13930 case 4:
13931 break;
13932 case 8:
13933 fputs (", 0", file);
13934 break;
13935 default:
13936 gcc_unreachable ();
13937 }
13938 }
13939
13940 /* Return true if X is a representation of the PIC register. This copes
13941 with calls from ix86_find_base_term, where the register might have
13942 been replaced by a cselib value. */
13943
13944 static bool
13945 ix86_pic_register_p (rtx x)
13946 {
13947 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13948 return (pic_offset_table_rtx
13949 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13950 else
13951 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13952 }
13953
13954 /* Helper function for ix86_delegitimize_address.
13955 Attempt to delegitimize TLS local-exec accesses. */
13956
13957 static rtx
13958 ix86_delegitimize_tls_address (rtx orig_x)
13959 {
13960 rtx x = orig_x, unspec;
13961 struct ix86_address addr;
13962
13963 if (!TARGET_TLS_DIRECT_SEG_REFS)
13964 return orig_x;
13965 if (MEM_P (x))
13966 x = XEXP (x, 0);
13967 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13968 return orig_x;
13969 if (ix86_decompose_address (x, &addr) == 0
13970 || addr.seg != DEFAULT_TLS_SEG_REG
13971 || addr.disp == NULL_RTX
13972 || GET_CODE (addr.disp) != CONST)
13973 return orig_x;
13974 unspec = XEXP (addr.disp, 0);
13975 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13976 unspec = XEXP (unspec, 0);
13977 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13978 return orig_x;
13979 x = XVECEXP (unspec, 0, 0);
13980 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13981 if (unspec != XEXP (addr.disp, 0))
13982 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13983 if (addr.index)
13984 {
13985 rtx idx = addr.index;
13986 if (addr.scale != 1)
13987 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13988 x = gen_rtx_PLUS (Pmode, idx, x);
13989 }
13990 if (addr.base)
13991 x = gen_rtx_PLUS (Pmode, addr.base, x);
13992 if (MEM_P (orig_x))
13993 x = replace_equiv_address_nv (orig_x, x);
13994 return x;
13995 }
13996
13997 /* In the name of slightly smaller debug output, and to cater to
13998 general assembler lossage, recognize PIC+GOTOFF and turn it back
13999 into a direct symbol reference.
14000
14001 On Darwin, this is necessary to avoid a crash, because Darwin
14002 has a different PIC label for each routine but the DWARF debugging
14003 information is not associated with any particular routine, so it's
14004 necessary to remove references to the PIC label from RTL stored by
14005 the DWARF output code. */
14006
14007 static rtx
14008 ix86_delegitimize_address (rtx x)
14009 {
14010 rtx orig_x = delegitimize_mem_from_attrs (x);
14011 /* addend is NULL or some rtx if x is something+GOTOFF where
14012 something doesn't include the PIC register. */
14013 rtx addend = NULL_RTX;
14014 /* reg_addend is NULL or a multiple of some register. */
14015 rtx reg_addend = NULL_RTX;
14016 /* const_addend is NULL or a const_int. */
14017 rtx const_addend = NULL_RTX;
14018 /* This is the result, or NULL. */
14019 rtx result = NULL_RTX;
14020
14021 x = orig_x;
14022
14023 if (MEM_P (x))
14024 x = XEXP (x, 0);
14025
14026 if (TARGET_64BIT)
14027 {
14028 if (GET_CODE (x) == CONST
14029 && GET_CODE (XEXP (x, 0)) == PLUS
14030 && GET_MODE (XEXP (x, 0)) == Pmode
14031 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
14032 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
14033 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
14034 {
14035 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
14036 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
14037 if (MEM_P (orig_x))
14038 x = replace_equiv_address_nv (orig_x, x);
14039 return x;
14040 }
14041
14042 if (GET_CODE (x) == CONST
14043 && GET_CODE (XEXP (x, 0)) == UNSPEC
14044 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
14045 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
14046 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
14047 {
14048 x = XVECEXP (XEXP (x, 0), 0, 0);
14049 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
14050 {
14051 x = simplify_gen_subreg (GET_MODE (orig_x), x,
14052 GET_MODE (x), 0);
14053 if (x == NULL_RTX)
14054 return orig_x;
14055 }
14056 return x;
14057 }
14058
14059 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
14060 return ix86_delegitimize_tls_address (orig_x);
14061
14062 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
14063 and -mcmodel=medium -fpic. */
14064 }
14065
14066 if (GET_CODE (x) != PLUS
14067 || GET_CODE (XEXP (x, 1)) != CONST)
14068 return ix86_delegitimize_tls_address (orig_x);
14069
14070 if (ix86_pic_register_p (XEXP (x, 0)))
14071 /* %ebx + GOT/GOTOFF */
14072 ;
14073 else if (GET_CODE (XEXP (x, 0)) == PLUS)
14074 {
14075 /* %ebx + %reg * scale + GOT/GOTOFF */
14076 reg_addend = XEXP (x, 0);
14077 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
14078 reg_addend = XEXP (reg_addend, 1);
14079 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
14080 reg_addend = XEXP (reg_addend, 0);
14081 else
14082 {
14083 reg_addend = NULL_RTX;
14084 addend = XEXP (x, 0);
14085 }
14086 }
14087 else
14088 addend = XEXP (x, 0);
14089
14090 x = XEXP (XEXP (x, 1), 0);
14091 if (GET_CODE (x) == PLUS
14092 && CONST_INT_P (XEXP (x, 1)))
14093 {
14094 const_addend = XEXP (x, 1);
14095 x = XEXP (x, 0);
14096 }
14097
14098 if (GET_CODE (x) == UNSPEC
14099 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
14100 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
14101 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
14102 && !MEM_P (orig_x) && !addend)))
14103 result = XVECEXP (x, 0, 0);
14104
14105 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
14106 && !MEM_P (orig_x))
14107 result = XVECEXP (x, 0, 0);
14108
14109 if (! result)
14110 return ix86_delegitimize_tls_address (orig_x);
14111
14112 if (const_addend)
14113 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
14114 if (reg_addend)
14115 result = gen_rtx_PLUS (Pmode, reg_addend, result);
14116 if (addend)
14117 {
14118 /* If the rest of original X doesn't involve the PIC register, add
14119 addend and subtract pic_offset_table_rtx. This can happen e.g.
14120 for code like:
14121 leal (%ebx, %ecx, 4), %ecx
14122 ...
14123 movl foo@GOTOFF(%ecx), %edx
14124 in which case we return (%ecx - %ebx) + foo. */
14125 if (pic_offset_table_rtx)
14126 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
14127 pic_offset_table_rtx),
14128 result);
14129 else
14130 return orig_x;
14131 }
14132 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
14133 {
14134 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
14135 if (result == NULL_RTX)
14136 return orig_x;
14137 }
14138 return result;
14139 }
14140
14141 /* If X is a machine specific address (i.e. a symbol or label being
14142 referenced as a displacement from the GOT implemented using an
14143 UNSPEC), then return the base term. Otherwise return X. */
14144
14145 rtx
14146 ix86_find_base_term (rtx x)
14147 {
14148 rtx term;
14149
14150 if (TARGET_64BIT)
14151 {
14152 if (GET_CODE (x) != CONST)
14153 return x;
14154 term = XEXP (x, 0);
14155 if (GET_CODE (term) == PLUS
14156 && (CONST_INT_P (XEXP (term, 1))
14157 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14158 term = XEXP (term, 0);
14159 if (GET_CODE (term) != UNSPEC
14160 || (XINT (term, 1) != UNSPEC_GOTPCREL
14161 && XINT (term, 1) != UNSPEC_PCREL))
14162 return x;
14163
14164 return XVECEXP (term, 0, 0);
14165 }
14166
14167 return ix86_delegitimize_address (x);
14168 }
14169 \f
14170 static void
14171 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14172 bool fp, FILE *file)
14173 {
14174 const char *suffix;
14175
14176 if (mode == CCFPmode || mode == CCFPUmode)
14177 {
14178 code = ix86_fp_compare_code_to_integer (code);
14179 mode = CCmode;
14180 }
14181 if (reverse)
14182 code = reverse_condition (code);
14183
14184 switch (code)
14185 {
14186 case EQ:
14187 switch (mode)
14188 {
14189 case CCAmode:
14190 suffix = "a";
14191 break;
14192
14193 case CCCmode:
14194 suffix = "c";
14195 break;
14196
14197 case CCOmode:
14198 suffix = "o";
14199 break;
14200
14201 case CCSmode:
14202 suffix = "s";
14203 break;
14204
14205 default:
14206 suffix = "e";
14207 }
14208 break;
14209 case NE:
14210 switch (mode)
14211 {
14212 case CCAmode:
14213 suffix = "na";
14214 break;
14215
14216 case CCCmode:
14217 suffix = "nc";
14218 break;
14219
14220 case CCOmode:
14221 suffix = "no";
14222 break;
14223
14224 case CCSmode:
14225 suffix = "ns";
14226 break;
14227
14228 default:
14229 suffix = "ne";
14230 }
14231 break;
14232 case GT:
14233 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14234 suffix = "g";
14235 break;
14236 case GTU:
14237 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14238 Those same assemblers have the same but opposite lossage on cmov. */
14239 if (mode == CCmode)
14240 suffix = fp ? "nbe" : "a";
14241 else
14242 gcc_unreachable ();
14243 break;
14244 case LT:
14245 switch (mode)
14246 {
14247 case CCNOmode:
14248 case CCGOCmode:
14249 suffix = "s";
14250 break;
14251
14252 case CCmode:
14253 case CCGCmode:
14254 suffix = "l";
14255 break;
14256
14257 default:
14258 gcc_unreachable ();
14259 }
14260 break;
14261 case LTU:
14262 if (mode == CCmode)
14263 suffix = "b";
14264 else if (mode == CCCmode)
14265 suffix = "c";
14266 else
14267 gcc_unreachable ();
14268 break;
14269 case GE:
14270 switch (mode)
14271 {
14272 case CCNOmode:
14273 case CCGOCmode:
14274 suffix = "ns";
14275 break;
14276
14277 case CCmode:
14278 case CCGCmode:
14279 suffix = "ge";
14280 break;
14281
14282 default:
14283 gcc_unreachable ();
14284 }
14285 break;
14286 case GEU:
14287 if (mode == CCmode)
14288 suffix = fp ? "nb" : "ae";
14289 else if (mode == CCCmode)
14290 suffix = "nc";
14291 else
14292 gcc_unreachable ();
14293 break;
14294 case LE:
14295 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14296 suffix = "le";
14297 break;
14298 case LEU:
14299 if (mode == CCmode)
14300 suffix = "be";
14301 else
14302 gcc_unreachable ();
14303 break;
14304 case UNORDERED:
14305 suffix = fp ? "u" : "p";
14306 break;
14307 case ORDERED:
14308 suffix = fp ? "nu" : "np";
14309 break;
14310 default:
14311 gcc_unreachable ();
14312 }
14313 fputs (suffix, file);
14314 }
14315
14316 /* Print the name of register X to FILE based on its machine mode and number.
14317 If CODE is 'w', pretend the mode is HImode.
14318 If CODE is 'b', pretend the mode is QImode.
14319 If CODE is 'k', pretend the mode is SImode.
14320 If CODE is 'q', pretend the mode is DImode.
14321 If CODE is 'x', pretend the mode is V4SFmode.
14322 If CODE is 't', pretend the mode is V8SFmode.
14323 If CODE is 'g', pretend the mode is V16SFmode.
14324 If CODE is 'h', pretend the reg is the 'high' byte register.
14325 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14326 If CODE is 'd', duplicate the operand for AVX instruction.
14327 */
14328
14329 void
14330 print_reg (rtx x, int code, FILE *file)
14331 {
14332 const char *reg;
14333 unsigned int regno;
14334 bool duplicated = code == 'd' && TARGET_AVX;
14335
14336 if (ASSEMBLER_DIALECT == ASM_ATT)
14337 putc ('%', file);
14338
14339 if (x == pc_rtx)
14340 {
14341 gcc_assert (TARGET_64BIT);
14342 fputs ("rip", file);
14343 return;
14344 }
14345
14346 regno = true_regnum (x);
14347 gcc_assert (regno != ARG_POINTER_REGNUM
14348 && regno != FRAME_POINTER_REGNUM
14349 && regno != FLAGS_REG
14350 && regno != FPSR_REG
14351 && regno != FPCR_REG);
14352
14353 if (code == 'w' || MMX_REG_P (x))
14354 code = 2;
14355 else if (code == 'b')
14356 code = 1;
14357 else if (code == 'k')
14358 code = 4;
14359 else if (code == 'q')
14360 code = 8;
14361 else if (code == 'y')
14362 code = 3;
14363 else if (code == 'h')
14364 code = 0;
14365 else if (code == 'x')
14366 code = 16;
14367 else if (code == 't')
14368 code = 32;
14369 else if (code == 'g')
14370 code = 64;
14371 else
14372 code = GET_MODE_SIZE (GET_MODE (x));
14373
14374 /* Irritatingly, AMD extended registers use different naming convention
14375 from the normal registers: "r%d[bwd]" */
14376 if (REX_INT_REGNO_P (regno))
14377 {
14378 gcc_assert (TARGET_64BIT);
14379 putc ('r', file);
14380 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14381 switch (code)
14382 {
14383 case 0:
14384 error ("extended registers have no high halves");
14385 break;
14386 case 1:
14387 putc ('b', file);
14388 break;
14389 case 2:
14390 putc ('w', file);
14391 break;
14392 case 4:
14393 putc ('d', file);
14394 break;
14395 case 8:
14396 /* no suffix */
14397 break;
14398 default:
14399 error ("unsupported operand size for extended register");
14400 break;
14401 }
14402 return;
14403 }
14404
14405 reg = NULL;
14406 switch (code)
14407 {
14408 case 3:
14409 if (STACK_TOP_P (x))
14410 {
14411 reg = "st(0)";
14412 break;
14413 }
14414 /* FALLTHRU */
14415 case 8:
14416 case 4:
14417 case 12:
14418 if (! ANY_FP_REG_P (x) && ! ANY_BND_REG_P (x))
14419 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14420 /* FALLTHRU */
14421 case 16:
14422 case 2:
14423 normal:
14424 reg = hi_reg_name[regno];
14425 break;
14426 case 1:
14427 if (regno >= ARRAY_SIZE (qi_reg_name))
14428 goto normal;
14429 reg = qi_reg_name[regno];
14430 break;
14431 case 0:
14432 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14433 goto normal;
14434 reg = qi_high_reg_name[regno];
14435 break;
14436 case 32:
14437 if (SSE_REG_P (x))
14438 {
14439 gcc_assert (!duplicated);
14440 putc ('y', file);
14441 fputs (hi_reg_name[regno] + 1, file);
14442 return;
14443 }
14444 case 64:
14445 if (SSE_REG_P (x))
14446 {
14447 gcc_assert (!duplicated);
14448 putc ('z', file);
14449 fputs (hi_reg_name[REGNO (x)] + 1, file);
14450 return;
14451 }
14452 break;
14453 default:
14454 gcc_unreachable ();
14455 }
14456
14457 fputs (reg, file);
14458 if (duplicated)
14459 {
14460 if (ASSEMBLER_DIALECT == ASM_ATT)
14461 fprintf (file, ", %%%s", reg);
14462 else
14463 fprintf (file, ", %s", reg);
14464 }
14465 }
14466
14467 /* Locate some local-dynamic symbol still in use by this function
14468 so that we can print its name in some tls_local_dynamic_base
14469 pattern. */
14470
14471 static int
14472 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14473 {
14474 rtx x = *px;
14475
14476 if (GET_CODE (x) == SYMBOL_REF
14477 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14478 {
14479 cfun->machine->some_ld_name = XSTR (x, 0);
14480 return 1;
14481 }
14482
14483 return 0;
14484 }
14485
14486 static const char *
14487 get_some_local_dynamic_name (void)
14488 {
14489 rtx insn;
14490
14491 if (cfun->machine->some_ld_name)
14492 return cfun->machine->some_ld_name;
14493
14494 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14495 if (NONDEBUG_INSN_P (insn)
14496 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14497 return cfun->machine->some_ld_name;
14498
14499 return NULL;
14500 }
14501
14502 /* Meaning of CODE:
14503 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14504 C -- print opcode suffix for set/cmov insn.
14505 c -- like C, but print reversed condition
14506 F,f -- likewise, but for floating-point.
14507 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14508 otherwise nothing
14509 R -- print the prefix for register names.
14510 z -- print the opcode suffix for the size of the current operand.
14511 Z -- likewise, with special suffixes for x87 instructions.
14512 * -- print a star (in certain assembler syntax)
14513 A -- print an absolute memory reference.
14514 E -- print address with DImode register names if TARGET_64BIT.
14515 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14516 s -- print a shift double count, followed by the assemblers argument
14517 delimiter.
14518 b -- print the QImode name of the register for the indicated operand.
14519 %b0 would print %al if operands[0] is reg 0.
14520 w -- likewise, print the HImode name of the register.
14521 k -- likewise, print the SImode name of the register.
14522 q -- likewise, print the DImode name of the register.
14523 x -- likewise, print the V4SFmode name of the register.
14524 t -- likewise, print the V8SFmode name of the register.
14525 g -- likewise, print the V16SFmode name of the register.
14526 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14527 y -- print "st(0)" instead of "st" as a register.
14528 d -- print duplicated register operand for AVX instruction.
14529 D -- print condition for SSE cmp instruction.
14530 P -- if PIC, print an @PLT suffix.
14531 p -- print raw symbol name.
14532 X -- don't print any sort of PIC '@' suffix for a symbol.
14533 & -- print some in-use local-dynamic symbol name.
14534 H -- print a memory address offset by 8; used for sse high-parts
14535 Y -- print condition for XOP pcom* instruction.
14536 + -- print a branch hint as 'cs' or 'ds' prefix
14537 ; -- print a semicolon (after prefixes due to bug in older gas).
14538 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14539 @ -- print a segment register of thread base pointer load
14540 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14541 ! -- print MPX prefix for jxx/call/ret instructions if required.
14542 */
14543
14544 void
14545 ix86_print_operand (FILE *file, rtx x, int code)
14546 {
14547 if (code)
14548 {
14549 switch (code)
14550 {
14551 case 'A':
14552 switch (ASSEMBLER_DIALECT)
14553 {
14554 case ASM_ATT:
14555 putc ('*', file);
14556 break;
14557
14558 case ASM_INTEL:
14559 /* Intel syntax. For absolute addresses, registers should not
14560 be surrounded by braces. */
14561 if (!REG_P (x))
14562 {
14563 putc ('[', file);
14564 ix86_print_operand (file, x, 0);
14565 putc (']', file);
14566 return;
14567 }
14568 break;
14569
14570 default:
14571 gcc_unreachable ();
14572 }
14573
14574 ix86_print_operand (file, x, 0);
14575 return;
14576
14577 case 'E':
14578 /* Wrap address in an UNSPEC to declare special handling. */
14579 if (TARGET_64BIT)
14580 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14581
14582 output_address (x);
14583 return;
14584
14585 case 'L':
14586 if (ASSEMBLER_DIALECT == ASM_ATT)
14587 putc ('l', file);
14588 return;
14589
14590 case 'W':
14591 if (ASSEMBLER_DIALECT == ASM_ATT)
14592 putc ('w', file);
14593 return;
14594
14595 case 'B':
14596 if (ASSEMBLER_DIALECT == ASM_ATT)
14597 putc ('b', file);
14598 return;
14599
14600 case 'Q':
14601 if (ASSEMBLER_DIALECT == ASM_ATT)
14602 putc ('l', file);
14603 return;
14604
14605 case 'S':
14606 if (ASSEMBLER_DIALECT == ASM_ATT)
14607 putc ('s', file);
14608 return;
14609
14610 case 'T':
14611 if (ASSEMBLER_DIALECT == ASM_ATT)
14612 putc ('t', file);
14613 return;
14614
14615 case 'O':
14616 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14617 if (ASSEMBLER_DIALECT != ASM_ATT)
14618 return;
14619
14620 switch (GET_MODE_SIZE (GET_MODE (x)))
14621 {
14622 case 2:
14623 putc ('w', file);
14624 break;
14625
14626 case 4:
14627 putc ('l', file);
14628 break;
14629
14630 case 8:
14631 putc ('q', file);
14632 break;
14633
14634 default:
14635 output_operand_lossage
14636 ("invalid operand size for operand code 'O'");
14637 return;
14638 }
14639
14640 putc ('.', file);
14641 #endif
14642 return;
14643
14644 case 'z':
14645 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14646 {
14647 /* Opcodes don't get size suffixes if using Intel opcodes. */
14648 if (ASSEMBLER_DIALECT == ASM_INTEL)
14649 return;
14650
14651 switch (GET_MODE_SIZE (GET_MODE (x)))
14652 {
14653 case 1:
14654 putc ('b', file);
14655 return;
14656
14657 case 2:
14658 putc ('w', file);
14659 return;
14660
14661 case 4:
14662 putc ('l', file);
14663 return;
14664
14665 case 8:
14666 putc ('q', file);
14667 return;
14668
14669 default:
14670 output_operand_lossage
14671 ("invalid operand size for operand code 'z'");
14672 return;
14673 }
14674 }
14675
14676 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14677 warning
14678 (0, "non-integer operand used with operand code 'z'");
14679 /* FALLTHRU */
14680
14681 case 'Z':
14682 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14683 if (ASSEMBLER_DIALECT == ASM_INTEL)
14684 return;
14685
14686 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14687 {
14688 switch (GET_MODE_SIZE (GET_MODE (x)))
14689 {
14690 case 2:
14691 #ifdef HAVE_AS_IX86_FILDS
14692 putc ('s', file);
14693 #endif
14694 return;
14695
14696 case 4:
14697 putc ('l', file);
14698 return;
14699
14700 case 8:
14701 #ifdef HAVE_AS_IX86_FILDQ
14702 putc ('q', file);
14703 #else
14704 fputs ("ll", file);
14705 #endif
14706 return;
14707
14708 default:
14709 break;
14710 }
14711 }
14712 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14713 {
14714 /* 387 opcodes don't get size suffixes
14715 if the operands are registers. */
14716 if (STACK_REG_P (x))
14717 return;
14718
14719 switch (GET_MODE_SIZE (GET_MODE (x)))
14720 {
14721 case 4:
14722 putc ('s', file);
14723 return;
14724
14725 case 8:
14726 putc ('l', file);
14727 return;
14728
14729 case 12:
14730 case 16:
14731 putc ('t', file);
14732 return;
14733
14734 default:
14735 break;
14736 }
14737 }
14738 else
14739 {
14740 output_operand_lossage
14741 ("invalid operand type used with operand code 'Z'");
14742 return;
14743 }
14744
14745 output_operand_lossage
14746 ("invalid operand size for operand code 'Z'");
14747 return;
14748
14749 case 'd':
14750 case 'b':
14751 case 'w':
14752 case 'k':
14753 case 'q':
14754 case 'h':
14755 case 't':
14756 case 'g':
14757 case 'y':
14758 case 'x':
14759 case 'X':
14760 case 'P':
14761 case 'p':
14762 break;
14763
14764 case 's':
14765 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14766 {
14767 ix86_print_operand (file, x, 0);
14768 fputs (", ", file);
14769 }
14770 return;
14771
14772 case 'Y':
14773 switch (GET_CODE (x))
14774 {
14775 case NE:
14776 fputs ("neq", file);
14777 break;
14778 case EQ:
14779 fputs ("eq", file);
14780 break;
14781 case GE:
14782 case GEU:
14783 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14784 break;
14785 case GT:
14786 case GTU:
14787 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14788 break;
14789 case LE:
14790 case LEU:
14791 fputs ("le", file);
14792 break;
14793 case LT:
14794 case LTU:
14795 fputs ("lt", file);
14796 break;
14797 case UNORDERED:
14798 fputs ("unord", file);
14799 break;
14800 case ORDERED:
14801 fputs ("ord", file);
14802 break;
14803 case UNEQ:
14804 fputs ("ueq", file);
14805 break;
14806 case UNGE:
14807 fputs ("nlt", file);
14808 break;
14809 case UNGT:
14810 fputs ("nle", file);
14811 break;
14812 case UNLE:
14813 fputs ("ule", file);
14814 break;
14815 case UNLT:
14816 fputs ("ult", file);
14817 break;
14818 case LTGT:
14819 fputs ("une", file);
14820 break;
14821 default:
14822 output_operand_lossage ("operand is not a condition code, "
14823 "invalid operand code 'Y'");
14824 return;
14825 }
14826 return;
14827
14828 case 'D':
14829 /* Little bit of braindamage here. The SSE compare instructions
14830 does use completely different names for the comparisons that the
14831 fp conditional moves. */
14832 switch (GET_CODE (x))
14833 {
14834 case UNEQ:
14835 if (TARGET_AVX)
14836 {
14837 fputs ("eq_us", file);
14838 break;
14839 }
14840 case EQ:
14841 fputs ("eq", file);
14842 break;
14843 case UNLT:
14844 if (TARGET_AVX)
14845 {
14846 fputs ("nge", file);
14847 break;
14848 }
14849 case LT:
14850 fputs ("lt", file);
14851 break;
14852 case UNLE:
14853 if (TARGET_AVX)
14854 {
14855 fputs ("ngt", file);
14856 break;
14857 }
14858 case LE:
14859 fputs ("le", file);
14860 break;
14861 case UNORDERED:
14862 fputs ("unord", file);
14863 break;
14864 case LTGT:
14865 if (TARGET_AVX)
14866 {
14867 fputs ("neq_oq", file);
14868 break;
14869 }
14870 case NE:
14871 fputs ("neq", file);
14872 break;
14873 case GE:
14874 if (TARGET_AVX)
14875 {
14876 fputs ("ge", file);
14877 break;
14878 }
14879 case UNGE:
14880 fputs ("nlt", file);
14881 break;
14882 case GT:
14883 if (TARGET_AVX)
14884 {
14885 fputs ("gt", file);
14886 break;
14887 }
14888 case UNGT:
14889 fputs ("nle", file);
14890 break;
14891 case ORDERED:
14892 fputs ("ord", file);
14893 break;
14894 default:
14895 output_operand_lossage ("operand is not a condition code, "
14896 "invalid operand code 'D'");
14897 return;
14898 }
14899 return;
14900
14901 case 'F':
14902 case 'f':
14903 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14904 if (ASSEMBLER_DIALECT == ASM_ATT)
14905 putc ('.', file);
14906 #endif
14907
14908 case 'C':
14909 case 'c':
14910 if (!COMPARISON_P (x))
14911 {
14912 output_operand_lossage ("operand is not a condition code, "
14913 "invalid operand code '%c'", code);
14914 return;
14915 }
14916 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14917 code == 'c' || code == 'f',
14918 code == 'F' || code == 'f',
14919 file);
14920 return;
14921
14922 case 'H':
14923 if (!offsettable_memref_p (x))
14924 {
14925 output_operand_lossage ("operand is not an offsettable memory "
14926 "reference, invalid operand code 'H'");
14927 return;
14928 }
14929 /* It doesn't actually matter what mode we use here, as we're
14930 only going to use this for printing. */
14931 x = adjust_address_nv (x, DImode, 8);
14932 /* Output 'qword ptr' for intel assembler dialect. */
14933 if (ASSEMBLER_DIALECT == ASM_INTEL)
14934 code = 'q';
14935 break;
14936
14937 case 'K':
14938 gcc_assert (CONST_INT_P (x));
14939
14940 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14941 #ifdef HAVE_AS_IX86_HLE
14942 fputs ("xacquire ", file);
14943 #else
14944 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14945 #endif
14946 else if (INTVAL (x) & IX86_HLE_RELEASE)
14947 #ifdef HAVE_AS_IX86_HLE
14948 fputs ("xrelease ", file);
14949 #else
14950 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14951 #endif
14952 /* We do not want to print value of the operand. */
14953 return;
14954
14955 case 'N':
14956 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
14957 fputs ("{z}", file);
14958 return;
14959
14960 case '*':
14961 if (ASSEMBLER_DIALECT == ASM_ATT)
14962 putc ('*', file);
14963 return;
14964
14965 case '&':
14966 {
14967 const char *name = get_some_local_dynamic_name ();
14968 if (name == NULL)
14969 output_operand_lossage ("'%%&' used without any "
14970 "local dynamic TLS references");
14971 else
14972 assemble_name (file, name);
14973 return;
14974 }
14975
14976 case '+':
14977 {
14978 rtx x;
14979
14980 if (!optimize
14981 || optimize_function_for_size_p (cfun)
14982 || !TARGET_BRANCH_PREDICTION_HINTS)
14983 return;
14984
14985 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14986 if (x)
14987 {
14988 int pred_val = XINT (x, 0);
14989
14990 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14991 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14992 {
14993 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14994 bool cputaken
14995 = final_forward_branch_p (current_output_insn) == 0;
14996
14997 /* Emit hints only in the case default branch prediction
14998 heuristics would fail. */
14999 if (taken != cputaken)
15000 {
15001 /* We use 3e (DS) prefix for taken branches and
15002 2e (CS) prefix for not taken branches. */
15003 if (taken)
15004 fputs ("ds ; ", file);
15005 else
15006 fputs ("cs ; ", file);
15007 }
15008 }
15009 }
15010 return;
15011 }
15012
15013 case ';':
15014 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
15015 putc (';', file);
15016 #endif
15017 return;
15018
15019 case '@':
15020 if (ASSEMBLER_DIALECT == ASM_ATT)
15021 putc ('%', file);
15022
15023 /* The kernel uses a different segment register for performance
15024 reasons; a system call would not have to trash the userspace
15025 segment register, which would be expensive. */
15026 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
15027 fputs ("fs", file);
15028 else
15029 fputs ("gs", file);
15030 return;
15031
15032 case '~':
15033 putc (TARGET_AVX2 ? 'i' : 'f', file);
15034 return;
15035
15036 case '^':
15037 if (TARGET_64BIT && Pmode != word_mode)
15038 fputs ("addr32 ", file);
15039 return;
15040
15041 case '!':
15042 if (ix86_bnd_prefixed_insn_p (NULL_RTX))
15043 fputs ("bnd ", file);
15044 return;
15045
15046 default:
15047 output_operand_lossage ("invalid operand code '%c'", code);
15048 }
15049 }
15050
15051 if (REG_P (x))
15052 print_reg (x, code, file);
15053
15054 else if (MEM_P (x))
15055 {
15056 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
15057 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
15058 && GET_MODE (x) != BLKmode)
15059 {
15060 const char * size;
15061 switch (GET_MODE_SIZE (GET_MODE (x)))
15062 {
15063 case 1: size = "BYTE"; break;
15064 case 2: size = "WORD"; break;
15065 case 4: size = "DWORD"; break;
15066 case 8: size = "QWORD"; break;
15067 case 12: size = "TBYTE"; break;
15068 case 16:
15069 if (GET_MODE (x) == XFmode)
15070 size = "TBYTE";
15071 else
15072 size = "XMMWORD";
15073 break;
15074 case 32: size = "YMMWORD"; break;
15075 case 64: size = "ZMMWORD"; break;
15076 default:
15077 gcc_unreachable ();
15078 }
15079
15080 /* Check for explicit size override (codes 'b', 'w', 'k',
15081 'q' and 'x') */
15082 if (code == 'b')
15083 size = "BYTE";
15084 else if (code == 'w')
15085 size = "WORD";
15086 else if (code == 'k')
15087 size = "DWORD";
15088 else if (code == 'q')
15089 size = "QWORD";
15090 else if (code == 'x')
15091 size = "XMMWORD";
15092
15093 fputs (size, file);
15094 fputs (" PTR ", file);
15095 }
15096
15097 x = XEXP (x, 0);
15098 /* Avoid (%rip) for call operands. */
15099 if (CONSTANT_ADDRESS_P (x) && code == 'P'
15100 && !CONST_INT_P (x))
15101 output_addr_const (file, x);
15102 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
15103 output_operand_lossage ("invalid constraints for operand");
15104 else
15105 output_address (x);
15106 }
15107
15108 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
15109 {
15110 REAL_VALUE_TYPE r;
15111 long l;
15112
15113 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15114 REAL_VALUE_TO_TARGET_SINGLE (r, l);
15115
15116 if (ASSEMBLER_DIALECT == ASM_ATT)
15117 putc ('$', file);
15118 /* Sign extend 32bit SFmode immediate to 8 bytes. */
15119 if (code == 'q')
15120 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
15121 (unsigned long long) (int) l);
15122 else
15123 fprintf (file, "0x%08x", (unsigned int) l);
15124 }
15125
15126 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
15127 {
15128 REAL_VALUE_TYPE r;
15129 long l[2];
15130
15131 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15132 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
15133
15134 if (ASSEMBLER_DIALECT == ASM_ATT)
15135 putc ('$', file);
15136 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
15137 }
15138
15139 /* These float cases don't actually occur as immediate operands. */
15140 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
15141 {
15142 char dstr[30];
15143
15144 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
15145 fputs (dstr, file);
15146 }
15147
15148 else
15149 {
15150 /* We have patterns that allow zero sets of memory, for instance.
15151 In 64-bit mode, we should probably support all 8-byte vectors,
15152 since we can in fact encode that into an immediate. */
15153 if (GET_CODE (x) == CONST_VECTOR)
15154 {
15155 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15156 x = const0_rtx;
15157 }
15158
15159 if (code != 'P' && code != 'p')
15160 {
15161 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15162 {
15163 if (ASSEMBLER_DIALECT == ASM_ATT)
15164 putc ('$', file);
15165 }
15166 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15167 || GET_CODE (x) == LABEL_REF)
15168 {
15169 if (ASSEMBLER_DIALECT == ASM_ATT)
15170 putc ('$', file);
15171 else
15172 fputs ("OFFSET FLAT:", file);
15173 }
15174 }
15175 if (CONST_INT_P (x))
15176 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15177 else if (flag_pic || MACHOPIC_INDIRECT)
15178 output_pic_addr_const (file, x, code);
15179 else
15180 output_addr_const (file, x);
15181 }
15182 }
15183
15184 static bool
15185 ix86_print_operand_punct_valid_p (unsigned char code)
15186 {
15187 return (code == '@' || code == '*' || code == '+' || code == '&'
15188 || code == ';' || code == '~' || code == '^' || code == '!');
15189 }
15190 \f
15191 /* Print a memory operand whose address is ADDR. */
15192
15193 static void
15194 ix86_print_operand_address (FILE *file, rtx addr)
15195 {
15196 struct ix86_address parts;
15197 rtx base, index, disp;
15198 int scale;
15199 int ok;
15200 bool vsib = false;
15201 int code = 0;
15202
15203 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15204 {
15205 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15206 gcc_assert (parts.index == NULL_RTX);
15207 parts.index = XVECEXP (addr, 0, 1);
15208 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15209 addr = XVECEXP (addr, 0, 0);
15210 vsib = true;
15211 }
15212 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15213 {
15214 gcc_assert (TARGET_64BIT);
15215 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15216 code = 'q';
15217 }
15218 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
15219 {
15220 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
15221 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
15222 if (parts.base != NULL_RTX)
15223 {
15224 parts.index = parts.base;
15225 parts.scale = 1;
15226 }
15227 parts.base = XVECEXP (addr, 0, 0);
15228 addr = XVECEXP (addr, 0, 0);
15229 }
15230 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
15231 {
15232 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15233 gcc_assert (parts.index == NULL_RTX);
15234 parts.index = XVECEXP (addr, 0, 1);
15235 addr = XVECEXP (addr, 0, 0);
15236 }
15237 else
15238 ok = ix86_decompose_address (addr, &parts);
15239
15240 gcc_assert (ok);
15241
15242 base = parts.base;
15243 index = parts.index;
15244 disp = parts.disp;
15245 scale = parts.scale;
15246
15247 switch (parts.seg)
15248 {
15249 case SEG_DEFAULT:
15250 break;
15251 case SEG_FS:
15252 case SEG_GS:
15253 if (ASSEMBLER_DIALECT == ASM_ATT)
15254 putc ('%', file);
15255 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15256 break;
15257 default:
15258 gcc_unreachable ();
15259 }
15260
15261 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15262 if (TARGET_64BIT && !base && !index)
15263 {
15264 rtx symbol = disp;
15265
15266 if (GET_CODE (disp) == CONST
15267 && GET_CODE (XEXP (disp, 0)) == PLUS
15268 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15269 symbol = XEXP (XEXP (disp, 0), 0);
15270
15271 if (GET_CODE (symbol) == LABEL_REF
15272 || (GET_CODE (symbol) == SYMBOL_REF
15273 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15274 base = pc_rtx;
15275 }
15276 if (!base && !index)
15277 {
15278 /* Displacement only requires special attention. */
15279
15280 if (CONST_INT_P (disp))
15281 {
15282 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15283 fputs ("ds:", file);
15284 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15285 }
15286 else if (flag_pic)
15287 output_pic_addr_const (file, disp, 0);
15288 else
15289 output_addr_const (file, disp);
15290 }
15291 else
15292 {
15293 /* Print SImode register names to force addr32 prefix. */
15294 if (SImode_address_operand (addr, VOIDmode))
15295 {
15296 #ifdef ENABLE_CHECKING
15297 gcc_assert (TARGET_64BIT);
15298 switch (GET_CODE (addr))
15299 {
15300 case SUBREG:
15301 gcc_assert (GET_MODE (addr) == SImode);
15302 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15303 break;
15304 case ZERO_EXTEND:
15305 case AND:
15306 gcc_assert (GET_MODE (addr) == DImode);
15307 break;
15308 default:
15309 gcc_unreachable ();
15310 }
15311 #endif
15312 gcc_assert (!code);
15313 code = 'k';
15314 }
15315 else if (code == 0
15316 && TARGET_X32
15317 && disp
15318 && CONST_INT_P (disp)
15319 && INTVAL (disp) < -16*1024*1024)
15320 {
15321 /* X32 runs in 64-bit mode, where displacement, DISP, in
15322 address DISP(%r64), is encoded as 32-bit immediate sign-
15323 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15324 address is %r64 + 0xffffffffbffffd00. When %r64 <
15325 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15326 which is invalid for x32. The correct address is %r64
15327 - 0x40000300 == 0xf7ffdd64. To properly encode
15328 -0x40000300(%r64) for x32, we zero-extend negative
15329 displacement by forcing addr32 prefix which truncates
15330 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15331 zero-extend all negative displacements, including -1(%rsp).
15332 However, for small negative displacements, sign-extension
15333 won't cause overflow. We only zero-extend negative
15334 displacements if they < -16*1024*1024, which is also used
15335 to check legitimate address displacements for PIC. */
15336 code = 'k';
15337 }
15338
15339 if (ASSEMBLER_DIALECT == ASM_ATT)
15340 {
15341 if (disp)
15342 {
15343 if (flag_pic)
15344 output_pic_addr_const (file, disp, 0);
15345 else if (GET_CODE (disp) == LABEL_REF)
15346 output_asm_label (disp);
15347 else
15348 output_addr_const (file, disp);
15349 }
15350
15351 putc ('(', file);
15352 if (base)
15353 print_reg (base, code, file);
15354 if (index)
15355 {
15356 putc (',', file);
15357 print_reg (index, vsib ? 0 : code, file);
15358 if (scale != 1 || vsib)
15359 fprintf (file, ",%d", scale);
15360 }
15361 putc (')', file);
15362 }
15363 else
15364 {
15365 rtx offset = NULL_RTX;
15366
15367 if (disp)
15368 {
15369 /* Pull out the offset of a symbol; print any symbol itself. */
15370 if (GET_CODE (disp) == CONST
15371 && GET_CODE (XEXP (disp, 0)) == PLUS
15372 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15373 {
15374 offset = XEXP (XEXP (disp, 0), 1);
15375 disp = gen_rtx_CONST (VOIDmode,
15376 XEXP (XEXP (disp, 0), 0));
15377 }
15378
15379 if (flag_pic)
15380 output_pic_addr_const (file, disp, 0);
15381 else if (GET_CODE (disp) == LABEL_REF)
15382 output_asm_label (disp);
15383 else if (CONST_INT_P (disp))
15384 offset = disp;
15385 else
15386 output_addr_const (file, disp);
15387 }
15388
15389 putc ('[', file);
15390 if (base)
15391 {
15392 print_reg (base, code, file);
15393 if (offset)
15394 {
15395 if (INTVAL (offset) >= 0)
15396 putc ('+', file);
15397 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15398 }
15399 }
15400 else if (offset)
15401 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15402 else
15403 putc ('0', file);
15404
15405 if (index)
15406 {
15407 putc ('+', file);
15408 print_reg (index, vsib ? 0 : code, file);
15409 if (scale != 1 || vsib)
15410 fprintf (file, "*%d", scale);
15411 }
15412 putc (']', file);
15413 }
15414 }
15415 }
15416
15417 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15418
15419 static bool
15420 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15421 {
15422 rtx op;
15423
15424 if (GET_CODE (x) != UNSPEC)
15425 return false;
15426
15427 op = XVECEXP (x, 0, 0);
15428 switch (XINT (x, 1))
15429 {
15430 case UNSPEC_GOTTPOFF:
15431 output_addr_const (file, op);
15432 /* FIXME: This might be @TPOFF in Sun ld. */
15433 fputs ("@gottpoff", file);
15434 break;
15435 case UNSPEC_TPOFF:
15436 output_addr_const (file, op);
15437 fputs ("@tpoff", file);
15438 break;
15439 case UNSPEC_NTPOFF:
15440 output_addr_const (file, op);
15441 if (TARGET_64BIT)
15442 fputs ("@tpoff", file);
15443 else
15444 fputs ("@ntpoff", file);
15445 break;
15446 case UNSPEC_DTPOFF:
15447 output_addr_const (file, op);
15448 fputs ("@dtpoff", file);
15449 break;
15450 case UNSPEC_GOTNTPOFF:
15451 output_addr_const (file, op);
15452 if (TARGET_64BIT)
15453 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15454 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15455 else
15456 fputs ("@gotntpoff", file);
15457 break;
15458 case UNSPEC_INDNTPOFF:
15459 output_addr_const (file, op);
15460 fputs ("@indntpoff", file);
15461 break;
15462 #if TARGET_MACHO
15463 case UNSPEC_MACHOPIC_OFFSET:
15464 output_addr_const (file, op);
15465 putc ('-', file);
15466 machopic_output_function_base_name (file);
15467 break;
15468 #endif
15469
15470 case UNSPEC_STACK_CHECK:
15471 {
15472 int offset;
15473
15474 gcc_assert (flag_split_stack);
15475
15476 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15477 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15478 #else
15479 gcc_unreachable ();
15480 #endif
15481
15482 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15483 }
15484 break;
15485
15486 default:
15487 return false;
15488 }
15489
15490 return true;
15491 }
15492 \f
15493 /* Split one or more double-mode RTL references into pairs of half-mode
15494 references. The RTL can be REG, offsettable MEM, integer constant, or
15495 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15496 split and "num" is its length. lo_half and hi_half are output arrays
15497 that parallel "operands". */
15498
15499 void
15500 split_double_mode (enum machine_mode mode, rtx operands[],
15501 int num, rtx lo_half[], rtx hi_half[])
15502 {
15503 enum machine_mode half_mode;
15504 unsigned int byte;
15505
15506 switch (mode)
15507 {
15508 case TImode:
15509 half_mode = DImode;
15510 break;
15511 case DImode:
15512 half_mode = SImode;
15513 break;
15514 default:
15515 gcc_unreachable ();
15516 }
15517
15518 byte = GET_MODE_SIZE (half_mode);
15519
15520 while (num--)
15521 {
15522 rtx op = operands[num];
15523
15524 /* simplify_subreg refuse to split volatile memory addresses,
15525 but we still have to handle it. */
15526 if (MEM_P (op))
15527 {
15528 lo_half[num] = adjust_address (op, half_mode, 0);
15529 hi_half[num] = adjust_address (op, half_mode, byte);
15530 }
15531 else
15532 {
15533 lo_half[num] = simplify_gen_subreg (half_mode, op,
15534 GET_MODE (op) == VOIDmode
15535 ? mode : GET_MODE (op), 0);
15536 hi_half[num] = simplify_gen_subreg (half_mode, op,
15537 GET_MODE (op) == VOIDmode
15538 ? mode : GET_MODE (op), byte);
15539 }
15540 }
15541 }
15542 \f
15543 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15544 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15545 is the expression of the binary operation. The output may either be
15546 emitted here, or returned to the caller, like all output_* functions.
15547
15548 There is no guarantee that the operands are the same mode, as they
15549 might be within FLOAT or FLOAT_EXTEND expressions. */
15550
15551 #ifndef SYSV386_COMPAT
15552 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15553 wants to fix the assemblers because that causes incompatibility
15554 with gcc. No-one wants to fix gcc because that causes
15555 incompatibility with assemblers... You can use the option of
15556 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15557 #define SYSV386_COMPAT 1
15558 #endif
15559
15560 const char *
15561 output_387_binary_op (rtx insn, rtx *operands)
15562 {
15563 static char buf[40];
15564 const char *p;
15565 const char *ssep;
15566 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15567
15568 #ifdef ENABLE_CHECKING
15569 /* Even if we do not want to check the inputs, this documents input
15570 constraints. Which helps in understanding the following code. */
15571 if (STACK_REG_P (operands[0])
15572 && ((REG_P (operands[1])
15573 && REGNO (operands[0]) == REGNO (operands[1])
15574 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15575 || (REG_P (operands[2])
15576 && REGNO (operands[0]) == REGNO (operands[2])
15577 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15578 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15579 ; /* ok */
15580 else
15581 gcc_assert (is_sse);
15582 #endif
15583
15584 switch (GET_CODE (operands[3]))
15585 {
15586 case PLUS:
15587 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15588 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15589 p = "fiadd";
15590 else
15591 p = "fadd";
15592 ssep = "vadd";
15593 break;
15594
15595 case MINUS:
15596 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15597 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15598 p = "fisub";
15599 else
15600 p = "fsub";
15601 ssep = "vsub";
15602 break;
15603
15604 case MULT:
15605 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15606 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15607 p = "fimul";
15608 else
15609 p = "fmul";
15610 ssep = "vmul";
15611 break;
15612
15613 case DIV:
15614 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15615 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15616 p = "fidiv";
15617 else
15618 p = "fdiv";
15619 ssep = "vdiv";
15620 break;
15621
15622 default:
15623 gcc_unreachable ();
15624 }
15625
15626 if (is_sse)
15627 {
15628 if (TARGET_AVX)
15629 {
15630 strcpy (buf, ssep);
15631 if (GET_MODE (operands[0]) == SFmode)
15632 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15633 else
15634 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15635 }
15636 else
15637 {
15638 strcpy (buf, ssep + 1);
15639 if (GET_MODE (operands[0]) == SFmode)
15640 strcat (buf, "ss\t{%2, %0|%0, %2}");
15641 else
15642 strcat (buf, "sd\t{%2, %0|%0, %2}");
15643 }
15644 return buf;
15645 }
15646 strcpy (buf, p);
15647
15648 switch (GET_CODE (operands[3]))
15649 {
15650 case MULT:
15651 case PLUS:
15652 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15653 {
15654 rtx temp = operands[2];
15655 operands[2] = operands[1];
15656 operands[1] = temp;
15657 }
15658
15659 /* know operands[0] == operands[1]. */
15660
15661 if (MEM_P (operands[2]))
15662 {
15663 p = "%Z2\t%2";
15664 break;
15665 }
15666
15667 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15668 {
15669 if (STACK_TOP_P (operands[0]))
15670 /* How is it that we are storing to a dead operand[2]?
15671 Well, presumably operands[1] is dead too. We can't
15672 store the result to st(0) as st(0) gets popped on this
15673 instruction. Instead store to operands[2] (which I
15674 think has to be st(1)). st(1) will be popped later.
15675 gcc <= 2.8.1 didn't have this check and generated
15676 assembly code that the Unixware assembler rejected. */
15677 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15678 else
15679 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15680 break;
15681 }
15682
15683 if (STACK_TOP_P (operands[0]))
15684 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15685 else
15686 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15687 break;
15688
15689 case MINUS:
15690 case DIV:
15691 if (MEM_P (operands[1]))
15692 {
15693 p = "r%Z1\t%1";
15694 break;
15695 }
15696
15697 if (MEM_P (operands[2]))
15698 {
15699 p = "%Z2\t%2";
15700 break;
15701 }
15702
15703 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15704 {
15705 #if SYSV386_COMPAT
15706 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15707 derived assemblers, confusingly reverse the direction of
15708 the operation for fsub{r} and fdiv{r} when the
15709 destination register is not st(0). The Intel assembler
15710 doesn't have this brain damage. Read !SYSV386_COMPAT to
15711 figure out what the hardware really does. */
15712 if (STACK_TOP_P (operands[0]))
15713 p = "{p\t%0, %2|rp\t%2, %0}";
15714 else
15715 p = "{rp\t%2, %0|p\t%0, %2}";
15716 #else
15717 if (STACK_TOP_P (operands[0]))
15718 /* As above for fmul/fadd, we can't store to st(0). */
15719 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15720 else
15721 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15722 #endif
15723 break;
15724 }
15725
15726 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15727 {
15728 #if SYSV386_COMPAT
15729 if (STACK_TOP_P (operands[0]))
15730 p = "{rp\t%0, %1|p\t%1, %0}";
15731 else
15732 p = "{p\t%1, %0|rp\t%0, %1}";
15733 #else
15734 if (STACK_TOP_P (operands[0]))
15735 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15736 else
15737 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15738 #endif
15739 break;
15740 }
15741
15742 if (STACK_TOP_P (operands[0]))
15743 {
15744 if (STACK_TOP_P (operands[1]))
15745 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15746 else
15747 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15748 break;
15749 }
15750 else if (STACK_TOP_P (operands[1]))
15751 {
15752 #if SYSV386_COMPAT
15753 p = "{\t%1, %0|r\t%0, %1}";
15754 #else
15755 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15756 #endif
15757 }
15758 else
15759 {
15760 #if SYSV386_COMPAT
15761 p = "{r\t%2, %0|\t%0, %2}";
15762 #else
15763 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15764 #endif
15765 }
15766 break;
15767
15768 default:
15769 gcc_unreachable ();
15770 }
15771
15772 strcat (buf, p);
15773 return buf;
15774 }
15775
15776 /* Check if a 256bit AVX register is referenced inside of EXP. */
15777
15778 static int
15779 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15780 {
15781 rtx exp = *pexp;
15782
15783 if (GET_CODE (exp) == SUBREG)
15784 exp = SUBREG_REG (exp);
15785
15786 if (REG_P (exp)
15787 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15788 return 1;
15789
15790 return 0;
15791 }
15792
15793 /* Return needed mode for entity in optimize_mode_switching pass. */
15794
15795 static int
15796 ix86_avx_u128_mode_needed (rtx insn)
15797 {
15798 if (CALL_P (insn))
15799 {
15800 rtx link;
15801
15802 /* Needed mode is set to AVX_U128_CLEAN if there are
15803 no 256bit modes used in function arguments. */
15804 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15805 link;
15806 link = XEXP (link, 1))
15807 {
15808 if (GET_CODE (XEXP (link, 0)) == USE)
15809 {
15810 rtx arg = XEXP (XEXP (link, 0), 0);
15811
15812 if (ix86_check_avx256_register (&arg, NULL))
15813 return AVX_U128_DIRTY;
15814 }
15815 }
15816
15817 return AVX_U128_CLEAN;
15818 }
15819
15820 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15821 changes state only when a 256bit register is written to, but we need
15822 to prevent the compiler from moving optimal insertion point above
15823 eventual read from 256bit register. */
15824 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15825 return AVX_U128_DIRTY;
15826
15827 return AVX_U128_ANY;
15828 }
15829
15830 /* Return mode that i387 must be switched into
15831 prior to the execution of insn. */
15832
15833 static int
15834 ix86_i387_mode_needed (int entity, rtx insn)
15835 {
15836 enum attr_i387_cw mode;
15837
15838 /* The mode UNINITIALIZED is used to store control word after a
15839 function call or ASM pattern. The mode ANY specify that function
15840 has no requirements on the control word and make no changes in the
15841 bits we are interested in. */
15842
15843 if (CALL_P (insn)
15844 || (NONJUMP_INSN_P (insn)
15845 && (asm_noperands (PATTERN (insn)) >= 0
15846 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15847 return I387_CW_UNINITIALIZED;
15848
15849 if (recog_memoized (insn) < 0)
15850 return I387_CW_ANY;
15851
15852 mode = get_attr_i387_cw (insn);
15853
15854 switch (entity)
15855 {
15856 case I387_TRUNC:
15857 if (mode == I387_CW_TRUNC)
15858 return mode;
15859 break;
15860
15861 case I387_FLOOR:
15862 if (mode == I387_CW_FLOOR)
15863 return mode;
15864 break;
15865
15866 case I387_CEIL:
15867 if (mode == I387_CW_CEIL)
15868 return mode;
15869 break;
15870
15871 case I387_MASK_PM:
15872 if (mode == I387_CW_MASK_PM)
15873 return mode;
15874 break;
15875
15876 default:
15877 gcc_unreachable ();
15878 }
15879
15880 return I387_CW_ANY;
15881 }
15882
15883 /* Return mode that entity must be switched into
15884 prior to the execution of insn. */
15885
15886 int
15887 ix86_mode_needed (int entity, rtx insn)
15888 {
15889 switch (entity)
15890 {
15891 case AVX_U128:
15892 return ix86_avx_u128_mode_needed (insn);
15893 case I387_TRUNC:
15894 case I387_FLOOR:
15895 case I387_CEIL:
15896 case I387_MASK_PM:
15897 return ix86_i387_mode_needed (entity, insn);
15898 default:
15899 gcc_unreachable ();
15900 }
15901 return 0;
15902 }
15903
15904 /* Check if a 256bit AVX register is referenced in stores. */
15905
15906 static void
15907 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15908 {
15909 if (ix86_check_avx256_register (&dest, NULL))
15910 {
15911 bool *used = (bool *) data;
15912 *used = true;
15913 }
15914 }
15915
15916 /* Calculate mode of upper 128bit AVX registers after the insn. */
15917
15918 static int
15919 ix86_avx_u128_mode_after (int mode, rtx insn)
15920 {
15921 rtx pat = PATTERN (insn);
15922
15923 if (vzeroupper_operation (pat, VOIDmode)
15924 || vzeroall_operation (pat, VOIDmode))
15925 return AVX_U128_CLEAN;
15926
15927 /* We know that state is clean after CALL insn if there are no
15928 256bit registers used in the function return register. */
15929 if (CALL_P (insn))
15930 {
15931 bool avx_reg256_found = false;
15932 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15933
15934 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
15935 }
15936
15937 /* Otherwise, return current mode. Remember that if insn
15938 references AVX 256bit registers, the mode was already changed
15939 to DIRTY from MODE_NEEDED. */
15940 return mode;
15941 }
15942
15943 /* Return the mode that an insn results in. */
15944
15945 int
15946 ix86_mode_after (int entity, int mode, rtx insn)
15947 {
15948 switch (entity)
15949 {
15950 case AVX_U128:
15951 return ix86_avx_u128_mode_after (mode, insn);
15952 case I387_TRUNC:
15953 case I387_FLOOR:
15954 case I387_CEIL:
15955 case I387_MASK_PM:
15956 return mode;
15957 default:
15958 gcc_unreachable ();
15959 }
15960 }
15961
15962 static int
15963 ix86_avx_u128_mode_entry (void)
15964 {
15965 tree arg;
15966
15967 /* Entry mode is set to AVX_U128_DIRTY if there are
15968 256bit modes used in function arguments. */
15969 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15970 arg = TREE_CHAIN (arg))
15971 {
15972 rtx incoming = DECL_INCOMING_RTL (arg);
15973
15974 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15975 return AVX_U128_DIRTY;
15976 }
15977
15978 return AVX_U128_CLEAN;
15979 }
15980
15981 /* Return a mode that ENTITY is assumed to be
15982 switched to at function entry. */
15983
15984 int
15985 ix86_mode_entry (int entity)
15986 {
15987 switch (entity)
15988 {
15989 case AVX_U128:
15990 return ix86_avx_u128_mode_entry ();
15991 case I387_TRUNC:
15992 case I387_FLOOR:
15993 case I387_CEIL:
15994 case I387_MASK_PM:
15995 return I387_CW_ANY;
15996 default:
15997 gcc_unreachable ();
15998 }
15999 }
16000
16001 static int
16002 ix86_avx_u128_mode_exit (void)
16003 {
16004 rtx reg = crtl->return_rtx;
16005
16006 /* Exit mode is set to AVX_U128_DIRTY if there are
16007 256bit modes used in the function return register. */
16008 if (reg && ix86_check_avx256_register (&reg, NULL))
16009 return AVX_U128_DIRTY;
16010
16011 return AVX_U128_CLEAN;
16012 }
16013
16014 /* Return a mode that ENTITY is assumed to be
16015 switched to at function exit. */
16016
16017 int
16018 ix86_mode_exit (int entity)
16019 {
16020 switch (entity)
16021 {
16022 case AVX_U128:
16023 return ix86_avx_u128_mode_exit ();
16024 case I387_TRUNC:
16025 case I387_FLOOR:
16026 case I387_CEIL:
16027 case I387_MASK_PM:
16028 return I387_CW_ANY;
16029 default:
16030 gcc_unreachable ();
16031 }
16032 }
16033
16034 /* Output code to initialize control word copies used by trunc?f?i and
16035 rounding patterns. CURRENT_MODE is set to current control word,
16036 while NEW_MODE is set to new control word. */
16037
16038 static void
16039 emit_i387_cw_initialization (int mode)
16040 {
16041 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
16042 rtx new_mode;
16043
16044 enum ix86_stack_slot slot;
16045
16046 rtx reg = gen_reg_rtx (HImode);
16047
16048 emit_insn (gen_x86_fnstcw_1 (stored_mode));
16049 emit_move_insn (reg, copy_rtx (stored_mode));
16050
16051 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
16052 || optimize_insn_for_size_p ())
16053 {
16054 switch (mode)
16055 {
16056 case I387_CW_TRUNC:
16057 /* round toward zero (truncate) */
16058 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
16059 slot = SLOT_CW_TRUNC;
16060 break;
16061
16062 case I387_CW_FLOOR:
16063 /* round down toward -oo */
16064 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16065 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
16066 slot = SLOT_CW_FLOOR;
16067 break;
16068
16069 case I387_CW_CEIL:
16070 /* round up toward +oo */
16071 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16072 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
16073 slot = SLOT_CW_CEIL;
16074 break;
16075
16076 case I387_CW_MASK_PM:
16077 /* mask precision exception for nearbyint() */
16078 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16079 slot = SLOT_CW_MASK_PM;
16080 break;
16081
16082 default:
16083 gcc_unreachable ();
16084 }
16085 }
16086 else
16087 {
16088 switch (mode)
16089 {
16090 case I387_CW_TRUNC:
16091 /* round toward zero (truncate) */
16092 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
16093 slot = SLOT_CW_TRUNC;
16094 break;
16095
16096 case I387_CW_FLOOR:
16097 /* round down toward -oo */
16098 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
16099 slot = SLOT_CW_FLOOR;
16100 break;
16101
16102 case I387_CW_CEIL:
16103 /* round up toward +oo */
16104 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
16105 slot = SLOT_CW_CEIL;
16106 break;
16107
16108 case I387_CW_MASK_PM:
16109 /* mask precision exception for nearbyint() */
16110 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16111 slot = SLOT_CW_MASK_PM;
16112 break;
16113
16114 default:
16115 gcc_unreachable ();
16116 }
16117 }
16118
16119 gcc_assert (slot < MAX_386_STACK_LOCALS);
16120
16121 new_mode = assign_386_stack_local (HImode, slot);
16122 emit_move_insn (new_mode, reg);
16123 }
16124
16125 /* Emit vzeroupper. */
16126
16127 void
16128 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
16129 {
16130 int i;
16131
16132 /* Cancel automatic vzeroupper insertion if there are
16133 live call-saved SSE registers at the insertion point. */
16134
16135 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
16136 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16137 return;
16138
16139 if (TARGET_64BIT)
16140 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
16141 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16142 return;
16143
16144 emit_insn (gen_avx_vzeroupper ());
16145 }
16146
16147 /* Generate one or more insns to set ENTITY to MODE. */
16148
16149 void
16150 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
16151 {
16152 switch (entity)
16153 {
16154 case AVX_U128:
16155 if (mode == AVX_U128_CLEAN)
16156 ix86_avx_emit_vzeroupper (regs_live);
16157 break;
16158 case I387_TRUNC:
16159 case I387_FLOOR:
16160 case I387_CEIL:
16161 case I387_MASK_PM:
16162 if (mode != I387_CW_ANY
16163 && mode != I387_CW_UNINITIALIZED)
16164 emit_i387_cw_initialization (mode);
16165 break;
16166 default:
16167 gcc_unreachable ();
16168 }
16169 }
16170
16171 /* Output code for INSN to convert a float to a signed int. OPERANDS
16172 are the insn operands. The output may be [HSD]Imode and the input
16173 operand may be [SDX]Fmode. */
16174
16175 const char *
16176 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16177 {
16178 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16179 int dimode_p = GET_MODE (operands[0]) == DImode;
16180 int round_mode = get_attr_i387_cw (insn);
16181
16182 /* Jump through a hoop or two for DImode, since the hardware has no
16183 non-popping instruction. We used to do this a different way, but
16184 that was somewhat fragile and broke with post-reload splitters. */
16185 if ((dimode_p || fisttp) && !stack_top_dies)
16186 output_asm_insn ("fld\t%y1", operands);
16187
16188 gcc_assert (STACK_TOP_P (operands[1]));
16189 gcc_assert (MEM_P (operands[0]));
16190 gcc_assert (GET_MODE (operands[1]) != TFmode);
16191
16192 if (fisttp)
16193 output_asm_insn ("fisttp%Z0\t%0", operands);
16194 else
16195 {
16196 if (round_mode != I387_CW_ANY)
16197 output_asm_insn ("fldcw\t%3", operands);
16198 if (stack_top_dies || dimode_p)
16199 output_asm_insn ("fistp%Z0\t%0", operands);
16200 else
16201 output_asm_insn ("fist%Z0\t%0", operands);
16202 if (round_mode != I387_CW_ANY)
16203 output_asm_insn ("fldcw\t%2", operands);
16204 }
16205
16206 return "";
16207 }
16208
16209 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16210 have the values zero or one, indicates the ffreep insn's operand
16211 from the OPERANDS array. */
16212
16213 static const char *
16214 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16215 {
16216 if (TARGET_USE_FFREEP)
16217 #ifdef HAVE_AS_IX86_FFREEP
16218 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16219 #else
16220 {
16221 static char retval[32];
16222 int regno = REGNO (operands[opno]);
16223
16224 gcc_assert (STACK_REGNO_P (regno));
16225
16226 regno -= FIRST_STACK_REG;
16227
16228 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16229 return retval;
16230 }
16231 #endif
16232
16233 return opno ? "fstp\t%y1" : "fstp\t%y0";
16234 }
16235
16236
16237 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16238 should be used. UNORDERED_P is true when fucom should be used. */
16239
16240 const char *
16241 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16242 {
16243 int stack_top_dies;
16244 rtx cmp_op0, cmp_op1;
16245 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16246
16247 if (eflags_p)
16248 {
16249 cmp_op0 = operands[0];
16250 cmp_op1 = operands[1];
16251 }
16252 else
16253 {
16254 cmp_op0 = operands[1];
16255 cmp_op1 = operands[2];
16256 }
16257
16258 if (is_sse)
16259 {
16260 if (GET_MODE (operands[0]) == SFmode)
16261 if (unordered_p)
16262 return "%vucomiss\t{%1, %0|%0, %1}";
16263 else
16264 return "%vcomiss\t{%1, %0|%0, %1}";
16265 else
16266 if (unordered_p)
16267 return "%vucomisd\t{%1, %0|%0, %1}";
16268 else
16269 return "%vcomisd\t{%1, %0|%0, %1}";
16270 }
16271
16272 gcc_assert (STACK_TOP_P (cmp_op0));
16273
16274 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16275
16276 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16277 {
16278 if (stack_top_dies)
16279 {
16280 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16281 return output_387_ffreep (operands, 1);
16282 }
16283 else
16284 return "ftst\n\tfnstsw\t%0";
16285 }
16286
16287 if (STACK_REG_P (cmp_op1)
16288 && stack_top_dies
16289 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16290 && REGNO (cmp_op1) != FIRST_STACK_REG)
16291 {
16292 /* If both the top of the 387 stack dies, and the other operand
16293 is also a stack register that dies, then this must be a
16294 `fcompp' float compare */
16295
16296 if (eflags_p)
16297 {
16298 /* There is no double popping fcomi variant. Fortunately,
16299 eflags is immune from the fstp's cc clobbering. */
16300 if (unordered_p)
16301 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16302 else
16303 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16304 return output_387_ffreep (operands, 0);
16305 }
16306 else
16307 {
16308 if (unordered_p)
16309 return "fucompp\n\tfnstsw\t%0";
16310 else
16311 return "fcompp\n\tfnstsw\t%0";
16312 }
16313 }
16314 else
16315 {
16316 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16317
16318 static const char * const alt[16] =
16319 {
16320 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16321 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16322 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16323 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16324
16325 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16326 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16327 NULL,
16328 NULL,
16329
16330 "fcomi\t{%y1, %0|%0, %y1}",
16331 "fcomip\t{%y1, %0|%0, %y1}",
16332 "fucomi\t{%y1, %0|%0, %y1}",
16333 "fucomip\t{%y1, %0|%0, %y1}",
16334
16335 NULL,
16336 NULL,
16337 NULL,
16338 NULL
16339 };
16340
16341 int mask;
16342 const char *ret;
16343
16344 mask = eflags_p << 3;
16345 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16346 mask |= unordered_p << 1;
16347 mask |= stack_top_dies;
16348
16349 gcc_assert (mask < 16);
16350 ret = alt[mask];
16351 gcc_assert (ret);
16352
16353 return ret;
16354 }
16355 }
16356
16357 void
16358 ix86_output_addr_vec_elt (FILE *file, int value)
16359 {
16360 const char *directive = ASM_LONG;
16361
16362 #ifdef ASM_QUAD
16363 if (TARGET_LP64)
16364 directive = ASM_QUAD;
16365 #else
16366 gcc_assert (!TARGET_64BIT);
16367 #endif
16368
16369 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16370 }
16371
16372 void
16373 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16374 {
16375 const char *directive = ASM_LONG;
16376
16377 #ifdef ASM_QUAD
16378 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16379 directive = ASM_QUAD;
16380 #else
16381 gcc_assert (!TARGET_64BIT);
16382 #endif
16383 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16384 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16385 fprintf (file, "%s%s%d-%s%d\n",
16386 directive, LPREFIX, value, LPREFIX, rel);
16387 else if (HAVE_AS_GOTOFF_IN_DATA)
16388 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16389 #if TARGET_MACHO
16390 else if (TARGET_MACHO)
16391 {
16392 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16393 machopic_output_function_base_name (file);
16394 putc ('\n', file);
16395 }
16396 #endif
16397 else
16398 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16399 GOT_SYMBOL_NAME, LPREFIX, value);
16400 }
16401 \f
16402 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16403 for the target. */
16404
16405 void
16406 ix86_expand_clear (rtx dest)
16407 {
16408 rtx tmp;
16409
16410 /* We play register width games, which are only valid after reload. */
16411 gcc_assert (reload_completed);
16412
16413 /* Avoid HImode and its attendant prefix byte. */
16414 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16415 dest = gen_rtx_REG (SImode, REGNO (dest));
16416 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16417
16418 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16419 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16420 {
16421 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16422 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16423 }
16424
16425 emit_insn (tmp);
16426 }
16427
16428 /* X is an unchanging MEM. If it is a constant pool reference, return
16429 the constant pool rtx, else NULL. */
16430
16431 rtx
16432 maybe_get_pool_constant (rtx x)
16433 {
16434 x = ix86_delegitimize_address (XEXP (x, 0));
16435
16436 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16437 return get_pool_constant (x);
16438
16439 return NULL_RTX;
16440 }
16441
16442 void
16443 ix86_expand_move (enum machine_mode mode, rtx operands[])
16444 {
16445 rtx op0, op1;
16446 enum tls_model model;
16447
16448 op0 = operands[0];
16449 op1 = operands[1];
16450
16451 if (GET_CODE (op1) == SYMBOL_REF)
16452 {
16453 rtx tmp;
16454
16455 model = SYMBOL_REF_TLS_MODEL (op1);
16456 if (model)
16457 {
16458 op1 = legitimize_tls_address (op1, model, true);
16459 op1 = force_operand (op1, op0);
16460 if (op1 == op0)
16461 return;
16462 op1 = convert_to_mode (mode, op1, 1);
16463 }
16464 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16465 op1 = tmp;
16466 }
16467 else if (GET_CODE (op1) == CONST
16468 && GET_CODE (XEXP (op1, 0)) == PLUS
16469 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16470 {
16471 rtx addend = XEXP (XEXP (op1, 0), 1);
16472 rtx symbol = XEXP (XEXP (op1, 0), 0);
16473 rtx tmp;
16474
16475 model = SYMBOL_REF_TLS_MODEL (symbol);
16476 if (model)
16477 tmp = legitimize_tls_address (symbol, model, true);
16478 else
16479 tmp = legitimize_pe_coff_symbol (symbol, true);
16480
16481 if (tmp)
16482 {
16483 tmp = force_operand (tmp, NULL);
16484 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16485 op0, 1, OPTAB_DIRECT);
16486 if (tmp == op0)
16487 return;
16488 op1 = convert_to_mode (mode, tmp, 1);
16489 }
16490 }
16491
16492 if ((flag_pic || MACHOPIC_INDIRECT)
16493 && symbolic_operand (op1, mode))
16494 {
16495 if (TARGET_MACHO && !TARGET_64BIT)
16496 {
16497 #if TARGET_MACHO
16498 /* dynamic-no-pic */
16499 if (MACHOPIC_INDIRECT)
16500 {
16501 rtx temp = ((reload_in_progress
16502 || ((op0 && REG_P (op0))
16503 && mode == Pmode))
16504 ? op0 : gen_reg_rtx (Pmode));
16505 op1 = machopic_indirect_data_reference (op1, temp);
16506 if (MACHOPIC_PURE)
16507 op1 = machopic_legitimize_pic_address (op1, mode,
16508 temp == op1 ? 0 : temp);
16509 }
16510 if (op0 != op1 && GET_CODE (op0) != MEM)
16511 {
16512 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16513 emit_insn (insn);
16514 return;
16515 }
16516 if (GET_CODE (op0) == MEM)
16517 op1 = force_reg (Pmode, op1);
16518 else
16519 {
16520 rtx temp = op0;
16521 if (GET_CODE (temp) != REG)
16522 temp = gen_reg_rtx (Pmode);
16523 temp = legitimize_pic_address (op1, temp);
16524 if (temp == op0)
16525 return;
16526 op1 = temp;
16527 }
16528 /* dynamic-no-pic */
16529 #endif
16530 }
16531 else
16532 {
16533 if (MEM_P (op0))
16534 op1 = force_reg (mode, op1);
16535 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16536 {
16537 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16538 op1 = legitimize_pic_address (op1, reg);
16539 if (op0 == op1)
16540 return;
16541 op1 = convert_to_mode (mode, op1, 1);
16542 }
16543 }
16544 }
16545 else
16546 {
16547 if (MEM_P (op0)
16548 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16549 || !push_operand (op0, mode))
16550 && MEM_P (op1))
16551 op1 = force_reg (mode, op1);
16552
16553 if (push_operand (op0, mode)
16554 && ! general_no_elim_operand (op1, mode))
16555 op1 = copy_to_mode_reg (mode, op1);
16556
16557 /* Force large constants in 64bit compilation into register
16558 to get them CSEed. */
16559 if (can_create_pseudo_p ()
16560 && (mode == DImode) && TARGET_64BIT
16561 && immediate_operand (op1, mode)
16562 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16563 && !register_operand (op0, mode)
16564 && optimize)
16565 op1 = copy_to_mode_reg (mode, op1);
16566
16567 if (can_create_pseudo_p ()
16568 && FLOAT_MODE_P (mode)
16569 && GET_CODE (op1) == CONST_DOUBLE)
16570 {
16571 /* If we are loading a floating point constant to a register,
16572 force the value to memory now, since we'll get better code
16573 out the back end. */
16574
16575 op1 = validize_mem (force_const_mem (mode, op1));
16576 if (!register_operand (op0, mode))
16577 {
16578 rtx temp = gen_reg_rtx (mode);
16579 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16580 emit_move_insn (op0, temp);
16581 return;
16582 }
16583 }
16584 }
16585
16586 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16587 }
16588
16589 void
16590 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16591 {
16592 rtx op0 = operands[0], op1 = operands[1];
16593 unsigned int align = GET_MODE_ALIGNMENT (mode);
16594
16595 /* Force constants other than zero into memory. We do not know how
16596 the instructions used to build constants modify the upper 64 bits
16597 of the register, once we have that information we may be able
16598 to handle some of them more efficiently. */
16599 if (can_create_pseudo_p ()
16600 && register_operand (op0, mode)
16601 && (CONSTANT_P (op1)
16602 || (GET_CODE (op1) == SUBREG
16603 && CONSTANT_P (SUBREG_REG (op1))))
16604 && !standard_sse_constant_p (op1))
16605 op1 = validize_mem (force_const_mem (mode, op1));
16606
16607 /* We need to check memory alignment for SSE mode since attribute
16608 can make operands unaligned. */
16609 if (can_create_pseudo_p ()
16610 && SSE_REG_MODE_P (mode)
16611 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16612 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16613 {
16614 rtx tmp[2];
16615
16616 /* ix86_expand_vector_move_misalign() does not like constants ... */
16617 if (CONSTANT_P (op1)
16618 || (GET_CODE (op1) == SUBREG
16619 && CONSTANT_P (SUBREG_REG (op1))))
16620 op1 = validize_mem (force_const_mem (mode, op1));
16621
16622 /* ... nor both arguments in memory. */
16623 if (!register_operand (op0, mode)
16624 && !register_operand (op1, mode))
16625 op1 = force_reg (mode, op1);
16626
16627 tmp[0] = op0; tmp[1] = op1;
16628 ix86_expand_vector_move_misalign (mode, tmp);
16629 return;
16630 }
16631
16632 /* Make operand1 a register if it isn't already. */
16633 if (can_create_pseudo_p ()
16634 && !register_operand (op0, mode)
16635 && !register_operand (op1, mode))
16636 {
16637 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16638 return;
16639 }
16640
16641 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16642 }
16643
16644 /* Split 32-byte AVX unaligned load and store if needed. */
16645
16646 static void
16647 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16648 {
16649 rtx m;
16650 rtx (*extract) (rtx, rtx, rtx);
16651 rtx (*load_unaligned) (rtx, rtx);
16652 rtx (*store_unaligned) (rtx, rtx);
16653 enum machine_mode mode;
16654
16655 switch (GET_MODE (op0))
16656 {
16657 default:
16658 gcc_unreachable ();
16659 case V32QImode:
16660 extract = gen_avx_vextractf128v32qi;
16661 load_unaligned = gen_avx_loaddquv32qi;
16662 store_unaligned = gen_avx_storedquv32qi;
16663 mode = V16QImode;
16664 break;
16665 case V8SFmode:
16666 extract = gen_avx_vextractf128v8sf;
16667 load_unaligned = gen_avx_loadups256;
16668 store_unaligned = gen_avx_storeups256;
16669 mode = V4SFmode;
16670 break;
16671 case V4DFmode:
16672 extract = gen_avx_vextractf128v4df;
16673 load_unaligned = gen_avx_loadupd256;
16674 store_unaligned = gen_avx_storeupd256;
16675 mode = V2DFmode;
16676 break;
16677 }
16678
16679 if (MEM_P (op1))
16680 {
16681 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16682 {
16683 rtx r = gen_reg_rtx (mode);
16684 m = adjust_address (op1, mode, 0);
16685 emit_move_insn (r, m);
16686 m = adjust_address (op1, mode, 16);
16687 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16688 emit_move_insn (op0, r);
16689 }
16690 /* Normal *mov<mode>_internal pattern will handle
16691 unaligned loads just fine if misaligned_operand
16692 is true, and without the UNSPEC it can be combined
16693 with arithmetic instructions. */
16694 else if (misaligned_operand (op1, GET_MODE (op1)))
16695 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16696 else
16697 emit_insn (load_unaligned (op0, op1));
16698 }
16699 else if (MEM_P (op0))
16700 {
16701 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16702 {
16703 m = adjust_address (op0, mode, 0);
16704 emit_insn (extract (m, op1, const0_rtx));
16705 m = adjust_address (op0, mode, 16);
16706 emit_insn (extract (m, op1, const1_rtx));
16707 }
16708 else
16709 emit_insn (store_unaligned (op0, op1));
16710 }
16711 else
16712 gcc_unreachable ();
16713 }
16714
16715 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16716 straight to ix86_expand_vector_move. */
16717 /* Code generation for scalar reg-reg moves of single and double precision data:
16718 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16719 movaps reg, reg
16720 else
16721 movss reg, reg
16722 if (x86_sse_partial_reg_dependency == true)
16723 movapd reg, reg
16724 else
16725 movsd reg, reg
16726
16727 Code generation for scalar loads of double precision data:
16728 if (x86_sse_split_regs == true)
16729 movlpd mem, reg (gas syntax)
16730 else
16731 movsd mem, reg
16732
16733 Code generation for unaligned packed loads of single precision data
16734 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16735 if (x86_sse_unaligned_move_optimal)
16736 movups mem, reg
16737
16738 if (x86_sse_partial_reg_dependency == true)
16739 {
16740 xorps reg, reg
16741 movlps mem, reg
16742 movhps mem+8, reg
16743 }
16744 else
16745 {
16746 movlps mem, reg
16747 movhps mem+8, reg
16748 }
16749
16750 Code generation for unaligned packed loads of double precision data
16751 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16752 if (x86_sse_unaligned_move_optimal)
16753 movupd mem, reg
16754
16755 if (x86_sse_split_regs == true)
16756 {
16757 movlpd mem, reg
16758 movhpd mem+8, reg
16759 }
16760 else
16761 {
16762 movsd mem, reg
16763 movhpd mem+8, reg
16764 }
16765 */
16766
16767 void
16768 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16769 {
16770 rtx op0, op1, orig_op0 = NULL_RTX, m;
16771 rtx (*load_unaligned) (rtx, rtx);
16772 rtx (*store_unaligned) (rtx, rtx);
16773
16774 op0 = operands[0];
16775 op1 = operands[1];
16776
16777 if (GET_MODE_SIZE (mode) == 64)
16778 {
16779 switch (GET_MODE_CLASS (mode))
16780 {
16781 case MODE_VECTOR_INT:
16782 case MODE_INT:
16783 if (GET_MODE (op0) != V16SImode)
16784 {
16785 if (!MEM_P (op0))
16786 {
16787 orig_op0 = op0;
16788 op0 = gen_reg_rtx (V16SImode);
16789 }
16790 else
16791 op0 = gen_lowpart (V16SImode, op0);
16792 }
16793 op1 = gen_lowpart (V16SImode, op1);
16794 /* FALLTHRU */
16795
16796 case MODE_VECTOR_FLOAT:
16797 switch (GET_MODE (op0))
16798 {
16799 default:
16800 gcc_unreachable ();
16801 case V16SImode:
16802 load_unaligned = gen_avx512f_loaddquv16si;
16803 store_unaligned = gen_avx512f_storedquv16si;
16804 break;
16805 case V16SFmode:
16806 load_unaligned = gen_avx512f_loadups512;
16807 store_unaligned = gen_avx512f_storeups512;
16808 break;
16809 case V8DFmode:
16810 load_unaligned = gen_avx512f_loadupd512;
16811 store_unaligned = gen_avx512f_storeupd512;
16812 break;
16813 }
16814
16815 if (MEM_P (op1))
16816 emit_insn (load_unaligned (op0, op1));
16817 else if (MEM_P (op0))
16818 emit_insn (store_unaligned (op0, op1));
16819 else
16820 gcc_unreachable ();
16821 if (orig_op0)
16822 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16823 break;
16824
16825 default:
16826 gcc_unreachable ();
16827 }
16828
16829 return;
16830 }
16831
16832 if (TARGET_AVX
16833 && GET_MODE_SIZE (mode) == 32)
16834 {
16835 switch (GET_MODE_CLASS (mode))
16836 {
16837 case MODE_VECTOR_INT:
16838 case MODE_INT:
16839 if (GET_MODE (op0) != V32QImode)
16840 {
16841 if (!MEM_P (op0))
16842 {
16843 orig_op0 = op0;
16844 op0 = gen_reg_rtx (V32QImode);
16845 }
16846 else
16847 op0 = gen_lowpart (V32QImode, op0);
16848 }
16849 op1 = gen_lowpart (V32QImode, op1);
16850 /* FALLTHRU */
16851
16852 case MODE_VECTOR_FLOAT:
16853 ix86_avx256_split_vector_move_misalign (op0, op1);
16854 if (orig_op0)
16855 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16856 break;
16857
16858 default:
16859 gcc_unreachable ();
16860 }
16861
16862 return;
16863 }
16864
16865 if (MEM_P (op1))
16866 {
16867 /* Normal *mov<mode>_internal pattern will handle
16868 unaligned loads just fine if misaligned_operand
16869 is true, and without the UNSPEC it can be combined
16870 with arithmetic instructions. */
16871 if (TARGET_AVX
16872 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16873 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16874 && misaligned_operand (op1, GET_MODE (op1)))
16875 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16876 /* ??? If we have typed data, then it would appear that using
16877 movdqu is the only way to get unaligned data loaded with
16878 integer type. */
16879 else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16880 {
16881 if (GET_MODE (op0) != V16QImode)
16882 {
16883 orig_op0 = op0;
16884 op0 = gen_reg_rtx (V16QImode);
16885 }
16886 op1 = gen_lowpart (V16QImode, op1);
16887 /* We will eventually emit movups based on insn attributes. */
16888 emit_insn (gen_sse2_loaddquv16qi (op0, op1));
16889 if (orig_op0)
16890 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16891 }
16892 else if (TARGET_SSE2 && mode == V2DFmode)
16893 {
16894 rtx zero;
16895
16896 if (TARGET_AVX
16897 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16898 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16899 || optimize_insn_for_size_p ())
16900 {
16901 /* We will eventually emit movups based on insn attributes. */
16902 emit_insn (gen_sse2_loadupd (op0, op1));
16903 return;
16904 }
16905
16906 /* When SSE registers are split into halves, we can avoid
16907 writing to the top half twice. */
16908 if (TARGET_SSE_SPLIT_REGS)
16909 {
16910 emit_clobber (op0);
16911 zero = op0;
16912 }
16913 else
16914 {
16915 /* ??? Not sure about the best option for the Intel chips.
16916 The following would seem to satisfy; the register is
16917 entirely cleared, breaking the dependency chain. We
16918 then store to the upper half, with a dependency depth
16919 of one. A rumor has it that Intel recommends two movsd
16920 followed by an unpacklpd, but this is unconfirmed. And
16921 given that the dependency depth of the unpacklpd would
16922 still be one, I'm not sure why this would be better. */
16923 zero = CONST0_RTX (V2DFmode);
16924 }
16925
16926 m = adjust_address (op1, DFmode, 0);
16927 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16928 m = adjust_address (op1, DFmode, 8);
16929 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16930 }
16931 else
16932 {
16933 rtx t;
16934
16935 if (TARGET_AVX
16936 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16937 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16938 || optimize_insn_for_size_p ())
16939 {
16940 if (GET_MODE (op0) != V4SFmode)
16941 {
16942 orig_op0 = op0;
16943 op0 = gen_reg_rtx (V4SFmode);
16944 }
16945 op1 = gen_lowpart (V4SFmode, op1);
16946 emit_insn (gen_sse_loadups (op0, op1));
16947 if (orig_op0)
16948 emit_move_insn (orig_op0,
16949 gen_lowpart (GET_MODE (orig_op0), op0));
16950 return;
16951 }
16952
16953 if (mode != V4SFmode)
16954 t = gen_reg_rtx (V4SFmode);
16955 else
16956 t = op0;
16957
16958 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16959 emit_move_insn (t, CONST0_RTX (V4SFmode));
16960 else
16961 emit_clobber (t);
16962
16963 m = adjust_address (op1, V2SFmode, 0);
16964 emit_insn (gen_sse_loadlps (t, t, m));
16965 m = adjust_address (op1, V2SFmode, 8);
16966 emit_insn (gen_sse_loadhps (t, t, m));
16967 if (mode != V4SFmode)
16968 emit_move_insn (op0, gen_lowpart (mode, t));
16969 }
16970 }
16971 else if (MEM_P (op0))
16972 {
16973 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16974 {
16975 op0 = gen_lowpart (V16QImode, op0);
16976 op1 = gen_lowpart (V16QImode, op1);
16977 /* We will eventually emit movups based on insn attributes. */
16978 emit_insn (gen_sse2_storedquv16qi (op0, op1));
16979 }
16980 else if (TARGET_SSE2 && mode == V2DFmode)
16981 {
16982 if (TARGET_AVX
16983 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16984 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16985 || optimize_insn_for_size_p ())
16986 /* We will eventually emit movups based on insn attributes. */
16987 emit_insn (gen_sse2_storeupd (op0, op1));
16988 else
16989 {
16990 m = adjust_address (op0, DFmode, 0);
16991 emit_insn (gen_sse2_storelpd (m, op1));
16992 m = adjust_address (op0, DFmode, 8);
16993 emit_insn (gen_sse2_storehpd (m, op1));
16994 }
16995 }
16996 else
16997 {
16998 if (mode != V4SFmode)
16999 op1 = gen_lowpart (V4SFmode, op1);
17000
17001 if (TARGET_AVX
17002 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
17003 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17004 || optimize_insn_for_size_p ())
17005 {
17006 op0 = gen_lowpart (V4SFmode, op0);
17007 emit_insn (gen_sse_storeups (op0, op1));
17008 }
17009 else
17010 {
17011 m = adjust_address (op0, V2SFmode, 0);
17012 emit_insn (gen_sse_storelps (m, op1));
17013 m = adjust_address (op0, V2SFmode, 8);
17014 emit_insn (gen_sse_storehps (m, op1));
17015 }
17016 }
17017 }
17018 else
17019 gcc_unreachable ();
17020 }
17021
17022 /* Expand a push in MODE. This is some mode for which we do not support
17023 proper push instructions, at least from the registers that we expect
17024 the value to live in. */
17025
17026 void
17027 ix86_expand_push (enum machine_mode mode, rtx x)
17028 {
17029 rtx tmp;
17030
17031 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
17032 GEN_INT (-GET_MODE_SIZE (mode)),
17033 stack_pointer_rtx, 1, OPTAB_DIRECT);
17034 if (tmp != stack_pointer_rtx)
17035 emit_move_insn (stack_pointer_rtx, tmp);
17036
17037 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
17038
17039 /* When we push an operand onto stack, it has to be aligned at least
17040 at the function argument boundary. However since we don't have
17041 the argument type, we can't determine the actual argument
17042 boundary. */
17043 emit_move_insn (tmp, x);
17044 }
17045
17046 /* Helper function of ix86_fixup_binary_operands to canonicalize
17047 operand order. Returns true if the operands should be swapped. */
17048
17049 static bool
17050 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
17051 rtx operands[])
17052 {
17053 rtx dst = operands[0];
17054 rtx src1 = operands[1];
17055 rtx src2 = operands[2];
17056
17057 /* If the operation is not commutative, we can't do anything. */
17058 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
17059 return false;
17060
17061 /* Highest priority is that src1 should match dst. */
17062 if (rtx_equal_p (dst, src1))
17063 return false;
17064 if (rtx_equal_p (dst, src2))
17065 return true;
17066
17067 /* Next highest priority is that immediate constants come second. */
17068 if (immediate_operand (src2, mode))
17069 return false;
17070 if (immediate_operand (src1, mode))
17071 return true;
17072
17073 /* Lowest priority is that memory references should come second. */
17074 if (MEM_P (src2))
17075 return false;
17076 if (MEM_P (src1))
17077 return true;
17078
17079 return false;
17080 }
17081
17082
17083 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
17084 destination to use for the operation. If different from the true
17085 destination in operands[0], a copy operation will be required. */
17086
17087 rtx
17088 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
17089 rtx operands[])
17090 {
17091 rtx dst = operands[0];
17092 rtx src1 = operands[1];
17093 rtx src2 = operands[2];
17094
17095 /* Canonicalize operand order. */
17096 if (ix86_swap_binary_operands_p (code, mode, operands))
17097 {
17098 rtx temp;
17099
17100 /* It is invalid to swap operands of different modes. */
17101 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
17102
17103 temp = src1;
17104 src1 = src2;
17105 src2 = temp;
17106 }
17107
17108 /* Both source operands cannot be in memory. */
17109 if (MEM_P (src1) && MEM_P (src2))
17110 {
17111 /* Optimization: Only read from memory once. */
17112 if (rtx_equal_p (src1, src2))
17113 {
17114 src2 = force_reg (mode, src2);
17115 src1 = src2;
17116 }
17117 else if (rtx_equal_p (dst, src1))
17118 src2 = force_reg (mode, src2);
17119 else
17120 src1 = force_reg (mode, src1);
17121 }
17122
17123 /* If the destination is memory, and we do not have matching source
17124 operands, do things in registers. */
17125 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17126 dst = gen_reg_rtx (mode);
17127
17128 /* Source 1 cannot be a constant. */
17129 if (CONSTANT_P (src1))
17130 src1 = force_reg (mode, src1);
17131
17132 /* Source 1 cannot be a non-matching memory. */
17133 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17134 src1 = force_reg (mode, src1);
17135
17136 /* Improve address combine. */
17137 if (code == PLUS
17138 && GET_MODE_CLASS (mode) == MODE_INT
17139 && MEM_P (src2))
17140 src2 = force_reg (mode, src2);
17141
17142 operands[1] = src1;
17143 operands[2] = src2;
17144 return dst;
17145 }
17146
17147 /* Similarly, but assume that the destination has already been
17148 set up properly. */
17149
17150 void
17151 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
17152 enum machine_mode mode, rtx operands[])
17153 {
17154 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
17155 gcc_assert (dst == operands[0]);
17156 }
17157
17158 /* Attempt to expand a binary operator. Make the expansion closer to the
17159 actual machine, then just general_operand, which will allow 3 separate
17160 memory references (one output, two input) in a single insn. */
17161
17162 void
17163 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
17164 rtx operands[])
17165 {
17166 rtx src1, src2, dst, op, clob;
17167
17168 dst = ix86_fixup_binary_operands (code, mode, operands);
17169 src1 = operands[1];
17170 src2 = operands[2];
17171
17172 /* Emit the instruction. */
17173
17174 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
17175 if (reload_in_progress)
17176 {
17177 /* Reload doesn't know about the flags register, and doesn't know that
17178 it doesn't want to clobber it. We can only do this with PLUS. */
17179 gcc_assert (code == PLUS);
17180 emit_insn (op);
17181 }
17182 else if (reload_completed
17183 && code == PLUS
17184 && !rtx_equal_p (dst, src1))
17185 {
17186 /* This is going to be an LEA; avoid splitting it later. */
17187 emit_insn (op);
17188 }
17189 else
17190 {
17191 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17192 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17193 }
17194
17195 /* Fix up the destination if needed. */
17196 if (dst != operands[0])
17197 emit_move_insn (operands[0], dst);
17198 }
17199
17200 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
17201 the given OPERANDS. */
17202
17203 void
17204 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
17205 rtx operands[])
17206 {
17207 rtx op1 = NULL_RTX, op2 = NULL_RTX;
17208 if (GET_CODE (operands[1]) == SUBREG)
17209 {
17210 op1 = operands[1];
17211 op2 = operands[2];
17212 }
17213 else if (GET_CODE (operands[2]) == SUBREG)
17214 {
17215 op1 = operands[2];
17216 op2 = operands[1];
17217 }
17218 /* Optimize (__m128i) d | (__m128i) e and similar code
17219 when d and e are float vectors into float vector logical
17220 insn. In C/C++ without using intrinsics there is no other way
17221 to express vector logical operation on float vectors than
17222 to cast them temporarily to integer vectors. */
17223 if (op1
17224 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17225 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
17226 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
17227 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
17228 && SUBREG_BYTE (op1) == 0
17229 && (GET_CODE (op2) == CONST_VECTOR
17230 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17231 && SUBREG_BYTE (op2) == 0))
17232 && can_create_pseudo_p ())
17233 {
17234 rtx dst;
17235 switch (GET_MODE (SUBREG_REG (op1)))
17236 {
17237 case V4SFmode:
17238 case V8SFmode:
17239 case V2DFmode:
17240 case V4DFmode:
17241 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17242 if (GET_CODE (op2) == CONST_VECTOR)
17243 {
17244 op2 = gen_lowpart (GET_MODE (dst), op2);
17245 op2 = force_reg (GET_MODE (dst), op2);
17246 }
17247 else
17248 {
17249 op1 = operands[1];
17250 op2 = SUBREG_REG (operands[2]);
17251 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17252 op2 = force_reg (GET_MODE (dst), op2);
17253 }
17254 op1 = SUBREG_REG (op1);
17255 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17256 op1 = force_reg (GET_MODE (dst), op1);
17257 emit_insn (gen_rtx_SET (VOIDmode, dst,
17258 gen_rtx_fmt_ee (code, GET_MODE (dst),
17259 op1, op2)));
17260 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17261 return;
17262 default:
17263 break;
17264 }
17265 }
17266 if (!nonimmediate_operand (operands[1], mode))
17267 operands[1] = force_reg (mode, operands[1]);
17268 if (!nonimmediate_operand (operands[2], mode))
17269 operands[2] = force_reg (mode, operands[2]);
17270 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17271 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17272 gen_rtx_fmt_ee (code, mode, operands[1],
17273 operands[2])));
17274 }
17275
17276 /* Return TRUE or FALSE depending on whether the binary operator meets the
17277 appropriate constraints. */
17278
17279 bool
17280 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17281 rtx operands[3])
17282 {
17283 rtx dst = operands[0];
17284 rtx src1 = operands[1];
17285 rtx src2 = operands[2];
17286
17287 /* Both source operands cannot be in memory. */
17288 if (MEM_P (src1) && MEM_P (src2))
17289 return false;
17290
17291 /* Canonicalize operand order for commutative operators. */
17292 if (ix86_swap_binary_operands_p (code, mode, operands))
17293 {
17294 rtx temp = src1;
17295 src1 = src2;
17296 src2 = temp;
17297 }
17298
17299 /* If the destination is memory, we must have a matching source operand. */
17300 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17301 return false;
17302
17303 /* Source 1 cannot be a constant. */
17304 if (CONSTANT_P (src1))
17305 return false;
17306
17307 /* Source 1 cannot be a non-matching memory. */
17308 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17309 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17310 return (code == AND
17311 && (mode == HImode
17312 || mode == SImode
17313 || (TARGET_64BIT && mode == DImode))
17314 && satisfies_constraint_L (src2));
17315
17316 return true;
17317 }
17318
17319 /* Attempt to expand a unary operator. Make the expansion closer to the
17320 actual machine, then just general_operand, which will allow 2 separate
17321 memory references (one output, one input) in a single insn. */
17322
17323 void
17324 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17325 rtx operands[])
17326 {
17327 int matching_memory;
17328 rtx src, dst, op, clob;
17329
17330 dst = operands[0];
17331 src = operands[1];
17332
17333 /* If the destination is memory, and we do not have matching source
17334 operands, do things in registers. */
17335 matching_memory = 0;
17336 if (MEM_P (dst))
17337 {
17338 if (rtx_equal_p (dst, src))
17339 matching_memory = 1;
17340 else
17341 dst = gen_reg_rtx (mode);
17342 }
17343
17344 /* When source operand is memory, destination must match. */
17345 if (MEM_P (src) && !matching_memory)
17346 src = force_reg (mode, src);
17347
17348 /* Emit the instruction. */
17349
17350 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17351 if (reload_in_progress || code == NOT)
17352 {
17353 /* Reload doesn't know about the flags register, and doesn't know that
17354 it doesn't want to clobber it. */
17355 gcc_assert (code == NOT);
17356 emit_insn (op);
17357 }
17358 else
17359 {
17360 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17361 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17362 }
17363
17364 /* Fix up the destination if needed. */
17365 if (dst != operands[0])
17366 emit_move_insn (operands[0], dst);
17367 }
17368
17369 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17370 divisor are within the range [0-255]. */
17371
17372 void
17373 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17374 bool signed_p)
17375 {
17376 rtx end_label, qimode_label;
17377 rtx insn, div, mod;
17378 rtx scratch, tmp0, tmp1, tmp2;
17379 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17380 rtx (*gen_zero_extend) (rtx, rtx);
17381 rtx (*gen_test_ccno_1) (rtx, rtx);
17382
17383 switch (mode)
17384 {
17385 case SImode:
17386 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17387 gen_test_ccno_1 = gen_testsi_ccno_1;
17388 gen_zero_extend = gen_zero_extendqisi2;
17389 break;
17390 case DImode:
17391 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17392 gen_test_ccno_1 = gen_testdi_ccno_1;
17393 gen_zero_extend = gen_zero_extendqidi2;
17394 break;
17395 default:
17396 gcc_unreachable ();
17397 }
17398
17399 end_label = gen_label_rtx ();
17400 qimode_label = gen_label_rtx ();
17401
17402 scratch = gen_reg_rtx (mode);
17403
17404 /* Use 8bit unsigned divimod if dividend and divisor are within
17405 the range [0-255]. */
17406 emit_move_insn (scratch, operands[2]);
17407 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17408 scratch, 1, OPTAB_DIRECT);
17409 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17410 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17411 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17412 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17413 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17414 pc_rtx);
17415 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17416 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17417 JUMP_LABEL (insn) = qimode_label;
17418
17419 /* Generate original signed/unsigned divimod. */
17420 div = gen_divmod4_1 (operands[0], operands[1],
17421 operands[2], operands[3]);
17422 emit_insn (div);
17423
17424 /* Branch to the end. */
17425 emit_jump_insn (gen_jump (end_label));
17426 emit_barrier ();
17427
17428 /* Generate 8bit unsigned divide. */
17429 emit_label (qimode_label);
17430 /* Don't use operands[0] for result of 8bit divide since not all
17431 registers support QImode ZERO_EXTRACT. */
17432 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17433 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17434 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17435 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17436
17437 if (signed_p)
17438 {
17439 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17440 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17441 }
17442 else
17443 {
17444 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17445 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17446 }
17447
17448 /* Extract remainder from AH. */
17449 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17450 if (REG_P (operands[1]))
17451 insn = emit_move_insn (operands[1], tmp1);
17452 else
17453 {
17454 /* Need a new scratch register since the old one has result
17455 of 8bit divide. */
17456 scratch = gen_reg_rtx (mode);
17457 emit_move_insn (scratch, tmp1);
17458 insn = emit_move_insn (operands[1], scratch);
17459 }
17460 set_unique_reg_note (insn, REG_EQUAL, mod);
17461
17462 /* Zero extend quotient from AL. */
17463 tmp1 = gen_lowpart (QImode, tmp0);
17464 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17465 set_unique_reg_note (insn, REG_EQUAL, div);
17466
17467 emit_label (end_label);
17468 }
17469
17470 /* Whether it is OK to emit CFI directives when emitting asm code. */
17471
17472 bool
17473 ix86_emit_cfi ()
17474 {
17475 return dwarf2out_do_cfi_asm ();
17476 }
17477
17478 #define LEA_MAX_STALL (3)
17479 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17480
17481 /* Increase given DISTANCE in half-cycles according to
17482 dependencies between PREV and NEXT instructions.
17483 Add 1 half-cycle if there is no dependency and
17484 go to next cycle if there is some dependecy. */
17485
17486 static unsigned int
17487 increase_distance (rtx prev, rtx next, unsigned int distance)
17488 {
17489 df_ref *use_rec;
17490 df_ref *def_rec;
17491
17492 if (!prev || !next)
17493 return distance + (distance & 1) + 2;
17494
17495 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17496 return distance + 1;
17497
17498 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17499 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17500 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17501 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17502 return distance + (distance & 1) + 2;
17503
17504 return distance + 1;
17505 }
17506
17507 /* Function checks if instruction INSN defines register number
17508 REGNO1 or REGNO2. */
17509
17510 static bool
17511 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17512 rtx insn)
17513 {
17514 df_ref *def_rec;
17515
17516 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17517 if (DF_REF_REG_DEF_P (*def_rec)
17518 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17519 && (regno1 == DF_REF_REGNO (*def_rec)
17520 || regno2 == DF_REF_REGNO (*def_rec)))
17521 {
17522 return true;
17523 }
17524
17525 return false;
17526 }
17527
17528 /* Function checks if instruction INSN uses register number
17529 REGNO as a part of address expression. */
17530
17531 static bool
17532 insn_uses_reg_mem (unsigned int regno, rtx insn)
17533 {
17534 df_ref *use_rec;
17535
17536 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17537 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17538 return true;
17539
17540 return false;
17541 }
17542
17543 /* Search backward for non-agu definition of register number REGNO1
17544 or register number REGNO2 in basic block starting from instruction
17545 START up to head of basic block or instruction INSN.
17546
17547 Function puts true value into *FOUND var if definition was found
17548 and false otherwise.
17549
17550 Distance in half-cycles between START and found instruction or head
17551 of BB is added to DISTANCE and returned. */
17552
17553 static int
17554 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17555 rtx insn, int distance,
17556 rtx start, bool *found)
17557 {
17558 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17559 rtx prev = start;
17560 rtx next = NULL;
17561
17562 *found = false;
17563
17564 while (prev
17565 && prev != insn
17566 && distance < LEA_SEARCH_THRESHOLD)
17567 {
17568 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17569 {
17570 distance = increase_distance (prev, next, distance);
17571 if (insn_defines_reg (regno1, regno2, prev))
17572 {
17573 if (recog_memoized (prev) < 0
17574 || get_attr_type (prev) != TYPE_LEA)
17575 {
17576 *found = true;
17577 return distance;
17578 }
17579 }
17580
17581 next = prev;
17582 }
17583 if (prev == BB_HEAD (bb))
17584 break;
17585
17586 prev = PREV_INSN (prev);
17587 }
17588
17589 return distance;
17590 }
17591
17592 /* Search backward for non-agu definition of register number REGNO1
17593 or register number REGNO2 in INSN's basic block until
17594 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17595 2. Reach neighbour BBs boundary, or
17596 3. Reach agu definition.
17597 Returns the distance between the non-agu definition point and INSN.
17598 If no definition point, returns -1. */
17599
17600 static int
17601 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17602 rtx insn)
17603 {
17604 basic_block bb = BLOCK_FOR_INSN (insn);
17605 int distance = 0;
17606 bool found = false;
17607
17608 if (insn != BB_HEAD (bb))
17609 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17610 distance, PREV_INSN (insn),
17611 &found);
17612
17613 if (!found && distance < LEA_SEARCH_THRESHOLD)
17614 {
17615 edge e;
17616 edge_iterator ei;
17617 bool simple_loop = false;
17618
17619 FOR_EACH_EDGE (e, ei, bb->preds)
17620 if (e->src == bb)
17621 {
17622 simple_loop = true;
17623 break;
17624 }
17625
17626 if (simple_loop)
17627 distance = distance_non_agu_define_in_bb (regno1, regno2,
17628 insn, distance,
17629 BB_END (bb), &found);
17630 else
17631 {
17632 int shortest_dist = -1;
17633 bool found_in_bb = false;
17634
17635 FOR_EACH_EDGE (e, ei, bb->preds)
17636 {
17637 int bb_dist
17638 = distance_non_agu_define_in_bb (regno1, regno2,
17639 insn, distance,
17640 BB_END (e->src),
17641 &found_in_bb);
17642 if (found_in_bb)
17643 {
17644 if (shortest_dist < 0)
17645 shortest_dist = bb_dist;
17646 else if (bb_dist > 0)
17647 shortest_dist = MIN (bb_dist, shortest_dist);
17648
17649 found = true;
17650 }
17651 }
17652
17653 distance = shortest_dist;
17654 }
17655 }
17656
17657 /* get_attr_type may modify recog data. We want to make sure
17658 that recog data is valid for instruction INSN, on which
17659 distance_non_agu_define is called. INSN is unchanged here. */
17660 extract_insn_cached (insn);
17661
17662 if (!found)
17663 return -1;
17664
17665 return distance >> 1;
17666 }
17667
17668 /* Return the distance in half-cycles between INSN and the next
17669 insn that uses register number REGNO in memory address added
17670 to DISTANCE. Return -1 if REGNO0 is set.
17671
17672 Put true value into *FOUND if register usage was found and
17673 false otherwise.
17674 Put true value into *REDEFINED if register redefinition was
17675 found and false otherwise. */
17676
17677 static int
17678 distance_agu_use_in_bb (unsigned int regno,
17679 rtx insn, int distance, rtx start,
17680 bool *found, bool *redefined)
17681 {
17682 basic_block bb = NULL;
17683 rtx next = start;
17684 rtx prev = NULL;
17685
17686 *found = false;
17687 *redefined = false;
17688
17689 if (start != NULL_RTX)
17690 {
17691 bb = BLOCK_FOR_INSN (start);
17692 if (start != BB_HEAD (bb))
17693 /* If insn and start belong to the same bb, set prev to insn,
17694 so the call to increase_distance will increase the distance
17695 between insns by 1. */
17696 prev = insn;
17697 }
17698
17699 while (next
17700 && next != insn
17701 && distance < LEA_SEARCH_THRESHOLD)
17702 {
17703 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17704 {
17705 distance = increase_distance(prev, next, distance);
17706 if (insn_uses_reg_mem (regno, next))
17707 {
17708 /* Return DISTANCE if OP0 is used in memory
17709 address in NEXT. */
17710 *found = true;
17711 return distance;
17712 }
17713
17714 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17715 {
17716 /* Return -1 if OP0 is set in NEXT. */
17717 *redefined = true;
17718 return -1;
17719 }
17720
17721 prev = next;
17722 }
17723
17724 if (next == BB_END (bb))
17725 break;
17726
17727 next = NEXT_INSN (next);
17728 }
17729
17730 return distance;
17731 }
17732
17733 /* Return the distance between INSN and the next insn that uses
17734 register number REGNO0 in memory address. Return -1 if no such
17735 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17736
17737 static int
17738 distance_agu_use (unsigned int regno0, rtx insn)
17739 {
17740 basic_block bb = BLOCK_FOR_INSN (insn);
17741 int distance = 0;
17742 bool found = false;
17743 bool redefined = false;
17744
17745 if (insn != BB_END (bb))
17746 distance = distance_agu_use_in_bb (regno0, insn, distance,
17747 NEXT_INSN (insn),
17748 &found, &redefined);
17749
17750 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17751 {
17752 edge e;
17753 edge_iterator ei;
17754 bool simple_loop = false;
17755
17756 FOR_EACH_EDGE (e, ei, bb->succs)
17757 if (e->dest == bb)
17758 {
17759 simple_loop = true;
17760 break;
17761 }
17762
17763 if (simple_loop)
17764 distance = distance_agu_use_in_bb (regno0, insn,
17765 distance, BB_HEAD (bb),
17766 &found, &redefined);
17767 else
17768 {
17769 int shortest_dist = -1;
17770 bool found_in_bb = false;
17771 bool redefined_in_bb = false;
17772
17773 FOR_EACH_EDGE (e, ei, bb->succs)
17774 {
17775 int bb_dist
17776 = distance_agu_use_in_bb (regno0, insn,
17777 distance, BB_HEAD (e->dest),
17778 &found_in_bb, &redefined_in_bb);
17779 if (found_in_bb)
17780 {
17781 if (shortest_dist < 0)
17782 shortest_dist = bb_dist;
17783 else if (bb_dist > 0)
17784 shortest_dist = MIN (bb_dist, shortest_dist);
17785
17786 found = true;
17787 }
17788 }
17789
17790 distance = shortest_dist;
17791 }
17792 }
17793
17794 if (!found || redefined)
17795 return -1;
17796
17797 return distance >> 1;
17798 }
17799
17800 /* Define this macro to tune LEA priority vs ADD, it take effect when
17801 there is a dilemma of choicing LEA or ADD
17802 Negative value: ADD is more preferred than LEA
17803 Zero: Netrual
17804 Positive value: LEA is more preferred than ADD*/
17805 #define IX86_LEA_PRIORITY 0
17806
17807 /* Return true if usage of lea INSN has performance advantage
17808 over a sequence of instructions. Instructions sequence has
17809 SPLIT_COST cycles higher latency than lea latency. */
17810
17811 static bool
17812 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17813 unsigned int regno2, int split_cost, bool has_scale)
17814 {
17815 int dist_define, dist_use;
17816
17817 /* For Silvermont if using a 2-source or 3-source LEA for
17818 non-destructive destination purposes, or due to wanting
17819 ability to use SCALE, the use of LEA is justified. */
17820 if (ix86_tune == PROCESSOR_SLM)
17821 {
17822 if (has_scale)
17823 return true;
17824 if (split_cost < 1)
17825 return false;
17826 if (regno0 == regno1 || regno0 == regno2)
17827 return false;
17828 return true;
17829 }
17830
17831 dist_define = distance_non_agu_define (regno1, regno2, insn);
17832 dist_use = distance_agu_use (regno0, insn);
17833
17834 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17835 {
17836 /* If there is no non AGU operand definition, no AGU
17837 operand usage and split cost is 0 then both lea
17838 and non lea variants have same priority. Currently
17839 we prefer lea for 64 bit code and non lea on 32 bit
17840 code. */
17841 if (dist_use < 0 && split_cost == 0)
17842 return TARGET_64BIT || IX86_LEA_PRIORITY;
17843 else
17844 return true;
17845 }
17846
17847 /* With longer definitions distance lea is more preferable.
17848 Here we change it to take into account splitting cost and
17849 lea priority. */
17850 dist_define += split_cost + IX86_LEA_PRIORITY;
17851
17852 /* If there is no use in memory addess then we just check
17853 that split cost exceeds AGU stall. */
17854 if (dist_use < 0)
17855 return dist_define > LEA_MAX_STALL;
17856
17857 /* If this insn has both backward non-agu dependence and forward
17858 agu dependence, the one with short distance takes effect. */
17859 return dist_define >= dist_use;
17860 }
17861
17862 /* Return true if it is legal to clobber flags by INSN and
17863 false otherwise. */
17864
17865 static bool
17866 ix86_ok_to_clobber_flags (rtx insn)
17867 {
17868 basic_block bb = BLOCK_FOR_INSN (insn);
17869 df_ref *use;
17870 bitmap live;
17871
17872 while (insn)
17873 {
17874 if (NONDEBUG_INSN_P (insn))
17875 {
17876 for (use = DF_INSN_USES (insn); *use; use++)
17877 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17878 return false;
17879
17880 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17881 return true;
17882 }
17883
17884 if (insn == BB_END (bb))
17885 break;
17886
17887 insn = NEXT_INSN (insn);
17888 }
17889
17890 live = df_get_live_out(bb);
17891 return !REGNO_REG_SET_P (live, FLAGS_REG);
17892 }
17893
17894 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17895 move and add to avoid AGU stalls. */
17896
17897 bool
17898 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17899 {
17900 unsigned int regno0, regno1, regno2;
17901
17902 /* Check if we need to optimize. */
17903 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17904 return false;
17905
17906 /* Check it is correct to split here. */
17907 if (!ix86_ok_to_clobber_flags(insn))
17908 return false;
17909
17910 regno0 = true_regnum (operands[0]);
17911 regno1 = true_regnum (operands[1]);
17912 regno2 = true_regnum (operands[2]);
17913
17914 /* We need to split only adds with non destructive
17915 destination operand. */
17916 if (regno0 == regno1 || regno0 == regno2)
17917 return false;
17918 else
17919 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17920 }
17921
17922 /* Return true if we should emit lea instruction instead of mov
17923 instruction. */
17924
17925 bool
17926 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17927 {
17928 unsigned int regno0, regno1;
17929
17930 /* Check if we need to optimize. */
17931 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17932 return false;
17933
17934 /* Use lea for reg to reg moves only. */
17935 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17936 return false;
17937
17938 regno0 = true_regnum (operands[0]);
17939 regno1 = true_regnum (operands[1]);
17940
17941 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17942 }
17943
17944 /* Return true if we need to split lea into a sequence of
17945 instructions to avoid AGU stalls. */
17946
17947 bool
17948 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17949 {
17950 unsigned int regno0, regno1, regno2;
17951 int split_cost;
17952 struct ix86_address parts;
17953 int ok;
17954
17955 /* Check we need to optimize. */
17956 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17957 return false;
17958
17959 /* Check it is correct to split here. */
17960 if (!ix86_ok_to_clobber_flags(insn))
17961 return false;
17962
17963 ok = ix86_decompose_address (operands[1], &parts);
17964 gcc_assert (ok);
17965
17966 /* There should be at least two components in the address. */
17967 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17968 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17969 return false;
17970
17971 /* We should not split into add if non legitimate pic
17972 operand is used as displacement. */
17973 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17974 return false;
17975
17976 regno0 = true_regnum (operands[0]) ;
17977 regno1 = INVALID_REGNUM;
17978 regno2 = INVALID_REGNUM;
17979
17980 if (parts.base)
17981 regno1 = true_regnum (parts.base);
17982 if (parts.index)
17983 regno2 = true_regnum (parts.index);
17984
17985 split_cost = 0;
17986
17987 /* Compute how many cycles we will add to execution time
17988 if split lea into a sequence of instructions. */
17989 if (parts.base || parts.index)
17990 {
17991 /* Have to use mov instruction if non desctructive
17992 destination form is used. */
17993 if (regno1 != regno0 && regno2 != regno0)
17994 split_cost += 1;
17995
17996 /* Have to add index to base if both exist. */
17997 if (parts.base && parts.index)
17998 split_cost += 1;
17999
18000 /* Have to use shift and adds if scale is 2 or greater. */
18001 if (parts.scale > 1)
18002 {
18003 if (regno0 != regno1)
18004 split_cost += 1;
18005 else if (regno2 == regno0)
18006 split_cost += 4;
18007 else
18008 split_cost += parts.scale;
18009 }
18010
18011 /* Have to use add instruction with immediate if
18012 disp is non zero. */
18013 if (parts.disp && parts.disp != const0_rtx)
18014 split_cost += 1;
18015
18016 /* Subtract the price of lea. */
18017 split_cost -= 1;
18018 }
18019
18020 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
18021 parts.scale > 1);
18022 }
18023
18024 /* Emit x86 binary operand CODE in mode MODE, where the first operand
18025 matches destination. RTX includes clobber of FLAGS_REG. */
18026
18027 static void
18028 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
18029 rtx dst, rtx src)
18030 {
18031 rtx op, clob;
18032
18033 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
18034 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18035
18036 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
18037 }
18038
18039 /* Return true if regno1 def is nearest to the insn. */
18040
18041 static bool
18042 find_nearest_reg_def (rtx insn, int regno1, int regno2)
18043 {
18044 rtx prev = insn;
18045 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
18046
18047 if (insn == start)
18048 return false;
18049 while (prev && prev != start)
18050 {
18051 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
18052 {
18053 prev = PREV_INSN (prev);
18054 continue;
18055 }
18056 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
18057 return true;
18058 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
18059 return false;
18060 prev = PREV_INSN (prev);
18061 }
18062
18063 /* None of the regs is defined in the bb. */
18064 return false;
18065 }
18066
18067 /* Split lea instructions into a sequence of instructions
18068 which are executed on ALU to avoid AGU stalls.
18069 It is assumed that it is allowed to clobber flags register
18070 at lea position. */
18071
18072 void
18073 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
18074 {
18075 unsigned int regno0, regno1, regno2;
18076 struct ix86_address parts;
18077 rtx target, tmp;
18078 int ok, adds;
18079
18080 ok = ix86_decompose_address (operands[1], &parts);
18081 gcc_assert (ok);
18082
18083 target = gen_lowpart (mode, operands[0]);
18084
18085 regno0 = true_regnum (target);
18086 regno1 = INVALID_REGNUM;
18087 regno2 = INVALID_REGNUM;
18088
18089 if (parts.base)
18090 {
18091 parts.base = gen_lowpart (mode, parts.base);
18092 regno1 = true_regnum (parts.base);
18093 }
18094
18095 if (parts.index)
18096 {
18097 parts.index = gen_lowpart (mode, parts.index);
18098 regno2 = true_regnum (parts.index);
18099 }
18100
18101 if (parts.disp)
18102 parts.disp = gen_lowpart (mode, parts.disp);
18103
18104 if (parts.scale > 1)
18105 {
18106 /* Case r1 = r1 + ... */
18107 if (regno1 == regno0)
18108 {
18109 /* If we have a case r1 = r1 + C * r1 then we
18110 should use multiplication which is very
18111 expensive. Assume cost model is wrong if we
18112 have such case here. */
18113 gcc_assert (regno2 != regno0);
18114
18115 for (adds = parts.scale; adds > 0; adds--)
18116 ix86_emit_binop (PLUS, mode, target, parts.index);
18117 }
18118 else
18119 {
18120 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
18121 if (regno0 != regno2)
18122 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18123
18124 /* Use shift for scaling. */
18125 ix86_emit_binop (ASHIFT, mode, target,
18126 GEN_INT (exact_log2 (parts.scale)));
18127
18128 if (parts.base)
18129 ix86_emit_binop (PLUS, mode, target, parts.base);
18130
18131 if (parts.disp && parts.disp != const0_rtx)
18132 ix86_emit_binop (PLUS, mode, target, parts.disp);
18133 }
18134 }
18135 else if (!parts.base && !parts.index)
18136 {
18137 gcc_assert(parts.disp);
18138 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
18139 }
18140 else
18141 {
18142 if (!parts.base)
18143 {
18144 if (regno0 != regno2)
18145 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18146 }
18147 else if (!parts.index)
18148 {
18149 if (regno0 != regno1)
18150 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
18151 }
18152 else
18153 {
18154 if (regno0 == regno1)
18155 tmp = parts.index;
18156 else if (regno0 == regno2)
18157 tmp = parts.base;
18158 else
18159 {
18160 rtx tmp1;
18161
18162 /* Find better operand for SET instruction, depending
18163 on which definition is farther from the insn. */
18164 if (find_nearest_reg_def (insn, regno1, regno2))
18165 tmp = parts.index, tmp1 = parts.base;
18166 else
18167 tmp = parts.base, tmp1 = parts.index;
18168
18169 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
18170
18171 if (parts.disp && parts.disp != const0_rtx)
18172 ix86_emit_binop (PLUS, mode, target, parts.disp);
18173
18174 ix86_emit_binop (PLUS, mode, target, tmp1);
18175 return;
18176 }
18177
18178 ix86_emit_binop (PLUS, mode, target, tmp);
18179 }
18180
18181 if (parts.disp && parts.disp != const0_rtx)
18182 ix86_emit_binop (PLUS, mode, target, parts.disp);
18183 }
18184 }
18185
18186 /* Return true if it is ok to optimize an ADD operation to LEA
18187 operation to avoid flag register consumation. For most processors,
18188 ADD is faster than LEA. For the processors like ATOM, if the
18189 destination register of LEA holds an actual address which will be
18190 used soon, LEA is better and otherwise ADD is better. */
18191
18192 bool
18193 ix86_lea_for_add_ok (rtx insn, rtx operands[])
18194 {
18195 unsigned int regno0 = true_regnum (operands[0]);
18196 unsigned int regno1 = true_regnum (operands[1]);
18197 unsigned int regno2 = true_regnum (operands[2]);
18198
18199 /* If a = b + c, (a!=b && a!=c), must use lea form. */
18200 if (regno0 != regno1 && regno0 != regno2)
18201 return true;
18202
18203 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18204 return false;
18205
18206 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
18207 }
18208
18209 /* Return true if destination reg of SET_BODY is shift count of
18210 USE_BODY. */
18211
18212 static bool
18213 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
18214 {
18215 rtx set_dest;
18216 rtx shift_rtx;
18217 int i;
18218
18219 /* Retrieve destination of SET_BODY. */
18220 switch (GET_CODE (set_body))
18221 {
18222 case SET:
18223 set_dest = SET_DEST (set_body);
18224 if (!set_dest || !REG_P (set_dest))
18225 return false;
18226 break;
18227 case PARALLEL:
18228 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
18229 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
18230 use_body))
18231 return true;
18232 default:
18233 return false;
18234 break;
18235 }
18236
18237 /* Retrieve shift count of USE_BODY. */
18238 switch (GET_CODE (use_body))
18239 {
18240 case SET:
18241 shift_rtx = XEXP (use_body, 1);
18242 break;
18243 case PARALLEL:
18244 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18245 if (ix86_dep_by_shift_count_body (set_body,
18246 XVECEXP (use_body, 0, i)))
18247 return true;
18248 default:
18249 return false;
18250 break;
18251 }
18252
18253 if (shift_rtx
18254 && (GET_CODE (shift_rtx) == ASHIFT
18255 || GET_CODE (shift_rtx) == LSHIFTRT
18256 || GET_CODE (shift_rtx) == ASHIFTRT
18257 || GET_CODE (shift_rtx) == ROTATE
18258 || GET_CODE (shift_rtx) == ROTATERT))
18259 {
18260 rtx shift_count = XEXP (shift_rtx, 1);
18261
18262 /* Return true if shift count is dest of SET_BODY. */
18263 if (REG_P (shift_count))
18264 {
18265 /* Add check since it can be invoked before register
18266 allocation in pre-reload schedule. */
18267 if (reload_completed
18268 && true_regnum (set_dest) == true_regnum (shift_count))
18269 return true;
18270 else if (REGNO(set_dest) == REGNO(shift_count))
18271 return true;
18272 }
18273 }
18274
18275 return false;
18276 }
18277
18278 /* Return true if destination reg of SET_INSN is shift count of
18279 USE_INSN. */
18280
18281 bool
18282 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18283 {
18284 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18285 PATTERN (use_insn));
18286 }
18287
18288 /* Return TRUE or FALSE depending on whether the unary operator meets the
18289 appropriate constraints. */
18290
18291 bool
18292 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18293 enum machine_mode mode ATTRIBUTE_UNUSED,
18294 rtx operands[2])
18295 {
18296 /* If one of operands is memory, source and destination must match. */
18297 if ((MEM_P (operands[0])
18298 || MEM_P (operands[1]))
18299 && ! rtx_equal_p (operands[0], operands[1]))
18300 return false;
18301 return true;
18302 }
18303
18304 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18305 are ok, keeping in mind the possible movddup alternative. */
18306
18307 bool
18308 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18309 {
18310 if (MEM_P (operands[0]))
18311 return rtx_equal_p (operands[0], operands[1 + high]);
18312 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18313 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18314 return true;
18315 }
18316
18317 /* Post-reload splitter for converting an SF or DFmode value in an
18318 SSE register into an unsigned SImode. */
18319
18320 void
18321 ix86_split_convert_uns_si_sse (rtx operands[])
18322 {
18323 enum machine_mode vecmode;
18324 rtx value, large, zero_or_two31, input, two31, x;
18325
18326 large = operands[1];
18327 zero_or_two31 = operands[2];
18328 input = operands[3];
18329 two31 = operands[4];
18330 vecmode = GET_MODE (large);
18331 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18332
18333 /* Load up the value into the low element. We must ensure that the other
18334 elements are valid floats -- zero is the easiest such value. */
18335 if (MEM_P (input))
18336 {
18337 if (vecmode == V4SFmode)
18338 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18339 else
18340 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18341 }
18342 else
18343 {
18344 input = gen_rtx_REG (vecmode, REGNO (input));
18345 emit_move_insn (value, CONST0_RTX (vecmode));
18346 if (vecmode == V4SFmode)
18347 emit_insn (gen_sse_movss (value, value, input));
18348 else
18349 emit_insn (gen_sse2_movsd (value, value, input));
18350 }
18351
18352 emit_move_insn (large, two31);
18353 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18354
18355 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18356 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18357
18358 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18359 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18360
18361 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18362 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18363
18364 large = gen_rtx_REG (V4SImode, REGNO (large));
18365 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18366
18367 x = gen_rtx_REG (V4SImode, REGNO (value));
18368 if (vecmode == V4SFmode)
18369 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18370 else
18371 emit_insn (gen_sse2_cvttpd2dq (x, value));
18372 value = x;
18373
18374 emit_insn (gen_xorv4si3 (value, value, large));
18375 }
18376
18377 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18378 Expects the 64-bit DImode to be supplied in a pair of integral
18379 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18380 -mfpmath=sse, !optimize_size only. */
18381
18382 void
18383 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18384 {
18385 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18386 rtx int_xmm, fp_xmm;
18387 rtx biases, exponents;
18388 rtx x;
18389
18390 int_xmm = gen_reg_rtx (V4SImode);
18391 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18392 emit_insn (gen_movdi_to_sse (int_xmm, input));
18393 else if (TARGET_SSE_SPLIT_REGS)
18394 {
18395 emit_clobber (int_xmm);
18396 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18397 }
18398 else
18399 {
18400 x = gen_reg_rtx (V2DImode);
18401 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18402 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18403 }
18404
18405 x = gen_rtx_CONST_VECTOR (V4SImode,
18406 gen_rtvec (4, GEN_INT (0x43300000UL),
18407 GEN_INT (0x45300000UL),
18408 const0_rtx, const0_rtx));
18409 exponents = validize_mem (force_const_mem (V4SImode, x));
18410
18411 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18412 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18413
18414 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18415 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18416 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18417 (0x1.0p84 + double(fp_value_hi_xmm)).
18418 Note these exponents differ by 32. */
18419
18420 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18421
18422 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18423 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18424 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18425 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18426 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18427 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18428 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18429 biases = validize_mem (force_const_mem (V2DFmode, biases));
18430 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18431
18432 /* Add the upper and lower DFmode values together. */
18433 if (TARGET_SSE3)
18434 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18435 else
18436 {
18437 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18438 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18439 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18440 }
18441
18442 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18443 }
18444
18445 /* Not used, but eases macroization of patterns. */
18446 void
18447 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18448 rtx input ATTRIBUTE_UNUSED)
18449 {
18450 gcc_unreachable ();
18451 }
18452
18453 /* Convert an unsigned SImode value into a DFmode. Only currently used
18454 for SSE, but applicable anywhere. */
18455
18456 void
18457 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18458 {
18459 REAL_VALUE_TYPE TWO31r;
18460 rtx x, fp;
18461
18462 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18463 NULL, 1, OPTAB_DIRECT);
18464
18465 fp = gen_reg_rtx (DFmode);
18466 emit_insn (gen_floatsidf2 (fp, x));
18467
18468 real_ldexp (&TWO31r, &dconst1, 31);
18469 x = const_double_from_real_value (TWO31r, DFmode);
18470
18471 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18472 if (x != target)
18473 emit_move_insn (target, x);
18474 }
18475
18476 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18477 32-bit mode; otherwise we have a direct convert instruction. */
18478
18479 void
18480 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18481 {
18482 REAL_VALUE_TYPE TWO32r;
18483 rtx fp_lo, fp_hi, x;
18484
18485 fp_lo = gen_reg_rtx (DFmode);
18486 fp_hi = gen_reg_rtx (DFmode);
18487
18488 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18489
18490 real_ldexp (&TWO32r, &dconst1, 32);
18491 x = const_double_from_real_value (TWO32r, DFmode);
18492 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18493
18494 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18495
18496 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18497 0, OPTAB_DIRECT);
18498 if (x != target)
18499 emit_move_insn (target, x);
18500 }
18501
18502 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18503 For x86_32, -mfpmath=sse, !optimize_size only. */
18504 void
18505 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18506 {
18507 REAL_VALUE_TYPE ONE16r;
18508 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18509
18510 real_ldexp (&ONE16r, &dconst1, 16);
18511 x = const_double_from_real_value (ONE16r, SFmode);
18512 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18513 NULL, 0, OPTAB_DIRECT);
18514 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18515 NULL, 0, OPTAB_DIRECT);
18516 fp_hi = gen_reg_rtx (SFmode);
18517 fp_lo = gen_reg_rtx (SFmode);
18518 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18519 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18520 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18521 0, OPTAB_DIRECT);
18522 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18523 0, OPTAB_DIRECT);
18524 if (!rtx_equal_p (target, fp_hi))
18525 emit_move_insn (target, fp_hi);
18526 }
18527
18528 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18529 a vector of unsigned ints VAL to vector of floats TARGET. */
18530
18531 void
18532 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18533 {
18534 rtx tmp[8];
18535 REAL_VALUE_TYPE TWO16r;
18536 enum machine_mode intmode = GET_MODE (val);
18537 enum machine_mode fltmode = GET_MODE (target);
18538 rtx (*cvt) (rtx, rtx);
18539
18540 if (intmode == V4SImode)
18541 cvt = gen_floatv4siv4sf2;
18542 else
18543 cvt = gen_floatv8siv8sf2;
18544 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18545 tmp[0] = force_reg (intmode, tmp[0]);
18546 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18547 OPTAB_DIRECT);
18548 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18549 NULL_RTX, 1, OPTAB_DIRECT);
18550 tmp[3] = gen_reg_rtx (fltmode);
18551 emit_insn (cvt (tmp[3], tmp[1]));
18552 tmp[4] = gen_reg_rtx (fltmode);
18553 emit_insn (cvt (tmp[4], tmp[2]));
18554 real_ldexp (&TWO16r, &dconst1, 16);
18555 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18556 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18557 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18558 OPTAB_DIRECT);
18559 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18560 OPTAB_DIRECT);
18561 if (tmp[7] != target)
18562 emit_move_insn (target, tmp[7]);
18563 }
18564
18565 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18566 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18567 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18568 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18569
18570 rtx
18571 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18572 {
18573 REAL_VALUE_TYPE TWO31r;
18574 rtx two31r, tmp[4];
18575 enum machine_mode mode = GET_MODE (val);
18576 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18577 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18578 rtx (*cmp) (rtx, rtx, rtx, rtx);
18579 int i;
18580
18581 for (i = 0; i < 3; i++)
18582 tmp[i] = gen_reg_rtx (mode);
18583 real_ldexp (&TWO31r, &dconst1, 31);
18584 two31r = const_double_from_real_value (TWO31r, scalarmode);
18585 two31r = ix86_build_const_vector (mode, 1, two31r);
18586 two31r = force_reg (mode, two31r);
18587 switch (mode)
18588 {
18589 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18590 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18591 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18592 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18593 default: gcc_unreachable ();
18594 }
18595 tmp[3] = gen_rtx_LE (mode, two31r, val);
18596 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18597 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18598 0, OPTAB_DIRECT);
18599 if (intmode == V4SImode || TARGET_AVX2)
18600 *xorp = expand_simple_binop (intmode, ASHIFT,
18601 gen_lowpart (intmode, tmp[0]),
18602 GEN_INT (31), NULL_RTX, 0,
18603 OPTAB_DIRECT);
18604 else
18605 {
18606 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18607 two31 = ix86_build_const_vector (intmode, 1, two31);
18608 *xorp = expand_simple_binop (intmode, AND,
18609 gen_lowpart (intmode, tmp[0]),
18610 two31, NULL_RTX, 0,
18611 OPTAB_DIRECT);
18612 }
18613 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18614 0, OPTAB_DIRECT);
18615 }
18616
18617 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18618 then replicate the value for all elements of the vector
18619 register. */
18620
18621 rtx
18622 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18623 {
18624 int i, n_elt;
18625 rtvec v;
18626 enum machine_mode scalar_mode;
18627
18628 switch (mode)
18629 {
18630 case V32QImode:
18631 case V16QImode:
18632 case V16HImode:
18633 case V8HImode:
18634 case V8SImode:
18635 case V4SImode:
18636 case V4DImode:
18637 case V2DImode:
18638 gcc_assert (vect);
18639 case V8SFmode:
18640 case V4SFmode:
18641 case V4DFmode:
18642 case V2DFmode:
18643 n_elt = GET_MODE_NUNITS (mode);
18644 v = rtvec_alloc (n_elt);
18645 scalar_mode = GET_MODE_INNER (mode);
18646
18647 RTVEC_ELT (v, 0) = value;
18648
18649 for (i = 1; i < n_elt; ++i)
18650 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18651
18652 return gen_rtx_CONST_VECTOR (mode, v);
18653
18654 default:
18655 gcc_unreachable ();
18656 }
18657 }
18658
18659 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18660 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18661 for an SSE register. If VECT is true, then replicate the mask for
18662 all elements of the vector register. If INVERT is true, then create
18663 a mask excluding the sign bit. */
18664
18665 rtx
18666 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18667 {
18668 enum machine_mode vec_mode, imode;
18669 HOST_WIDE_INT hi, lo;
18670 int shift = 63;
18671 rtx v;
18672 rtx mask;
18673
18674 /* Find the sign bit, sign extended to 2*HWI. */
18675 switch (mode)
18676 {
18677 case V8SImode:
18678 case V4SImode:
18679 case V8SFmode:
18680 case V4SFmode:
18681 vec_mode = mode;
18682 mode = GET_MODE_INNER (mode);
18683 imode = SImode;
18684 lo = 0x80000000, hi = lo < 0;
18685 break;
18686
18687 case V4DImode:
18688 case V2DImode:
18689 case V4DFmode:
18690 case V2DFmode:
18691 vec_mode = mode;
18692 mode = GET_MODE_INNER (mode);
18693 imode = DImode;
18694 if (HOST_BITS_PER_WIDE_INT >= 64)
18695 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18696 else
18697 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18698 break;
18699
18700 case TImode:
18701 case TFmode:
18702 vec_mode = VOIDmode;
18703 if (HOST_BITS_PER_WIDE_INT >= 64)
18704 {
18705 imode = TImode;
18706 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18707 }
18708 else
18709 {
18710 rtvec vec;
18711
18712 imode = DImode;
18713 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18714
18715 if (invert)
18716 {
18717 lo = ~lo, hi = ~hi;
18718 v = constm1_rtx;
18719 }
18720 else
18721 v = const0_rtx;
18722
18723 mask = immed_double_const (lo, hi, imode);
18724
18725 vec = gen_rtvec (2, v, mask);
18726 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18727 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18728
18729 return v;
18730 }
18731 break;
18732
18733 default:
18734 gcc_unreachable ();
18735 }
18736
18737 if (invert)
18738 lo = ~lo, hi = ~hi;
18739
18740 /* Force this value into the low part of a fp vector constant. */
18741 mask = immed_double_const (lo, hi, imode);
18742 mask = gen_lowpart (mode, mask);
18743
18744 if (vec_mode == VOIDmode)
18745 return force_reg (mode, mask);
18746
18747 v = ix86_build_const_vector (vec_mode, vect, mask);
18748 return force_reg (vec_mode, v);
18749 }
18750
18751 /* Generate code for floating point ABS or NEG. */
18752
18753 void
18754 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18755 rtx operands[])
18756 {
18757 rtx mask, set, dst, src;
18758 bool use_sse = false;
18759 bool vector_mode = VECTOR_MODE_P (mode);
18760 enum machine_mode vmode = mode;
18761
18762 if (vector_mode)
18763 use_sse = true;
18764 else if (mode == TFmode)
18765 use_sse = true;
18766 else if (TARGET_SSE_MATH)
18767 {
18768 use_sse = SSE_FLOAT_MODE_P (mode);
18769 if (mode == SFmode)
18770 vmode = V4SFmode;
18771 else if (mode == DFmode)
18772 vmode = V2DFmode;
18773 }
18774
18775 /* NEG and ABS performed with SSE use bitwise mask operations.
18776 Create the appropriate mask now. */
18777 if (use_sse)
18778 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18779 else
18780 mask = NULL_RTX;
18781
18782 dst = operands[0];
18783 src = operands[1];
18784
18785 set = gen_rtx_fmt_e (code, mode, src);
18786 set = gen_rtx_SET (VOIDmode, dst, set);
18787
18788 if (mask)
18789 {
18790 rtx use, clob;
18791 rtvec par;
18792
18793 use = gen_rtx_USE (VOIDmode, mask);
18794 if (vector_mode)
18795 par = gen_rtvec (2, set, use);
18796 else
18797 {
18798 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18799 par = gen_rtvec (3, set, use, clob);
18800 }
18801 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18802 }
18803 else
18804 emit_insn (set);
18805 }
18806
18807 /* Expand a copysign operation. Special case operand 0 being a constant. */
18808
18809 void
18810 ix86_expand_copysign (rtx operands[])
18811 {
18812 enum machine_mode mode, vmode;
18813 rtx dest, op0, op1, mask, nmask;
18814
18815 dest = operands[0];
18816 op0 = operands[1];
18817 op1 = operands[2];
18818
18819 mode = GET_MODE (dest);
18820
18821 if (mode == SFmode)
18822 vmode = V4SFmode;
18823 else if (mode == DFmode)
18824 vmode = V2DFmode;
18825 else
18826 vmode = mode;
18827
18828 if (GET_CODE (op0) == CONST_DOUBLE)
18829 {
18830 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18831
18832 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18833 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18834
18835 if (mode == SFmode || mode == DFmode)
18836 {
18837 if (op0 == CONST0_RTX (mode))
18838 op0 = CONST0_RTX (vmode);
18839 else
18840 {
18841 rtx v = ix86_build_const_vector (vmode, false, op0);
18842
18843 op0 = force_reg (vmode, v);
18844 }
18845 }
18846 else if (op0 != CONST0_RTX (mode))
18847 op0 = force_reg (mode, op0);
18848
18849 mask = ix86_build_signbit_mask (vmode, 0, 0);
18850
18851 if (mode == SFmode)
18852 copysign_insn = gen_copysignsf3_const;
18853 else if (mode == DFmode)
18854 copysign_insn = gen_copysigndf3_const;
18855 else
18856 copysign_insn = gen_copysigntf3_const;
18857
18858 emit_insn (copysign_insn (dest, op0, op1, mask));
18859 }
18860 else
18861 {
18862 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18863
18864 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18865 mask = ix86_build_signbit_mask (vmode, 0, 0);
18866
18867 if (mode == SFmode)
18868 copysign_insn = gen_copysignsf3_var;
18869 else if (mode == DFmode)
18870 copysign_insn = gen_copysigndf3_var;
18871 else
18872 copysign_insn = gen_copysigntf3_var;
18873
18874 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18875 }
18876 }
18877
18878 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18879 be a constant, and so has already been expanded into a vector constant. */
18880
18881 void
18882 ix86_split_copysign_const (rtx operands[])
18883 {
18884 enum machine_mode mode, vmode;
18885 rtx dest, op0, mask, x;
18886
18887 dest = operands[0];
18888 op0 = operands[1];
18889 mask = operands[3];
18890
18891 mode = GET_MODE (dest);
18892 vmode = GET_MODE (mask);
18893
18894 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18895 x = gen_rtx_AND (vmode, dest, mask);
18896 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18897
18898 if (op0 != CONST0_RTX (vmode))
18899 {
18900 x = gen_rtx_IOR (vmode, dest, op0);
18901 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18902 }
18903 }
18904
18905 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18906 so we have to do two masks. */
18907
18908 void
18909 ix86_split_copysign_var (rtx operands[])
18910 {
18911 enum machine_mode mode, vmode;
18912 rtx dest, scratch, op0, op1, mask, nmask, x;
18913
18914 dest = operands[0];
18915 scratch = operands[1];
18916 op0 = operands[2];
18917 op1 = operands[3];
18918 nmask = operands[4];
18919 mask = operands[5];
18920
18921 mode = GET_MODE (dest);
18922 vmode = GET_MODE (mask);
18923
18924 if (rtx_equal_p (op0, op1))
18925 {
18926 /* Shouldn't happen often (it's useless, obviously), but when it does
18927 we'd generate incorrect code if we continue below. */
18928 emit_move_insn (dest, op0);
18929 return;
18930 }
18931
18932 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18933 {
18934 gcc_assert (REGNO (op1) == REGNO (scratch));
18935
18936 x = gen_rtx_AND (vmode, scratch, mask);
18937 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18938
18939 dest = mask;
18940 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18941 x = gen_rtx_NOT (vmode, dest);
18942 x = gen_rtx_AND (vmode, x, op0);
18943 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18944 }
18945 else
18946 {
18947 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18948 {
18949 x = gen_rtx_AND (vmode, scratch, mask);
18950 }
18951 else /* alternative 2,4 */
18952 {
18953 gcc_assert (REGNO (mask) == REGNO (scratch));
18954 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18955 x = gen_rtx_AND (vmode, scratch, op1);
18956 }
18957 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18958
18959 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18960 {
18961 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18962 x = gen_rtx_AND (vmode, dest, nmask);
18963 }
18964 else /* alternative 3,4 */
18965 {
18966 gcc_assert (REGNO (nmask) == REGNO (dest));
18967 dest = nmask;
18968 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18969 x = gen_rtx_AND (vmode, dest, op0);
18970 }
18971 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18972 }
18973
18974 x = gen_rtx_IOR (vmode, dest, scratch);
18975 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18976 }
18977
18978 /* Return TRUE or FALSE depending on whether the first SET in INSN
18979 has source and destination with matching CC modes, and that the
18980 CC mode is at least as constrained as REQ_MODE. */
18981
18982 bool
18983 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18984 {
18985 rtx set;
18986 enum machine_mode set_mode;
18987
18988 set = PATTERN (insn);
18989 if (GET_CODE (set) == PARALLEL)
18990 set = XVECEXP (set, 0, 0);
18991 gcc_assert (GET_CODE (set) == SET);
18992 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18993
18994 set_mode = GET_MODE (SET_DEST (set));
18995 switch (set_mode)
18996 {
18997 case CCNOmode:
18998 if (req_mode != CCNOmode
18999 && (req_mode != CCmode
19000 || XEXP (SET_SRC (set), 1) != const0_rtx))
19001 return false;
19002 break;
19003 case CCmode:
19004 if (req_mode == CCGCmode)
19005 return false;
19006 /* FALLTHRU */
19007 case CCGCmode:
19008 if (req_mode == CCGOCmode || req_mode == CCNOmode)
19009 return false;
19010 /* FALLTHRU */
19011 case CCGOCmode:
19012 if (req_mode == CCZmode)
19013 return false;
19014 /* FALLTHRU */
19015 case CCZmode:
19016 break;
19017
19018 case CCAmode:
19019 case CCCmode:
19020 case CCOmode:
19021 case CCSmode:
19022 if (set_mode != req_mode)
19023 return false;
19024 break;
19025
19026 default:
19027 gcc_unreachable ();
19028 }
19029
19030 return GET_MODE (SET_SRC (set)) == set_mode;
19031 }
19032
19033 /* Generate insn patterns to do an integer compare of OPERANDS. */
19034
19035 static rtx
19036 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
19037 {
19038 enum machine_mode cmpmode;
19039 rtx tmp, flags;
19040
19041 cmpmode = SELECT_CC_MODE (code, op0, op1);
19042 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
19043
19044 /* This is very simple, but making the interface the same as in the
19045 FP case makes the rest of the code easier. */
19046 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
19047 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
19048
19049 /* Return the test that should be put into the flags user, i.e.
19050 the bcc, scc, or cmov instruction. */
19051 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
19052 }
19053
19054 /* Figure out whether to use ordered or unordered fp comparisons.
19055 Return the appropriate mode to use. */
19056
19057 enum machine_mode
19058 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
19059 {
19060 /* ??? In order to make all comparisons reversible, we do all comparisons
19061 non-trapping when compiling for IEEE. Once gcc is able to distinguish
19062 all forms trapping and nontrapping comparisons, we can make inequality
19063 comparisons trapping again, since it results in better code when using
19064 FCOM based compares. */
19065 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
19066 }
19067
19068 enum machine_mode
19069 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
19070 {
19071 enum machine_mode mode = GET_MODE (op0);
19072
19073 if (SCALAR_FLOAT_MODE_P (mode))
19074 {
19075 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19076 return ix86_fp_compare_mode (code);
19077 }
19078
19079 switch (code)
19080 {
19081 /* Only zero flag is needed. */
19082 case EQ: /* ZF=0 */
19083 case NE: /* ZF!=0 */
19084 return CCZmode;
19085 /* Codes needing carry flag. */
19086 case GEU: /* CF=0 */
19087 case LTU: /* CF=1 */
19088 /* Detect overflow checks. They need just the carry flag. */
19089 if (GET_CODE (op0) == PLUS
19090 && rtx_equal_p (op1, XEXP (op0, 0)))
19091 return CCCmode;
19092 else
19093 return CCmode;
19094 case GTU: /* CF=0 & ZF=0 */
19095 case LEU: /* CF=1 | ZF=1 */
19096 return CCmode;
19097 /* Codes possibly doable only with sign flag when
19098 comparing against zero. */
19099 case GE: /* SF=OF or SF=0 */
19100 case LT: /* SF<>OF or SF=1 */
19101 if (op1 == const0_rtx)
19102 return CCGOCmode;
19103 else
19104 /* For other cases Carry flag is not required. */
19105 return CCGCmode;
19106 /* Codes doable only with sign flag when comparing
19107 against zero, but we miss jump instruction for it
19108 so we need to use relational tests against overflow
19109 that thus needs to be zero. */
19110 case GT: /* ZF=0 & SF=OF */
19111 case LE: /* ZF=1 | SF<>OF */
19112 if (op1 == const0_rtx)
19113 return CCNOmode;
19114 else
19115 return CCGCmode;
19116 /* strcmp pattern do (use flags) and combine may ask us for proper
19117 mode. */
19118 case USE:
19119 return CCmode;
19120 default:
19121 gcc_unreachable ();
19122 }
19123 }
19124
19125 /* Return the fixed registers used for condition codes. */
19126
19127 static bool
19128 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
19129 {
19130 *p1 = FLAGS_REG;
19131 *p2 = FPSR_REG;
19132 return true;
19133 }
19134
19135 /* If two condition code modes are compatible, return a condition code
19136 mode which is compatible with both. Otherwise, return
19137 VOIDmode. */
19138
19139 static enum machine_mode
19140 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
19141 {
19142 if (m1 == m2)
19143 return m1;
19144
19145 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
19146 return VOIDmode;
19147
19148 if ((m1 == CCGCmode && m2 == CCGOCmode)
19149 || (m1 == CCGOCmode && m2 == CCGCmode))
19150 return CCGCmode;
19151
19152 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
19153 return m2;
19154 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
19155 return m1;
19156
19157 switch (m1)
19158 {
19159 default:
19160 gcc_unreachable ();
19161
19162 case CCmode:
19163 case CCGCmode:
19164 case CCGOCmode:
19165 case CCNOmode:
19166 case CCAmode:
19167 case CCCmode:
19168 case CCOmode:
19169 case CCSmode:
19170 case CCZmode:
19171 switch (m2)
19172 {
19173 default:
19174 return VOIDmode;
19175
19176 case CCmode:
19177 case CCGCmode:
19178 case CCGOCmode:
19179 case CCNOmode:
19180 case CCAmode:
19181 case CCCmode:
19182 case CCOmode:
19183 case CCSmode:
19184 case CCZmode:
19185 return CCmode;
19186 }
19187
19188 case CCFPmode:
19189 case CCFPUmode:
19190 /* These are only compatible with themselves, which we already
19191 checked above. */
19192 return VOIDmode;
19193 }
19194 }
19195
19196
19197 /* Return a comparison we can do and that it is equivalent to
19198 swap_condition (code) apart possibly from orderedness.
19199 But, never change orderedness if TARGET_IEEE_FP, returning
19200 UNKNOWN in that case if necessary. */
19201
19202 static enum rtx_code
19203 ix86_fp_swap_condition (enum rtx_code code)
19204 {
19205 switch (code)
19206 {
19207 case GT: /* GTU - CF=0 & ZF=0 */
19208 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
19209 case GE: /* GEU - CF=0 */
19210 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
19211 case UNLT: /* LTU - CF=1 */
19212 return TARGET_IEEE_FP ? UNKNOWN : GT;
19213 case UNLE: /* LEU - CF=1 | ZF=1 */
19214 return TARGET_IEEE_FP ? UNKNOWN : GE;
19215 default:
19216 return swap_condition (code);
19217 }
19218 }
19219
19220 /* Return cost of comparison CODE using the best strategy for performance.
19221 All following functions do use number of instructions as a cost metrics.
19222 In future this should be tweaked to compute bytes for optimize_size and
19223 take into account performance of various instructions on various CPUs. */
19224
19225 static int
19226 ix86_fp_comparison_cost (enum rtx_code code)
19227 {
19228 int arith_cost;
19229
19230 /* The cost of code using bit-twiddling on %ah. */
19231 switch (code)
19232 {
19233 case UNLE:
19234 case UNLT:
19235 case LTGT:
19236 case GT:
19237 case GE:
19238 case UNORDERED:
19239 case ORDERED:
19240 case UNEQ:
19241 arith_cost = 4;
19242 break;
19243 case LT:
19244 case NE:
19245 case EQ:
19246 case UNGE:
19247 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19248 break;
19249 case LE:
19250 case UNGT:
19251 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19252 break;
19253 default:
19254 gcc_unreachable ();
19255 }
19256
19257 switch (ix86_fp_comparison_strategy (code))
19258 {
19259 case IX86_FPCMP_COMI:
19260 return arith_cost > 4 ? 3 : 2;
19261 case IX86_FPCMP_SAHF:
19262 return arith_cost > 4 ? 4 : 3;
19263 default:
19264 return arith_cost;
19265 }
19266 }
19267
19268 /* Return strategy to use for floating-point. We assume that fcomi is always
19269 preferrable where available, since that is also true when looking at size
19270 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19271
19272 enum ix86_fpcmp_strategy
19273 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19274 {
19275 /* Do fcomi/sahf based test when profitable. */
19276
19277 if (TARGET_CMOVE)
19278 return IX86_FPCMP_COMI;
19279
19280 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19281 return IX86_FPCMP_SAHF;
19282
19283 return IX86_FPCMP_ARITH;
19284 }
19285
19286 /* Swap, force into registers, or otherwise massage the two operands
19287 to a fp comparison. The operands are updated in place; the new
19288 comparison code is returned. */
19289
19290 static enum rtx_code
19291 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19292 {
19293 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19294 rtx op0 = *pop0, op1 = *pop1;
19295 enum machine_mode op_mode = GET_MODE (op0);
19296 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19297
19298 /* All of the unordered compare instructions only work on registers.
19299 The same is true of the fcomi compare instructions. The XFmode
19300 compare instructions require registers except when comparing
19301 against zero or when converting operand 1 from fixed point to
19302 floating point. */
19303
19304 if (!is_sse
19305 && (fpcmp_mode == CCFPUmode
19306 || (op_mode == XFmode
19307 && ! (standard_80387_constant_p (op0) == 1
19308 || standard_80387_constant_p (op1) == 1)
19309 && GET_CODE (op1) != FLOAT)
19310 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19311 {
19312 op0 = force_reg (op_mode, op0);
19313 op1 = force_reg (op_mode, op1);
19314 }
19315 else
19316 {
19317 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19318 things around if they appear profitable, otherwise force op0
19319 into a register. */
19320
19321 if (standard_80387_constant_p (op0) == 0
19322 || (MEM_P (op0)
19323 && ! (standard_80387_constant_p (op1) == 0
19324 || MEM_P (op1))))
19325 {
19326 enum rtx_code new_code = ix86_fp_swap_condition (code);
19327 if (new_code != UNKNOWN)
19328 {
19329 rtx tmp;
19330 tmp = op0, op0 = op1, op1 = tmp;
19331 code = new_code;
19332 }
19333 }
19334
19335 if (!REG_P (op0))
19336 op0 = force_reg (op_mode, op0);
19337
19338 if (CONSTANT_P (op1))
19339 {
19340 int tmp = standard_80387_constant_p (op1);
19341 if (tmp == 0)
19342 op1 = validize_mem (force_const_mem (op_mode, op1));
19343 else if (tmp == 1)
19344 {
19345 if (TARGET_CMOVE)
19346 op1 = force_reg (op_mode, op1);
19347 }
19348 else
19349 op1 = force_reg (op_mode, op1);
19350 }
19351 }
19352
19353 /* Try to rearrange the comparison to make it cheaper. */
19354 if (ix86_fp_comparison_cost (code)
19355 > ix86_fp_comparison_cost (swap_condition (code))
19356 && (REG_P (op1) || can_create_pseudo_p ()))
19357 {
19358 rtx tmp;
19359 tmp = op0, op0 = op1, op1 = tmp;
19360 code = swap_condition (code);
19361 if (!REG_P (op0))
19362 op0 = force_reg (op_mode, op0);
19363 }
19364
19365 *pop0 = op0;
19366 *pop1 = op1;
19367 return code;
19368 }
19369
19370 /* Convert comparison codes we use to represent FP comparison to integer
19371 code that will result in proper branch. Return UNKNOWN if no such code
19372 is available. */
19373
19374 enum rtx_code
19375 ix86_fp_compare_code_to_integer (enum rtx_code code)
19376 {
19377 switch (code)
19378 {
19379 case GT:
19380 return GTU;
19381 case GE:
19382 return GEU;
19383 case ORDERED:
19384 case UNORDERED:
19385 return code;
19386 break;
19387 case UNEQ:
19388 return EQ;
19389 break;
19390 case UNLT:
19391 return LTU;
19392 break;
19393 case UNLE:
19394 return LEU;
19395 break;
19396 case LTGT:
19397 return NE;
19398 break;
19399 default:
19400 return UNKNOWN;
19401 }
19402 }
19403
19404 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19405
19406 static rtx
19407 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19408 {
19409 enum machine_mode fpcmp_mode, intcmp_mode;
19410 rtx tmp, tmp2;
19411
19412 fpcmp_mode = ix86_fp_compare_mode (code);
19413 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19414
19415 /* Do fcomi/sahf based test when profitable. */
19416 switch (ix86_fp_comparison_strategy (code))
19417 {
19418 case IX86_FPCMP_COMI:
19419 intcmp_mode = fpcmp_mode;
19420 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19421 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19422 tmp);
19423 emit_insn (tmp);
19424 break;
19425
19426 case IX86_FPCMP_SAHF:
19427 intcmp_mode = fpcmp_mode;
19428 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19429 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19430 tmp);
19431
19432 if (!scratch)
19433 scratch = gen_reg_rtx (HImode);
19434 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19435 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19436 break;
19437
19438 case IX86_FPCMP_ARITH:
19439 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19440 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19441 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19442 if (!scratch)
19443 scratch = gen_reg_rtx (HImode);
19444 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19445
19446 /* In the unordered case, we have to check C2 for NaN's, which
19447 doesn't happen to work out to anything nice combination-wise.
19448 So do some bit twiddling on the value we've got in AH to come
19449 up with an appropriate set of condition codes. */
19450
19451 intcmp_mode = CCNOmode;
19452 switch (code)
19453 {
19454 case GT:
19455 case UNGT:
19456 if (code == GT || !TARGET_IEEE_FP)
19457 {
19458 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19459 code = EQ;
19460 }
19461 else
19462 {
19463 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19464 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19465 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19466 intcmp_mode = CCmode;
19467 code = GEU;
19468 }
19469 break;
19470 case LT:
19471 case UNLT:
19472 if (code == LT && TARGET_IEEE_FP)
19473 {
19474 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19475 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19476 intcmp_mode = CCmode;
19477 code = EQ;
19478 }
19479 else
19480 {
19481 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19482 code = NE;
19483 }
19484 break;
19485 case GE:
19486 case UNGE:
19487 if (code == GE || !TARGET_IEEE_FP)
19488 {
19489 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19490 code = EQ;
19491 }
19492 else
19493 {
19494 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19495 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19496 code = NE;
19497 }
19498 break;
19499 case LE:
19500 case UNLE:
19501 if (code == LE && TARGET_IEEE_FP)
19502 {
19503 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19504 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19505 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19506 intcmp_mode = CCmode;
19507 code = LTU;
19508 }
19509 else
19510 {
19511 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19512 code = NE;
19513 }
19514 break;
19515 case EQ:
19516 case UNEQ:
19517 if (code == EQ && TARGET_IEEE_FP)
19518 {
19519 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19520 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19521 intcmp_mode = CCmode;
19522 code = EQ;
19523 }
19524 else
19525 {
19526 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19527 code = NE;
19528 }
19529 break;
19530 case NE:
19531 case LTGT:
19532 if (code == NE && TARGET_IEEE_FP)
19533 {
19534 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19535 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19536 GEN_INT (0x40)));
19537 code = NE;
19538 }
19539 else
19540 {
19541 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19542 code = EQ;
19543 }
19544 break;
19545
19546 case UNORDERED:
19547 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19548 code = NE;
19549 break;
19550 case ORDERED:
19551 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19552 code = EQ;
19553 break;
19554
19555 default:
19556 gcc_unreachable ();
19557 }
19558 break;
19559
19560 default:
19561 gcc_unreachable();
19562 }
19563
19564 /* Return the test that should be put into the flags user, i.e.
19565 the bcc, scc, or cmov instruction. */
19566 return gen_rtx_fmt_ee (code, VOIDmode,
19567 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19568 const0_rtx);
19569 }
19570
19571 static rtx
19572 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19573 {
19574 rtx ret;
19575
19576 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19577 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19578
19579 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19580 {
19581 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19582 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19583 }
19584 else
19585 ret = ix86_expand_int_compare (code, op0, op1);
19586
19587 return ret;
19588 }
19589
19590 void
19591 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19592 {
19593 enum machine_mode mode = GET_MODE (op0);
19594 rtx tmp;
19595
19596 switch (mode)
19597 {
19598 case SFmode:
19599 case DFmode:
19600 case XFmode:
19601 case QImode:
19602 case HImode:
19603 case SImode:
19604 simple:
19605 tmp = ix86_expand_compare (code, op0, op1);
19606 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19607 gen_rtx_LABEL_REF (VOIDmode, label),
19608 pc_rtx);
19609 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19610 return;
19611
19612 case DImode:
19613 if (TARGET_64BIT)
19614 goto simple;
19615 case TImode:
19616 /* Expand DImode branch into multiple compare+branch. */
19617 {
19618 rtx lo[2], hi[2], label2;
19619 enum rtx_code code1, code2, code3;
19620 enum machine_mode submode;
19621
19622 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19623 {
19624 tmp = op0, op0 = op1, op1 = tmp;
19625 code = swap_condition (code);
19626 }
19627
19628 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19629 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19630
19631 submode = mode == DImode ? SImode : DImode;
19632
19633 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19634 avoid two branches. This costs one extra insn, so disable when
19635 optimizing for size. */
19636
19637 if ((code == EQ || code == NE)
19638 && (!optimize_insn_for_size_p ()
19639 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19640 {
19641 rtx xor0, xor1;
19642
19643 xor1 = hi[0];
19644 if (hi[1] != const0_rtx)
19645 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19646 NULL_RTX, 0, OPTAB_WIDEN);
19647
19648 xor0 = lo[0];
19649 if (lo[1] != const0_rtx)
19650 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19651 NULL_RTX, 0, OPTAB_WIDEN);
19652
19653 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19654 NULL_RTX, 0, OPTAB_WIDEN);
19655
19656 ix86_expand_branch (code, tmp, const0_rtx, label);
19657 return;
19658 }
19659
19660 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19661 op1 is a constant and the low word is zero, then we can just
19662 examine the high word. Similarly for low word -1 and
19663 less-or-equal-than or greater-than. */
19664
19665 if (CONST_INT_P (hi[1]))
19666 switch (code)
19667 {
19668 case LT: case LTU: case GE: case GEU:
19669 if (lo[1] == const0_rtx)
19670 {
19671 ix86_expand_branch (code, hi[0], hi[1], label);
19672 return;
19673 }
19674 break;
19675 case LE: case LEU: case GT: case GTU:
19676 if (lo[1] == constm1_rtx)
19677 {
19678 ix86_expand_branch (code, hi[0], hi[1], label);
19679 return;
19680 }
19681 break;
19682 default:
19683 break;
19684 }
19685
19686 /* Otherwise, we need two or three jumps. */
19687
19688 label2 = gen_label_rtx ();
19689
19690 code1 = code;
19691 code2 = swap_condition (code);
19692 code3 = unsigned_condition (code);
19693
19694 switch (code)
19695 {
19696 case LT: case GT: case LTU: case GTU:
19697 break;
19698
19699 case LE: code1 = LT; code2 = GT; break;
19700 case GE: code1 = GT; code2 = LT; break;
19701 case LEU: code1 = LTU; code2 = GTU; break;
19702 case GEU: code1 = GTU; code2 = LTU; break;
19703
19704 case EQ: code1 = UNKNOWN; code2 = NE; break;
19705 case NE: code2 = UNKNOWN; break;
19706
19707 default:
19708 gcc_unreachable ();
19709 }
19710
19711 /*
19712 * a < b =>
19713 * if (hi(a) < hi(b)) goto true;
19714 * if (hi(a) > hi(b)) goto false;
19715 * if (lo(a) < lo(b)) goto true;
19716 * false:
19717 */
19718
19719 if (code1 != UNKNOWN)
19720 ix86_expand_branch (code1, hi[0], hi[1], label);
19721 if (code2 != UNKNOWN)
19722 ix86_expand_branch (code2, hi[0], hi[1], label2);
19723
19724 ix86_expand_branch (code3, lo[0], lo[1], label);
19725
19726 if (code2 != UNKNOWN)
19727 emit_label (label2);
19728 return;
19729 }
19730
19731 default:
19732 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19733 goto simple;
19734 }
19735 }
19736
19737 /* Split branch based on floating point condition. */
19738 void
19739 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19740 rtx target1, rtx target2, rtx tmp, rtx pushed)
19741 {
19742 rtx condition;
19743 rtx i;
19744
19745 if (target2 != pc_rtx)
19746 {
19747 rtx tmp = target2;
19748 code = reverse_condition_maybe_unordered (code);
19749 target2 = target1;
19750 target1 = tmp;
19751 }
19752
19753 condition = ix86_expand_fp_compare (code, op1, op2,
19754 tmp);
19755
19756 /* Remove pushed operand from stack. */
19757 if (pushed)
19758 ix86_free_from_memory (GET_MODE (pushed));
19759
19760 i = emit_jump_insn (gen_rtx_SET
19761 (VOIDmode, pc_rtx,
19762 gen_rtx_IF_THEN_ELSE (VOIDmode,
19763 condition, target1, target2)));
19764 if (split_branch_probability >= 0)
19765 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
19766 }
19767
19768 void
19769 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19770 {
19771 rtx ret;
19772
19773 gcc_assert (GET_MODE (dest) == QImode);
19774
19775 ret = ix86_expand_compare (code, op0, op1);
19776 PUT_MODE (ret, QImode);
19777 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19778 }
19779
19780 /* Expand comparison setting or clearing carry flag. Return true when
19781 successful and set pop for the operation. */
19782 static bool
19783 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19784 {
19785 enum machine_mode mode =
19786 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19787
19788 /* Do not handle double-mode compares that go through special path. */
19789 if (mode == (TARGET_64BIT ? TImode : DImode))
19790 return false;
19791
19792 if (SCALAR_FLOAT_MODE_P (mode))
19793 {
19794 rtx compare_op, compare_seq;
19795
19796 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19797
19798 /* Shortcut: following common codes never translate
19799 into carry flag compares. */
19800 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19801 || code == ORDERED || code == UNORDERED)
19802 return false;
19803
19804 /* These comparisons require zero flag; swap operands so they won't. */
19805 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19806 && !TARGET_IEEE_FP)
19807 {
19808 rtx tmp = op0;
19809 op0 = op1;
19810 op1 = tmp;
19811 code = swap_condition (code);
19812 }
19813
19814 /* Try to expand the comparison and verify that we end up with
19815 carry flag based comparison. This fails to be true only when
19816 we decide to expand comparison using arithmetic that is not
19817 too common scenario. */
19818 start_sequence ();
19819 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19820 compare_seq = get_insns ();
19821 end_sequence ();
19822
19823 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19824 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19825 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19826 else
19827 code = GET_CODE (compare_op);
19828
19829 if (code != LTU && code != GEU)
19830 return false;
19831
19832 emit_insn (compare_seq);
19833 *pop = compare_op;
19834 return true;
19835 }
19836
19837 if (!INTEGRAL_MODE_P (mode))
19838 return false;
19839
19840 switch (code)
19841 {
19842 case LTU:
19843 case GEU:
19844 break;
19845
19846 /* Convert a==0 into (unsigned)a<1. */
19847 case EQ:
19848 case NE:
19849 if (op1 != const0_rtx)
19850 return false;
19851 op1 = const1_rtx;
19852 code = (code == EQ ? LTU : GEU);
19853 break;
19854
19855 /* Convert a>b into b<a or a>=b-1. */
19856 case GTU:
19857 case LEU:
19858 if (CONST_INT_P (op1))
19859 {
19860 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19861 /* Bail out on overflow. We still can swap operands but that
19862 would force loading of the constant into register. */
19863 if (op1 == const0_rtx
19864 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19865 return false;
19866 code = (code == GTU ? GEU : LTU);
19867 }
19868 else
19869 {
19870 rtx tmp = op1;
19871 op1 = op0;
19872 op0 = tmp;
19873 code = (code == GTU ? LTU : GEU);
19874 }
19875 break;
19876
19877 /* Convert a>=0 into (unsigned)a<0x80000000. */
19878 case LT:
19879 case GE:
19880 if (mode == DImode || op1 != const0_rtx)
19881 return false;
19882 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19883 code = (code == LT ? GEU : LTU);
19884 break;
19885 case LE:
19886 case GT:
19887 if (mode == DImode || op1 != constm1_rtx)
19888 return false;
19889 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19890 code = (code == LE ? GEU : LTU);
19891 break;
19892
19893 default:
19894 return false;
19895 }
19896 /* Swapping operands may cause constant to appear as first operand. */
19897 if (!nonimmediate_operand (op0, VOIDmode))
19898 {
19899 if (!can_create_pseudo_p ())
19900 return false;
19901 op0 = force_reg (mode, op0);
19902 }
19903 *pop = ix86_expand_compare (code, op0, op1);
19904 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19905 return true;
19906 }
19907
19908 bool
19909 ix86_expand_int_movcc (rtx operands[])
19910 {
19911 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19912 rtx compare_seq, compare_op;
19913 enum machine_mode mode = GET_MODE (operands[0]);
19914 bool sign_bit_compare_p = false;
19915 rtx op0 = XEXP (operands[1], 0);
19916 rtx op1 = XEXP (operands[1], 1);
19917
19918 if (GET_MODE (op0) == TImode
19919 || (GET_MODE (op0) == DImode
19920 && !TARGET_64BIT))
19921 return false;
19922
19923 start_sequence ();
19924 compare_op = ix86_expand_compare (code, op0, op1);
19925 compare_seq = get_insns ();
19926 end_sequence ();
19927
19928 compare_code = GET_CODE (compare_op);
19929
19930 if ((op1 == const0_rtx && (code == GE || code == LT))
19931 || (op1 == constm1_rtx && (code == GT || code == LE)))
19932 sign_bit_compare_p = true;
19933
19934 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19935 HImode insns, we'd be swallowed in word prefix ops. */
19936
19937 if ((mode != HImode || TARGET_FAST_PREFIX)
19938 && (mode != (TARGET_64BIT ? TImode : DImode))
19939 && CONST_INT_P (operands[2])
19940 && CONST_INT_P (operands[3]))
19941 {
19942 rtx out = operands[0];
19943 HOST_WIDE_INT ct = INTVAL (operands[2]);
19944 HOST_WIDE_INT cf = INTVAL (operands[3]);
19945 HOST_WIDE_INT diff;
19946
19947 diff = ct - cf;
19948 /* Sign bit compares are better done using shifts than we do by using
19949 sbb. */
19950 if (sign_bit_compare_p
19951 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19952 {
19953 /* Detect overlap between destination and compare sources. */
19954 rtx tmp = out;
19955
19956 if (!sign_bit_compare_p)
19957 {
19958 rtx flags;
19959 bool fpcmp = false;
19960
19961 compare_code = GET_CODE (compare_op);
19962
19963 flags = XEXP (compare_op, 0);
19964
19965 if (GET_MODE (flags) == CCFPmode
19966 || GET_MODE (flags) == CCFPUmode)
19967 {
19968 fpcmp = true;
19969 compare_code
19970 = ix86_fp_compare_code_to_integer (compare_code);
19971 }
19972
19973 /* To simplify rest of code, restrict to the GEU case. */
19974 if (compare_code == LTU)
19975 {
19976 HOST_WIDE_INT tmp = ct;
19977 ct = cf;
19978 cf = tmp;
19979 compare_code = reverse_condition (compare_code);
19980 code = reverse_condition (code);
19981 }
19982 else
19983 {
19984 if (fpcmp)
19985 PUT_CODE (compare_op,
19986 reverse_condition_maybe_unordered
19987 (GET_CODE (compare_op)));
19988 else
19989 PUT_CODE (compare_op,
19990 reverse_condition (GET_CODE (compare_op)));
19991 }
19992 diff = ct - cf;
19993
19994 if (reg_overlap_mentioned_p (out, op0)
19995 || reg_overlap_mentioned_p (out, op1))
19996 tmp = gen_reg_rtx (mode);
19997
19998 if (mode == DImode)
19999 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
20000 else
20001 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
20002 flags, compare_op));
20003 }
20004 else
20005 {
20006 if (code == GT || code == GE)
20007 code = reverse_condition (code);
20008 else
20009 {
20010 HOST_WIDE_INT tmp = ct;
20011 ct = cf;
20012 cf = tmp;
20013 diff = ct - cf;
20014 }
20015 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
20016 }
20017
20018 if (diff == 1)
20019 {
20020 /*
20021 * cmpl op0,op1
20022 * sbbl dest,dest
20023 * [addl dest, ct]
20024 *
20025 * Size 5 - 8.
20026 */
20027 if (ct)
20028 tmp = expand_simple_binop (mode, PLUS,
20029 tmp, GEN_INT (ct),
20030 copy_rtx (tmp), 1, OPTAB_DIRECT);
20031 }
20032 else if (cf == -1)
20033 {
20034 /*
20035 * cmpl op0,op1
20036 * sbbl dest,dest
20037 * orl $ct, dest
20038 *
20039 * Size 8.
20040 */
20041 tmp = expand_simple_binop (mode, IOR,
20042 tmp, GEN_INT (ct),
20043 copy_rtx (tmp), 1, OPTAB_DIRECT);
20044 }
20045 else if (diff == -1 && ct)
20046 {
20047 /*
20048 * cmpl op0,op1
20049 * sbbl dest,dest
20050 * notl dest
20051 * [addl dest, cf]
20052 *
20053 * Size 8 - 11.
20054 */
20055 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
20056 if (cf)
20057 tmp = expand_simple_binop (mode, PLUS,
20058 copy_rtx (tmp), GEN_INT (cf),
20059 copy_rtx (tmp), 1, OPTAB_DIRECT);
20060 }
20061 else
20062 {
20063 /*
20064 * cmpl op0,op1
20065 * sbbl dest,dest
20066 * [notl dest]
20067 * andl cf - ct, dest
20068 * [addl dest, ct]
20069 *
20070 * Size 8 - 11.
20071 */
20072
20073 if (cf == 0)
20074 {
20075 cf = ct;
20076 ct = 0;
20077 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
20078 }
20079
20080 tmp = expand_simple_binop (mode, AND,
20081 copy_rtx (tmp),
20082 gen_int_mode (cf - ct, mode),
20083 copy_rtx (tmp), 1, OPTAB_DIRECT);
20084 if (ct)
20085 tmp = expand_simple_binop (mode, PLUS,
20086 copy_rtx (tmp), GEN_INT (ct),
20087 copy_rtx (tmp), 1, OPTAB_DIRECT);
20088 }
20089
20090 if (!rtx_equal_p (tmp, out))
20091 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
20092
20093 return true;
20094 }
20095
20096 if (diff < 0)
20097 {
20098 enum machine_mode cmp_mode = GET_MODE (op0);
20099
20100 HOST_WIDE_INT tmp;
20101 tmp = ct, ct = cf, cf = tmp;
20102 diff = -diff;
20103
20104 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20105 {
20106 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20107
20108 /* We may be reversing unordered compare to normal compare, that
20109 is not valid in general (we may convert non-trapping condition
20110 to trapping one), however on i386 we currently emit all
20111 comparisons unordered. */
20112 compare_code = reverse_condition_maybe_unordered (compare_code);
20113 code = reverse_condition_maybe_unordered (code);
20114 }
20115 else
20116 {
20117 compare_code = reverse_condition (compare_code);
20118 code = reverse_condition (code);
20119 }
20120 }
20121
20122 compare_code = UNKNOWN;
20123 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
20124 && CONST_INT_P (op1))
20125 {
20126 if (op1 == const0_rtx
20127 && (code == LT || code == GE))
20128 compare_code = code;
20129 else if (op1 == constm1_rtx)
20130 {
20131 if (code == LE)
20132 compare_code = LT;
20133 else if (code == GT)
20134 compare_code = GE;
20135 }
20136 }
20137
20138 /* Optimize dest = (op0 < 0) ? -1 : cf. */
20139 if (compare_code != UNKNOWN
20140 && GET_MODE (op0) == GET_MODE (out)
20141 && (cf == -1 || ct == -1))
20142 {
20143 /* If lea code below could be used, only optimize
20144 if it results in a 2 insn sequence. */
20145
20146 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
20147 || diff == 3 || diff == 5 || diff == 9)
20148 || (compare_code == LT && ct == -1)
20149 || (compare_code == GE && cf == -1))
20150 {
20151 /*
20152 * notl op1 (if necessary)
20153 * sarl $31, op1
20154 * orl cf, op1
20155 */
20156 if (ct != -1)
20157 {
20158 cf = ct;
20159 ct = -1;
20160 code = reverse_condition (code);
20161 }
20162
20163 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20164
20165 out = expand_simple_binop (mode, IOR,
20166 out, GEN_INT (cf),
20167 out, 1, OPTAB_DIRECT);
20168 if (out != operands[0])
20169 emit_move_insn (operands[0], out);
20170
20171 return true;
20172 }
20173 }
20174
20175
20176 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
20177 || diff == 3 || diff == 5 || diff == 9)
20178 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
20179 && (mode != DImode
20180 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
20181 {
20182 /*
20183 * xorl dest,dest
20184 * cmpl op1,op2
20185 * setcc dest
20186 * lea cf(dest*(ct-cf)),dest
20187 *
20188 * Size 14.
20189 *
20190 * This also catches the degenerate setcc-only case.
20191 */
20192
20193 rtx tmp;
20194 int nops;
20195
20196 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20197
20198 nops = 0;
20199 /* On x86_64 the lea instruction operates on Pmode, so we need
20200 to get arithmetics done in proper mode to match. */
20201 if (diff == 1)
20202 tmp = copy_rtx (out);
20203 else
20204 {
20205 rtx out1;
20206 out1 = copy_rtx (out);
20207 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
20208 nops++;
20209 if (diff & 1)
20210 {
20211 tmp = gen_rtx_PLUS (mode, tmp, out1);
20212 nops++;
20213 }
20214 }
20215 if (cf != 0)
20216 {
20217 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
20218 nops++;
20219 }
20220 if (!rtx_equal_p (tmp, out))
20221 {
20222 if (nops == 1)
20223 out = force_operand (tmp, copy_rtx (out));
20224 else
20225 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
20226 }
20227 if (!rtx_equal_p (out, operands[0]))
20228 emit_move_insn (operands[0], copy_rtx (out));
20229
20230 return true;
20231 }
20232
20233 /*
20234 * General case: Jumpful:
20235 * xorl dest,dest cmpl op1, op2
20236 * cmpl op1, op2 movl ct, dest
20237 * setcc dest jcc 1f
20238 * decl dest movl cf, dest
20239 * andl (cf-ct),dest 1:
20240 * addl ct,dest
20241 *
20242 * Size 20. Size 14.
20243 *
20244 * This is reasonably steep, but branch mispredict costs are
20245 * high on modern cpus, so consider failing only if optimizing
20246 * for space.
20247 */
20248
20249 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20250 && BRANCH_COST (optimize_insn_for_speed_p (),
20251 false) >= 2)
20252 {
20253 if (cf == 0)
20254 {
20255 enum machine_mode cmp_mode = GET_MODE (op0);
20256
20257 cf = ct;
20258 ct = 0;
20259
20260 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20261 {
20262 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20263
20264 /* We may be reversing unordered compare to normal compare,
20265 that is not valid in general (we may convert non-trapping
20266 condition to trapping one), however on i386 we currently
20267 emit all comparisons unordered. */
20268 code = reverse_condition_maybe_unordered (code);
20269 }
20270 else
20271 {
20272 code = reverse_condition (code);
20273 if (compare_code != UNKNOWN)
20274 compare_code = reverse_condition (compare_code);
20275 }
20276 }
20277
20278 if (compare_code != UNKNOWN)
20279 {
20280 /* notl op1 (if needed)
20281 sarl $31, op1
20282 andl (cf-ct), op1
20283 addl ct, op1
20284
20285 For x < 0 (resp. x <= -1) there will be no notl,
20286 so if possible swap the constants to get rid of the
20287 complement.
20288 True/false will be -1/0 while code below (store flag
20289 followed by decrement) is 0/-1, so the constants need
20290 to be exchanged once more. */
20291
20292 if (compare_code == GE || !cf)
20293 {
20294 code = reverse_condition (code);
20295 compare_code = LT;
20296 }
20297 else
20298 {
20299 HOST_WIDE_INT tmp = cf;
20300 cf = ct;
20301 ct = tmp;
20302 }
20303
20304 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20305 }
20306 else
20307 {
20308 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20309
20310 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20311 constm1_rtx,
20312 copy_rtx (out), 1, OPTAB_DIRECT);
20313 }
20314
20315 out = expand_simple_binop (mode, AND, copy_rtx (out),
20316 gen_int_mode (cf - ct, mode),
20317 copy_rtx (out), 1, OPTAB_DIRECT);
20318 if (ct)
20319 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20320 copy_rtx (out), 1, OPTAB_DIRECT);
20321 if (!rtx_equal_p (out, operands[0]))
20322 emit_move_insn (operands[0], copy_rtx (out));
20323
20324 return true;
20325 }
20326 }
20327
20328 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20329 {
20330 /* Try a few things more with specific constants and a variable. */
20331
20332 optab op;
20333 rtx var, orig_out, out, tmp;
20334
20335 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20336 return false;
20337
20338 /* If one of the two operands is an interesting constant, load a
20339 constant with the above and mask it in with a logical operation. */
20340
20341 if (CONST_INT_P (operands[2]))
20342 {
20343 var = operands[3];
20344 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20345 operands[3] = constm1_rtx, op = and_optab;
20346 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20347 operands[3] = const0_rtx, op = ior_optab;
20348 else
20349 return false;
20350 }
20351 else if (CONST_INT_P (operands[3]))
20352 {
20353 var = operands[2];
20354 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20355 operands[2] = constm1_rtx, op = and_optab;
20356 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20357 operands[2] = const0_rtx, op = ior_optab;
20358 else
20359 return false;
20360 }
20361 else
20362 return false;
20363
20364 orig_out = operands[0];
20365 tmp = gen_reg_rtx (mode);
20366 operands[0] = tmp;
20367
20368 /* Recurse to get the constant loaded. */
20369 if (ix86_expand_int_movcc (operands) == 0)
20370 return false;
20371
20372 /* Mask in the interesting variable. */
20373 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20374 OPTAB_WIDEN);
20375 if (!rtx_equal_p (out, orig_out))
20376 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20377
20378 return true;
20379 }
20380
20381 /*
20382 * For comparison with above,
20383 *
20384 * movl cf,dest
20385 * movl ct,tmp
20386 * cmpl op1,op2
20387 * cmovcc tmp,dest
20388 *
20389 * Size 15.
20390 */
20391
20392 if (! nonimmediate_operand (operands[2], mode))
20393 operands[2] = force_reg (mode, operands[2]);
20394 if (! nonimmediate_operand (operands[3], mode))
20395 operands[3] = force_reg (mode, operands[3]);
20396
20397 if (! register_operand (operands[2], VOIDmode)
20398 && (mode == QImode
20399 || ! register_operand (operands[3], VOIDmode)))
20400 operands[2] = force_reg (mode, operands[2]);
20401
20402 if (mode == QImode
20403 && ! register_operand (operands[3], VOIDmode))
20404 operands[3] = force_reg (mode, operands[3]);
20405
20406 emit_insn (compare_seq);
20407 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20408 gen_rtx_IF_THEN_ELSE (mode,
20409 compare_op, operands[2],
20410 operands[3])));
20411 return true;
20412 }
20413
20414 /* Swap, force into registers, or otherwise massage the two operands
20415 to an sse comparison with a mask result. Thus we differ a bit from
20416 ix86_prepare_fp_compare_args which expects to produce a flags result.
20417
20418 The DEST operand exists to help determine whether to commute commutative
20419 operators. The POP0/POP1 operands are updated in place. The new
20420 comparison code is returned, or UNKNOWN if not implementable. */
20421
20422 static enum rtx_code
20423 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20424 rtx *pop0, rtx *pop1)
20425 {
20426 rtx tmp;
20427
20428 switch (code)
20429 {
20430 case LTGT:
20431 case UNEQ:
20432 /* AVX supports all the needed comparisons. */
20433 if (TARGET_AVX)
20434 break;
20435 /* We have no LTGT as an operator. We could implement it with
20436 NE & ORDERED, but this requires an extra temporary. It's
20437 not clear that it's worth it. */
20438 return UNKNOWN;
20439
20440 case LT:
20441 case LE:
20442 case UNGT:
20443 case UNGE:
20444 /* These are supported directly. */
20445 break;
20446
20447 case EQ:
20448 case NE:
20449 case UNORDERED:
20450 case ORDERED:
20451 /* AVX has 3 operand comparisons, no need to swap anything. */
20452 if (TARGET_AVX)
20453 break;
20454 /* For commutative operators, try to canonicalize the destination
20455 operand to be first in the comparison - this helps reload to
20456 avoid extra moves. */
20457 if (!dest || !rtx_equal_p (dest, *pop1))
20458 break;
20459 /* FALLTHRU */
20460
20461 case GE:
20462 case GT:
20463 case UNLE:
20464 case UNLT:
20465 /* These are not supported directly before AVX, and furthermore
20466 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20467 comparison operands to transform into something that is
20468 supported. */
20469 tmp = *pop0;
20470 *pop0 = *pop1;
20471 *pop1 = tmp;
20472 code = swap_condition (code);
20473 break;
20474
20475 default:
20476 gcc_unreachable ();
20477 }
20478
20479 return code;
20480 }
20481
20482 /* Detect conditional moves that exactly match min/max operational
20483 semantics. Note that this is IEEE safe, as long as we don't
20484 interchange the operands.
20485
20486 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20487 and TRUE if the operation is successful and instructions are emitted. */
20488
20489 static bool
20490 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20491 rtx cmp_op1, rtx if_true, rtx if_false)
20492 {
20493 enum machine_mode mode;
20494 bool is_min;
20495 rtx tmp;
20496
20497 if (code == LT)
20498 ;
20499 else if (code == UNGE)
20500 {
20501 tmp = if_true;
20502 if_true = if_false;
20503 if_false = tmp;
20504 }
20505 else
20506 return false;
20507
20508 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20509 is_min = true;
20510 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20511 is_min = false;
20512 else
20513 return false;
20514
20515 mode = GET_MODE (dest);
20516
20517 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20518 but MODE may be a vector mode and thus not appropriate. */
20519 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20520 {
20521 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20522 rtvec v;
20523
20524 if_true = force_reg (mode, if_true);
20525 v = gen_rtvec (2, if_true, if_false);
20526 tmp = gen_rtx_UNSPEC (mode, v, u);
20527 }
20528 else
20529 {
20530 code = is_min ? SMIN : SMAX;
20531 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20532 }
20533
20534 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20535 return true;
20536 }
20537
20538 /* Expand an sse vector comparison. Return the register with the result. */
20539
20540 static rtx
20541 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20542 rtx op_true, rtx op_false)
20543 {
20544 enum machine_mode mode = GET_MODE (dest);
20545 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20546 rtx x;
20547
20548 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20549 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20550 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20551
20552 if (optimize
20553 || reg_overlap_mentioned_p (dest, op_true)
20554 || reg_overlap_mentioned_p (dest, op_false))
20555 dest = gen_reg_rtx (mode);
20556
20557 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20558 if (cmp_mode != mode)
20559 {
20560 x = force_reg (cmp_mode, x);
20561 convert_move (dest, x, false);
20562 }
20563 else
20564 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20565
20566 return dest;
20567 }
20568
20569 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20570 operations. This is used for both scalar and vector conditional moves. */
20571
20572 static void
20573 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20574 {
20575 enum machine_mode mode = GET_MODE (dest);
20576 rtx t2, t3, x;
20577
20578 if (vector_all_ones_operand (op_true, mode)
20579 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20580 {
20581 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20582 }
20583 else if (op_false == CONST0_RTX (mode))
20584 {
20585 op_true = force_reg (mode, op_true);
20586 x = gen_rtx_AND (mode, cmp, op_true);
20587 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20588 }
20589 else if (op_true == CONST0_RTX (mode))
20590 {
20591 op_false = force_reg (mode, op_false);
20592 x = gen_rtx_NOT (mode, cmp);
20593 x = gen_rtx_AND (mode, x, op_false);
20594 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20595 }
20596 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20597 {
20598 op_false = force_reg (mode, op_false);
20599 x = gen_rtx_IOR (mode, cmp, op_false);
20600 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20601 }
20602 else if (TARGET_XOP)
20603 {
20604 op_true = force_reg (mode, op_true);
20605
20606 if (!nonimmediate_operand (op_false, mode))
20607 op_false = force_reg (mode, op_false);
20608
20609 emit_insn (gen_rtx_SET (mode, dest,
20610 gen_rtx_IF_THEN_ELSE (mode, cmp,
20611 op_true,
20612 op_false)));
20613 }
20614 else
20615 {
20616 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20617 rtx d = dest;
20618
20619 if (!nonimmediate_operand (op_true, mode))
20620 op_true = force_reg (mode, op_true);
20621
20622 op_false = force_reg (mode, op_false);
20623
20624 switch (mode)
20625 {
20626 case V4SFmode:
20627 if (TARGET_SSE4_1)
20628 gen = gen_sse4_1_blendvps;
20629 break;
20630 case V2DFmode:
20631 if (TARGET_SSE4_1)
20632 gen = gen_sse4_1_blendvpd;
20633 break;
20634 case V16QImode:
20635 case V8HImode:
20636 case V4SImode:
20637 case V2DImode:
20638 if (TARGET_SSE4_1)
20639 {
20640 gen = gen_sse4_1_pblendvb;
20641 if (mode != V16QImode)
20642 d = gen_reg_rtx (V16QImode);
20643 op_false = gen_lowpart (V16QImode, op_false);
20644 op_true = gen_lowpart (V16QImode, op_true);
20645 cmp = gen_lowpart (V16QImode, cmp);
20646 }
20647 break;
20648 case V8SFmode:
20649 if (TARGET_AVX)
20650 gen = gen_avx_blendvps256;
20651 break;
20652 case V4DFmode:
20653 if (TARGET_AVX)
20654 gen = gen_avx_blendvpd256;
20655 break;
20656 case V32QImode:
20657 case V16HImode:
20658 case V8SImode:
20659 case V4DImode:
20660 if (TARGET_AVX2)
20661 {
20662 gen = gen_avx2_pblendvb;
20663 if (mode != V32QImode)
20664 d = gen_reg_rtx (V32QImode);
20665 op_false = gen_lowpart (V32QImode, op_false);
20666 op_true = gen_lowpart (V32QImode, op_true);
20667 cmp = gen_lowpart (V32QImode, cmp);
20668 }
20669 break;
20670 default:
20671 break;
20672 }
20673
20674 if (gen != NULL)
20675 {
20676 emit_insn (gen (d, op_false, op_true, cmp));
20677 if (d != dest)
20678 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
20679 }
20680 else
20681 {
20682 op_true = force_reg (mode, op_true);
20683
20684 t2 = gen_reg_rtx (mode);
20685 if (optimize)
20686 t3 = gen_reg_rtx (mode);
20687 else
20688 t3 = dest;
20689
20690 x = gen_rtx_AND (mode, op_true, cmp);
20691 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20692
20693 x = gen_rtx_NOT (mode, cmp);
20694 x = gen_rtx_AND (mode, x, op_false);
20695 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20696
20697 x = gen_rtx_IOR (mode, t3, t2);
20698 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20699 }
20700 }
20701 }
20702
20703 /* Expand a floating-point conditional move. Return true if successful. */
20704
20705 bool
20706 ix86_expand_fp_movcc (rtx operands[])
20707 {
20708 enum machine_mode mode = GET_MODE (operands[0]);
20709 enum rtx_code code = GET_CODE (operands[1]);
20710 rtx tmp, compare_op;
20711 rtx op0 = XEXP (operands[1], 0);
20712 rtx op1 = XEXP (operands[1], 1);
20713
20714 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20715 {
20716 enum machine_mode cmode;
20717
20718 /* Since we've no cmove for sse registers, don't force bad register
20719 allocation just to gain access to it. Deny movcc when the
20720 comparison mode doesn't match the move mode. */
20721 cmode = GET_MODE (op0);
20722 if (cmode == VOIDmode)
20723 cmode = GET_MODE (op1);
20724 if (cmode != mode)
20725 return false;
20726
20727 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20728 if (code == UNKNOWN)
20729 return false;
20730
20731 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20732 operands[2], operands[3]))
20733 return true;
20734
20735 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20736 operands[2], operands[3]);
20737 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20738 return true;
20739 }
20740
20741 if (GET_MODE (op0) == TImode
20742 || (GET_MODE (op0) == DImode
20743 && !TARGET_64BIT))
20744 return false;
20745
20746 /* The floating point conditional move instructions don't directly
20747 support conditions resulting from a signed integer comparison. */
20748
20749 compare_op = ix86_expand_compare (code, op0, op1);
20750 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20751 {
20752 tmp = gen_reg_rtx (QImode);
20753 ix86_expand_setcc (tmp, code, op0, op1);
20754
20755 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20756 }
20757
20758 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20759 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20760 operands[2], operands[3])));
20761
20762 return true;
20763 }
20764
20765 /* Expand a floating-point vector conditional move; a vcond operation
20766 rather than a movcc operation. */
20767
20768 bool
20769 ix86_expand_fp_vcond (rtx operands[])
20770 {
20771 enum rtx_code code = GET_CODE (operands[3]);
20772 rtx cmp;
20773
20774 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20775 &operands[4], &operands[5]);
20776 if (code == UNKNOWN)
20777 {
20778 rtx temp;
20779 switch (GET_CODE (operands[3]))
20780 {
20781 case LTGT:
20782 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20783 operands[5], operands[0], operands[0]);
20784 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20785 operands[5], operands[1], operands[2]);
20786 code = AND;
20787 break;
20788 case UNEQ:
20789 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20790 operands[5], operands[0], operands[0]);
20791 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20792 operands[5], operands[1], operands[2]);
20793 code = IOR;
20794 break;
20795 default:
20796 gcc_unreachable ();
20797 }
20798 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20799 OPTAB_DIRECT);
20800 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20801 return true;
20802 }
20803
20804 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20805 operands[5], operands[1], operands[2]))
20806 return true;
20807
20808 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20809 operands[1], operands[2]);
20810 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20811 return true;
20812 }
20813
20814 /* Expand a signed/unsigned integral vector conditional move. */
20815
20816 bool
20817 ix86_expand_int_vcond (rtx operands[])
20818 {
20819 enum machine_mode data_mode = GET_MODE (operands[0]);
20820 enum machine_mode mode = GET_MODE (operands[4]);
20821 enum rtx_code code = GET_CODE (operands[3]);
20822 bool negate = false;
20823 rtx x, cop0, cop1;
20824
20825 cop0 = operands[4];
20826 cop1 = operands[5];
20827
20828 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20829 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20830 if ((code == LT || code == GE)
20831 && data_mode == mode
20832 && cop1 == CONST0_RTX (mode)
20833 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20834 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20835 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20836 && (GET_MODE_SIZE (data_mode) == 16
20837 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20838 {
20839 rtx negop = operands[2 - (code == LT)];
20840 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20841 if (negop == CONST1_RTX (data_mode))
20842 {
20843 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20844 operands[0], 1, OPTAB_DIRECT);
20845 if (res != operands[0])
20846 emit_move_insn (operands[0], res);
20847 return true;
20848 }
20849 else if (GET_MODE_INNER (data_mode) != DImode
20850 && vector_all_ones_operand (negop, data_mode))
20851 {
20852 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20853 operands[0], 0, OPTAB_DIRECT);
20854 if (res != operands[0])
20855 emit_move_insn (operands[0], res);
20856 return true;
20857 }
20858 }
20859
20860 if (!nonimmediate_operand (cop1, mode))
20861 cop1 = force_reg (mode, cop1);
20862 if (!general_operand (operands[1], data_mode))
20863 operands[1] = force_reg (data_mode, operands[1]);
20864 if (!general_operand (operands[2], data_mode))
20865 operands[2] = force_reg (data_mode, operands[2]);
20866
20867 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20868 if (TARGET_XOP
20869 && (mode == V16QImode || mode == V8HImode
20870 || mode == V4SImode || mode == V2DImode))
20871 ;
20872 else
20873 {
20874 /* Canonicalize the comparison to EQ, GT, GTU. */
20875 switch (code)
20876 {
20877 case EQ:
20878 case GT:
20879 case GTU:
20880 break;
20881
20882 case NE:
20883 case LE:
20884 case LEU:
20885 code = reverse_condition (code);
20886 negate = true;
20887 break;
20888
20889 case GE:
20890 case GEU:
20891 code = reverse_condition (code);
20892 negate = true;
20893 /* FALLTHRU */
20894
20895 case LT:
20896 case LTU:
20897 code = swap_condition (code);
20898 x = cop0, cop0 = cop1, cop1 = x;
20899 break;
20900
20901 default:
20902 gcc_unreachable ();
20903 }
20904
20905 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20906 if (mode == V2DImode)
20907 {
20908 switch (code)
20909 {
20910 case EQ:
20911 /* SSE4.1 supports EQ. */
20912 if (!TARGET_SSE4_1)
20913 return false;
20914 break;
20915
20916 case GT:
20917 case GTU:
20918 /* SSE4.2 supports GT/GTU. */
20919 if (!TARGET_SSE4_2)
20920 return false;
20921 break;
20922
20923 default:
20924 gcc_unreachable ();
20925 }
20926 }
20927
20928 /* Unsigned parallel compare is not supported by the hardware.
20929 Play some tricks to turn this into a signed comparison
20930 against 0. */
20931 if (code == GTU)
20932 {
20933 cop0 = force_reg (mode, cop0);
20934
20935 switch (mode)
20936 {
20937 case V8SImode:
20938 case V4DImode:
20939 case V4SImode:
20940 case V2DImode:
20941 {
20942 rtx t1, t2, mask;
20943 rtx (*gen_sub3) (rtx, rtx, rtx);
20944
20945 switch (mode)
20946 {
20947 case V8SImode: gen_sub3 = gen_subv8si3; break;
20948 case V4DImode: gen_sub3 = gen_subv4di3; break;
20949 case V4SImode: gen_sub3 = gen_subv4si3; break;
20950 case V2DImode: gen_sub3 = gen_subv2di3; break;
20951 default:
20952 gcc_unreachable ();
20953 }
20954 /* Subtract (-(INT MAX) - 1) from both operands to make
20955 them signed. */
20956 mask = ix86_build_signbit_mask (mode, true, false);
20957 t1 = gen_reg_rtx (mode);
20958 emit_insn (gen_sub3 (t1, cop0, mask));
20959
20960 t2 = gen_reg_rtx (mode);
20961 emit_insn (gen_sub3 (t2, cop1, mask));
20962
20963 cop0 = t1;
20964 cop1 = t2;
20965 code = GT;
20966 }
20967 break;
20968
20969 case V32QImode:
20970 case V16HImode:
20971 case V16QImode:
20972 case V8HImode:
20973 /* Perform a parallel unsigned saturating subtraction. */
20974 x = gen_reg_rtx (mode);
20975 emit_insn (gen_rtx_SET (VOIDmode, x,
20976 gen_rtx_US_MINUS (mode, cop0, cop1)));
20977
20978 cop0 = x;
20979 cop1 = CONST0_RTX (mode);
20980 code = EQ;
20981 negate = !negate;
20982 break;
20983
20984 default:
20985 gcc_unreachable ();
20986 }
20987 }
20988 }
20989
20990 /* Allow the comparison to be done in one mode, but the movcc to
20991 happen in another mode. */
20992 if (data_mode == mode)
20993 {
20994 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20995 operands[1+negate], operands[2-negate]);
20996 }
20997 else
20998 {
20999 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
21000 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
21001 operands[1+negate], operands[2-negate]);
21002 x = gen_lowpart (data_mode, x);
21003 }
21004
21005 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
21006 operands[2-negate]);
21007 return true;
21008 }
21009
21010 /* Expand a variable vector permutation. */
21011
21012 void
21013 ix86_expand_vec_perm (rtx operands[])
21014 {
21015 rtx target = operands[0];
21016 rtx op0 = operands[1];
21017 rtx op1 = operands[2];
21018 rtx mask = operands[3];
21019 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
21020 enum machine_mode mode = GET_MODE (op0);
21021 enum machine_mode maskmode = GET_MODE (mask);
21022 int w, e, i;
21023 bool one_operand_shuffle = rtx_equal_p (op0, op1);
21024
21025 /* Number of elements in the vector. */
21026 w = GET_MODE_NUNITS (mode);
21027 e = GET_MODE_UNIT_SIZE (mode);
21028 gcc_assert (w <= 32);
21029
21030 if (TARGET_AVX2)
21031 {
21032 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
21033 {
21034 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
21035 an constant shuffle operand. With a tiny bit of effort we can
21036 use VPERMD instead. A re-interpretation stall for V4DFmode is
21037 unfortunate but there's no avoiding it.
21038 Similarly for V16HImode we don't have instructions for variable
21039 shuffling, while for V32QImode we can use after preparing suitable
21040 masks vpshufb; vpshufb; vpermq; vpor. */
21041
21042 if (mode == V16HImode)
21043 {
21044 maskmode = mode = V32QImode;
21045 w = 32;
21046 e = 1;
21047 }
21048 else
21049 {
21050 maskmode = mode = V8SImode;
21051 w = 8;
21052 e = 4;
21053 }
21054 t1 = gen_reg_rtx (maskmode);
21055
21056 /* Replicate the low bits of the V4DImode mask into V8SImode:
21057 mask = { A B C D }
21058 t1 = { A A B B C C D D }. */
21059 for (i = 0; i < w / 2; ++i)
21060 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
21061 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21062 vt = force_reg (maskmode, vt);
21063 mask = gen_lowpart (maskmode, mask);
21064 if (maskmode == V8SImode)
21065 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
21066 else
21067 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
21068
21069 /* Multiply the shuffle indicies by two. */
21070 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
21071 OPTAB_DIRECT);
21072
21073 /* Add one to the odd shuffle indicies:
21074 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
21075 for (i = 0; i < w / 2; ++i)
21076 {
21077 vec[i * 2] = const0_rtx;
21078 vec[i * 2 + 1] = const1_rtx;
21079 }
21080 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21081 vt = validize_mem (force_const_mem (maskmode, vt));
21082 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
21083 OPTAB_DIRECT);
21084
21085 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
21086 operands[3] = mask = t1;
21087 target = gen_reg_rtx (mode);
21088 op0 = gen_lowpart (mode, op0);
21089 op1 = gen_lowpart (mode, op1);
21090 }
21091
21092 switch (mode)
21093 {
21094 case V8SImode:
21095 /* The VPERMD and VPERMPS instructions already properly ignore
21096 the high bits of the shuffle elements. No need for us to
21097 perform an AND ourselves. */
21098 if (one_operand_shuffle)
21099 {
21100 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
21101 if (target != operands[0])
21102 emit_move_insn (operands[0],
21103 gen_lowpart (GET_MODE (operands[0]), target));
21104 }
21105 else
21106 {
21107 t1 = gen_reg_rtx (V8SImode);
21108 t2 = gen_reg_rtx (V8SImode);
21109 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
21110 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
21111 goto merge_two;
21112 }
21113 return;
21114
21115 case V8SFmode:
21116 mask = gen_lowpart (V8SFmode, mask);
21117 if (one_operand_shuffle)
21118 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
21119 else
21120 {
21121 t1 = gen_reg_rtx (V8SFmode);
21122 t2 = gen_reg_rtx (V8SFmode);
21123 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
21124 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
21125 goto merge_two;
21126 }
21127 return;
21128
21129 case V4SImode:
21130 /* By combining the two 128-bit input vectors into one 256-bit
21131 input vector, we can use VPERMD and VPERMPS for the full
21132 two-operand shuffle. */
21133 t1 = gen_reg_rtx (V8SImode);
21134 t2 = gen_reg_rtx (V8SImode);
21135 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
21136 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21137 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
21138 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
21139 return;
21140
21141 case V4SFmode:
21142 t1 = gen_reg_rtx (V8SFmode);
21143 t2 = gen_reg_rtx (V8SImode);
21144 mask = gen_lowpart (V4SImode, mask);
21145 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
21146 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21147 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
21148 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
21149 return;
21150
21151 case V32QImode:
21152 t1 = gen_reg_rtx (V32QImode);
21153 t2 = gen_reg_rtx (V32QImode);
21154 t3 = gen_reg_rtx (V32QImode);
21155 vt2 = GEN_INT (128);
21156 for (i = 0; i < 32; i++)
21157 vec[i] = vt2;
21158 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21159 vt = force_reg (V32QImode, vt);
21160 for (i = 0; i < 32; i++)
21161 vec[i] = i < 16 ? vt2 : const0_rtx;
21162 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21163 vt2 = force_reg (V32QImode, vt2);
21164 /* From mask create two adjusted masks, which contain the same
21165 bits as mask in the low 7 bits of each vector element.
21166 The first mask will have the most significant bit clear
21167 if it requests element from the same 128-bit lane
21168 and MSB set if it requests element from the other 128-bit lane.
21169 The second mask will have the opposite values of the MSB,
21170 and additionally will have its 128-bit lanes swapped.
21171 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
21172 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
21173 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
21174 stands for other 12 bytes. */
21175 /* The bit whether element is from the same lane or the other
21176 lane is bit 4, so shift it up by 3 to the MSB position. */
21177 t5 = gen_reg_rtx (V4DImode);
21178 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
21179 GEN_INT (3)));
21180 /* Clear MSB bits from the mask just in case it had them set. */
21181 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
21182 /* After this t1 will have MSB set for elements from other lane. */
21183 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
21184 /* Clear bits other than MSB. */
21185 emit_insn (gen_andv32qi3 (t1, t1, vt));
21186 /* Or in the lower bits from mask into t3. */
21187 emit_insn (gen_iorv32qi3 (t3, t1, t2));
21188 /* And invert MSB bits in t1, so MSB is set for elements from the same
21189 lane. */
21190 emit_insn (gen_xorv32qi3 (t1, t1, vt));
21191 /* Swap 128-bit lanes in t3. */
21192 t6 = gen_reg_rtx (V4DImode);
21193 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
21194 const2_rtx, GEN_INT (3),
21195 const0_rtx, const1_rtx));
21196 /* And or in the lower bits from mask into t1. */
21197 emit_insn (gen_iorv32qi3 (t1, t1, t2));
21198 if (one_operand_shuffle)
21199 {
21200 /* Each of these shuffles will put 0s in places where
21201 element from the other 128-bit lane is needed, otherwise
21202 will shuffle in the requested value. */
21203 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
21204 gen_lowpart (V32QImode, t6)));
21205 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
21206 /* For t3 the 128-bit lanes are swapped again. */
21207 t7 = gen_reg_rtx (V4DImode);
21208 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
21209 const2_rtx, GEN_INT (3),
21210 const0_rtx, const1_rtx));
21211 /* And oring both together leads to the result. */
21212 emit_insn (gen_iorv32qi3 (target, t1,
21213 gen_lowpart (V32QImode, t7)));
21214 if (target != operands[0])
21215 emit_move_insn (operands[0],
21216 gen_lowpart (GET_MODE (operands[0]), target));
21217 return;
21218 }
21219
21220 t4 = gen_reg_rtx (V32QImode);
21221 /* Similarly to the above one_operand_shuffle code,
21222 just for repeated twice for each operand. merge_two:
21223 code will merge the two results together. */
21224 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
21225 gen_lowpart (V32QImode, t6)));
21226 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
21227 gen_lowpart (V32QImode, t6)));
21228 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
21229 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
21230 t7 = gen_reg_rtx (V4DImode);
21231 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
21232 const2_rtx, GEN_INT (3),
21233 const0_rtx, const1_rtx));
21234 t8 = gen_reg_rtx (V4DImode);
21235 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
21236 const2_rtx, GEN_INT (3),
21237 const0_rtx, const1_rtx));
21238 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
21239 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
21240 t1 = t4;
21241 t2 = t3;
21242 goto merge_two;
21243
21244 default:
21245 gcc_assert (GET_MODE_SIZE (mode) <= 16);
21246 break;
21247 }
21248 }
21249
21250 if (TARGET_XOP)
21251 {
21252 /* The XOP VPPERM insn supports three inputs. By ignoring the
21253 one_operand_shuffle special case, we avoid creating another
21254 set of constant vectors in memory. */
21255 one_operand_shuffle = false;
21256
21257 /* mask = mask & {2*w-1, ...} */
21258 vt = GEN_INT (2*w - 1);
21259 }
21260 else
21261 {
21262 /* mask = mask & {w-1, ...} */
21263 vt = GEN_INT (w - 1);
21264 }
21265
21266 for (i = 0; i < w; i++)
21267 vec[i] = vt;
21268 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21269 mask = expand_simple_binop (maskmode, AND, mask, vt,
21270 NULL_RTX, 0, OPTAB_DIRECT);
21271
21272 /* For non-QImode operations, convert the word permutation control
21273 into a byte permutation control. */
21274 if (mode != V16QImode)
21275 {
21276 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21277 GEN_INT (exact_log2 (e)),
21278 NULL_RTX, 0, OPTAB_DIRECT);
21279
21280 /* Convert mask to vector of chars. */
21281 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21282
21283 /* Replicate each of the input bytes into byte positions:
21284 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21285 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21286 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21287 for (i = 0; i < 16; ++i)
21288 vec[i] = GEN_INT (i/e * e);
21289 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21290 vt = validize_mem (force_const_mem (V16QImode, vt));
21291 if (TARGET_XOP)
21292 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21293 else
21294 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21295
21296 /* Convert it into the byte positions by doing
21297 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21298 for (i = 0; i < 16; ++i)
21299 vec[i] = GEN_INT (i % e);
21300 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21301 vt = validize_mem (force_const_mem (V16QImode, vt));
21302 emit_insn (gen_addv16qi3 (mask, mask, vt));
21303 }
21304
21305 /* The actual shuffle operations all operate on V16QImode. */
21306 op0 = gen_lowpart (V16QImode, op0);
21307 op1 = gen_lowpart (V16QImode, op1);
21308
21309 if (TARGET_XOP)
21310 {
21311 if (GET_MODE (target) != V16QImode)
21312 target = gen_reg_rtx (V16QImode);
21313 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21314 if (target != operands[0])
21315 emit_move_insn (operands[0],
21316 gen_lowpart (GET_MODE (operands[0]), target));
21317 }
21318 else if (one_operand_shuffle)
21319 {
21320 if (GET_MODE (target) != V16QImode)
21321 target = gen_reg_rtx (V16QImode);
21322 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21323 if (target != operands[0])
21324 emit_move_insn (operands[0],
21325 gen_lowpart (GET_MODE (operands[0]), target));
21326 }
21327 else
21328 {
21329 rtx xops[6];
21330 bool ok;
21331
21332 /* Shuffle the two input vectors independently. */
21333 t1 = gen_reg_rtx (V16QImode);
21334 t2 = gen_reg_rtx (V16QImode);
21335 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21336 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21337
21338 merge_two:
21339 /* Then merge them together. The key is whether any given control
21340 element contained a bit set that indicates the second word. */
21341 mask = operands[3];
21342 vt = GEN_INT (w);
21343 if (maskmode == V2DImode && !TARGET_SSE4_1)
21344 {
21345 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21346 more shuffle to convert the V2DI input mask into a V4SI
21347 input mask. At which point the masking that expand_int_vcond
21348 will work as desired. */
21349 rtx t3 = gen_reg_rtx (V4SImode);
21350 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21351 const0_rtx, const0_rtx,
21352 const2_rtx, const2_rtx));
21353 mask = t3;
21354 maskmode = V4SImode;
21355 e = w = 4;
21356 }
21357
21358 for (i = 0; i < w; i++)
21359 vec[i] = vt;
21360 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21361 vt = force_reg (maskmode, vt);
21362 mask = expand_simple_binop (maskmode, AND, mask, vt,
21363 NULL_RTX, 0, OPTAB_DIRECT);
21364
21365 if (GET_MODE (target) != mode)
21366 target = gen_reg_rtx (mode);
21367 xops[0] = target;
21368 xops[1] = gen_lowpart (mode, t2);
21369 xops[2] = gen_lowpart (mode, t1);
21370 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21371 xops[4] = mask;
21372 xops[5] = vt;
21373 ok = ix86_expand_int_vcond (xops);
21374 gcc_assert (ok);
21375 if (target != operands[0])
21376 emit_move_insn (operands[0],
21377 gen_lowpart (GET_MODE (operands[0]), target));
21378 }
21379 }
21380
21381 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21382 true if we should do zero extension, else sign extension. HIGH_P is
21383 true if we want the N/2 high elements, else the low elements. */
21384
21385 void
21386 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21387 {
21388 enum machine_mode imode = GET_MODE (src);
21389 rtx tmp;
21390
21391 if (TARGET_SSE4_1)
21392 {
21393 rtx (*unpack)(rtx, rtx);
21394 rtx (*extract)(rtx, rtx) = NULL;
21395 enum machine_mode halfmode = BLKmode;
21396
21397 switch (imode)
21398 {
21399 case V32QImode:
21400 if (unsigned_p)
21401 unpack = gen_avx2_zero_extendv16qiv16hi2;
21402 else
21403 unpack = gen_avx2_sign_extendv16qiv16hi2;
21404 halfmode = V16QImode;
21405 extract
21406 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21407 break;
21408 case V16HImode:
21409 if (unsigned_p)
21410 unpack = gen_avx2_zero_extendv8hiv8si2;
21411 else
21412 unpack = gen_avx2_sign_extendv8hiv8si2;
21413 halfmode = V8HImode;
21414 extract
21415 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21416 break;
21417 case V8SImode:
21418 if (unsigned_p)
21419 unpack = gen_avx2_zero_extendv4siv4di2;
21420 else
21421 unpack = gen_avx2_sign_extendv4siv4di2;
21422 halfmode = V4SImode;
21423 extract
21424 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21425 break;
21426 case V16QImode:
21427 if (unsigned_p)
21428 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21429 else
21430 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21431 break;
21432 case V8HImode:
21433 if (unsigned_p)
21434 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21435 else
21436 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21437 break;
21438 case V4SImode:
21439 if (unsigned_p)
21440 unpack = gen_sse4_1_zero_extendv2siv2di2;
21441 else
21442 unpack = gen_sse4_1_sign_extendv2siv2di2;
21443 break;
21444 default:
21445 gcc_unreachable ();
21446 }
21447
21448 if (GET_MODE_SIZE (imode) == 32)
21449 {
21450 tmp = gen_reg_rtx (halfmode);
21451 emit_insn (extract (tmp, src));
21452 }
21453 else if (high_p)
21454 {
21455 /* Shift higher 8 bytes to lower 8 bytes. */
21456 tmp = gen_reg_rtx (V1TImode);
21457 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
21458 GEN_INT (64)));
21459 tmp = gen_lowpart (imode, tmp);
21460 }
21461 else
21462 tmp = src;
21463
21464 emit_insn (unpack (dest, tmp));
21465 }
21466 else
21467 {
21468 rtx (*unpack)(rtx, rtx, rtx);
21469
21470 switch (imode)
21471 {
21472 case V16QImode:
21473 if (high_p)
21474 unpack = gen_vec_interleave_highv16qi;
21475 else
21476 unpack = gen_vec_interleave_lowv16qi;
21477 break;
21478 case V8HImode:
21479 if (high_p)
21480 unpack = gen_vec_interleave_highv8hi;
21481 else
21482 unpack = gen_vec_interleave_lowv8hi;
21483 break;
21484 case V4SImode:
21485 if (high_p)
21486 unpack = gen_vec_interleave_highv4si;
21487 else
21488 unpack = gen_vec_interleave_lowv4si;
21489 break;
21490 default:
21491 gcc_unreachable ();
21492 }
21493
21494 if (unsigned_p)
21495 tmp = force_reg (imode, CONST0_RTX (imode));
21496 else
21497 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21498 src, pc_rtx, pc_rtx);
21499
21500 rtx tmp2 = gen_reg_rtx (imode);
21501 emit_insn (unpack (tmp2, src, tmp));
21502 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
21503 }
21504 }
21505
21506 /* Expand conditional increment or decrement using adb/sbb instructions.
21507 The default case using setcc followed by the conditional move can be
21508 done by generic code. */
21509 bool
21510 ix86_expand_int_addcc (rtx operands[])
21511 {
21512 enum rtx_code code = GET_CODE (operands[1]);
21513 rtx flags;
21514 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21515 rtx compare_op;
21516 rtx val = const0_rtx;
21517 bool fpcmp = false;
21518 enum machine_mode mode;
21519 rtx op0 = XEXP (operands[1], 0);
21520 rtx op1 = XEXP (operands[1], 1);
21521
21522 if (operands[3] != const1_rtx
21523 && operands[3] != constm1_rtx)
21524 return false;
21525 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21526 return false;
21527 code = GET_CODE (compare_op);
21528
21529 flags = XEXP (compare_op, 0);
21530
21531 if (GET_MODE (flags) == CCFPmode
21532 || GET_MODE (flags) == CCFPUmode)
21533 {
21534 fpcmp = true;
21535 code = ix86_fp_compare_code_to_integer (code);
21536 }
21537
21538 if (code != LTU)
21539 {
21540 val = constm1_rtx;
21541 if (fpcmp)
21542 PUT_CODE (compare_op,
21543 reverse_condition_maybe_unordered
21544 (GET_CODE (compare_op)));
21545 else
21546 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21547 }
21548
21549 mode = GET_MODE (operands[0]);
21550
21551 /* Construct either adc or sbb insn. */
21552 if ((code == LTU) == (operands[3] == constm1_rtx))
21553 {
21554 switch (mode)
21555 {
21556 case QImode:
21557 insn = gen_subqi3_carry;
21558 break;
21559 case HImode:
21560 insn = gen_subhi3_carry;
21561 break;
21562 case SImode:
21563 insn = gen_subsi3_carry;
21564 break;
21565 case DImode:
21566 insn = gen_subdi3_carry;
21567 break;
21568 default:
21569 gcc_unreachable ();
21570 }
21571 }
21572 else
21573 {
21574 switch (mode)
21575 {
21576 case QImode:
21577 insn = gen_addqi3_carry;
21578 break;
21579 case HImode:
21580 insn = gen_addhi3_carry;
21581 break;
21582 case SImode:
21583 insn = gen_addsi3_carry;
21584 break;
21585 case DImode:
21586 insn = gen_adddi3_carry;
21587 break;
21588 default:
21589 gcc_unreachable ();
21590 }
21591 }
21592 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21593
21594 return true;
21595 }
21596
21597
21598 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21599 but works for floating pointer parameters and nonoffsetable memories.
21600 For pushes, it returns just stack offsets; the values will be saved
21601 in the right order. Maximally three parts are generated. */
21602
21603 static int
21604 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21605 {
21606 int size;
21607
21608 if (!TARGET_64BIT)
21609 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21610 else
21611 size = (GET_MODE_SIZE (mode) + 4) / 8;
21612
21613 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21614 gcc_assert (size >= 2 && size <= 4);
21615
21616 /* Optimize constant pool reference to immediates. This is used by fp
21617 moves, that force all constants to memory to allow combining. */
21618 if (MEM_P (operand) && MEM_READONLY_P (operand))
21619 {
21620 rtx tmp = maybe_get_pool_constant (operand);
21621 if (tmp)
21622 operand = tmp;
21623 }
21624
21625 if (MEM_P (operand) && !offsettable_memref_p (operand))
21626 {
21627 /* The only non-offsetable memories we handle are pushes. */
21628 int ok = push_operand (operand, VOIDmode);
21629
21630 gcc_assert (ok);
21631
21632 operand = copy_rtx (operand);
21633 PUT_MODE (operand, word_mode);
21634 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21635 return size;
21636 }
21637
21638 if (GET_CODE (operand) == CONST_VECTOR)
21639 {
21640 enum machine_mode imode = int_mode_for_mode (mode);
21641 /* Caution: if we looked through a constant pool memory above,
21642 the operand may actually have a different mode now. That's
21643 ok, since we want to pun this all the way back to an integer. */
21644 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21645 gcc_assert (operand != NULL);
21646 mode = imode;
21647 }
21648
21649 if (!TARGET_64BIT)
21650 {
21651 if (mode == DImode)
21652 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21653 else
21654 {
21655 int i;
21656
21657 if (REG_P (operand))
21658 {
21659 gcc_assert (reload_completed);
21660 for (i = 0; i < size; i++)
21661 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21662 }
21663 else if (offsettable_memref_p (operand))
21664 {
21665 operand = adjust_address (operand, SImode, 0);
21666 parts[0] = operand;
21667 for (i = 1; i < size; i++)
21668 parts[i] = adjust_address (operand, SImode, 4 * i);
21669 }
21670 else if (GET_CODE (operand) == CONST_DOUBLE)
21671 {
21672 REAL_VALUE_TYPE r;
21673 long l[4];
21674
21675 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21676 switch (mode)
21677 {
21678 case TFmode:
21679 real_to_target (l, &r, mode);
21680 parts[3] = gen_int_mode (l[3], SImode);
21681 parts[2] = gen_int_mode (l[2], SImode);
21682 break;
21683 case XFmode:
21684 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21685 long double may not be 80-bit. */
21686 real_to_target (l, &r, mode);
21687 parts[2] = gen_int_mode (l[2], SImode);
21688 break;
21689 case DFmode:
21690 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21691 break;
21692 default:
21693 gcc_unreachable ();
21694 }
21695 parts[1] = gen_int_mode (l[1], SImode);
21696 parts[0] = gen_int_mode (l[0], SImode);
21697 }
21698 else
21699 gcc_unreachable ();
21700 }
21701 }
21702 else
21703 {
21704 if (mode == TImode)
21705 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21706 if (mode == XFmode || mode == TFmode)
21707 {
21708 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21709 if (REG_P (operand))
21710 {
21711 gcc_assert (reload_completed);
21712 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21713 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21714 }
21715 else if (offsettable_memref_p (operand))
21716 {
21717 operand = adjust_address (operand, DImode, 0);
21718 parts[0] = operand;
21719 parts[1] = adjust_address (operand, upper_mode, 8);
21720 }
21721 else if (GET_CODE (operand) == CONST_DOUBLE)
21722 {
21723 REAL_VALUE_TYPE r;
21724 long l[4];
21725
21726 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21727 real_to_target (l, &r, mode);
21728
21729 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21730 if (HOST_BITS_PER_WIDE_INT >= 64)
21731 parts[0]
21732 = gen_int_mode
21733 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21734 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21735 DImode);
21736 else
21737 parts[0] = immed_double_const (l[0], l[1], DImode);
21738
21739 if (upper_mode == SImode)
21740 parts[1] = gen_int_mode (l[2], SImode);
21741 else if (HOST_BITS_PER_WIDE_INT >= 64)
21742 parts[1]
21743 = gen_int_mode
21744 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21745 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21746 DImode);
21747 else
21748 parts[1] = immed_double_const (l[2], l[3], DImode);
21749 }
21750 else
21751 gcc_unreachable ();
21752 }
21753 }
21754
21755 return size;
21756 }
21757
21758 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21759 Return false when normal moves are needed; true when all required
21760 insns have been emitted. Operands 2-4 contain the input values
21761 int the correct order; operands 5-7 contain the output values. */
21762
21763 void
21764 ix86_split_long_move (rtx operands[])
21765 {
21766 rtx part[2][4];
21767 int nparts, i, j;
21768 int push = 0;
21769 int collisions = 0;
21770 enum machine_mode mode = GET_MODE (operands[0]);
21771 bool collisionparts[4];
21772
21773 /* The DFmode expanders may ask us to move double.
21774 For 64bit target this is single move. By hiding the fact
21775 here we simplify i386.md splitters. */
21776 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21777 {
21778 /* Optimize constant pool reference to immediates. This is used by
21779 fp moves, that force all constants to memory to allow combining. */
21780
21781 if (MEM_P (operands[1])
21782 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21783 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21784 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21785 if (push_operand (operands[0], VOIDmode))
21786 {
21787 operands[0] = copy_rtx (operands[0]);
21788 PUT_MODE (operands[0], word_mode);
21789 }
21790 else
21791 operands[0] = gen_lowpart (DImode, operands[0]);
21792 operands[1] = gen_lowpart (DImode, operands[1]);
21793 emit_move_insn (operands[0], operands[1]);
21794 return;
21795 }
21796
21797 /* The only non-offsettable memory we handle is push. */
21798 if (push_operand (operands[0], VOIDmode))
21799 push = 1;
21800 else
21801 gcc_assert (!MEM_P (operands[0])
21802 || offsettable_memref_p (operands[0]));
21803
21804 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21805 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21806
21807 /* When emitting push, take care for source operands on the stack. */
21808 if (push && MEM_P (operands[1])
21809 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21810 {
21811 rtx src_base = XEXP (part[1][nparts - 1], 0);
21812
21813 /* Compensate for the stack decrement by 4. */
21814 if (!TARGET_64BIT && nparts == 3
21815 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21816 src_base = plus_constant (Pmode, src_base, 4);
21817
21818 /* src_base refers to the stack pointer and is
21819 automatically decreased by emitted push. */
21820 for (i = 0; i < nparts; i++)
21821 part[1][i] = change_address (part[1][i],
21822 GET_MODE (part[1][i]), src_base);
21823 }
21824
21825 /* We need to do copy in the right order in case an address register
21826 of the source overlaps the destination. */
21827 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21828 {
21829 rtx tmp;
21830
21831 for (i = 0; i < nparts; i++)
21832 {
21833 collisionparts[i]
21834 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21835 if (collisionparts[i])
21836 collisions++;
21837 }
21838
21839 /* Collision in the middle part can be handled by reordering. */
21840 if (collisions == 1 && nparts == 3 && collisionparts [1])
21841 {
21842 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21843 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21844 }
21845 else if (collisions == 1
21846 && nparts == 4
21847 && (collisionparts [1] || collisionparts [2]))
21848 {
21849 if (collisionparts [1])
21850 {
21851 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21852 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21853 }
21854 else
21855 {
21856 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21857 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21858 }
21859 }
21860
21861 /* If there are more collisions, we can't handle it by reordering.
21862 Do an lea to the last part and use only one colliding move. */
21863 else if (collisions > 1)
21864 {
21865 rtx base;
21866
21867 collisions = 1;
21868
21869 base = part[0][nparts - 1];
21870
21871 /* Handle the case when the last part isn't valid for lea.
21872 Happens in 64-bit mode storing the 12-byte XFmode. */
21873 if (GET_MODE (base) != Pmode)
21874 base = gen_rtx_REG (Pmode, REGNO (base));
21875
21876 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21877 part[1][0] = replace_equiv_address (part[1][0], base);
21878 for (i = 1; i < nparts; i++)
21879 {
21880 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21881 part[1][i] = replace_equiv_address (part[1][i], tmp);
21882 }
21883 }
21884 }
21885
21886 if (push)
21887 {
21888 if (!TARGET_64BIT)
21889 {
21890 if (nparts == 3)
21891 {
21892 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21893 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21894 stack_pointer_rtx, GEN_INT (-4)));
21895 emit_move_insn (part[0][2], part[1][2]);
21896 }
21897 else if (nparts == 4)
21898 {
21899 emit_move_insn (part[0][3], part[1][3]);
21900 emit_move_insn (part[0][2], part[1][2]);
21901 }
21902 }
21903 else
21904 {
21905 /* In 64bit mode we don't have 32bit push available. In case this is
21906 register, it is OK - we will just use larger counterpart. We also
21907 retype memory - these comes from attempt to avoid REX prefix on
21908 moving of second half of TFmode value. */
21909 if (GET_MODE (part[1][1]) == SImode)
21910 {
21911 switch (GET_CODE (part[1][1]))
21912 {
21913 case MEM:
21914 part[1][1] = adjust_address (part[1][1], DImode, 0);
21915 break;
21916
21917 case REG:
21918 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21919 break;
21920
21921 default:
21922 gcc_unreachable ();
21923 }
21924
21925 if (GET_MODE (part[1][0]) == SImode)
21926 part[1][0] = part[1][1];
21927 }
21928 }
21929 emit_move_insn (part[0][1], part[1][1]);
21930 emit_move_insn (part[0][0], part[1][0]);
21931 return;
21932 }
21933
21934 /* Choose correct order to not overwrite the source before it is copied. */
21935 if ((REG_P (part[0][0])
21936 && REG_P (part[1][1])
21937 && (REGNO (part[0][0]) == REGNO (part[1][1])
21938 || (nparts == 3
21939 && REGNO (part[0][0]) == REGNO (part[1][2]))
21940 || (nparts == 4
21941 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21942 || (collisions > 0
21943 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21944 {
21945 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21946 {
21947 operands[2 + i] = part[0][j];
21948 operands[6 + i] = part[1][j];
21949 }
21950 }
21951 else
21952 {
21953 for (i = 0; i < nparts; i++)
21954 {
21955 operands[2 + i] = part[0][i];
21956 operands[6 + i] = part[1][i];
21957 }
21958 }
21959
21960 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21961 if (optimize_insn_for_size_p ())
21962 {
21963 for (j = 0; j < nparts - 1; j++)
21964 if (CONST_INT_P (operands[6 + j])
21965 && operands[6 + j] != const0_rtx
21966 && REG_P (operands[2 + j]))
21967 for (i = j; i < nparts - 1; i++)
21968 if (CONST_INT_P (operands[7 + i])
21969 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21970 operands[7 + i] = operands[2 + j];
21971 }
21972
21973 for (i = 0; i < nparts; i++)
21974 emit_move_insn (operands[2 + i], operands[6 + i]);
21975
21976 return;
21977 }
21978
21979 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21980 left shift by a constant, either using a single shift or
21981 a sequence of add instructions. */
21982
21983 static void
21984 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21985 {
21986 rtx (*insn)(rtx, rtx, rtx);
21987
21988 if (count == 1
21989 || (count * ix86_cost->add <= ix86_cost->shift_const
21990 && !optimize_insn_for_size_p ()))
21991 {
21992 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21993 while (count-- > 0)
21994 emit_insn (insn (operand, operand, operand));
21995 }
21996 else
21997 {
21998 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21999 emit_insn (insn (operand, operand, GEN_INT (count)));
22000 }
22001 }
22002
22003 void
22004 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
22005 {
22006 rtx (*gen_ashl3)(rtx, rtx, rtx);
22007 rtx (*gen_shld)(rtx, rtx, rtx);
22008 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22009
22010 rtx low[2], high[2];
22011 int count;
22012
22013 if (CONST_INT_P (operands[2]))
22014 {
22015 split_double_mode (mode, operands, 2, low, high);
22016 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22017
22018 if (count >= half_width)
22019 {
22020 emit_move_insn (high[0], low[1]);
22021 emit_move_insn (low[0], const0_rtx);
22022
22023 if (count > half_width)
22024 ix86_expand_ashl_const (high[0], count - half_width, mode);
22025 }
22026 else
22027 {
22028 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22029
22030 if (!rtx_equal_p (operands[0], operands[1]))
22031 emit_move_insn (operands[0], operands[1]);
22032
22033 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
22034 ix86_expand_ashl_const (low[0], count, mode);
22035 }
22036 return;
22037 }
22038
22039 split_double_mode (mode, operands, 1, low, high);
22040
22041 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
22042
22043 if (operands[1] == const1_rtx)
22044 {
22045 /* Assuming we've chosen a QImode capable registers, then 1 << N
22046 can be done with two 32/64-bit shifts, no branches, no cmoves. */
22047 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
22048 {
22049 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
22050
22051 ix86_expand_clear (low[0]);
22052 ix86_expand_clear (high[0]);
22053 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
22054
22055 d = gen_lowpart (QImode, low[0]);
22056 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
22057 s = gen_rtx_EQ (QImode, flags, const0_rtx);
22058 emit_insn (gen_rtx_SET (VOIDmode, d, s));
22059
22060 d = gen_lowpart (QImode, high[0]);
22061 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
22062 s = gen_rtx_NE (QImode, flags, const0_rtx);
22063 emit_insn (gen_rtx_SET (VOIDmode, d, s));
22064 }
22065
22066 /* Otherwise, we can get the same results by manually performing
22067 a bit extract operation on bit 5/6, and then performing the two
22068 shifts. The two methods of getting 0/1 into low/high are exactly
22069 the same size. Avoiding the shift in the bit extract case helps
22070 pentium4 a bit; no one else seems to care much either way. */
22071 else
22072 {
22073 enum machine_mode half_mode;
22074 rtx (*gen_lshr3)(rtx, rtx, rtx);
22075 rtx (*gen_and3)(rtx, rtx, rtx);
22076 rtx (*gen_xor3)(rtx, rtx, rtx);
22077 HOST_WIDE_INT bits;
22078 rtx x;
22079
22080 if (mode == DImode)
22081 {
22082 half_mode = SImode;
22083 gen_lshr3 = gen_lshrsi3;
22084 gen_and3 = gen_andsi3;
22085 gen_xor3 = gen_xorsi3;
22086 bits = 5;
22087 }
22088 else
22089 {
22090 half_mode = DImode;
22091 gen_lshr3 = gen_lshrdi3;
22092 gen_and3 = gen_anddi3;
22093 gen_xor3 = gen_xordi3;
22094 bits = 6;
22095 }
22096
22097 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
22098 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
22099 else
22100 x = gen_lowpart (half_mode, operands[2]);
22101 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
22102
22103 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
22104 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
22105 emit_move_insn (low[0], high[0]);
22106 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
22107 }
22108
22109 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22110 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
22111 return;
22112 }
22113
22114 if (operands[1] == constm1_rtx)
22115 {
22116 /* For -1 << N, we can avoid the shld instruction, because we
22117 know that we're shifting 0...31/63 ones into a -1. */
22118 emit_move_insn (low[0], constm1_rtx);
22119 if (optimize_insn_for_size_p ())
22120 emit_move_insn (high[0], low[0]);
22121 else
22122 emit_move_insn (high[0], constm1_rtx);
22123 }
22124 else
22125 {
22126 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22127
22128 if (!rtx_equal_p (operands[0], operands[1]))
22129 emit_move_insn (operands[0], operands[1]);
22130
22131 split_double_mode (mode, operands, 1, low, high);
22132 emit_insn (gen_shld (high[0], low[0], operands[2]));
22133 }
22134
22135 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22136
22137 if (TARGET_CMOVE && scratch)
22138 {
22139 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22140 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22141
22142 ix86_expand_clear (scratch);
22143 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
22144 }
22145 else
22146 {
22147 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22148 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22149
22150 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
22151 }
22152 }
22153
22154 void
22155 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
22156 {
22157 rtx (*gen_ashr3)(rtx, rtx, rtx)
22158 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
22159 rtx (*gen_shrd)(rtx, rtx, rtx);
22160 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22161
22162 rtx low[2], high[2];
22163 int count;
22164
22165 if (CONST_INT_P (operands[2]))
22166 {
22167 split_double_mode (mode, operands, 2, low, high);
22168 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22169
22170 if (count == GET_MODE_BITSIZE (mode) - 1)
22171 {
22172 emit_move_insn (high[0], high[1]);
22173 emit_insn (gen_ashr3 (high[0], high[0],
22174 GEN_INT (half_width - 1)));
22175 emit_move_insn (low[0], high[0]);
22176
22177 }
22178 else if (count >= half_width)
22179 {
22180 emit_move_insn (low[0], high[1]);
22181 emit_move_insn (high[0], low[0]);
22182 emit_insn (gen_ashr3 (high[0], high[0],
22183 GEN_INT (half_width - 1)));
22184
22185 if (count > half_width)
22186 emit_insn (gen_ashr3 (low[0], low[0],
22187 GEN_INT (count - half_width)));
22188 }
22189 else
22190 {
22191 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22192
22193 if (!rtx_equal_p (operands[0], operands[1]))
22194 emit_move_insn (operands[0], operands[1]);
22195
22196 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22197 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
22198 }
22199 }
22200 else
22201 {
22202 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22203
22204 if (!rtx_equal_p (operands[0], operands[1]))
22205 emit_move_insn (operands[0], operands[1]);
22206
22207 split_double_mode (mode, operands, 1, low, high);
22208
22209 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22210 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
22211
22212 if (TARGET_CMOVE && scratch)
22213 {
22214 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22215 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22216
22217 emit_move_insn (scratch, high[0]);
22218 emit_insn (gen_ashr3 (scratch, scratch,
22219 GEN_INT (half_width - 1)));
22220 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22221 scratch));
22222 }
22223 else
22224 {
22225 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
22226 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
22227
22228 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
22229 }
22230 }
22231 }
22232
22233 void
22234 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
22235 {
22236 rtx (*gen_lshr3)(rtx, rtx, rtx)
22237 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
22238 rtx (*gen_shrd)(rtx, rtx, rtx);
22239 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22240
22241 rtx low[2], high[2];
22242 int count;
22243
22244 if (CONST_INT_P (operands[2]))
22245 {
22246 split_double_mode (mode, operands, 2, low, high);
22247 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22248
22249 if (count >= half_width)
22250 {
22251 emit_move_insn (low[0], high[1]);
22252 ix86_expand_clear (high[0]);
22253
22254 if (count > half_width)
22255 emit_insn (gen_lshr3 (low[0], low[0],
22256 GEN_INT (count - half_width)));
22257 }
22258 else
22259 {
22260 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22261
22262 if (!rtx_equal_p (operands[0], operands[1]))
22263 emit_move_insn (operands[0], operands[1]);
22264
22265 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22266 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22267 }
22268 }
22269 else
22270 {
22271 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22272
22273 if (!rtx_equal_p (operands[0], operands[1]))
22274 emit_move_insn (operands[0], operands[1]);
22275
22276 split_double_mode (mode, operands, 1, low, high);
22277
22278 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22279 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22280
22281 if (TARGET_CMOVE && scratch)
22282 {
22283 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22284 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22285
22286 ix86_expand_clear (scratch);
22287 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22288 scratch));
22289 }
22290 else
22291 {
22292 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22293 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22294
22295 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22296 }
22297 }
22298 }
22299
22300 /* Predict just emitted jump instruction to be taken with probability PROB. */
22301 static void
22302 predict_jump (int prob)
22303 {
22304 rtx insn = get_last_insn ();
22305 gcc_assert (JUMP_P (insn));
22306 add_int_reg_note (insn, REG_BR_PROB, prob);
22307 }
22308
22309 /* Helper function for the string operations below. Dest VARIABLE whether
22310 it is aligned to VALUE bytes. If true, jump to the label. */
22311 static rtx
22312 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22313 {
22314 rtx label = gen_label_rtx ();
22315 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22316 if (GET_MODE (variable) == DImode)
22317 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22318 else
22319 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22320 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22321 1, label);
22322 if (epilogue)
22323 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22324 else
22325 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22326 return label;
22327 }
22328
22329 /* Adjust COUNTER by the VALUE. */
22330 static void
22331 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22332 {
22333 rtx (*gen_add)(rtx, rtx, rtx)
22334 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22335
22336 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22337 }
22338
22339 /* Zero extend possibly SImode EXP to Pmode register. */
22340 rtx
22341 ix86_zero_extend_to_Pmode (rtx exp)
22342 {
22343 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22344 }
22345
22346 /* Divide COUNTREG by SCALE. */
22347 static rtx
22348 scale_counter (rtx countreg, int scale)
22349 {
22350 rtx sc;
22351
22352 if (scale == 1)
22353 return countreg;
22354 if (CONST_INT_P (countreg))
22355 return GEN_INT (INTVAL (countreg) / scale);
22356 gcc_assert (REG_P (countreg));
22357
22358 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22359 GEN_INT (exact_log2 (scale)),
22360 NULL, 1, OPTAB_DIRECT);
22361 return sc;
22362 }
22363
22364 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22365 DImode for constant loop counts. */
22366
22367 static enum machine_mode
22368 counter_mode (rtx count_exp)
22369 {
22370 if (GET_MODE (count_exp) != VOIDmode)
22371 return GET_MODE (count_exp);
22372 if (!CONST_INT_P (count_exp))
22373 return Pmode;
22374 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22375 return DImode;
22376 return SImode;
22377 }
22378
22379 /* Copy the address to a Pmode register. This is used for x32 to
22380 truncate DImode TLS address to a SImode register. */
22381
22382 static rtx
22383 ix86_copy_addr_to_reg (rtx addr)
22384 {
22385 if (GET_MODE (addr) == Pmode)
22386 return copy_addr_to_reg (addr);
22387 else
22388 {
22389 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
22390 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
22391 }
22392 }
22393
22394 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
22395 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
22396 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
22397 memory by VALUE (supposed to be in MODE).
22398
22399 The size is rounded down to whole number of chunk size moved at once.
22400 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22401
22402
22403 static void
22404 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22405 rtx destptr, rtx srcptr, rtx value,
22406 rtx count, enum machine_mode mode, int unroll,
22407 int expected_size, bool issetmem)
22408 {
22409 rtx out_label, top_label, iter, tmp;
22410 enum machine_mode iter_mode = counter_mode (count);
22411 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22412 rtx piece_size = GEN_INT (piece_size_n);
22413 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22414 rtx size;
22415 int i;
22416
22417 top_label = gen_label_rtx ();
22418 out_label = gen_label_rtx ();
22419 iter = gen_reg_rtx (iter_mode);
22420
22421 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22422 NULL, 1, OPTAB_DIRECT);
22423 /* Those two should combine. */
22424 if (piece_size == const1_rtx)
22425 {
22426 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22427 true, out_label);
22428 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22429 }
22430 emit_move_insn (iter, const0_rtx);
22431
22432 emit_label (top_label);
22433
22434 tmp = convert_modes (Pmode, iter_mode, iter, true);
22435
22436 /* This assert could be relaxed - in this case we'll need to compute
22437 smallest power of two, containing in PIECE_SIZE_N and pass it to
22438 offset_address. */
22439 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22440 destmem = offset_address (destmem, tmp, piece_size_n);
22441 destmem = adjust_address (destmem, mode, 0);
22442
22443 if (!issetmem)
22444 {
22445 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22446 srcmem = adjust_address (srcmem, mode, 0);
22447
22448 /* When unrolling for chips that reorder memory reads and writes,
22449 we can save registers by using single temporary.
22450 Also using 4 temporaries is overkill in 32bit mode. */
22451 if (!TARGET_64BIT && 0)
22452 {
22453 for (i = 0; i < unroll; i++)
22454 {
22455 if (i)
22456 {
22457 destmem =
22458 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22459 srcmem =
22460 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22461 }
22462 emit_move_insn (destmem, srcmem);
22463 }
22464 }
22465 else
22466 {
22467 rtx tmpreg[4];
22468 gcc_assert (unroll <= 4);
22469 for (i = 0; i < unroll; i++)
22470 {
22471 tmpreg[i] = gen_reg_rtx (mode);
22472 if (i)
22473 {
22474 srcmem =
22475 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22476 }
22477 emit_move_insn (tmpreg[i], srcmem);
22478 }
22479 for (i = 0; i < unroll; i++)
22480 {
22481 if (i)
22482 {
22483 destmem =
22484 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22485 }
22486 emit_move_insn (destmem, tmpreg[i]);
22487 }
22488 }
22489 }
22490 else
22491 for (i = 0; i < unroll; i++)
22492 {
22493 if (i)
22494 destmem =
22495 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22496 emit_move_insn (destmem, value);
22497 }
22498
22499 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22500 true, OPTAB_LIB_WIDEN);
22501 if (tmp != iter)
22502 emit_move_insn (iter, tmp);
22503
22504 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22505 true, top_label);
22506 if (expected_size != -1)
22507 {
22508 expected_size /= GET_MODE_SIZE (mode) * unroll;
22509 if (expected_size == 0)
22510 predict_jump (0);
22511 else if (expected_size > REG_BR_PROB_BASE)
22512 predict_jump (REG_BR_PROB_BASE - 1);
22513 else
22514 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22515 }
22516 else
22517 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22518 iter = ix86_zero_extend_to_Pmode (iter);
22519 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22520 true, OPTAB_LIB_WIDEN);
22521 if (tmp != destptr)
22522 emit_move_insn (destptr, tmp);
22523 if (!issetmem)
22524 {
22525 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22526 true, OPTAB_LIB_WIDEN);
22527 if (tmp != srcptr)
22528 emit_move_insn (srcptr, tmp);
22529 }
22530 emit_label (out_label);
22531 }
22532
22533 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
22534 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
22535 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
22536 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
22537 ORIG_VALUE is the original value passed to memset to fill the memory with.
22538 Other arguments have same meaning as for previous function. */
22539
22540 static void
22541 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
22542 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
22543 rtx count,
22544 enum machine_mode mode, bool issetmem)
22545 {
22546 rtx destexp;
22547 rtx srcexp;
22548 rtx countreg;
22549 HOST_WIDE_INT rounded_count;
22550
22551 /* If possible, it is shorter to use rep movs.
22552 TODO: Maybe it is better to move this logic to decide_alg. */
22553 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
22554 && (!issetmem || orig_value == const0_rtx))
22555 mode = SImode;
22556
22557 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22558 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22559
22560 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
22561 GET_MODE_SIZE (mode)));
22562 if (mode != QImode)
22563 {
22564 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22565 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22566 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22567 }
22568 else
22569 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22570 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
22571 {
22572 rounded_count = (INTVAL (count)
22573 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22574 destmem = shallow_copy_rtx (destmem);
22575 set_mem_size (destmem, rounded_count);
22576 }
22577 else if (MEM_SIZE_KNOWN_P (destmem))
22578 clear_mem_size (destmem);
22579
22580 if (issetmem)
22581 {
22582 value = force_reg (mode, gen_lowpart (mode, value));
22583 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22584 }
22585 else
22586 {
22587 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22588 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22589 if (mode != QImode)
22590 {
22591 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22592 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22593 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22594 }
22595 else
22596 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22597 if (CONST_INT_P (count))
22598 {
22599 rounded_count = (INTVAL (count)
22600 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22601 srcmem = shallow_copy_rtx (srcmem);
22602 set_mem_size (srcmem, rounded_count);
22603 }
22604 else
22605 {
22606 if (MEM_SIZE_KNOWN_P (srcmem))
22607 clear_mem_size (srcmem);
22608 }
22609 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22610 destexp, srcexp));
22611 }
22612 }
22613
22614 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22615 DESTMEM.
22616 SRC is passed by pointer to be updated on return.
22617 Return value is updated DST. */
22618 static rtx
22619 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22620 HOST_WIDE_INT size_to_move)
22621 {
22622 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22623 enum insn_code code;
22624 enum machine_mode move_mode;
22625 int piece_size, i;
22626
22627 /* Find the widest mode in which we could perform moves.
22628 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22629 it until move of such size is supported. */
22630 piece_size = 1 << floor_log2 (size_to_move);
22631 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22632 code = optab_handler (mov_optab, move_mode);
22633 while (code == CODE_FOR_nothing && piece_size > 1)
22634 {
22635 piece_size >>= 1;
22636 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22637 code = optab_handler (mov_optab, move_mode);
22638 }
22639
22640 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22641 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22642 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22643 {
22644 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22645 move_mode = mode_for_vector (word_mode, nunits);
22646 code = optab_handler (mov_optab, move_mode);
22647 if (code == CODE_FOR_nothing)
22648 {
22649 move_mode = word_mode;
22650 piece_size = GET_MODE_SIZE (move_mode);
22651 code = optab_handler (mov_optab, move_mode);
22652 }
22653 }
22654 gcc_assert (code != CODE_FOR_nothing);
22655
22656 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22657 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22658
22659 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22660 gcc_assert (size_to_move % piece_size == 0);
22661 adjust = GEN_INT (piece_size);
22662 for (i = 0; i < size_to_move; i += piece_size)
22663 {
22664 /* We move from memory to memory, so we'll need to do it via
22665 a temporary register. */
22666 tempreg = gen_reg_rtx (move_mode);
22667 emit_insn (GEN_FCN (code) (tempreg, src));
22668 emit_insn (GEN_FCN (code) (dst, tempreg));
22669
22670 emit_move_insn (destptr,
22671 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22672 emit_move_insn (srcptr,
22673 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22674
22675 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22676 piece_size);
22677 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22678 piece_size);
22679 }
22680
22681 /* Update DST and SRC rtx. */
22682 *srcmem = src;
22683 return dst;
22684 }
22685
22686 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22687 static void
22688 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22689 rtx destptr, rtx srcptr, rtx count, int max_size)
22690 {
22691 rtx src, dest;
22692 if (CONST_INT_P (count))
22693 {
22694 HOST_WIDE_INT countval = INTVAL (count);
22695 HOST_WIDE_INT epilogue_size = countval % max_size;
22696 int i;
22697
22698 /* For now MAX_SIZE should be a power of 2. This assert could be
22699 relaxed, but it'll require a bit more complicated epilogue
22700 expanding. */
22701 gcc_assert ((max_size & (max_size - 1)) == 0);
22702 for (i = max_size; i >= 1; i >>= 1)
22703 {
22704 if (epilogue_size & i)
22705 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22706 }
22707 return;
22708 }
22709 if (max_size > 8)
22710 {
22711 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22712 count, 1, OPTAB_DIRECT);
22713 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22714 count, QImode, 1, 4, false);
22715 return;
22716 }
22717
22718 /* When there are stringops, we can cheaply increase dest and src pointers.
22719 Otherwise we save code size by maintaining offset (zero is readily
22720 available from preceding rep operation) and using x86 addressing modes.
22721 */
22722 if (TARGET_SINGLE_STRINGOP)
22723 {
22724 if (max_size > 4)
22725 {
22726 rtx label = ix86_expand_aligntest (count, 4, true);
22727 src = change_address (srcmem, SImode, srcptr);
22728 dest = change_address (destmem, SImode, destptr);
22729 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22730 emit_label (label);
22731 LABEL_NUSES (label) = 1;
22732 }
22733 if (max_size > 2)
22734 {
22735 rtx label = ix86_expand_aligntest (count, 2, true);
22736 src = change_address (srcmem, HImode, srcptr);
22737 dest = change_address (destmem, HImode, destptr);
22738 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22739 emit_label (label);
22740 LABEL_NUSES (label) = 1;
22741 }
22742 if (max_size > 1)
22743 {
22744 rtx label = ix86_expand_aligntest (count, 1, true);
22745 src = change_address (srcmem, QImode, srcptr);
22746 dest = change_address (destmem, QImode, destptr);
22747 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22748 emit_label (label);
22749 LABEL_NUSES (label) = 1;
22750 }
22751 }
22752 else
22753 {
22754 rtx offset = force_reg (Pmode, const0_rtx);
22755 rtx tmp;
22756
22757 if (max_size > 4)
22758 {
22759 rtx label = ix86_expand_aligntest (count, 4, true);
22760 src = change_address (srcmem, SImode, srcptr);
22761 dest = change_address (destmem, SImode, destptr);
22762 emit_move_insn (dest, src);
22763 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22764 true, OPTAB_LIB_WIDEN);
22765 if (tmp != offset)
22766 emit_move_insn (offset, tmp);
22767 emit_label (label);
22768 LABEL_NUSES (label) = 1;
22769 }
22770 if (max_size > 2)
22771 {
22772 rtx label = ix86_expand_aligntest (count, 2, true);
22773 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22774 src = change_address (srcmem, HImode, tmp);
22775 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22776 dest = change_address (destmem, HImode, tmp);
22777 emit_move_insn (dest, src);
22778 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22779 true, OPTAB_LIB_WIDEN);
22780 if (tmp != offset)
22781 emit_move_insn (offset, tmp);
22782 emit_label (label);
22783 LABEL_NUSES (label) = 1;
22784 }
22785 if (max_size > 1)
22786 {
22787 rtx label = ix86_expand_aligntest (count, 1, true);
22788 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22789 src = change_address (srcmem, QImode, tmp);
22790 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22791 dest = change_address (destmem, QImode, tmp);
22792 emit_move_insn (dest, src);
22793 emit_label (label);
22794 LABEL_NUSES (label) = 1;
22795 }
22796 }
22797 }
22798
22799 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
22800 with value PROMOTED_VAL.
22801 SRC is passed by pointer to be updated on return.
22802 Return value is updated DST. */
22803 static rtx
22804 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
22805 HOST_WIDE_INT size_to_move)
22806 {
22807 rtx dst = destmem, adjust;
22808 enum insn_code code;
22809 enum machine_mode move_mode;
22810 int piece_size, i;
22811
22812 /* Find the widest mode in which we could perform moves.
22813 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22814 it until move of such size is supported. */
22815 move_mode = GET_MODE (promoted_val);
22816 if (move_mode == VOIDmode)
22817 move_mode = QImode;
22818 if (size_to_move < GET_MODE_SIZE (move_mode))
22819 {
22820 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
22821 promoted_val = gen_lowpart (move_mode, promoted_val);
22822 }
22823 piece_size = GET_MODE_SIZE (move_mode);
22824 code = optab_handler (mov_optab, move_mode);
22825 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
22826
22827 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22828
22829 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22830 gcc_assert (size_to_move % piece_size == 0);
22831 adjust = GEN_INT (piece_size);
22832 for (i = 0; i < size_to_move; i += piece_size)
22833 {
22834 if (piece_size <= GET_MODE_SIZE (word_mode))
22835 {
22836 emit_insn (gen_strset (destptr, dst, promoted_val));
22837 continue;
22838 }
22839
22840 emit_insn (GEN_FCN (code) (dst, promoted_val));
22841
22842 emit_move_insn (destptr,
22843 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22844
22845 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22846 piece_size);
22847 }
22848
22849 /* Update DST rtx. */
22850 return dst;
22851 }
22852 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22853 static void
22854 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22855 rtx count, int max_size)
22856 {
22857 count =
22858 expand_simple_binop (counter_mode (count), AND, count,
22859 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22860 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22861 gen_lowpart (QImode, value), count, QImode,
22862 1, max_size / 2, true);
22863 }
22864
22865 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22866 static void
22867 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
22868 rtx count, int max_size)
22869 {
22870 rtx dest;
22871
22872 if (CONST_INT_P (count))
22873 {
22874 HOST_WIDE_INT countval = INTVAL (count);
22875 HOST_WIDE_INT epilogue_size = countval % max_size;
22876 int i;
22877
22878 /* For now MAX_SIZE should be a power of 2. This assert could be
22879 relaxed, but it'll require a bit more complicated epilogue
22880 expanding. */
22881 gcc_assert ((max_size & (max_size - 1)) == 0);
22882 for (i = max_size; i >= 1; i >>= 1)
22883 {
22884 if (epilogue_size & i)
22885 {
22886 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22887 destmem = emit_memset (destmem, destptr, vec_value, i);
22888 else
22889 destmem = emit_memset (destmem, destptr, value, i);
22890 }
22891 }
22892 return;
22893 }
22894 if (max_size > 32)
22895 {
22896 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22897 return;
22898 }
22899 if (max_size > 16)
22900 {
22901 rtx label = ix86_expand_aligntest (count, 16, true);
22902 if (TARGET_64BIT)
22903 {
22904 dest = change_address (destmem, DImode, destptr);
22905 emit_insn (gen_strset (destptr, dest, value));
22906 emit_insn (gen_strset (destptr, dest, value));
22907 }
22908 else
22909 {
22910 dest = change_address (destmem, SImode, destptr);
22911 emit_insn (gen_strset (destptr, dest, value));
22912 emit_insn (gen_strset (destptr, dest, value));
22913 emit_insn (gen_strset (destptr, dest, value));
22914 emit_insn (gen_strset (destptr, dest, value));
22915 }
22916 emit_label (label);
22917 LABEL_NUSES (label) = 1;
22918 }
22919 if (max_size > 8)
22920 {
22921 rtx label = ix86_expand_aligntest (count, 8, true);
22922 if (TARGET_64BIT)
22923 {
22924 dest = change_address (destmem, DImode, destptr);
22925 emit_insn (gen_strset (destptr, dest, value));
22926 }
22927 else
22928 {
22929 dest = change_address (destmem, SImode, destptr);
22930 emit_insn (gen_strset (destptr, dest, value));
22931 emit_insn (gen_strset (destptr, dest, value));
22932 }
22933 emit_label (label);
22934 LABEL_NUSES (label) = 1;
22935 }
22936 if (max_size > 4)
22937 {
22938 rtx label = ix86_expand_aligntest (count, 4, true);
22939 dest = change_address (destmem, SImode, destptr);
22940 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22941 emit_label (label);
22942 LABEL_NUSES (label) = 1;
22943 }
22944 if (max_size > 2)
22945 {
22946 rtx label = ix86_expand_aligntest (count, 2, true);
22947 dest = change_address (destmem, HImode, destptr);
22948 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22949 emit_label (label);
22950 LABEL_NUSES (label) = 1;
22951 }
22952 if (max_size > 1)
22953 {
22954 rtx label = ix86_expand_aligntest (count, 1, true);
22955 dest = change_address (destmem, QImode, destptr);
22956 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22957 emit_label (label);
22958 LABEL_NUSES (label) = 1;
22959 }
22960 }
22961
22962 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
22963 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
22964 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
22965 ignored.
22966 Return value is updated DESTMEM. */
22967 static rtx
22968 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
22969 rtx destptr, rtx srcptr, rtx value,
22970 rtx vec_value, rtx count, int align,
22971 int desired_alignment, bool issetmem)
22972 {
22973 int i;
22974 for (i = 1; i < desired_alignment; i <<= 1)
22975 {
22976 if (align <= i)
22977 {
22978 rtx label = ix86_expand_aligntest (destptr, i, false);
22979 if (issetmem)
22980 {
22981 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22982 destmem = emit_memset (destmem, destptr, vec_value, i);
22983 else
22984 destmem = emit_memset (destmem, destptr, value, i);
22985 }
22986 else
22987 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22988 ix86_adjust_counter (count, i);
22989 emit_label (label);
22990 LABEL_NUSES (label) = 1;
22991 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22992 }
22993 }
22994 return destmem;
22995 }
22996
22997 /* Test if COUNT&SIZE is nonzero and if so, expand movme
22998 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
22999 and jump to DONE_LABEL. */
23000 static void
23001 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
23002 rtx destptr, rtx srcptr,
23003 rtx value, rtx vec_value,
23004 rtx count, int size,
23005 rtx done_label, bool issetmem)
23006 {
23007 rtx label = ix86_expand_aligntest (count, size, false);
23008 enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
23009 rtx modesize;
23010 int n;
23011
23012 /* If we do not have vector value to copy, we must reduce size. */
23013 if (issetmem)
23014 {
23015 if (!vec_value)
23016 {
23017 if (GET_MODE (value) == VOIDmode && size > 8)
23018 mode = Pmode;
23019 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
23020 mode = GET_MODE (value);
23021 }
23022 else
23023 mode = GET_MODE (vec_value), value = vec_value;
23024 }
23025 else
23026 {
23027 /* Choose appropriate vector mode. */
23028 if (size >= 32)
23029 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
23030 else if (size >= 16)
23031 mode = TARGET_SSE ? V16QImode : DImode;
23032 srcmem = change_address (srcmem, mode, srcptr);
23033 }
23034 destmem = change_address (destmem, mode, destptr);
23035 modesize = GEN_INT (GET_MODE_SIZE (mode));
23036 gcc_assert (GET_MODE_SIZE (mode) <= size);
23037 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
23038 {
23039 if (issetmem)
23040 emit_move_insn (destmem, gen_lowpart (mode, value));
23041 else
23042 {
23043 emit_move_insn (destmem, srcmem);
23044 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23045 }
23046 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23047 }
23048
23049 destmem = offset_address (destmem, count, 1);
23050 destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
23051 GET_MODE_SIZE (mode));
23052 if (issetmem)
23053 emit_move_insn (destmem, gen_lowpart (mode, value));
23054 else
23055 {
23056 srcmem = offset_address (srcmem, count, 1);
23057 srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
23058 GET_MODE_SIZE (mode));
23059 emit_move_insn (destmem, srcmem);
23060 }
23061 emit_jump_insn (gen_jump (done_label));
23062 emit_barrier ();
23063
23064 emit_label (label);
23065 LABEL_NUSES (label) = 1;
23066 }
23067
23068 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
23069 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
23070 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
23071 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
23072 DONE_LABEL is a label after the whole copying sequence. The label is created
23073 on demand if *DONE_LABEL is NULL.
23074 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
23075 bounds after the initial copies.
23076
23077 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
23078 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
23079 we will dispatch to a library call for large blocks.
23080
23081 In pseudocode we do:
23082
23083 if (COUNT < SIZE)
23084 {
23085 Assume that SIZE is 4. Bigger sizes are handled analogously
23086 if (COUNT & 4)
23087 {
23088 copy 4 bytes from SRCPTR to DESTPTR
23089 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
23090 goto done_label
23091 }
23092 if (!COUNT)
23093 goto done_label;
23094 copy 1 byte from SRCPTR to DESTPTR
23095 if (COUNT & 2)
23096 {
23097 copy 2 bytes from SRCPTR to DESTPTR
23098 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
23099 }
23100 }
23101 else
23102 {
23103 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
23104 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
23105
23106 OLD_DESPTR = DESTPTR;
23107 Align DESTPTR up to DESIRED_ALIGN
23108 SRCPTR += DESTPTR - OLD_DESTPTR
23109 COUNT -= DEST_PTR - OLD_DESTPTR
23110 if (DYNAMIC_CHECK)
23111 Round COUNT down to multiple of SIZE
23112 << optional caller supplied zero size guard is here >>
23113 << optional caller suppplied dynamic check is here >>
23114 << caller supplied main copy loop is here >>
23115 }
23116 done_label:
23117 */
23118 static void
23119 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
23120 rtx *destptr, rtx *srcptr,
23121 enum machine_mode mode,
23122 rtx value, rtx vec_value,
23123 rtx *count,
23124 rtx *done_label,
23125 int size,
23126 int desired_align,
23127 int align,
23128 unsigned HOST_WIDE_INT *min_size,
23129 bool dynamic_check,
23130 bool issetmem)
23131 {
23132 rtx loop_label = NULL, label;
23133 int n;
23134 rtx modesize;
23135 int prolog_size = 0;
23136 rtx mode_value;
23137
23138 /* Chose proper value to copy. */
23139 if (issetmem && VECTOR_MODE_P (mode))
23140 mode_value = vec_value;
23141 else
23142 mode_value = value;
23143 gcc_assert (GET_MODE_SIZE (mode) <= size);
23144
23145 /* See if block is big or small, handle small blocks. */
23146 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
23147 {
23148 int size2 = size;
23149 loop_label = gen_label_rtx ();
23150
23151 if (!*done_label)
23152 *done_label = gen_label_rtx ();
23153
23154 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
23155 1, loop_label);
23156 size2 >>= 1;
23157
23158 /* Handle sizes > 3. */
23159 for (;size2 > 2; size2 >>= 1)
23160 expand_small_movmem_or_setmem (destmem, srcmem,
23161 *destptr, *srcptr,
23162 value, vec_value,
23163 *count,
23164 size2, *done_label, issetmem);
23165 /* Nothing to copy? Jump to DONE_LABEL if so */
23166 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
23167 1, *done_label);
23168
23169 /* Do a byte copy. */
23170 destmem = change_address (destmem, QImode, *destptr);
23171 if (issetmem)
23172 emit_move_insn (destmem, gen_lowpart (QImode, value));
23173 else
23174 {
23175 srcmem = change_address (srcmem, QImode, *srcptr);
23176 emit_move_insn (destmem, srcmem);
23177 }
23178
23179 /* Handle sizes 2 and 3. */
23180 label = ix86_expand_aligntest (*count, 2, false);
23181 destmem = change_address (destmem, HImode, *destptr);
23182 destmem = offset_address (destmem, *count, 1);
23183 destmem = offset_address (destmem, GEN_INT (-2), 2);
23184 if (issetmem)
23185 emit_move_insn (destmem, gen_lowpart (HImode, value));
23186 else
23187 {
23188 srcmem = change_address (srcmem, HImode, *srcptr);
23189 srcmem = offset_address (srcmem, *count, 1);
23190 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
23191 emit_move_insn (destmem, srcmem);
23192 }
23193
23194 emit_label (label);
23195 LABEL_NUSES (label) = 1;
23196 emit_jump_insn (gen_jump (*done_label));
23197 emit_barrier ();
23198 }
23199 else
23200 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
23201 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
23202
23203 /* Start memcpy for COUNT >= SIZE. */
23204 if (loop_label)
23205 {
23206 emit_label (loop_label);
23207 LABEL_NUSES (loop_label) = 1;
23208 }
23209
23210 /* Copy first desired_align bytes. */
23211 if (!issetmem)
23212 srcmem = change_address (srcmem, mode, *srcptr);
23213 destmem = change_address (destmem, mode, *destptr);
23214 modesize = GEN_INT (GET_MODE_SIZE (mode));
23215 for (n = 0; prolog_size < desired_align - align; n++)
23216 {
23217 if (issetmem)
23218 emit_move_insn (destmem, mode_value);
23219 else
23220 {
23221 emit_move_insn (destmem, srcmem);
23222 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23223 }
23224 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23225 prolog_size += GET_MODE_SIZE (mode);
23226 }
23227
23228
23229 /* Copy last SIZE bytes. */
23230 destmem = offset_address (destmem, *count, 1);
23231 destmem = offset_address (destmem,
23232 GEN_INT (-size - prolog_size),
23233 1);
23234 if (issetmem)
23235 emit_move_insn (destmem, mode_value);
23236 else
23237 {
23238 srcmem = offset_address (srcmem, *count, 1);
23239 srcmem = offset_address (srcmem,
23240 GEN_INT (-size - prolog_size),
23241 1);
23242 emit_move_insn (destmem, srcmem);
23243 }
23244 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
23245 {
23246 destmem = offset_address (destmem, modesize, 1);
23247 if (issetmem)
23248 emit_move_insn (destmem, mode_value);
23249 else
23250 {
23251 srcmem = offset_address (srcmem, modesize, 1);
23252 emit_move_insn (destmem, srcmem);
23253 }
23254 }
23255
23256 /* Align destination. */
23257 if (desired_align > 1 && desired_align > align)
23258 {
23259 rtx saveddest = *destptr;
23260
23261 gcc_assert (desired_align <= size);
23262 /* Align destptr up, place it to new register. */
23263 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
23264 GEN_INT (prolog_size),
23265 NULL_RTX, 1, OPTAB_DIRECT);
23266 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
23267 GEN_INT (-desired_align),
23268 *destptr, 1, OPTAB_DIRECT);
23269 /* See how many bytes we skipped. */
23270 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
23271 *destptr,
23272 saveddest, 1, OPTAB_DIRECT);
23273 /* Adjust srcptr and count. */
23274 if (!issetmem)
23275 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
23276 *srcptr, 1, OPTAB_DIRECT);
23277 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23278 saveddest, *count, 1, OPTAB_DIRECT);
23279 /* We copied at most size + prolog_size. */
23280 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
23281 *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
23282 else
23283 *min_size = 0;
23284
23285 /* Our loops always round down the bock size, but for dispatch to library
23286 we need precise value. */
23287 if (dynamic_check)
23288 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
23289 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
23290 }
23291 else
23292 {
23293 gcc_assert (prolog_size == 0);
23294 /* Decrease count, so we won't end up copying last word twice. */
23295 if (!CONST_INT_P (*count))
23296 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23297 constm1_rtx, *count, 1, OPTAB_DIRECT);
23298 else
23299 *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
23300 if (*min_size)
23301 *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
23302 }
23303 }
23304
23305
23306 /* This function is like the previous one, except here we know how many bytes
23307 need to be copied. That allows us to update alignment not only of DST, which
23308 is returned, but also of SRC, which is passed as a pointer for that
23309 reason. */
23310 static rtx
23311 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
23312 rtx srcreg, rtx value, rtx vec_value,
23313 int desired_align, int align_bytes,
23314 bool issetmem)
23315 {
23316 rtx src = NULL;
23317 rtx orig_dst = dst;
23318 rtx orig_src = NULL;
23319 int piece_size = 1;
23320 int copied_bytes = 0;
23321
23322 if (!issetmem)
23323 {
23324 gcc_assert (srcp != NULL);
23325 src = *srcp;
23326 orig_src = src;
23327 }
23328
23329 for (piece_size = 1;
23330 piece_size <= desired_align && copied_bytes < align_bytes;
23331 piece_size <<= 1)
23332 {
23333 if (align_bytes & piece_size)
23334 {
23335 if (issetmem)
23336 {
23337 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
23338 dst = emit_memset (dst, destreg, vec_value, piece_size);
23339 else
23340 dst = emit_memset (dst, destreg, value, piece_size);
23341 }
23342 else
23343 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
23344 copied_bytes += piece_size;
23345 }
23346 }
23347 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
23348 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23349 if (MEM_SIZE_KNOWN_P (orig_dst))
23350 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
23351
23352 if (!issetmem)
23353 {
23354 int src_align_bytes = get_mem_align_offset (src, desired_align
23355 * BITS_PER_UNIT);
23356 if (src_align_bytes >= 0)
23357 src_align_bytes = desired_align - src_align_bytes;
23358 if (src_align_bytes >= 0)
23359 {
23360 unsigned int src_align;
23361 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
23362 {
23363 if ((src_align_bytes & (src_align - 1))
23364 == (align_bytes & (src_align - 1)))
23365 break;
23366 }
23367 if (src_align > (unsigned int) desired_align)
23368 src_align = desired_align;
23369 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
23370 set_mem_align (src, src_align * BITS_PER_UNIT);
23371 }
23372 if (MEM_SIZE_KNOWN_P (orig_src))
23373 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
23374 *srcp = src;
23375 }
23376
23377 return dst;
23378 }
23379
23380 /* Return true if ALG can be used in current context.
23381 Assume we expand memset if MEMSET is true. */
23382 static bool
23383 alg_usable_p (enum stringop_alg alg, bool memset)
23384 {
23385 if (alg == no_stringop)
23386 return false;
23387 if (alg == vector_loop)
23388 return TARGET_SSE || TARGET_AVX;
23389 /* Algorithms using the rep prefix want at least edi and ecx;
23390 additionally, memset wants eax and memcpy wants esi. Don't
23391 consider such algorithms if the user has appropriated those
23392 registers for their own purposes. */
23393 if (alg == rep_prefix_1_byte
23394 || alg == rep_prefix_4_byte
23395 || alg == rep_prefix_8_byte)
23396 return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
23397 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
23398 return true;
23399 }
23400
23401 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
23402 static enum stringop_alg
23403 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
23404 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
23405 bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
23406 {
23407 const struct stringop_algs * algs;
23408 bool optimize_for_speed;
23409 int max = -1;
23410 const struct processor_costs *cost;
23411 int i;
23412 bool any_alg_usable_p = false;
23413
23414 *noalign = false;
23415 *dynamic_check = -1;
23416
23417 /* Even if the string operation call is cold, we still might spend a lot
23418 of time processing large blocks. */
23419 if (optimize_function_for_size_p (cfun)
23420 || (optimize_insn_for_size_p ()
23421 && (max_size < 256
23422 || (expected_size != -1 && expected_size < 256))))
23423 optimize_for_speed = false;
23424 else
23425 optimize_for_speed = true;
23426
23427 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
23428 if (memset)
23429 algs = &cost->memset[TARGET_64BIT != 0];
23430 else
23431 algs = &cost->memcpy[TARGET_64BIT != 0];
23432
23433 /* See maximal size for user defined algorithm. */
23434 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23435 {
23436 enum stringop_alg candidate = algs->size[i].alg;
23437 bool usable = alg_usable_p (candidate, memset);
23438 any_alg_usable_p |= usable;
23439
23440 if (candidate != libcall && candidate && usable)
23441 max = algs->size[i].max;
23442 }
23443
23444 /* If expected size is not known but max size is small enough
23445 so inline version is a win, set expected size into
23446 the range. */
23447 if (max > 1 && (unsigned HOST_WIDE_INT)max >= max_size && expected_size == -1)
23448 expected_size = min_size / 2 + max_size / 2;
23449
23450 /* If user specified the algorithm, honnor it if possible. */
23451 if (ix86_stringop_alg != no_stringop
23452 && alg_usable_p (ix86_stringop_alg, memset))
23453 return ix86_stringop_alg;
23454 /* rep; movq or rep; movl is the smallest variant. */
23455 else if (!optimize_for_speed)
23456 {
23457 *noalign = true;
23458 if (!count || (count & 3) || (memset && !zero_memset))
23459 return alg_usable_p (rep_prefix_1_byte, memset)
23460 ? rep_prefix_1_byte : loop_1_byte;
23461 else
23462 return alg_usable_p (rep_prefix_4_byte, memset)
23463 ? rep_prefix_4_byte : loop;
23464 }
23465 /* Very tiny blocks are best handled via the loop, REP is expensive to
23466 setup. */
23467 else if (expected_size != -1 && expected_size < 4)
23468 return loop_1_byte;
23469 else if (expected_size != -1)
23470 {
23471 enum stringop_alg alg = libcall;
23472 bool alg_noalign = false;
23473 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23474 {
23475 /* We get here if the algorithms that were not libcall-based
23476 were rep-prefix based and we are unable to use rep prefixes
23477 based on global register usage. Break out of the loop and
23478 use the heuristic below. */
23479 if (algs->size[i].max == 0)
23480 break;
23481 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
23482 {
23483 enum stringop_alg candidate = algs->size[i].alg;
23484
23485 if (candidate != libcall && alg_usable_p (candidate, memset))
23486 {
23487 alg = candidate;
23488 alg_noalign = algs->size[i].noalign;
23489 }
23490 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
23491 last non-libcall inline algorithm. */
23492 if (TARGET_INLINE_ALL_STRINGOPS)
23493 {
23494 /* When the current size is best to be copied by a libcall,
23495 but we are still forced to inline, run the heuristic below
23496 that will pick code for medium sized blocks. */
23497 if (alg != libcall)
23498 {
23499 *noalign = alg_noalign;
23500 return alg;
23501 }
23502 break;
23503 }
23504 else if (alg_usable_p (candidate, memset))
23505 {
23506 *noalign = algs->size[i].noalign;
23507 return candidate;
23508 }
23509 }
23510 }
23511 }
23512 /* When asked to inline the call anyway, try to pick meaningful choice.
23513 We look for maximal size of block that is faster to copy by hand and
23514 take blocks of at most of that size guessing that average size will
23515 be roughly half of the block.
23516
23517 If this turns out to be bad, we might simply specify the preferred
23518 choice in ix86_costs. */
23519 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23520 && (algs->unknown_size == libcall
23521 || !alg_usable_p (algs->unknown_size, memset)))
23522 {
23523 enum stringop_alg alg;
23524
23525 /* If there aren't any usable algorithms, then recursing on
23526 smaller sizes isn't going to find anything. Just return the
23527 simple byte-at-a-time copy loop. */
23528 if (!any_alg_usable_p)
23529 {
23530 /* Pick something reasonable. */
23531 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23532 *dynamic_check = 128;
23533 return loop_1_byte;
23534 }
23535 if (max == -1)
23536 max = 4096;
23537 alg = decide_alg (count, max / 2, min_size, max_size, memset,
23538 zero_memset, dynamic_check, noalign);
23539 gcc_assert (*dynamic_check == -1);
23540 gcc_assert (alg != libcall);
23541 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23542 *dynamic_check = max;
23543 return alg;
23544 }
23545 return (alg_usable_p (algs->unknown_size, memset)
23546 ? algs->unknown_size : libcall);
23547 }
23548
23549 /* Decide on alignment. We know that the operand is already aligned to ALIGN
23550 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
23551 static int
23552 decide_alignment (int align,
23553 enum stringop_alg alg,
23554 int expected_size,
23555 enum machine_mode move_mode)
23556 {
23557 int desired_align = 0;
23558
23559 gcc_assert (alg != no_stringop);
23560
23561 if (alg == libcall)
23562 return 0;
23563 if (move_mode == VOIDmode)
23564 return 0;
23565
23566 desired_align = GET_MODE_SIZE (move_mode);
23567 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
23568 copying whole cacheline at once. */
23569 if (TARGET_PENTIUMPRO
23570 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
23571 desired_align = 8;
23572
23573 if (optimize_size)
23574 desired_align = 1;
23575 if (desired_align < align)
23576 desired_align = align;
23577 if (expected_size != -1 && expected_size < 4)
23578 desired_align = align;
23579
23580 return desired_align;
23581 }
23582
23583
23584 /* Helper function for memcpy. For QImode value 0xXY produce
23585 0xXYXYXYXY of wide specified by MODE. This is essentially
23586 a * 0x10101010, but we can do slightly better than
23587 synth_mult by unwinding the sequence by hand on CPUs with
23588 slow multiply. */
23589 static rtx
23590 promote_duplicated_reg (enum machine_mode mode, rtx val)
23591 {
23592 enum machine_mode valmode = GET_MODE (val);
23593 rtx tmp;
23594 int nops = mode == DImode ? 3 : 2;
23595
23596 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
23597 if (val == const0_rtx)
23598 return copy_to_mode_reg (mode, CONST0_RTX (mode));
23599 if (CONST_INT_P (val))
23600 {
23601 HOST_WIDE_INT v = INTVAL (val) & 255;
23602
23603 v |= v << 8;
23604 v |= v << 16;
23605 if (mode == DImode)
23606 v |= (v << 16) << 16;
23607 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23608 }
23609
23610 if (valmode == VOIDmode)
23611 valmode = QImode;
23612 if (valmode != QImode)
23613 val = gen_lowpart (QImode, val);
23614 if (mode == QImode)
23615 return val;
23616 if (!TARGET_PARTIAL_REG_STALL)
23617 nops--;
23618 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23619 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23620 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23621 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23622 {
23623 rtx reg = convert_modes (mode, QImode, val, true);
23624 tmp = promote_duplicated_reg (mode, const1_rtx);
23625 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23626 OPTAB_DIRECT);
23627 }
23628 else
23629 {
23630 rtx reg = convert_modes (mode, QImode, val, true);
23631
23632 if (!TARGET_PARTIAL_REG_STALL)
23633 if (mode == SImode)
23634 emit_insn (gen_movsi_insv_1 (reg, reg));
23635 else
23636 emit_insn (gen_movdi_insv_1 (reg, reg));
23637 else
23638 {
23639 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23640 NULL, 1, OPTAB_DIRECT);
23641 reg =
23642 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23643 }
23644 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23645 NULL, 1, OPTAB_DIRECT);
23646 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23647 if (mode == SImode)
23648 return reg;
23649 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23650 NULL, 1, OPTAB_DIRECT);
23651 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23652 return reg;
23653 }
23654 }
23655
23656 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23657 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23658 alignment from ALIGN to DESIRED_ALIGN. */
23659 static rtx
23660 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
23661 int align)
23662 {
23663 rtx promoted_val;
23664
23665 if (TARGET_64BIT
23666 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23667 promoted_val = promote_duplicated_reg (DImode, val);
23668 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23669 promoted_val = promote_duplicated_reg (SImode, val);
23670 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23671 promoted_val = promote_duplicated_reg (HImode, val);
23672 else
23673 promoted_val = val;
23674
23675 return promoted_val;
23676 }
23677
23678 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
23679 operations when profitable. The code depends upon architecture, block size
23680 and alignment, but always has one of the following overall structures:
23681
23682 Aligned move sequence:
23683
23684 1) Prologue guard: Conditional that jumps up to epilogues for small
23685 blocks that can be handled by epilogue alone. This is faster
23686 but also needed for correctness, since prologue assume the block
23687 is larger than the desired alignment.
23688
23689 Optional dynamic check for size and libcall for large
23690 blocks is emitted here too, with -minline-stringops-dynamically.
23691
23692 2) Prologue: copy first few bytes in order to get destination
23693 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23694 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23695 copied. We emit either a jump tree on power of two sized
23696 blocks, or a byte loop.
23697
23698 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23699 with specified algorithm.
23700
23701 4) Epilogue: code copying tail of the block that is too small to be
23702 handled by main body (or up to size guarded by prologue guard).
23703
23704 Misaligned move sequence
23705
23706 1) missaligned move prologue/epilogue containing:
23707 a) Prologue handling small memory blocks and jumping to done_label
23708 (skipped if blocks are known to be large enough)
23709 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
23710 needed by single possibly misaligned move
23711 (skipped if alignment is not needed)
23712 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
23713
23714 2) Zero size guard dispatching to done_label, if needed
23715
23716 3) dispatch to library call, if needed,
23717
23718 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23719 with specified algorithm. */
23720 bool
23721 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
23722 rtx align_exp, rtx expected_align_exp,
23723 rtx expected_size_exp, rtx min_size_exp,
23724 rtx max_size_exp, rtx probable_max_size_exp,
23725 bool issetmem)
23726 {
23727 rtx destreg;
23728 rtx srcreg = NULL;
23729 rtx label = NULL;
23730 rtx tmp;
23731 rtx jump_around_label = NULL;
23732 HOST_WIDE_INT align = 1;
23733 unsigned HOST_WIDE_INT count = 0;
23734 HOST_WIDE_INT expected_size = -1;
23735 int size_needed = 0, epilogue_size_needed;
23736 int desired_align = 0, align_bytes = 0;
23737 enum stringop_alg alg;
23738 rtx promoted_val = NULL;
23739 rtx vec_promoted_val = NULL;
23740 bool force_loopy_epilogue = false;
23741 int dynamic_check;
23742 bool need_zero_guard = false;
23743 bool noalign;
23744 enum machine_mode move_mode = VOIDmode;
23745 int unroll_factor = 1;
23746 /* TODO: Once vlaue ranges are available, fill in proper data. */
23747 unsigned HOST_WIDE_INT min_size = 0;
23748 unsigned HOST_WIDE_INT max_size = -1;
23749 unsigned HOST_WIDE_INT probable_max_size = -1;
23750 bool misaligned_prologue_used = false;
23751
23752 if (CONST_INT_P (align_exp))
23753 align = INTVAL (align_exp);
23754 /* i386 can do misaligned access on reasonably increased cost. */
23755 if (CONST_INT_P (expected_align_exp)
23756 && INTVAL (expected_align_exp) > align)
23757 align = INTVAL (expected_align_exp);
23758 /* ALIGN is the minimum of destination and source alignment, but we care here
23759 just about destination alignment. */
23760 else if (!issetmem
23761 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23762 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23763
23764 if (CONST_INT_P (count_exp))
23765 min_size = max_size = probable_max_size = count = expected_size
23766 = INTVAL (count_exp);
23767 else
23768 {
23769 if (min_size_exp)
23770 min_size = INTVAL (min_size_exp);
23771 if (max_size_exp)
23772 max_size = INTVAL (max_size_exp);
23773 if (probable_max_size_exp)
23774 probable_max_size = INTVAL (probable_max_size_exp);
23775 if (CONST_INT_P (expected_size_exp) && count == 0)
23776 expected_size = INTVAL (expected_size_exp);
23777 }
23778
23779 /* Make sure we don't need to care about overflow later on. */
23780 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23781 return false;
23782
23783 /* Step 0: Decide on preferred algorithm, desired alignment and
23784 size of chunks to be copied by main loop. */
23785 alg = decide_alg (count, expected_size, min_size, probable_max_size,
23786 issetmem,
23787 issetmem && val_exp == const0_rtx,
23788 &dynamic_check, &noalign);
23789 if (alg == libcall)
23790 return false;
23791 gcc_assert (alg != no_stringop);
23792
23793 /* For now vector-version of memset is generated only for memory zeroing, as
23794 creating of promoted vector value is very cheap in this case. */
23795 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
23796 alg = unrolled_loop;
23797
23798 if (!count)
23799 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23800 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23801 if (!issetmem)
23802 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
23803
23804 unroll_factor = 1;
23805 move_mode = word_mode;
23806 switch (alg)
23807 {
23808 case libcall:
23809 case no_stringop:
23810 case last_alg:
23811 gcc_unreachable ();
23812 case loop_1_byte:
23813 need_zero_guard = true;
23814 move_mode = QImode;
23815 break;
23816 case loop:
23817 need_zero_guard = true;
23818 break;
23819 case unrolled_loop:
23820 need_zero_guard = true;
23821 unroll_factor = (TARGET_64BIT ? 4 : 2);
23822 break;
23823 case vector_loop:
23824 need_zero_guard = true;
23825 unroll_factor = 4;
23826 /* Find the widest supported mode. */
23827 move_mode = word_mode;
23828 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23829 != CODE_FOR_nothing)
23830 move_mode = GET_MODE_WIDER_MODE (move_mode);
23831
23832 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23833 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23834 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23835 {
23836 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23837 move_mode = mode_for_vector (word_mode, nunits);
23838 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23839 move_mode = word_mode;
23840 }
23841 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23842 break;
23843 case rep_prefix_8_byte:
23844 move_mode = DImode;
23845 break;
23846 case rep_prefix_4_byte:
23847 move_mode = SImode;
23848 break;
23849 case rep_prefix_1_byte:
23850 move_mode = QImode;
23851 break;
23852 }
23853 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23854 epilogue_size_needed = size_needed;
23855
23856 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23857 if (!TARGET_ALIGN_STRINGOPS || noalign)
23858 align = desired_align;
23859
23860 /* Step 1: Prologue guard. */
23861
23862 /* Alignment code needs count to be in register. */
23863 if (CONST_INT_P (count_exp) && desired_align > align)
23864 {
23865 if (INTVAL (count_exp) > desired_align
23866 && INTVAL (count_exp) > size_needed)
23867 {
23868 align_bytes
23869 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23870 if (align_bytes <= 0)
23871 align_bytes = 0;
23872 else
23873 align_bytes = desired_align - align_bytes;
23874 }
23875 if (align_bytes == 0)
23876 count_exp = force_reg (counter_mode (count_exp), count_exp);
23877 }
23878 gcc_assert (desired_align >= 1 && align >= 1);
23879
23880 /* Misaligned move sequences handle both prologue and epilogue at once.
23881 Default code generation results in a smaller code for large alignments
23882 and also avoids redundant job when sizes are known precisely. */
23883 misaligned_prologue_used
23884 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
23885 && MAX (desired_align, epilogue_size_needed) <= 32
23886 && desired_align <= epilogue_size_needed
23887 && ((desired_align > align && !align_bytes)
23888 || (!count && epilogue_size_needed > 1)));
23889
23890 /* Do the cheap promotion to allow better CSE across the
23891 main loop and epilogue (ie one load of the big constant in the
23892 front of all code.
23893 For now the misaligned move sequences do not have fast path
23894 without broadcasting. */
23895 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
23896 {
23897 if (alg == vector_loop)
23898 {
23899 gcc_assert (val_exp == const0_rtx);
23900 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
23901 promoted_val = promote_duplicated_reg_to_size (val_exp,
23902 GET_MODE_SIZE (word_mode),
23903 desired_align, align);
23904 }
23905 else
23906 {
23907 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23908 desired_align, align);
23909 }
23910 }
23911 /* Misaligned move sequences handles both prologues and epilogues at once.
23912 Default code generation results in smaller code for large alignments and
23913 also avoids redundant job when sizes are known precisely. */
23914 if (misaligned_prologue_used)
23915 {
23916 /* Misaligned move prologue handled small blocks by itself. */
23917 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
23918 (dst, src, &destreg, &srcreg,
23919 move_mode, promoted_val, vec_promoted_val,
23920 &count_exp,
23921 &jump_around_label,
23922 desired_align < align
23923 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
23924 desired_align, align, &min_size, dynamic_check, issetmem);
23925 if (!issetmem)
23926 src = change_address (src, BLKmode, srcreg);
23927 dst = change_address (dst, BLKmode, destreg);
23928 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23929 epilogue_size_needed = 0;
23930 if (need_zero_guard && !min_size)
23931 {
23932 /* It is possible that we copied enough so the main loop will not
23933 execute. */
23934 gcc_assert (size_needed > 1);
23935 if (jump_around_label == NULL_RTX)
23936 jump_around_label = gen_label_rtx ();
23937 emit_cmp_and_jump_insns (count_exp,
23938 GEN_INT (size_needed),
23939 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
23940 if (expected_size == -1
23941 || expected_size < (desired_align - align) / 2 + size_needed)
23942 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23943 else
23944 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23945 }
23946 }
23947 /* Ensure that alignment prologue won't copy past end of block. */
23948 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23949 {
23950 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23951 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23952 Make sure it is power of 2. */
23953 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23954
23955 /* To improve performance of small blocks, we jump around the VAL
23956 promoting mode. This mean that if the promoted VAL is not constant,
23957 we might not use it in the epilogue and have to use byte
23958 loop variant. */
23959 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
23960 force_loopy_epilogue = true;
23961 if (count)
23962 {
23963 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23964 {
23965 /* If main algorithm works on QImode, no epilogue is needed.
23966 For small sizes just don't align anything. */
23967 if (size_needed == 1)
23968 desired_align = align;
23969 else
23970 goto epilogue;
23971 }
23972 }
23973 else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23974 {
23975 gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
23976 label = gen_label_rtx ();
23977 emit_cmp_and_jump_insns (count_exp,
23978 GEN_INT (epilogue_size_needed),
23979 LTU, 0, counter_mode (count_exp), 1, label);
23980 if (expected_size == -1 || expected_size < epilogue_size_needed)
23981 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23982 else
23983 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23984 }
23985 }
23986
23987 /* Emit code to decide on runtime whether library call or inline should be
23988 used. */
23989 if (dynamic_check != -1)
23990 {
23991 if (!issetmem && CONST_INT_P (count_exp))
23992 {
23993 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23994 {
23995 emit_block_move_via_libcall (dst, src, count_exp, false);
23996 count_exp = const0_rtx;
23997 goto epilogue;
23998 }
23999 }
24000 else
24001 {
24002 rtx hot_label = gen_label_rtx ();
24003 jump_around_label = gen_label_rtx ();
24004 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
24005 LEU, 0, GET_MODE (count_exp), 1, hot_label);
24006 predict_jump (REG_BR_PROB_BASE * 90 / 100);
24007 if (issetmem)
24008 set_storage_via_libcall (dst, count_exp, val_exp, false);
24009 else
24010 emit_block_move_via_libcall (dst, src, count_exp, false);
24011 emit_jump (jump_around_label);
24012 emit_label (hot_label);
24013 }
24014 }
24015
24016 /* Step 2: Alignment prologue. */
24017 /* Do the expensive promotion once we branched off the small blocks. */
24018 if (issetmem && !promoted_val)
24019 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
24020 desired_align, align);
24021
24022 if (desired_align > align && !misaligned_prologue_used)
24023 {
24024 if (align_bytes == 0)
24025 {
24026 /* Except for the first move in prologue, we no longer know
24027 constant offset in aliasing info. It don't seems to worth
24028 the pain to maintain it for the first move, so throw away
24029 the info early. */
24030 dst = change_address (dst, BLKmode, destreg);
24031 if (!issetmem)
24032 src = change_address (src, BLKmode, srcreg);
24033 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
24034 promoted_val, vec_promoted_val,
24035 count_exp, align, desired_align,
24036 issetmem);
24037 /* At most desired_align - align bytes are copied. */
24038 if (min_size < (unsigned)(desired_align - align))
24039 min_size = 0;
24040 else
24041 min_size -= desired_align - align;
24042 }
24043 else
24044 {
24045 /* If we know how many bytes need to be stored before dst is
24046 sufficiently aligned, maintain aliasing info accurately. */
24047 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
24048 srcreg,
24049 promoted_val,
24050 vec_promoted_val,
24051 desired_align,
24052 align_bytes,
24053 issetmem);
24054
24055 count_exp = plus_constant (counter_mode (count_exp),
24056 count_exp, -align_bytes);
24057 count -= align_bytes;
24058 min_size -= align_bytes;
24059 max_size -= align_bytes;
24060 }
24061 if (need_zero_guard
24062 && !min_size
24063 && (count < (unsigned HOST_WIDE_INT) size_needed
24064 || (align_bytes == 0
24065 && count < ((unsigned HOST_WIDE_INT) size_needed
24066 + desired_align - align))))
24067 {
24068 /* It is possible that we copied enough so the main loop will not
24069 execute. */
24070 gcc_assert (size_needed > 1);
24071 if (label == NULL_RTX)
24072 label = gen_label_rtx ();
24073 emit_cmp_and_jump_insns (count_exp,
24074 GEN_INT (size_needed),
24075 LTU, 0, counter_mode (count_exp), 1, label);
24076 if (expected_size == -1
24077 || expected_size < (desired_align - align) / 2 + size_needed)
24078 predict_jump (REG_BR_PROB_BASE * 20 / 100);
24079 else
24080 predict_jump (REG_BR_PROB_BASE * 60 / 100);
24081 }
24082 }
24083 if (label && size_needed == 1)
24084 {
24085 emit_label (label);
24086 LABEL_NUSES (label) = 1;
24087 label = NULL;
24088 epilogue_size_needed = 1;
24089 if (issetmem)
24090 promoted_val = val_exp;
24091 }
24092 else if (label == NULL_RTX && !misaligned_prologue_used)
24093 epilogue_size_needed = size_needed;
24094
24095 /* Step 3: Main loop. */
24096
24097 switch (alg)
24098 {
24099 case libcall:
24100 case no_stringop:
24101 case last_alg:
24102 gcc_unreachable ();
24103 case loop_1_byte:
24104 case loop:
24105 case unrolled_loop:
24106 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
24107 count_exp, move_mode, unroll_factor,
24108 expected_size, issetmem);
24109 break;
24110 case vector_loop:
24111 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
24112 vec_promoted_val, count_exp, move_mode,
24113 unroll_factor, expected_size, issetmem);
24114 break;
24115 case rep_prefix_8_byte:
24116 case rep_prefix_4_byte:
24117 case rep_prefix_1_byte:
24118 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
24119 val_exp, count_exp, move_mode, issetmem);
24120 break;
24121 }
24122 /* Adjust properly the offset of src and dest memory for aliasing. */
24123 if (CONST_INT_P (count_exp))
24124 {
24125 if (!issetmem)
24126 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
24127 (count / size_needed) * size_needed);
24128 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
24129 (count / size_needed) * size_needed);
24130 }
24131 else
24132 {
24133 if (!issetmem)
24134 src = change_address (src, BLKmode, srcreg);
24135 dst = change_address (dst, BLKmode, destreg);
24136 }
24137
24138 /* Step 4: Epilogue to copy the remaining bytes. */
24139 epilogue:
24140 if (label)
24141 {
24142 /* When the main loop is done, COUNT_EXP might hold original count,
24143 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
24144 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
24145 bytes. Compensate if needed. */
24146
24147 if (size_needed < epilogue_size_needed)
24148 {
24149 tmp =
24150 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
24151 GEN_INT (size_needed - 1), count_exp, 1,
24152 OPTAB_DIRECT);
24153 if (tmp != count_exp)
24154 emit_move_insn (count_exp, tmp);
24155 }
24156 emit_label (label);
24157 LABEL_NUSES (label) = 1;
24158 }
24159
24160 if (count_exp != const0_rtx && epilogue_size_needed > 1)
24161 {
24162 if (force_loopy_epilogue)
24163 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
24164 epilogue_size_needed);
24165 else
24166 {
24167 if (issetmem)
24168 expand_setmem_epilogue (dst, destreg, promoted_val,
24169 vec_promoted_val, count_exp,
24170 epilogue_size_needed);
24171 else
24172 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
24173 epilogue_size_needed);
24174 }
24175 }
24176 if (jump_around_label)
24177 emit_label (jump_around_label);
24178 return true;
24179 }
24180
24181
24182 /* Expand the appropriate insns for doing strlen if not just doing
24183 repnz; scasb
24184
24185 out = result, initialized with the start address
24186 align_rtx = alignment of the address.
24187 scratch = scratch register, initialized with the startaddress when
24188 not aligned, otherwise undefined
24189
24190 This is just the body. It needs the initializations mentioned above and
24191 some address computing at the end. These things are done in i386.md. */
24192
24193 static void
24194 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
24195 {
24196 int align;
24197 rtx tmp;
24198 rtx align_2_label = NULL_RTX;
24199 rtx align_3_label = NULL_RTX;
24200 rtx align_4_label = gen_label_rtx ();
24201 rtx end_0_label = gen_label_rtx ();
24202 rtx mem;
24203 rtx tmpreg = gen_reg_rtx (SImode);
24204 rtx scratch = gen_reg_rtx (SImode);
24205 rtx cmp;
24206
24207 align = 0;
24208 if (CONST_INT_P (align_rtx))
24209 align = INTVAL (align_rtx);
24210
24211 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
24212
24213 /* Is there a known alignment and is it less than 4? */
24214 if (align < 4)
24215 {
24216 rtx scratch1 = gen_reg_rtx (Pmode);
24217 emit_move_insn (scratch1, out);
24218 /* Is there a known alignment and is it not 2? */
24219 if (align != 2)
24220 {
24221 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
24222 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
24223
24224 /* Leave just the 3 lower bits. */
24225 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
24226 NULL_RTX, 0, OPTAB_WIDEN);
24227
24228 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24229 Pmode, 1, align_4_label);
24230 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
24231 Pmode, 1, align_2_label);
24232 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
24233 Pmode, 1, align_3_label);
24234 }
24235 else
24236 {
24237 /* Since the alignment is 2, we have to check 2 or 0 bytes;
24238 check if is aligned to 4 - byte. */
24239
24240 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
24241 NULL_RTX, 0, OPTAB_WIDEN);
24242
24243 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24244 Pmode, 1, align_4_label);
24245 }
24246
24247 mem = change_address (src, QImode, out);
24248
24249 /* Now compare the bytes. */
24250
24251 /* Compare the first n unaligned byte on a byte per byte basis. */
24252 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
24253 QImode, 1, end_0_label);
24254
24255 /* Increment the address. */
24256 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24257
24258 /* Not needed with an alignment of 2 */
24259 if (align != 2)
24260 {
24261 emit_label (align_2_label);
24262
24263 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24264 end_0_label);
24265
24266 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24267
24268 emit_label (align_3_label);
24269 }
24270
24271 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24272 end_0_label);
24273
24274 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24275 }
24276
24277 /* Generate loop to check 4 bytes at a time. It is not a good idea to
24278 align this loop. It gives only huge programs, but does not help to
24279 speed up. */
24280 emit_label (align_4_label);
24281
24282 mem = change_address (src, SImode, out);
24283 emit_move_insn (scratch, mem);
24284 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
24285
24286 /* This formula yields a nonzero result iff one of the bytes is zero.
24287 This saves three branches inside loop and many cycles. */
24288
24289 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
24290 emit_insn (gen_one_cmplsi2 (scratch, scratch));
24291 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
24292 emit_insn (gen_andsi3 (tmpreg, tmpreg,
24293 gen_int_mode (0x80808080, SImode)));
24294 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
24295 align_4_label);
24296
24297 if (TARGET_CMOVE)
24298 {
24299 rtx reg = gen_reg_rtx (SImode);
24300 rtx reg2 = gen_reg_rtx (Pmode);
24301 emit_move_insn (reg, tmpreg);
24302 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
24303
24304 /* If zero is not in the first two bytes, move two bytes forward. */
24305 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24306 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24307 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24308 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
24309 gen_rtx_IF_THEN_ELSE (SImode, tmp,
24310 reg,
24311 tmpreg)));
24312 /* Emit lea manually to avoid clobbering of flags. */
24313 emit_insn (gen_rtx_SET (SImode, reg2,
24314 gen_rtx_PLUS (Pmode, out, const2_rtx)));
24315
24316 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24317 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24318 emit_insn (gen_rtx_SET (VOIDmode, out,
24319 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
24320 reg2,
24321 out)));
24322 }
24323 else
24324 {
24325 rtx end_2_label = gen_label_rtx ();
24326 /* Is zero in the first two bytes? */
24327
24328 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24329 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24330 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
24331 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24332 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
24333 pc_rtx);
24334 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
24335 JUMP_LABEL (tmp) = end_2_label;
24336
24337 /* Not in the first two. Move two bytes forward. */
24338 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
24339 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
24340
24341 emit_label (end_2_label);
24342
24343 }
24344
24345 /* Avoid branch in fixing the byte. */
24346 tmpreg = gen_lowpart (QImode, tmpreg);
24347 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
24348 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
24349 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
24350 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
24351
24352 emit_label (end_0_label);
24353 }
24354
24355 /* Expand strlen. */
24356
24357 bool
24358 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
24359 {
24360 rtx addr, scratch1, scratch2, scratch3, scratch4;
24361
24362 /* The generic case of strlen expander is long. Avoid it's
24363 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
24364
24365 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24366 && !TARGET_INLINE_ALL_STRINGOPS
24367 && !optimize_insn_for_size_p ()
24368 && (!CONST_INT_P (align) || INTVAL (align) < 4))
24369 return false;
24370
24371 addr = force_reg (Pmode, XEXP (src, 0));
24372 scratch1 = gen_reg_rtx (Pmode);
24373
24374 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24375 && !optimize_insn_for_size_p ())
24376 {
24377 /* Well it seems that some optimizer does not combine a call like
24378 foo(strlen(bar), strlen(bar));
24379 when the move and the subtraction is done here. It does calculate
24380 the length just once when these instructions are done inside of
24381 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
24382 often used and I use one fewer register for the lifetime of
24383 output_strlen_unroll() this is better. */
24384
24385 emit_move_insn (out, addr);
24386
24387 ix86_expand_strlensi_unroll_1 (out, src, align);
24388
24389 /* strlensi_unroll_1 returns the address of the zero at the end of
24390 the string, like memchr(), so compute the length by subtracting
24391 the start address. */
24392 emit_insn (ix86_gen_sub3 (out, out, addr));
24393 }
24394 else
24395 {
24396 rtx unspec;
24397
24398 /* Can't use this if the user has appropriated eax, ecx, or edi. */
24399 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
24400 return false;
24401
24402 scratch2 = gen_reg_rtx (Pmode);
24403 scratch3 = gen_reg_rtx (Pmode);
24404 scratch4 = force_reg (Pmode, constm1_rtx);
24405
24406 emit_move_insn (scratch3, addr);
24407 eoschar = force_reg (QImode, eoschar);
24408
24409 src = replace_equiv_address_nv (src, scratch3);
24410
24411 /* If .md starts supporting :P, this can be done in .md. */
24412 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
24413 scratch4), UNSPEC_SCAS);
24414 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
24415 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
24416 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
24417 }
24418 return true;
24419 }
24420
24421 /* For given symbol (function) construct code to compute address of it's PLT
24422 entry in large x86-64 PIC model. */
24423 static rtx
24424 construct_plt_address (rtx symbol)
24425 {
24426 rtx tmp, unspec;
24427
24428 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
24429 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
24430 gcc_assert (Pmode == DImode);
24431
24432 tmp = gen_reg_rtx (Pmode);
24433 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
24434
24435 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
24436 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
24437 return tmp;
24438 }
24439
24440 rtx
24441 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
24442 rtx callarg2,
24443 rtx pop, bool sibcall)
24444 {
24445 unsigned int const cregs_size
24446 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24447 rtx vec[3 + cregs_size];
24448 rtx use = NULL, call;
24449 unsigned int vec_len = 0;
24450
24451 if (pop == const0_rtx)
24452 pop = NULL;
24453 gcc_assert (!TARGET_64BIT || !pop);
24454
24455 if (TARGET_MACHO && !TARGET_64BIT)
24456 {
24457 #if TARGET_MACHO
24458 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24459 fnaddr = machopic_indirect_call_target (fnaddr);
24460 #endif
24461 }
24462 else
24463 {
24464 /* Static functions and indirect calls don't need the pic register. */
24465 if (flag_pic
24466 && (!TARGET_64BIT
24467 || (ix86_cmodel == CM_LARGE_PIC
24468 && DEFAULT_ABI != MS_ABI))
24469 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24470 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24471 use_reg (&use, pic_offset_table_rtx);
24472 }
24473
24474 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24475 {
24476 rtx al = gen_rtx_REG (QImode, AX_REG);
24477 emit_move_insn (al, callarg2);
24478 use_reg (&use, al);
24479 }
24480
24481 if (ix86_cmodel == CM_LARGE_PIC
24482 && !TARGET_PECOFF
24483 && MEM_P (fnaddr)
24484 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24485 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24486 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24487 else if (sibcall
24488 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24489 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24490 {
24491 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24492 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24493 }
24494
24495 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24496 if (retval)
24497 call = gen_rtx_SET (VOIDmode, retval, call);
24498 vec[vec_len++] = call;
24499
24500 if (pop)
24501 {
24502 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24503 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24504 vec[vec_len++] = pop;
24505 }
24506
24507 if (TARGET_64BIT_MS_ABI
24508 && (!callarg2 || INTVAL (callarg2) != -2))
24509 {
24510 unsigned i;
24511
24512 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24513 UNSPEC_MS_TO_SYSV_CALL);
24514
24515 for (i = 0; i < cregs_size; i++)
24516 {
24517 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24518 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24519
24520 vec[vec_len++]
24521 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24522 }
24523 }
24524
24525 if (vec_len > 1)
24526 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24527 call = emit_call_insn (call);
24528 if (use)
24529 CALL_INSN_FUNCTION_USAGE (call) = use;
24530
24531 return call;
24532 }
24533
24534 /* Output the assembly for a call instruction. */
24535
24536 const char *
24537 ix86_output_call_insn (rtx insn, rtx call_op)
24538 {
24539 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24540 bool seh_nop_p = false;
24541 const char *xasm;
24542
24543 if (SIBLING_CALL_P (insn))
24544 {
24545 if (direct_p)
24546 xasm = "%!jmp\t%P0";
24547 /* SEH epilogue detection requires the indirect branch case
24548 to include REX.W. */
24549 else if (TARGET_SEH)
24550 xasm = "%!rex.W jmp %A0";
24551 else
24552 xasm = "%!jmp\t%A0";
24553
24554 output_asm_insn (xasm, &call_op);
24555 return "";
24556 }
24557
24558 /* SEH unwinding can require an extra nop to be emitted in several
24559 circumstances. Determine if we have one of those. */
24560 if (TARGET_SEH)
24561 {
24562 rtx i;
24563
24564 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24565 {
24566 /* If we get to another real insn, we don't need the nop. */
24567 if (INSN_P (i))
24568 break;
24569
24570 /* If we get to the epilogue note, prevent a catch region from
24571 being adjacent to the standard epilogue sequence. If non-
24572 call-exceptions, we'll have done this during epilogue emission. */
24573 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24574 && !flag_non_call_exceptions
24575 && !can_throw_internal (insn))
24576 {
24577 seh_nop_p = true;
24578 break;
24579 }
24580 }
24581
24582 /* If we didn't find a real insn following the call, prevent the
24583 unwinder from looking into the next function. */
24584 if (i == NULL)
24585 seh_nop_p = true;
24586 }
24587
24588 if (direct_p)
24589 xasm = "%!call\t%P0";
24590 else
24591 xasm = "%!call\t%A0";
24592
24593 output_asm_insn (xasm, &call_op);
24594
24595 if (seh_nop_p)
24596 return "nop";
24597
24598 return "";
24599 }
24600 \f
24601 /* Clear stack slot assignments remembered from previous functions.
24602 This is called from INIT_EXPANDERS once before RTL is emitted for each
24603 function. */
24604
24605 static struct machine_function *
24606 ix86_init_machine_status (void)
24607 {
24608 struct machine_function *f;
24609
24610 f = ggc_alloc_cleared_machine_function ();
24611 f->use_fast_prologue_epilogue_nregs = -1;
24612 f->call_abi = ix86_abi;
24613
24614 return f;
24615 }
24616
24617 /* Return a MEM corresponding to a stack slot with mode MODE.
24618 Allocate a new slot if necessary.
24619
24620 The RTL for a function can have several slots available: N is
24621 which slot to use. */
24622
24623 rtx
24624 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24625 {
24626 struct stack_local_entry *s;
24627
24628 gcc_assert (n < MAX_386_STACK_LOCALS);
24629
24630 for (s = ix86_stack_locals; s; s = s->next)
24631 if (s->mode == mode && s->n == n)
24632 return validize_mem (copy_rtx (s->rtl));
24633
24634 s = ggc_alloc_stack_local_entry ();
24635 s->n = n;
24636 s->mode = mode;
24637 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24638
24639 s->next = ix86_stack_locals;
24640 ix86_stack_locals = s;
24641 return validize_mem (s->rtl);
24642 }
24643
24644 static void
24645 ix86_instantiate_decls (void)
24646 {
24647 struct stack_local_entry *s;
24648
24649 for (s = ix86_stack_locals; s; s = s->next)
24650 if (s->rtl != NULL_RTX)
24651 instantiate_decl_rtl (s->rtl);
24652 }
24653 \f
24654 /* Check whether x86 address PARTS is a pc-relative address. */
24655
24656 static bool
24657 rip_relative_addr_p (struct ix86_address *parts)
24658 {
24659 rtx base, index, disp;
24660
24661 base = parts->base;
24662 index = parts->index;
24663 disp = parts->disp;
24664
24665 if (disp && !base && !index)
24666 {
24667 if (TARGET_64BIT)
24668 {
24669 rtx symbol = disp;
24670
24671 if (GET_CODE (disp) == CONST)
24672 symbol = XEXP (disp, 0);
24673 if (GET_CODE (symbol) == PLUS
24674 && CONST_INT_P (XEXP (symbol, 1)))
24675 symbol = XEXP (symbol, 0);
24676
24677 if (GET_CODE (symbol) == LABEL_REF
24678 || (GET_CODE (symbol) == SYMBOL_REF
24679 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
24680 || (GET_CODE (symbol) == UNSPEC
24681 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
24682 || XINT (symbol, 1) == UNSPEC_PCREL
24683 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
24684 return true;
24685 }
24686 }
24687 return false;
24688 }
24689
24690 /* Calculate the length of the memory address in the instruction encoding.
24691 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24692 or other prefixes. We never generate addr32 prefix for LEA insn. */
24693
24694 int
24695 memory_address_length (rtx addr, bool lea)
24696 {
24697 struct ix86_address parts;
24698 rtx base, index, disp;
24699 int len;
24700 int ok;
24701
24702 if (GET_CODE (addr) == PRE_DEC
24703 || GET_CODE (addr) == POST_INC
24704 || GET_CODE (addr) == PRE_MODIFY
24705 || GET_CODE (addr) == POST_MODIFY)
24706 return 0;
24707
24708 ok = ix86_decompose_address (addr, &parts);
24709 gcc_assert (ok);
24710
24711 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24712
24713 /* If this is not LEA instruction, add the length of addr32 prefix. */
24714 if (TARGET_64BIT && !lea
24715 && (SImode_address_operand (addr, VOIDmode)
24716 || (parts.base && GET_MODE (parts.base) == SImode)
24717 || (parts.index && GET_MODE (parts.index) == SImode)))
24718 len++;
24719
24720 base = parts.base;
24721 index = parts.index;
24722 disp = parts.disp;
24723
24724 if (base && GET_CODE (base) == SUBREG)
24725 base = SUBREG_REG (base);
24726 if (index && GET_CODE (index) == SUBREG)
24727 index = SUBREG_REG (index);
24728
24729 gcc_assert (base == NULL_RTX || REG_P (base));
24730 gcc_assert (index == NULL_RTX || REG_P (index));
24731
24732 /* Rule of thumb:
24733 - esp as the base always wants an index,
24734 - ebp as the base always wants a displacement,
24735 - r12 as the base always wants an index,
24736 - r13 as the base always wants a displacement. */
24737
24738 /* Register Indirect. */
24739 if (base && !index && !disp)
24740 {
24741 /* esp (for its index) and ebp (for its displacement) need
24742 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24743 code. */
24744 if (base == arg_pointer_rtx
24745 || base == frame_pointer_rtx
24746 || REGNO (base) == SP_REG
24747 || REGNO (base) == BP_REG
24748 || REGNO (base) == R12_REG
24749 || REGNO (base) == R13_REG)
24750 len++;
24751 }
24752
24753 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24754 is not disp32, but disp32(%rip), so for disp32
24755 SIB byte is needed, unless print_operand_address
24756 optimizes it into disp32(%rip) or (%rip) is implied
24757 by UNSPEC. */
24758 else if (disp && !base && !index)
24759 {
24760 len += 4;
24761 if (rip_relative_addr_p (&parts))
24762 len++;
24763 }
24764 else
24765 {
24766 /* Find the length of the displacement constant. */
24767 if (disp)
24768 {
24769 if (base && satisfies_constraint_K (disp))
24770 len += 1;
24771 else
24772 len += 4;
24773 }
24774 /* ebp always wants a displacement. Similarly r13. */
24775 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24776 len++;
24777
24778 /* An index requires the two-byte modrm form.... */
24779 if (index
24780 /* ...like esp (or r12), which always wants an index. */
24781 || base == arg_pointer_rtx
24782 || base == frame_pointer_rtx
24783 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24784 len++;
24785 }
24786
24787 return len;
24788 }
24789
24790 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24791 is set, expect that insn have 8bit immediate alternative. */
24792 int
24793 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24794 {
24795 int len = 0;
24796 int i;
24797 extract_insn_cached (insn);
24798 for (i = recog_data.n_operands - 1; i >= 0; --i)
24799 if (CONSTANT_P (recog_data.operand[i]))
24800 {
24801 enum attr_mode mode = get_attr_mode (insn);
24802
24803 gcc_assert (!len);
24804 if (shortform && CONST_INT_P (recog_data.operand[i]))
24805 {
24806 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24807 switch (mode)
24808 {
24809 case MODE_QI:
24810 len = 1;
24811 continue;
24812 case MODE_HI:
24813 ival = trunc_int_for_mode (ival, HImode);
24814 break;
24815 case MODE_SI:
24816 ival = trunc_int_for_mode (ival, SImode);
24817 break;
24818 default:
24819 break;
24820 }
24821 if (IN_RANGE (ival, -128, 127))
24822 {
24823 len = 1;
24824 continue;
24825 }
24826 }
24827 switch (mode)
24828 {
24829 case MODE_QI:
24830 len = 1;
24831 break;
24832 case MODE_HI:
24833 len = 2;
24834 break;
24835 case MODE_SI:
24836 len = 4;
24837 break;
24838 /* Immediates for DImode instructions are encoded
24839 as 32bit sign extended values. */
24840 case MODE_DI:
24841 len = 4;
24842 break;
24843 default:
24844 fatal_insn ("unknown insn mode", insn);
24845 }
24846 }
24847 return len;
24848 }
24849
24850 /* Compute default value for "length_address" attribute. */
24851 int
24852 ix86_attr_length_address_default (rtx insn)
24853 {
24854 int i;
24855
24856 if (get_attr_type (insn) == TYPE_LEA)
24857 {
24858 rtx set = PATTERN (insn), addr;
24859
24860 if (GET_CODE (set) == PARALLEL)
24861 set = XVECEXP (set, 0, 0);
24862
24863 gcc_assert (GET_CODE (set) == SET);
24864
24865 addr = SET_SRC (set);
24866
24867 return memory_address_length (addr, true);
24868 }
24869
24870 extract_insn_cached (insn);
24871 for (i = recog_data.n_operands - 1; i >= 0; --i)
24872 if (MEM_P (recog_data.operand[i]))
24873 {
24874 constrain_operands_cached (reload_completed);
24875 if (which_alternative != -1)
24876 {
24877 const char *constraints = recog_data.constraints[i];
24878 int alt = which_alternative;
24879
24880 while (*constraints == '=' || *constraints == '+')
24881 constraints++;
24882 while (alt-- > 0)
24883 while (*constraints++ != ',')
24884 ;
24885 /* Skip ignored operands. */
24886 if (*constraints == 'X')
24887 continue;
24888 }
24889 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24890 }
24891 return 0;
24892 }
24893
24894 /* Compute default value for "length_vex" attribute. It includes
24895 2 or 3 byte VEX prefix and 1 opcode byte. */
24896
24897 int
24898 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24899 {
24900 int i;
24901
24902 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24903 byte VEX prefix. */
24904 if (!has_0f_opcode || has_vex_w)
24905 return 3 + 1;
24906
24907 /* We can always use 2 byte VEX prefix in 32bit. */
24908 if (!TARGET_64BIT)
24909 return 2 + 1;
24910
24911 extract_insn_cached (insn);
24912
24913 for (i = recog_data.n_operands - 1; i >= 0; --i)
24914 if (REG_P (recog_data.operand[i]))
24915 {
24916 /* REX.W bit uses 3 byte VEX prefix. */
24917 if (GET_MODE (recog_data.operand[i]) == DImode
24918 && GENERAL_REG_P (recog_data.operand[i]))
24919 return 3 + 1;
24920 }
24921 else
24922 {
24923 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24924 if (MEM_P (recog_data.operand[i])
24925 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24926 return 3 + 1;
24927 }
24928
24929 return 2 + 1;
24930 }
24931 \f
24932 /* Return the maximum number of instructions a cpu can issue. */
24933
24934 static int
24935 ix86_issue_rate (void)
24936 {
24937 switch (ix86_tune)
24938 {
24939 case PROCESSOR_PENTIUM:
24940 case PROCESSOR_ATOM:
24941 case PROCESSOR_SLM:
24942 case PROCESSOR_K6:
24943 case PROCESSOR_BTVER2:
24944 case PROCESSOR_PENTIUM4:
24945 case PROCESSOR_NOCONA:
24946 return 2;
24947
24948 case PROCESSOR_PENTIUMPRO:
24949 case PROCESSOR_ATHLON:
24950 case PROCESSOR_K8:
24951 case PROCESSOR_AMDFAM10:
24952 case PROCESSOR_GENERIC:
24953 case PROCESSOR_BTVER1:
24954 return 3;
24955
24956 case PROCESSOR_BDVER1:
24957 case PROCESSOR_BDVER2:
24958 case PROCESSOR_BDVER3:
24959 case PROCESSOR_BDVER4:
24960 case PROCESSOR_CORE2:
24961 case PROCESSOR_COREI7:
24962 case PROCESSOR_COREI7_AVX:
24963 case PROCESSOR_HASWELL:
24964 return 4;
24965
24966 default:
24967 return 1;
24968 }
24969 }
24970
24971 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24972 by DEP_INSN and nothing set by DEP_INSN. */
24973
24974 static bool
24975 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24976 {
24977 rtx set, set2;
24978
24979 /* Simplify the test for uninteresting insns. */
24980 if (insn_type != TYPE_SETCC
24981 && insn_type != TYPE_ICMOV
24982 && insn_type != TYPE_FCMOV
24983 && insn_type != TYPE_IBR)
24984 return false;
24985
24986 if ((set = single_set (dep_insn)) != 0)
24987 {
24988 set = SET_DEST (set);
24989 set2 = NULL_RTX;
24990 }
24991 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24992 && XVECLEN (PATTERN (dep_insn), 0) == 2
24993 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24994 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24995 {
24996 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24997 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24998 }
24999 else
25000 return false;
25001
25002 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
25003 return false;
25004
25005 /* This test is true if the dependent insn reads the flags but
25006 not any other potentially set register. */
25007 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
25008 return false;
25009
25010 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
25011 return false;
25012
25013 return true;
25014 }
25015
25016 /* Return true iff USE_INSN has a memory address with operands set by
25017 SET_INSN. */
25018
25019 bool
25020 ix86_agi_dependent (rtx set_insn, rtx use_insn)
25021 {
25022 int i;
25023 extract_insn_cached (use_insn);
25024 for (i = recog_data.n_operands - 1; i >= 0; --i)
25025 if (MEM_P (recog_data.operand[i]))
25026 {
25027 rtx addr = XEXP (recog_data.operand[i], 0);
25028 return modified_in_p (addr, set_insn) != 0;
25029 }
25030 return false;
25031 }
25032
25033 /* Helper function for exact_store_load_dependency.
25034 Return true if addr is found in insn. */
25035 static bool
25036 exact_dependency_1 (rtx addr, rtx insn)
25037 {
25038 enum rtx_code code;
25039 const char *format_ptr;
25040 int i, j;
25041
25042 code = GET_CODE (insn);
25043 switch (code)
25044 {
25045 case MEM:
25046 if (rtx_equal_p (addr, insn))
25047 return true;
25048 break;
25049 case REG:
25050 CASE_CONST_ANY:
25051 case SYMBOL_REF:
25052 case CODE_LABEL:
25053 case PC:
25054 case CC0:
25055 case EXPR_LIST:
25056 return false;
25057 default:
25058 break;
25059 }
25060
25061 format_ptr = GET_RTX_FORMAT (code);
25062 for (i = 0; i < GET_RTX_LENGTH (code); i++)
25063 {
25064 switch (*format_ptr++)
25065 {
25066 case 'e':
25067 if (exact_dependency_1 (addr, XEXP (insn, i)))
25068 return true;
25069 break;
25070 case 'E':
25071 for (j = 0; j < XVECLEN (insn, i); j++)
25072 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
25073 return true;
25074 break;
25075 }
25076 }
25077 return false;
25078 }
25079
25080 /* Return true if there exists exact dependency for store & load, i.e.
25081 the same memory address is used in them. */
25082 static bool
25083 exact_store_load_dependency (rtx store, rtx load)
25084 {
25085 rtx set1, set2;
25086
25087 set1 = single_set (store);
25088 if (!set1)
25089 return false;
25090 if (!MEM_P (SET_DEST (set1)))
25091 return false;
25092 set2 = single_set (load);
25093 if (!set2)
25094 return false;
25095 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
25096 return true;
25097 return false;
25098 }
25099
25100 static int
25101 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
25102 {
25103 enum attr_type insn_type, dep_insn_type;
25104 enum attr_memory memory;
25105 rtx set, set2;
25106 int dep_insn_code_number;
25107
25108 /* Anti and output dependencies have zero cost on all CPUs. */
25109 if (REG_NOTE_KIND (link) != 0)
25110 return 0;
25111
25112 dep_insn_code_number = recog_memoized (dep_insn);
25113
25114 /* If we can't recognize the insns, we can't really do anything. */
25115 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
25116 return cost;
25117
25118 insn_type = get_attr_type (insn);
25119 dep_insn_type = get_attr_type (dep_insn);
25120
25121 switch (ix86_tune)
25122 {
25123 case PROCESSOR_PENTIUM:
25124 /* Address Generation Interlock adds a cycle of latency. */
25125 if (insn_type == TYPE_LEA)
25126 {
25127 rtx addr = PATTERN (insn);
25128
25129 if (GET_CODE (addr) == PARALLEL)
25130 addr = XVECEXP (addr, 0, 0);
25131
25132 gcc_assert (GET_CODE (addr) == SET);
25133
25134 addr = SET_SRC (addr);
25135 if (modified_in_p (addr, dep_insn))
25136 cost += 1;
25137 }
25138 else if (ix86_agi_dependent (dep_insn, insn))
25139 cost += 1;
25140
25141 /* ??? Compares pair with jump/setcc. */
25142 if (ix86_flags_dependent (insn, dep_insn, insn_type))
25143 cost = 0;
25144
25145 /* Floating point stores require value to be ready one cycle earlier. */
25146 if (insn_type == TYPE_FMOV
25147 && get_attr_memory (insn) == MEMORY_STORE
25148 && !ix86_agi_dependent (dep_insn, insn))
25149 cost += 1;
25150 break;
25151
25152 case PROCESSOR_PENTIUMPRO:
25153 memory = get_attr_memory (insn);
25154
25155 /* INT->FP conversion is expensive. */
25156 if (get_attr_fp_int_src (dep_insn))
25157 cost += 5;
25158
25159 /* There is one cycle extra latency between an FP op and a store. */
25160 if (insn_type == TYPE_FMOV
25161 && (set = single_set (dep_insn)) != NULL_RTX
25162 && (set2 = single_set (insn)) != NULL_RTX
25163 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
25164 && MEM_P (SET_DEST (set2)))
25165 cost += 1;
25166
25167 /* Show ability of reorder buffer to hide latency of load by executing
25168 in parallel with previous instruction in case
25169 previous instruction is not needed to compute the address. */
25170 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25171 && !ix86_agi_dependent (dep_insn, insn))
25172 {
25173 /* Claim moves to take one cycle, as core can issue one load
25174 at time and the next load can start cycle later. */
25175 if (dep_insn_type == TYPE_IMOV
25176 || dep_insn_type == TYPE_FMOV)
25177 cost = 1;
25178 else if (cost > 1)
25179 cost--;
25180 }
25181 break;
25182
25183 case PROCESSOR_K6:
25184 memory = get_attr_memory (insn);
25185
25186 /* The esp dependency is resolved before the instruction is really
25187 finished. */
25188 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25189 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25190 return 1;
25191
25192 /* INT->FP conversion is expensive. */
25193 if (get_attr_fp_int_src (dep_insn))
25194 cost += 5;
25195
25196 /* Show ability of reorder buffer to hide latency of load by executing
25197 in parallel with previous instruction in case
25198 previous instruction is not needed to compute the address. */
25199 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25200 && !ix86_agi_dependent (dep_insn, insn))
25201 {
25202 /* Claim moves to take one cycle, as core can issue one load
25203 at time and the next load can start cycle later. */
25204 if (dep_insn_type == TYPE_IMOV
25205 || dep_insn_type == TYPE_FMOV)
25206 cost = 1;
25207 else if (cost > 2)
25208 cost -= 2;
25209 else
25210 cost = 1;
25211 }
25212 break;
25213
25214 case PROCESSOR_ATHLON:
25215 case PROCESSOR_K8:
25216 case PROCESSOR_AMDFAM10:
25217 case PROCESSOR_BDVER1:
25218 case PROCESSOR_BDVER2:
25219 case PROCESSOR_BDVER3:
25220 case PROCESSOR_BDVER4:
25221 case PROCESSOR_BTVER1:
25222 case PROCESSOR_BTVER2:
25223 case PROCESSOR_GENERIC:
25224 memory = get_attr_memory (insn);
25225
25226 /* Stack engine allows to execute push&pop instructions in parall. */
25227 if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25228 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25229 && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
25230 return 0;
25231
25232 /* Show ability of reorder buffer to hide latency of load by executing
25233 in parallel with previous instruction in case
25234 previous instruction is not needed to compute the address. */
25235 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25236 && !ix86_agi_dependent (dep_insn, insn))
25237 {
25238 enum attr_unit unit = get_attr_unit (insn);
25239 int loadcost = 3;
25240
25241 /* Because of the difference between the length of integer and
25242 floating unit pipeline preparation stages, the memory operands
25243 for floating point are cheaper.
25244
25245 ??? For Athlon it the difference is most probably 2. */
25246 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
25247 loadcost = 3;
25248 else
25249 loadcost = TARGET_ATHLON ? 2 : 0;
25250
25251 if (cost >= loadcost)
25252 cost -= loadcost;
25253 else
25254 cost = 0;
25255 }
25256 break;
25257
25258 case PROCESSOR_CORE2:
25259 case PROCESSOR_COREI7:
25260 case PROCESSOR_COREI7_AVX:
25261 case PROCESSOR_HASWELL:
25262 memory = get_attr_memory (insn);
25263
25264 /* Stack engine allows to execute push&pop instructions in parall. */
25265 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25266 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25267 return 0;
25268
25269 /* Show ability of reorder buffer to hide latency of load by executing
25270 in parallel with previous instruction in case
25271 previous instruction is not needed to compute the address. */
25272 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25273 && !ix86_agi_dependent (dep_insn, insn))
25274 {
25275 if (cost >= 4)
25276 cost -= 4;
25277 else
25278 cost = 0;
25279 }
25280 break;
25281
25282 case PROCESSOR_SLM:
25283 if (!reload_completed)
25284 return cost;
25285
25286 /* Increase cost of integer loads. */
25287 memory = get_attr_memory (dep_insn);
25288 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25289 {
25290 enum attr_unit unit = get_attr_unit (dep_insn);
25291 if (unit == UNIT_INTEGER && cost == 1)
25292 {
25293 if (memory == MEMORY_LOAD)
25294 cost = 3;
25295 else
25296 {
25297 /* Increase cost of ld/st for short int types only
25298 because of store forwarding issue. */
25299 rtx set = single_set (dep_insn);
25300 if (set && (GET_MODE (SET_DEST (set)) == QImode
25301 || GET_MODE (SET_DEST (set)) == HImode))
25302 {
25303 /* Increase cost of store/load insn if exact
25304 dependence exists and it is load insn. */
25305 enum attr_memory insn_memory = get_attr_memory (insn);
25306 if (insn_memory == MEMORY_LOAD
25307 && exact_store_load_dependency (dep_insn, insn))
25308 cost = 3;
25309 }
25310 }
25311 }
25312 }
25313
25314 default:
25315 break;
25316 }
25317
25318 return cost;
25319 }
25320
25321 /* How many alternative schedules to try. This should be as wide as the
25322 scheduling freedom in the DFA, but no wider. Making this value too
25323 large results extra work for the scheduler. */
25324
25325 static int
25326 ia32_multipass_dfa_lookahead (void)
25327 {
25328 switch (ix86_tune)
25329 {
25330 case PROCESSOR_PENTIUM:
25331 return 2;
25332
25333 case PROCESSOR_PENTIUMPRO:
25334 case PROCESSOR_K6:
25335 return 1;
25336
25337 case PROCESSOR_BDVER1:
25338 case PROCESSOR_BDVER2:
25339 case PROCESSOR_BDVER3:
25340 case PROCESSOR_BDVER4:
25341 /* We use lookahead value 4 for BD both before and after reload
25342 schedules. Plan is to have value 8 included for O3. */
25343 return 4;
25344
25345 case PROCESSOR_CORE2:
25346 case PROCESSOR_COREI7:
25347 case PROCESSOR_COREI7_AVX:
25348 case PROCESSOR_HASWELL:
25349 case PROCESSOR_ATOM:
25350 case PROCESSOR_SLM:
25351 /* Generally, we want haifa-sched:max_issue() to look ahead as far
25352 as many instructions can be executed on a cycle, i.e.,
25353 issue_rate. I wonder why tuning for many CPUs does not do this. */
25354 if (reload_completed)
25355 return ix86_issue_rate ();
25356 /* Don't use lookahead for pre-reload schedule to save compile time. */
25357 return 0;
25358
25359 default:
25360 return 0;
25361 }
25362 }
25363
25364 /* Return true if target platform supports macro-fusion. */
25365
25366 static bool
25367 ix86_macro_fusion_p ()
25368 {
25369 return TARGET_FUSE_CMP_AND_BRANCH;
25370 }
25371
25372 /* Check whether current microarchitecture support macro fusion
25373 for insn pair "CONDGEN + CONDJMP". Refer to
25374 "Intel Architectures Optimization Reference Manual". */
25375
25376 static bool
25377 ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
25378 {
25379 rtx src, dest;
25380 rtx single_set = single_set (condgen);
25381 enum rtx_code ccode;
25382 rtx compare_set = NULL_RTX, test_if, cond;
25383 rtx alu_set = NULL_RTX, addr = NULL_RTX;
25384
25385 if (get_attr_type (condgen) != TYPE_TEST
25386 && get_attr_type (condgen) != TYPE_ICMP
25387 && get_attr_type (condgen) != TYPE_INCDEC
25388 && get_attr_type (condgen) != TYPE_ALU)
25389 return false;
25390
25391 if (single_set == NULL_RTX
25392 && !TARGET_FUSE_ALU_AND_BRANCH)
25393 return false;
25394
25395 if (single_set != NULL_RTX)
25396 compare_set = single_set;
25397 else
25398 {
25399 int i;
25400 rtx pat = PATTERN (condgen);
25401 for (i = 0; i < XVECLEN (pat, 0); i++)
25402 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
25403 {
25404 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
25405 if (GET_CODE (set_src) == COMPARE)
25406 compare_set = XVECEXP (pat, 0, i);
25407 else
25408 alu_set = XVECEXP (pat, 0, i);
25409 }
25410 }
25411 if (compare_set == NULL_RTX)
25412 return false;
25413 src = SET_SRC (compare_set);
25414 if (GET_CODE (src) != COMPARE)
25415 return false;
25416
25417 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
25418 supported. */
25419 if ((MEM_P (XEXP (src, 0))
25420 && CONST_INT_P (XEXP (src, 1)))
25421 || (MEM_P (XEXP (src, 1))
25422 && CONST_INT_P (XEXP (src, 0))))
25423 return false;
25424
25425 /* No fusion for RIP-relative address. */
25426 if (MEM_P (XEXP (src, 0)))
25427 addr = XEXP (XEXP (src, 0), 0);
25428 else if (MEM_P (XEXP (src, 1)))
25429 addr = XEXP (XEXP (src, 1), 0);
25430
25431 if (addr) {
25432 ix86_address parts;
25433 int ok = ix86_decompose_address (addr, &parts);
25434 gcc_assert (ok);
25435
25436 if (rip_relative_addr_p (&parts))
25437 return false;
25438 }
25439
25440 test_if = SET_SRC (pc_set (condjmp));
25441 cond = XEXP (test_if, 0);
25442 ccode = GET_CODE (cond);
25443 /* Check whether conditional jump use Sign or Overflow Flags. */
25444 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
25445 && (ccode == GE
25446 || ccode == GT
25447 || ccode == LE
25448 || ccode == LT))
25449 return false;
25450
25451 /* Return true for TYPE_TEST and TYPE_ICMP. */
25452 if (get_attr_type (condgen) == TYPE_TEST
25453 || get_attr_type (condgen) == TYPE_ICMP)
25454 return true;
25455
25456 /* The following is the case that macro-fusion for alu + jmp. */
25457 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
25458 return false;
25459
25460 /* No fusion for alu op with memory destination operand. */
25461 dest = SET_DEST (alu_set);
25462 if (MEM_P (dest))
25463 return false;
25464
25465 /* Macro-fusion for inc/dec + unsigned conditional jump is not
25466 supported. */
25467 if (get_attr_type (condgen) == TYPE_INCDEC
25468 && (ccode == GEU
25469 || ccode == GTU
25470 || ccode == LEU
25471 || ccode == LTU))
25472 return false;
25473
25474 return true;
25475 }
25476
25477 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
25478 execution. It is applied if
25479 (1) IMUL instruction is on the top of list;
25480 (2) There exists the only producer of independent IMUL instruction in
25481 ready list.
25482 Return index of IMUL producer if it was found and -1 otherwise. */
25483 static int
25484 do_reorder_for_imul (rtx *ready, int n_ready)
25485 {
25486 rtx insn, set, insn1, insn2;
25487 sd_iterator_def sd_it;
25488 dep_t dep;
25489 int index = -1;
25490 int i;
25491
25492 if (ix86_tune != PROCESSOR_ATOM)
25493 return index;
25494
25495 /* Check that IMUL instruction is on the top of ready list. */
25496 insn = ready[n_ready - 1];
25497 set = single_set (insn);
25498 if (!set)
25499 return index;
25500 if (!(GET_CODE (SET_SRC (set)) == MULT
25501 && GET_MODE (SET_SRC (set)) == SImode))
25502 return index;
25503
25504 /* Search for producer of independent IMUL instruction. */
25505 for (i = n_ready - 2; i >= 0; i--)
25506 {
25507 insn = ready[i];
25508 if (!NONDEBUG_INSN_P (insn))
25509 continue;
25510 /* Skip IMUL instruction. */
25511 insn2 = PATTERN (insn);
25512 if (GET_CODE (insn2) == PARALLEL)
25513 insn2 = XVECEXP (insn2, 0, 0);
25514 if (GET_CODE (insn2) == SET
25515 && GET_CODE (SET_SRC (insn2)) == MULT
25516 && GET_MODE (SET_SRC (insn2)) == SImode)
25517 continue;
25518
25519 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
25520 {
25521 rtx con;
25522 con = DEP_CON (dep);
25523 if (!NONDEBUG_INSN_P (con))
25524 continue;
25525 insn1 = PATTERN (con);
25526 if (GET_CODE (insn1) == PARALLEL)
25527 insn1 = XVECEXP (insn1, 0, 0);
25528
25529 if (GET_CODE (insn1) == SET
25530 && GET_CODE (SET_SRC (insn1)) == MULT
25531 && GET_MODE (SET_SRC (insn1)) == SImode)
25532 {
25533 sd_iterator_def sd_it1;
25534 dep_t dep1;
25535 /* Check if there is no other dependee for IMUL. */
25536 index = i;
25537 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
25538 {
25539 rtx pro;
25540 pro = DEP_PRO (dep1);
25541 if (!NONDEBUG_INSN_P (pro))
25542 continue;
25543 if (pro != insn)
25544 index = -1;
25545 }
25546 if (index >= 0)
25547 break;
25548 }
25549 }
25550 if (index >= 0)
25551 break;
25552 }
25553 return index;
25554 }
25555
25556 /* Try to find the best candidate on the top of ready list if two insns
25557 have the same priority - candidate is best if its dependees were
25558 scheduled earlier. Applied for Silvermont only.
25559 Return true if top 2 insns must be interchanged. */
25560 static bool
25561 swap_top_of_ready_list (rtx *ready, int n_ready)
25562 {
25563 rtx top = ready[n_ready - 1];
25564 rtx next = ready[n_ready - 2];
25565 rtx set;
25566 sd_iterator_def sd_it;
25567 dep_t dep;
25568 int clock1 = -1;
25569 int clock2 = -1;
25570 #define INSN_TICK(INSN) (HID (INSN)->tick)
25571
25572 if (ix86_tune != PROCESSOR_SLM)
25573 return false;
25574
25575 if (!NONDEBUG_INSN_P (top))
25576 return false;
25577 if (!NONJUMP_INSN_P (top))
25578 return false;
25579 if (!NONDEBUG_INSN_P (next))
25580 return false;
25581 if (!NONJUMP_INSN_P (next))
25582 return false;
25583 set = single_set (top);
25584 if (!set)
25585 return false;
25586 set = single_set (next);
25587 if (!set)
25588 return false;
25589
25590 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25591 {
25592 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25593 return false;
25594 /* Determine winner more precise. */
25595 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25596 {
25597 rtx pro;
25598 pro = DEP_PRO (dep);
25599 if (!NONDEBUG_INSN_P (pro))
25600 continue;
25601 if (INSN_TICK (pro) > clock1)
25602 clock1 = INSN_TICK (pro);
25603 }
25604 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25605 {
25606 rtx pro;
25607 pro = DEP_PRO (dep);
25608 if (!NONDEBUG_INSN_P (pro))
25609 continue;
25610 if (INSN_TICK (pro) > clock2)
25611 clock2 = INSN_TICK (pro);
25612 }
25613
25614 if (clock1 == clock2)
25615 {
25616 /* Determine winner - load must win. */
25617 enum attr_memory memory1, memory2;
25618 memory1 = get_attr_memory (top);
25619 memory2 = get_attr_memory (next);
25620 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25621 return true;
25622 }
25623 return (bool) (clock2 < clock1);
25624 }
25625 return false;
25626 #undef INSN_TICK
25627 }
25628
25629 /* Perform possible reodering of ready list for Atom/Silvermont only.
25630 Return issue rate. */
25631 static int
25632 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25633 int clock_var)
25634 {
25635 int issue_rate = -1;
25636 int n_ready = *pn_ready;
25637 int i;
25638 rtx insn;
25639 int index = -1;
25640
25641 /* Set up issue rate. */
25642 issue_rate = ix86_issue_rate ();
25643
25644 /* Do reodering for Atom/SLM only. */
25645 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25646 return issue_rate;
25647
25648 /* Nothing to do if ready list contains only 1 instruction. */
25649 if (n_ready <= 1)
25650 return issue_rate;
25651
25652 /* Do reodering for post-reload scheduler only. */
25653 if (!reload_completed)
25654 return issue_rate;
25655
25656 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25657 {
25658 if (sched_verbose > 1)
25659 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25660 INSN_UID (ready[index]));
25661
25662 /* Put IMUL producer (ready[index]) at the top of ready list. */
25663 insn = ready[index];
25664 for (i = index; i < n_ready - 1; i++)
25665 ready[i] = ready[i + 1];
25666 ready[n_ready - 1] = insn;
25667 return issue_rate;
25668 }
25669 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25670 {
25671 if (sched_verbose > 1)
25672 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25673 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25674 /* Swap 2 top elements of ready list. */
25675 insn = ready[n_ready - 1];
25676 ready[n_ready - 1] = ready[n_ready - 2];
25677 ready[n_ready - 2] = insn;
25678 }
25679 return issue_rate;
25680 }
25681
25682 static bool
25683 ix86_class_likely_spilled_p (reg_class_t);
25684
25685 /* Returns true if lhs of insn is HW function argument register and set up
25686 is_spilled to true if it is likely spilled HW register. */
25687 static bool
25688 insn_is_function_arg (rtx insn, bool* is_spilled)
25689 {
25690 rtx dst;
25691
25692 if (!NONDEBUG_INSN_P (insn))
25693 return false;
25694 /* Call instructions are not movable, ignore it. */
25695 if (CALL_P (insn))
25696 return false;
25697 insn = PATTERN (insn);
25698 if (GET_CODE (insn) == PARALLEL)
25699 insn = XVECEXP (insn, 0, 0);
25700 if (GET_CODE (insn) != SET)
25701 return false;
25702 dst = SET_DEST (insn);
25703 if (REG_P (dst) && HARD_REGISTER_P (dst)
25704 && ix86_function_arg_regno_p (REGNO (dst)))
25705 {
25706 /* Is it likely spilled HW register? */
25707 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25708 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25709 *is_spilled = true;
25710 return true;
25711 }
25712 return false;
25713 }
25714
25715 /* Add output dependencies for chain of function adjacent arguments if only
25716 there is a move to likely spilled HW register. Return first argument
25717 if at least one dependence was added or NULL otherwise. */
25718 static rtx
25719 add_parameter_dependencies (rtx call, rtx head)
25720 {
25721 rtx insn;
25722 rtx last = call;
25723 rtx first_arg = NULL;
25724 bool is_spilled = false;
25725
25726 head = PREV_INSN (head);
25727
25728 /* Find nearest to call argument passing instruction. */
25729 while (true)
25730 {
25731 last = PREV_INSN (last);
25732 if (last == head)
25733 return NULL;
25734 if (!NONDEBUG_INSN_P (last))
25735 continue;
25736 if (insn_is_function_arg (last, &is_spilled))
25737 break;
25738 return NULL;
25739 }
25740
25741 first_arg = last;
25742 while (true)
25743 {
25744 insn = PREV_INSN (last);
25745 if (!INSN_P (insn))
25746 break;
25747 if (insn == head)
25748 break;
25749 if (!NONDEBUG_INSN_P (insn))
25750 {
25751 last = insn;
25752 continue;
25753 }
25754 if (insn_is_function_arg (insn, &is_spilled))
25755 {
25756 /* Add output depdendence between two function arguments if chain
25757 of output arguments contains likely spilled HW registers. */
25758 if (is_spilled)
25759 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25760 first_arg = last = insn;
25761 }
25762 else
25763 break;
25764 }
25765 if (!is_spilled)
25766 return NULL;
25767 return first_arg;
25768 }
25769
25770 /* Add output or anti dependency from insn to first_arg to restrict its code
25771 motion. */
25772 static void
25773 avoid_func_arg_motion (rtx first_arg, rtx insn)
25774 {
25775 rtx set;
25776 rtx tmp;
25777
25778 set = single_set (insn);
25779 if (!set)
25780 return;
25781 tmp = SET_DEST (set);
25782 if (REG_P (tmp))
25783 {
25784 /* Add output dependency to the first function argument. */
25785 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25786 return;
25787 }
25788 /* Add anti dependency. */
25789 add_dependence (first_arg, insn, REG_DEP_ANTI);
25790 }
25791
25792 /* Avoid cross block motion of function argument through adding dependency
25793 from the first non-jump instruction in bb. */
25794 static void
25795 add_dependee_for_func_arg (rtx arg, basic_block bb)
25796 {
25797 rtx insn = BB_END (bb);
25798
25799 while (insn)
25800 {
25801 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25802 {
25803 rtx set = single_set (insn);
25804 if (set)
25805 {
25806 avoid_func_arg_motion (arg, insn);
25807 return;
25808 }
25809 }
25810 if (insn == BB_HEAD (bb))
25811 return;
25812 insn = PREV_INSN (insn);
25813 }
25814 }
25815
25816 /* Hook for pre-reload schedule - avoid motion of function arguments
25817 passed in likely spilled HW registers. */
25818 static void
25819 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25820 {
25821 rtx insn;
25822 rtx first_arg = NULL;
25823 if (reload_completed)
25824 return;
25825 while (head != tail && DEBUG_INSN_P (head))
25826 head = NEXT_INSN (head);
25827 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25828 if (INSN_P (insn) && CALL_P (insn))
25829 {
25830 first_arg = add_parameter_dependencies (insn, head);
25831 if (first_arg)
25832 {
25833 /* Add dependee for first argument to predecessors if only
25834 region contains more than one block. */
25835 basic_block bb = BLOCK_FOR_INSN (insn);
25836 int rgn = CONTAINING_RGN (bb->index);
25837 int nr_blks = RGN_NR_BLOCKS (rgn);
25838 /* Skip trivial regions and region head blocks that can have
25839 predecessors outside of region. */
25840 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25841 {
25842 edge e;
25843 edge_iterator ei;
25844 /* Assume that region is SCC, i.e. all immediate predecessors
25845 of non-head block are in the same region. */
25846 FOR_EACH_EDGE (e, ei, bb->preds)
25847 {
25848 /* Avoid creating of loop-carried dependencies through
25849 using topological odering in region. */
25850 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25851 add_dependee_for_func_arg (first_arg, e->src);
25852 }
25853 }
25854 insn = first_arg;
25855 if (insn == head)
25856 break;
25857 }
25858 }
25859 else if (first_arg)
25860 avoid_func_arg_motion (first_arg, insn);
25861 }
25862
25863 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25864 HW registers to maximum, to schedule them at soon as possible. These are
25865 moves from function argument registers at the top of the function entry
25866 and moves from function return value registers after call. */
25867 static int
25868 ix86_adjust_priority (rtx insn, int priority)
25869 {
25870 rtx set;
25871
25872 if (reload_completed)
25873 return priority;
25874
25875 if (!NONDEBUG_INSN_P (insn))
25876 return priority;
25877
25878 set = single_set (insn);
25879 if (set)
25880 {
25881 rtx tmp = SET_SRC (set);
25882 if (REG_P (tmp)
25883 && HARD_REGISTER_P (tmp)
25884 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25885 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25886 return current_sched_info->sched_max_insns_priority;
25887 }
25888
25889 return priority;
25890 }
25891
25892 /* Model decoder of Core 2/i7.
25893 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25894 track the instruction fetch block boundaries and make sure that long
25895 (9+ bytes) instructions are assigned to D0. */
25896
25897 /* Maximum length of an insn that can be handled by
25898 a secondary decoder unit. '8' for Core 2/i7. */
25899 static int core2i7_secondary_decoder_max_insn_size;
25900
25901 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25902 '16' for Core 2/i7. */
25903 static int core2i7_ifetch_block_size;
25904
25905 /* Maximum number of instructions decoder can handle per cycle.
25906 '6' for Core 2/i7. */
25907 static int core2i7_ifetch_block_max_insns;
25908
25909 typedef struct ix86_first_cycle_multipass_data_ *
25910 ix86_first_cycle_multipass_data_t;
25911 typedef const struct ix86_first_cycle_multipass_data_ *
25912 const_ix86_first_cycle_multipass_data_t;
25913
25914 /* A variable to store target state across calls to max_issue within
25915 one cycle. */
25916 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25917 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25918
25919 /* Initialize DATA. */
25920 static void
25921 core2i7_first_cycle_multipass_init (void *_data)
25922 {
25923 ix86_first_cycle_multipass_data_t data
25924 = (ix86_first_cycle_multipass_data_t) _data;
25925
25926 data->ifetch_block_len = 0;
25927 data->ifetch_block_n_insns = 0;
25928 data->ready_try_change = NULL;
25929 data->ready_try_change_size = 0;
25930 }
25931
25932 /* Advancing the cycle; reset ifetch block counts. */
25933 static void
25934 core2i7_dfa_post_advance_cycle (void)
25935 {
25936 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25937
25938 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25939
25940 data->ifetch_block_len = 0;
25941 data->ifetch_block_n_insns = 0;
25942 }
25943
25944 static int min_insn_size (rtx);
25945
25946 /* Filter out insns from ready_try that the core will not be able to issue
25947 on current cycle due to decoder. */
25948 static void
25949 core2i7_first_cycle_multipass_filter_ready_try
25950 (const_ix86_first_cycle_multipass_data_t data,
25951 char *ready_try, int n_ready, bool first_cycle_insn_p)
25952 {
25953 while (n_ready--)
25954 {
25955 rtx insn;
25956 int insn_size;
25957
25958 if (ready_try[n_ready])
25959 continue;
25960
25961 insn = get_ready_element (n_ready);
25962 insn_size = min_insn_size (insn);
25963
25964 if (/* If this is a too long an insn for a secondary decoder ... */
25965 (!first_cycle_insn_p
25966 && insn_size > core2i7_secondary_decoder_max_insn_size)
25967 /* ... or it would not fit into the ifetch block ... */
25968 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25969 /* ... or the decoder is full already ... */
25970 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25971 /* ... mask the insn out. */
25972 {
25973 ready_try[n_ready] = 1;
25974
25975 if (data->ready_try_change)
25976 bitmap_set_bit (data->ready_try_change, n_ready);
25977 }
25978 }
25979 }
25980
25981 /* Prepare for a new round of multipass lookahead scheduling. */
25982 static void
25983 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25984 bool first_cycle_insn_p)
25985 {
25986 ix86_first_cycle_multipass_data_t data
25987 = (ix86_first_cycle_multipass_data_t) _data;
25988 const_ix86_first_cycle_multipass_data_t prev_data
25989 = ix86_first_cycle_multipass_data;
25990
25991 /* Restore the state from the end of the previous round. */
25992 data->ifetch_block_len = prev_data->ifetch_block_len;
25993 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25994
25995 /* Filter instructions that cannot be issued on current cycle due to
25996 decoder restrictions. */
25997 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25998 first_cycle_insn_p);
25999 }
26000
26001 /* INSN is being issued in current solution. Account for its impact on
26002 the decoder model. */
26003 static void
26004 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
26005 rtx insn, const void *_prev_data)
26006 {
26007 ix86_first_cycle_multipass_data_t data
26008 = (ix86_first_cycle_multipass_data_t) _data;
26009 const_ix86_first_cycle_multipass_data_t prev_data
26010 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
26011
26012 int insn_size = min_insn_size (insn);
26013
26014 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
26015 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
26016 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
26017 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
26018
26019 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
26020 if (!data->ready_try_change)
26021 {
26022 data->ready_try_change = sbitmap_alloc (n_ready);
26023 data->ready_try_change_size = n_ready;
26024 }
26025 else if (data->ready_try_change_size < n_ready)
26026 {
26027 data->ready_try_change = sbitmap_resize (data->ready_try_change,
26028 n_ready, 0);
26029 data->ready_try_change_size = n_ready;
26030 }
26031 bitmap_clear (data->ready_try_change);
26032
26033 /* Filter out insns from ready_try that the core will not be able to issue
26034 on current cycle due to decoder. */
26035 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
26036 false);
26037 }
26038
26039 /* Revert the effect on ready_try. */
26040 static void
26041 core2i7_first_cycle_multipass_backtrack (const void *_data,
26042 char *ready_try,
26043 int n_ready ATTRIBUTE_UNUSED)
26044 {
26045 const_ix86_first_cycle_multipass_data_t data
26046 = (const_ix86_first_cycle_multipass_data_t) _data;
26047 unsigned int i = 0;
26048 sbitmap_iterator sbi;
26049
26050 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
26051 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
26052 {
26053 ready_try[i] = 0;
26054 }
26055 }
26056
26057 /* Save the result of multipass lookahead scheduling for the next round. */
26058 static void
26059 core2i7_first_cycle_multipass_end (const void *_data)
26060 {
26061 const_ix86_first_cycle_multipass_data_t data
26062 = (const_ix86_first_cycle_multipass_data_t) _data;
26063 ix86_first_cycle_multipass_data_t next_data
26064 = ix86_first_cycle_multipass_data;
26065
26066 if (data != NULL)
26067 {
26068 next_data->ifetch_block_len = data->ifetch_block_len;
26069 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
26070 }
26071 }
26072
26073 /* Deallocate target data. */
26074 static void
26075 core2i7_first_cycle_multipass_fini (void *_data)
26076 {
26077 ix86_first_cycle_multipass_data_t data
26078 = (ix86_first_cycle_multipass_data_t) _data;
26079
26080 if (data->ready_try_change)
26081 {
26082 sbitmap_free (data->ready_try_change);
26083 data->ready_try_change = NULL;
26084 data->ready_try_change_size = 0;
26085 }
26086 }
26087
26088 /* Prepare for scheduling pass. */
26089 static void
26090 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
26091 int verbose ATTRIBUTE_UNUSED,
26092 int max_uid ATTRIBUTE_UNUSED)
26093 {
26094 /* Install scheduling hooks for current CPU. Some of these hooks are used
26095 in time-critical parts of the scheduler, so we only set them up when
26096 they are actually used. */
26097 switch (ix86_tune)
26098 {
26099 case PROCESSOR_CORE2:
26100 case PROCESSOR_COREI7:
26101 case PROCESSOR_COREI7_AVX:
26102 case PROCESSOR_HASWELL:
26103 /* Do not perform multipass scheduling for pre-reload schedule
26104 to save compile time. */
26105 if (reload_completed)
26106 {
26107 targetm.sched.dfa_post_advance_cycle
26108 = core2i7_dfa_post_advance_cycle;
26109 targetm.sched.first_cycle_multipass_init
26110 = core2i7_first_cycle_multipass_init;
26111 targetm.sched.first_cycle_multipass_begin
26112 = core2i7_first_cycle_multipass_begin;
26113 targetm.sched.first_cycle_multipass_issue
26114 = core2i7_first_cycle_multipass_issue;
26115 targetm.sched.first_cycle_multipass_backtrack
26116 = core2i7_first_cycle_multipass_backtrack;
26117 targetm.sched.first_cycle_multipass_end
26118 = core2i7_first_cycle_multipass_end;
26119 targetm.sched.first_cycle_multipass_fini
26120 = core2i7_first_cycle_multipass_fini;
26121
26122 /* Set decoder parameters. */
26123 core2i7_secondary_decoder_max_insn_size = 8;
26124 core2i7_ifetch_block_size = 16;
26125 core2i7_ifetch_block_max_insns = 6;
26126 break;
26127 }
26128 /* ... Fall through ... */
26129 default:
26130 targetm.sched.dfa_post_advance_cycle = NULL;
26131 targetm.sched.first_cycle_multipass_init = NULL;
26132 targetm.sched.first_cycle_multipass_begin = NULL;
26133 targetm.sched.first_cycle_multipass_issue = NULL;
26134 targetm.sched.first_cycle_multipass_backtrack = NULL;
26135 targetm.sched.first_cycle_multipass_end = NULL;
26136 targetm.sched.first_cycle_multipass_fini = NULL;
26137 break;
26138 }
26139 }
26140
26141 \f
26142 /* Compute the alignment given to a constant that is being placed in memory.
26143 EXP is the constant and ALIGN is the alignment that the object would
26144 ordinarily have.
26145 The value of this function is used instead of that alignment to align
26146 the object. */
26147
26148 int
26149 ix86_constant_alignment (tree exp, int align)
26150 {
26151 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
26152 || TREE_CODE (exp) == INTEGER_CST)
26153 {
26154 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
26155 return 64;
26156 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
26157 return 128;
26158 }
26159 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
26160 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
26161 return BITS_PER_WORD;
26162
26163 return align;
26164 }
26165
26166 /* Compute the alignment for a static variable.
26167 TYPE is the data type, and ALIGN is the alignment that
26168 the object would ordinarily have. The value of this function is used
26169 instead of that alignment to align the object. */
26170
26171 int
26172 ix86_data_alignment (tree type, int align, bool opt)
26173 {
26174 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
26175
26176 if (opt
26177 && AGGREGATE_TYPE_P (type)
26178 && TYPE_SIZE (type)
26179 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26180 && wi::geu_p (TYPE_SIZE (type), max_align)
26181 && align < max_align)
26182 align = max_align;
26183
26184 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26185 to 16byte boundary. */
26186 if (TARGET_64BIT)
26187 {
26188 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
26189 && TYPE_SIZE (type)
26190 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26191 && wi::geu_p (TYPE_SIZE (type), 128)
26192 && align < 128)
26193 return 128;
26194 }
26195
26196 if (!opt)
26197 return align;
26198
26199 if (TREE_CODE (type) == ARRAY_TYPE)
26200 {
26201 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26202 return 64;
26203 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26204 return 128;
26205 }
26206 else if (TREE_CODE (type) == COMPLEX_TYPE)
26207 {
26208
26209 if (TYPE_MODE (type) == DCmode && align < 64)
26210 return 64;
26211 if ((TYPE_MODE (type) == XCmode
26212 || TYPE_MODE (type) == TCmode) && align < 128)
26213 return 128;
26214 }
26215 else if ((TREE_CODE (type) == RECORD_TYPE
26216 || TREE_CODE (type) == UNION_TYPE
26217 || TREE_CODE (type) == QUAL_UNION_TYPE)
26218 && TYPE_FIELDS (type))
26219 {
26220 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26221 return 64;
26222 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26223 return 128;
26224 }
26225 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26226 || TREE_CODE (type) == INTEGER_TYPE)
26227 {
26228 if (TYPE_MODE (type) == DFmode && align < 64)
26229 return 64;
26230 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26231 return 128;
26232 }
26233
26234 return align;
26235 }
26236
26237 /* Compute the alignment for a local variable or a stack slot. EXP is
26238 the data type or decl itself, MODE is the widest mode available and
26239 ALIGN is the alignment that the object would ordinarily have. The
26240 value of this macro is used instead of that alignment to align the
26241 object. */
26242
26243 unsigned int
26244 ix86_local_alignment (tree exp, enum machine_mode mode,
26245 unsigned int align)
26246 {
26247 tree type, decl;
26248
26249 if (exp && DECL_P (exp))
26250 {
26251 type = TREE_TYPE (exp);
26252 decl = exp;
26253 }
26254 else
26255 {
26256 type = exp;
26257 decl = NULL;
26258 }
26259
26260 /* Don't do dynamic stack realignment for long long objects with
26261 -mpreferred-stack-boundary=2. */
26262 if (!TARGET_64BIT
26263 && align == 64
26264 && ix86_preferred_stack_boundary < 64
26265 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
26266 && (!type || !TYPE_USER_ALIGN (type))
26267 && (!decl || !DECL_USER_ALIGN (decl)))
26268 align = 32;
26269
26270 /* If TYPE is NULL, we are allocating a stack slot for caller-save
26271 register in MODE. We will return the largest alignment of XF
26272 and DF. */
26273 if (!type)
26274 {
26275 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
26276 align = GET_MODE_ALIGNMENT (DFmode);
26277 return align;
26278 }
26279
26280 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26281 to 16byte boundary. Exact wording is:
26282
26283 An array uses the same alignment as its elements, except that a local or
26284 global array variable of length at least 16 bytes or
26285 a C99 variable-length array variable always has alignment of at least 16 bytes.
26286
26287 This was added to allow use of aligned SSE instructions at arrays. This
26288 rule is meant for static storage (where compiler can not do the analysis
26289 by itself). We follow it for automatic variables only when convenient.
26290 We fully control everything in the function compiled and functions from
26291 other unit can not rely on the alignment.
26292
26293 Exclude va_list type. It is the common case of local array where
26294 we can not benefit from the alignment.
26295
26296 TODO: Probably one should optimize for size only when var is not escaping. */
26297 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
26298 && TARGET_SSE)
26299 {
26300 if (AGGREGATE_TYPE_P (type)
26301 && (va_list_type_node == NULL_TREE
26302 || (TYPE_MAIN_VARIANT (type)
26303 != TYPE_MAIN_VARIANT (va_list_type_node)))
26304 && TYPE_SIZE (type)
26305 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26306 && wi::geu_p (TYPE_SIZE (type), 16)
26307 && align < 128)
26308 return 128;
26309 }
26310 if (TREE_CODE (type) == ARRAY_TYPE)
26311 {
26312 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26313 return 64;
26314 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26315 return 128;
26316 }
26317 else if (TREE_CODE (type) == COMPLEX_TYPE)
26318 {
26319 if (TYPE_MODE (type) == DCmode && align < 64)
26320 return 64;
26321 if ((TYPE_MODE (type) == XCmode
26322 || TYPE_MODE (type) == TCmode) && align < 128)
26323 return 128;
26324 }
26325 else if ((TREE_CODE (type) == RECORD_TYPE
26326 || TREE_CODE (type) == UNION_TYPE
26327 || TREE_CODE (type) == QUAL_UNION_TYPE)
26328 && TYPE_FIELDS (type))
26329 {
26330 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26331 return 64;
26332 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26333 return 128;
26334 }
26335 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26336 || TREE_CODE (type) == INTEGER_TYPE)
26337 {
26338
26339 if (TYPE_MODE (type) == DFmode && align < 64)
26340 return 64;
26341 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26342 return 128;
26343 }
26344 return align;
26345 }
26346
26347 /* Compute the minimum required alignment for dynamic stack realignment
26348 purposes for a local variable, parameter or a stack slot. EXP is
26349 the data type or decl itself, MODE is its mode and ALIGN is the
26350 alignment that the object would ordinarily have. */
26351
26352 unsigned int
26353 ix86_minimum_alignment (tree exp, enum machine_mode mode,
26354 unsigned int align)
26355 {
26356 tree type, decl;
26357
26358 if (exp && DECL_P (exp))
26359 {
26360 type = TREE_TYPE (exp);
26361 decl = exp;
26362 }
26363 else
26364 {
26365 type = exp;
26366 decl = NULL;
26367 }
26368
26369 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
26370 return align;
26371
26372 /* Don't do dynamic stack realignment for long long objects with
26373 -mpreferred-stack-boundary=2. */
26374 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
26375 && (!type || !TYPE_USER_ALIGN (type))
26376 && (!decl || !DECL_USER_ALIGN (decl)))
26377 return 32;
26378
26379 return align;
26380 }
26381 \f
26382 /* Find a location for the static chain incoming to a nested function.
26383 This is a register, unless all free registers are used by arguments. */
26384
26385 static rtx
26386 ix86_static_chain (const_tree fndecl, bool incoming_p)
26387 {
26388 unsigned regno;
26389
26390 if (!DECL_STATIC_CHAIN (fndecl))
26391 return NULL;
26392
26393 if (TARGET_64BIT)
26394 {
26395 /* We always use R10 in 64-bit mode. */
26396 regno = R10_REG;
26397 }
26398 else
26399 {
26400 tree fntype;
26401 unsigned int ccvt;
26402
26403 /* By default in 32-bit mode we use ECX to pass the static chain. */
26404 regno = CX_REG;
26405
26406 fntype = TREE_TYPE (fndecl);
26407 ccvt = ix86_get_callcvt (fntype);
26408 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
26409 {
26410 /* Fastcall functions use ecx/edx for arguments, which leaves
26411 us with EAX for the static chain.
26412 Thiscall functions use ecx for arguments, which also
26413 leaves us with EAX for the static chain. */
26414 regno = AX_REG;
26415 }
26416 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
26417 {
26418 /* Thiscall functions use ecx for arguments, which leaves
26419 us with EAX and EDX for the static chain.
26420 We are using for abi-compatibility EAX. */
26421 regno = AX_REG;
26422 }
26423 else if (ix86_function_regparm (fntype, fndecl) == 3)
26424 {
26425 /* For regparm 3, we have no free call-clobbered registers in
26426 which to store the static chain. In order to implement this,
26427 we have the trampoline push the static chain to the stack.
26428 However, we can't push a value below the return address when
26429 we call the nested function directly, so we have to use an
26430 alternate entry point. For this we use ESI, and have the
26431 alternate entry point push ESI, so that things appear the
26432 same once we're executing the nested function. */
26433 if (incoming_p)
26434 {
26435 if (fndecl == current_function_decl)
26436 ix86_static_chain_on_stack = true;
26437 return gen_frame_mem (SImode,
26438 plus_constant (Pmode,
26439 arg_pointer_rtx, -8));
26440 }
26441 regno = SI_REG;
26442 }
26443 }
26444
26445 return gen_rtx_REG (Pmode, regno);
26446 }
26447
26448 /* Emit RTL insns to initialize the variable parts of a trampoline.
26449 FNDECL is the decl of the target address; M_TRAMP is a MEM for
26450 the trampoline, and CHAIN_VALUE is an RTX for the static chain
26451 to be passed to the target function. */
26452
26453 static void
26454 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
26455 {
26456 rtx mem, fnaddr;
26457 int opcode;
26458 int offset = 0;
26459
26460 fnaddr = XEXP (DECL_RTL (fndecl), 0);
26461
26462 if (TARGET_64BIT)
26463 {
26464 int size;
26465
26466 /* Load the function address to r11. Try to load address using
26467 the shorter movl instead of movabs. We may want to support
26468 movq for kernel mode, but kernel does not use trampolines at
26469 the moment. FNADDR is a 32bit address and may not be in
26470 DImode when ptr_mode == SImode. Always use movl in this
26471 case. */
26472 if (ptr_mode == SImode
26473 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
26474 {
26475 fnaddr = copy_addr_to_reg (fnaddr);
26476
26477 mem = adjust_address (m_tramp, HImode, offset);
26478 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
26479
26480 mem = adjust_address (m_tramp, SImode, offset + 2);
26481 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
26482 offset += 6;
26483 }
26484 else
26485 {
26486 mem = adjust_address (m_tramp, HImode, offset);
26487 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
26488
26489 mem = adjust_address (m_tramp, DImode, offset + 2);
26490 emit_move_insn (mem, fnaddr);
26491 offset += 10;
26492 }
26493
26494 /* Load static chain using movabs to r10. Use the shorter movl
26495 instead of movabs when ptr_mode == SImode. */
26496 if (ptr_mode == SImode)
26497 {
26498 opcode = 0xba41;
26499 size = 6;
26500 }
26501 else
26502 {
26503 opcode = 0xba49;
26504 size = 10;
26505 }
26506
26507 mem = adjust_address (m_tramp, HImode, offset);
26508 emit_move_insn (mem, gen_int_mode (opcode, HImode));
26509
26510 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
26511 emit_move_insn (mem, chain_value);
26512 offset += size;
26513
26514 /* Jump to r11; the last (unused) byte is a nop, only there to
26515 pad the write out to a single 32-bit store. */
26516 mem = adjust_address (m_tramp, SImode, offset);
26517 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
26518 offset += 4;
26519 }
26520 else
26521 {
26522 rtx disp, chain;
26523
26524 /* Depending on the static chain location, either load a register
26525 with a constant, or push the constant to the stack. All of the
26526 instructions are the same size. */
26527 chain = ix86_static_chain (fndecl, true);
26528 if (REG_P (chain))
26529 {
26530 switch (REGNO (chain))
26531 {
26532 case AX_REG:
26533 opcode = 0xb8; break;
26534 case CX_REG:
26535 opcode = 0xb9; break;
26536 default:
26537 gcc_unreachable ();
26538 }
26539 }
26540 else
26541 opcode = 0x68;
26542
26543 mem = adjust_address (m_tramp, QImode, offset);
26544 emit_move_insn (mem, gen_int_mode (opcode, QImode));
26545
26546 mem = adjust_address (m_tramp, SImode, offset + 1);
26547 emit_move_insn (mem, chain_value);
26548 offset += 5;
26549
26550 mem = adjust_address (m_tramp, QImode, offset);
26551 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
26552
26553 mem = adjust_address (m_tramp, SImode, offset + 1);
26554
26555 /* Compute offset from the end of the jmp to the target function.
26556 In the case in which the trampoline stores the static chain on
26557 the stack, we need to skip the first insn which pushes the
26558 (call-saved) register static chain; this push is 1 byte. */
26559 offset += 5;
26560 disp = expand_binop (SImode, sub_optab, fnaddr,
26561 plus_constant (Pmode, XEXP (m_tramp, 0),
26562 offset - (MEM_P (chain) ? 1 : 0)),
26563 NULL_RTX, 1, OPTAB_DIRECT);
26564 emit_move_insn (mem, disp);
26565 }
26566
26567 gcc_assert (offset <= TRAMPOLINE_SIZE);
26568
26569 #ifdef HAVE_ENABLE_EXECUTE_STACK
26570 #ifdef CHECK_EXECUTE_STACK_ENABLED
26571 if (CHECK_EXECUTE_STACK_ENABLED)
26572 #endif
26573 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
26574 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
26575 #endif
26576 }
26577 \f
26578 /* The following file contains several enumerations and data structures
26579 built from the definitions in i386-builtin-types.def. */
26580
26581 #include "i386-builtin-types.inc"
26582
26583 /* Table for the ix86 builtin non-function types. */
26584 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
26585
26586 /* Retrieve an element from the above table, building some of
26587 the types lazily. */
26588
26589 static tree
26590 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26591 {
26592 unsigned int index;
26593 tree type, itype;
26594
26595 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26596
26597 type = ix86_builtin_type_tab[(int) tcode];
26598 if (type != NULL)
26599 return type;
26600
26601 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26602 if (tcode <= IX86_BT_LAST_VECT)
26603 {
26604 enum machine_mode mode;
26605
26606 index = tcode - IX86_BT_LAST_PRIM - 1;
26607 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26608 mode = ix86_builtin_type_vect_mode[index];
26609
26610 type = build_vector_type_for_mode (itype, mode);
26611 }
26612 else
26613 {
26614 int quals;
26615
26616 index = tcode - IX86_BT_LAST_VECT - 1;
26617 if (tcode <= IX86_BT_LAST_PTR)
26618 quals = TYPE_UNQUALIFIED;
26619 else
26620 quals = TYPE_QUAL_CONST;
26621
26622 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26623 if (quals != TYPE_UNQUALIFIED)
26624 itype = build_qualified_type (itype, quals);
26625
26626 type = build_pointer_type (itype);
26627 }
26628
26629 ix86_builtin_type_tab[(int) tcode] = type;
26630 return type;
26631 }
26632
26633 /* Table for the ix86 builtin function types. */
26634 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26635
26636 /* Retrieve an element from the above table, building some of
26637 the types lazily. */
26638
26639 static tree
26640 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26641 {
26642 tree type;
26643
26644 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26645
26646 type = ix86_builtin_func_type_tab[(int) tcode];
26647 if (type != NULL)
26648 return type;
26649
26650 if (tcode <= IX86_BT_LAST_FUNC)
26651 {
26652 unsigned start = ix86_builtin_func_start[(int) tcode];
26653 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26654 tree rtype, atype, args = void_list_node;
26655 unsigned i;
26656
26657 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26658 for (i = after - 1; i > start; --i)
26659 {
26660 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26661 args = tree_cons (NULL, atype, args);
26662 }
26663
26664 type = build_function_type (rtype, args);
26665 }
26666 else
26667 {
26668 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26669 enum ix86_builtin_func_type icode;
26670
26671 icode = ix86_builtin_func_alias_base[index];
26672 type = ix86_get_builtin_func_type (icode);
26673 }
26674
26675 ix86_builtin_func_type_tab[(int) tcode] = type;
26676 return type;
26677 }
26678
26679
26680 /* Codes for all the SSE/MMX builtins. */
26681 enum ix86_builtins
26682 {
26683 IX86_BUILTIN_ADDPS,
26684 IX86_BUILTIN_ADDSS,
26685 IX86_BUILTIN_DIVPS,
26686 IX86_BUILTIN_DIVSS,
26687 IX86_BUILTIN_MULPS,
26688 IX86_BUILTIN_MULSS,
26689 IX86_BUILTIN_SUBPS,
26690 IX86_BUILTIN_SUBSS,
26691
26692 IX86_BUILTIN_CMPEQPS,
26693 IX86_BUILTIN_CMPLTPS,
26694 IX86_BUILTIN_CMPLEPS,
26695 IX86_BUILTIN_CMPGTPS,
26696 IX86_BUILTIN_CMPGEPS,
26697 IX86_BUILTIN_CMPNEQPS,
26698 IX86_BUILTIN_CMPNLTPS,
26699 IX86_BUILTIN_CMPNLEPS,
26700 IX86_BUILTIN_CMPNGTPS,
26701 IX86_BUILTIN_CMPNGEPS,
26702 IX86_BUILTIN_CMPORDPS,
26703 IX86_BUILTIN_CMPUNORDPS,
26704 IX86_BUILTIN_CMPEQSS,
26705 IX86_BUILTIN_CMPLTSS,
26706 IX86_BUILTIN_CMPLESS,
26707 IX86_BUILTIN_CMPNEQSS,
26708 IX86_BUILTIN_CMPNLTSS,
26709 IX86_BUILTIN_CMPNLESS,
26710 IX86_BUILTIN_CMPORDSS,
26711 IX86_BUILTIN_CMPUNORDSS,
26712
26713 IX86_BUILTIN_COMIEQSS,
26714 IX86_BUILTIN_COMILTSS,
26715 IX86_BUILTIN_COMILESS,
26716 IX86_BUILTIN_COMIGTSS,
26717 IX86_BUILTIN_COMIGESS,
26718 IX86_BUILTIN_COMINEQSS,
26719 IX86_BUILTIN_UCOMIEQSS,
26720 IX86_BUILTIN_UCOMILTSS,
26721 IX86_BUILTIN_UCOMILESS,
26722 IX86_BUILTIN_UCOMIGTSS,
26723 IX86_BUILTIN_UCOMIGESS,
26724 IX86_BUILTIN_UCOMINEQSS,
26725
26726 IX86_BUILTIN_CVTPI2PS,
26727 IX86_BUILTIN_CVTPS2PI,
26728 IX86_BUILTIN_CVTSI2SS,
26729 IX86_BUILTIN_CVTSI642SS,
26730 IX86_BUILTIN_CVTSS2SI,
26731 IX86_BUILTIN_CVTSS2SI64,
26732 IX86_BUILTIN_CVTTPS2PI,
26733 IX86_BUILTIN_CVTTSS2SI,
26734 IX86_BUILTIN_CVTTSS2SI64,
26735
26736 IX86_BUILTIN_MAXPS,
26737 IX86_BUILTIN_MAXSS,
26738 IX86_BUILTIN_MINPS,
26739 IX86_BUILTIN_MINSS,
26740
26741 IX86_BUILTIN_LOADUPS,
26742 IX86_BUILTIN_STOREUPS,
26743 IX86_BUILTIN_MOVSS,
26744
26745 IX86_BUILTIN_MOVHLPS,
26746 IX86_BUILTIN_MOVLHPS,
26747 IX86_BUILTIN_LOADHPS,
26748 IX86_BUILTIN_LOADLPS,
26749 IX86_BUILTIN_STOREHPS,
26750 IX86_BUILTIN_STORELPS,
26751
26752 IX86_BUILTIN_MASKMOVQ,
26753 IX86_BUILTIN_MOVMSKPS,
26754 IX86_BUILTIN_PMOVMSKB,
26755
26756 IX86_BUILTIN_MOVNTPS,
26757 IX86_BUILTIN_MOVNTQ,
26758
26759 IX86_BUILTIN_LOADDQU,
26760 IX86_BUILTIN_STOREDQU,
26761
26762 IX86_BUILTIN_PACKSSWB,
26763 IX86_BUILTIN_PACKSSDW,
26764 IX86_BUILTIN_PACKUSWB,
26765
26766 IX86_BUILTIN_PADDB,
26767 IX86_BUILTIN_PADDW,
26768 IX86_BUILTIN_PADDD,
26769 IX86_BUILTIN_PADDQ,
26770 IX86_BUILTIN_PADDSB,
26771 IX86_BUILTIN_PADDSW,
26772 IX86_BUILTIN_PADDUSB,
26773 IX86_BUILTIN_PADDUSW,
26774 IX86_BUILTIN_PSUBB,
26775 IX86_BUILTIN_PSUBW,
26776 IX86_BUILTIN_PSUBD,
26777 IX86_BUILTIN_PSUBQ,
26778 IX86_BUILTIN_PSUBSB,
26779 IX86_BUILTIN_PSUBSW,
26780 IX86_BUILTIN_PSUBUSB,
26781 IX86_BUILTIN_PSUBUSW,
26782
26783 IX86_BUILTIN_PAND,
26784 IX86_BUILTIN_PANDN,
26785 IX86_BUILTIN_POR,
26786 IX86_BUILTIN_PXOR,
26787
26788 IX86_BUILTIN_PAVGB,
26789 IX86_BUILTIN_PAVGW,
26790
26791 IX86_BUILTIN_PCMPEQB,
26792 IX86_BUILTIN_PCMPEQW,
26793 IX86_BUILTIN_PCMPEQD,
26794 IX86_BUILTIN_PCMPGTB,
26795 IX86_BUILTIN_PCMPGTW,
26796 IX86_BUILTIN_PCMPGTD,
26797
26798 IX86_BUILTIN_PMADDWD,
26799
26800 IX86_BUILTIN_PMAXSW,
26801 IX86_BUILTIN_PMAXUB,
26802 IX86_BUILTIN_PMINSW,
26803 IX86_BUILTIN_PMINUB,
26804
26805 IX86_BUILTIN_PMULHUW,
26806 IX86_BUILTIN_PMULHW,
26807 IX86_BUILTIN_PMULLW,
26808
26809 IX86_BUILTIN_PSADBW,
26810 IX86_BUILTIN_PSHUFW,
26811
26812 IX86_BUILTIN_PSLLW,
26813 IX86_BUILTIN_PSLLD,
26814 IX86_BUILTIN_PSLLQ,
26815 IX86_BUILTIN_PSRAW,
26816 IX86_BUILTIN_PSRAD,
26817 IX86_BUILTIN_PSRLW,
26818 IX86_BUILTIN_PSRLD,
26819 IX86_BUILTIN_PSRLQ,
26820 IX86_BUILTIN_PSLLWI,
26821 IX86_BUILTIN_PSLLDI,
26822 IX86_BUILTIN_PSLLQI,
26823 IX86_BUILTIN_PSRAWI,
26824 IX86_BUILTIN_PSRADI,
26825 IX86_BUILTIN_PSRLWI,
26826 IX86_BUILTIN_PSRLDI,
26827 IX86_BUILTIN_PSRLQI,
26828
26829 IX86_BUILTIN_PUNPCKHBW,
26830 IX86_BUILTIN_PUNPCKHWD,
26831 IX86_BUILTIN_PUNPCKHDQ,
26832 IX86_BUILTIN_PUNPCKLBW,
26833 IX86_BUILTIN_PUNPCKLWD,
26834 IX86_BUILTIN_PUNPCKLDQ,
26835
26836 IX86_BUILTIN_SHUFPS,
26837
26838 IX86_BUILTIN_RCPPS,
26839 IX86_BUILTIN_RCPSS,
26840 IX86_BUILTIN_RSQRTPS,
26841 IX86_BUILTIN_RSQRTPS_NR,
26842 IX86_BUILTIN_RSQRTSS,
26843 IX86_BUILTIN_RSQRTF,
26844 IX86_BUILTIN_SQRTPS,
26845 IX86_BUILTIN_SQRTPS_NR,
26846 IX86_BUILTIN_SQRTSS,
26847
26848 IX86_BUILTIN_UNPCKHPS,
26849 IX86_BUILTIN_UNPCKLPS,
26850
26851 IX86_BUILTIN_ANDPS,
26852 IX86_BUILTIN_ANDNPS,
26853 IX86_BUILTIN_ORPS,
26854 IX86_BUILTIN_XORPS,
26855
26856 IX86_BUILTIN_EMMS,
26857 IX86_BUILTIN_LDMXCSR,
26858 IX86_BUILTIN_STMXCSR,
26859 IX86_BUILTIN_SFENCE,
26860
26861 IX86_BUILTIN_FXSAVE,
26862 IX86_BUILTIN_FXRSTOR,
26863 IX86_BUILTIN_FXSAVE64,
26864 IX86_BUILTIN_FXRSTOR64,
26865
26866 IX86_BUILTIN_XSAVE,
26867 IX86_BUILTIN_XRSTOR,
26868 IX86_BUILTIN_XSAVE64,
26869 IX86_BUILTIN_XRSTOR64,
26870
26871 IX86_BUILTIN_XSAVEOPT,
26872 IX86_BUILTIN_XSAVEOPT64,
26873
26874 /* 3DNow! Original */
26875 IX86_BUILTIN_FEMMS,
26876 IX86_BUILTIN_PAVGUSB,
26877 IX86_BUILTIN_PF2ID,
26878 IX86_BUILTIN_PFACC,
26879 IX86_BUILTIN_PFADD,
26880 IX86_BUILTIN_PFCMPEQ,
26881 IX86_BUILTIN_PFCMPGE,
26882 IX86_BUILTIN_PFCMPGT,
26883 IX86_BUILTIN_PFMAX,
26884 IX86_BUILTIN_PFMIN,
26885 IX86_BUILTIN_PFMUL,
26886 IX86_BUILTIN_PFRCP,
26887 IX86_BUILTIN_PFRCPIT1,
26888 IX86_BUILTIN_PFRCPIT2,
26889 IX86_BUILTIN_PFRSQIT1,
26890 IX86_BUILTIN_PFRSQRT,
26891 IX86_BUILTIN_PFSUB,
26892 IX86_BUILTIN_PFSUBR,
26893 IX86_BUILTIN_PI2FD,
26894 IX86_BUILTIN_PMULHRW,
26895
26896 /* 3DNow! Athlon Extensions */
26897 IX86_BUILTIN_PF2IW,
26898 IX86_BUILTIN_PFNACC,
26899 IX86_BUILTIN_PFPNACC,
26900 IX86_BUILTIN_PI2FW,
26901 IX86_BUILTIN_PSWAPDSI,
26902 IX86_BUILTIN_PSWAPDSF,
26903
26904 /* SSE2 */
26905 IX86_BUILTIN_ADDPD,
26906 IX86_BUILTIN_ADDSD,
26907 IX86_BUILTIN_DIVPD,
26908 IX86_BUILTIN_DIVSD,
26909 IX86_BUILTIN_MULPD,
26910 IX86_BUILTIN_MULSD,
26911 IX86_BUILTIN_SUBPD,
26912 IX86_BUILTIN_SUBSD,
26913
26914 IX86_BUILTIN_CMPEQPD,
26915 IX86_BUILTIN_CMPLTPD,
26916 IX86_BUILTIN_CMPLEPD,
26917 IX86_BUILTIN_CMPGTPD,
26918 IX86_BUILTIN_CMPGEPD,
26919 IX86_BUILTIN_CMPNEQPD,
26920 IX86_BUILTIN_CMPNLTPD,
26921 IX86_BUILTIN_CMPNLEPD,
26922 IX86_BUILTIN_CMPNGTPD,
26923 IX86_BUILTIN_CMPNGEPD,
26924 IX86_BUILTIN_CMPORDPD,
26925 IX86_BUILTIN_CMPUNORDPD,
26926 IX86_BUILTIN_CMPEQSD,
26927 IX86_BUILTIN_CMPLTSD,
26928 IX86_BUILTIN_CMPLESD,
26929 IX86_BUILTIN_CMPNEQSD,
26930 IX86_BUILTIN_CMPNLTSD,
26931 IX86_BUILTIN_CMPNLESD,
26932 IX86_BUILTIN_CMPORDSD,
26933 IX86_BUILTIN_CMPUNORDSD,
26934
26935 IX86_BUILTIN_COMIEQSD,
26936 IX86_BUILTIN_COMILTSD,
26937 IX86_BUILTIN_COMILESD,
26938 IX86_BUILTIN_COMIGTSD,
26939 IX86_BUILTIN_COMIGESD,
26940 IX86_BUILTIN_COMINEQSD,
26941 IX86_BUILTIN_UCOMIEQSD,
26942 IX86_BUILTIN_UCOMILTSD,
26943 IX86_BUILTIN_UCOMILESD,
26944 IX86_BUILTIN_UCOMIGTSD,
26945 IX86_BUILTIN_UCOMIGESD,
26946 IX86_BUILTIN_UCOMINEQSD,
26947
26948 IX86_BUILTIN_MAXPD,
26949 IX86_BUILTIN_MAXSD,
26950 IX86_BUILTIN_MINPD,
26951 IX86_BUILTIN_MINSD,
26952
26953 IX86_BUILTIN_ANDPD,
26954 IX86_BUILTIN_ANDNPD,
26955 IX86_BUILTIN_ORPD,
26956 IX86_BUILTIN_XORPD,
26957
26958 IX86_BUILTIN_SQRTPD,
26959 IX86_BUILTIN_SQRTSD,
26960
26961 IX86_BUILTIN_UNPCKHPD,
26962 IX86_BUILTIN_UNPCKLPD,
26963
26964 IX86_BUILTIN_SHUFPD,
26965
26966 IX86_BUILTIN_LOADUPD,
26967 IX86_BUILTIN_STOREUPD,
26968 IX86_BUILTIN_MOVSD,
26969
26970 IX86_BUILTIN_LOADHPD,
26971 IX86_BUILTIN_LOADLPD,
26972
26973 IX86_BUILTIN_CVTDQ2PD,
26974 IX86_BUILTIN_CVTDQ2PS,
26975
26976 IX86_BUILTIN_CVTPD2DQ,
26977 IX86_BUILTIN_CVTPD2PI,
26978 IX86_BUILTIN_CVTPD2PS,
26979 IX86_BUILTIN_CVTTPD2DQ,
26980 IX86_BUILTIN_CVTTPD2PI,
26981
26982 IX86_BUILTIN_CVTPI2PD,
26983 IX86_BUILTIN_CVTSI2SD,
26984 IX86_BUILTIN_CVTSI642SD,
26985
26986 IX86_BUILTIN_CVTSD2SI,
26987 IX86_BUILTIN_CVTSD2SI64,
26988 IX86_BUILTIN_CVTSD2SS,
26989 IX86_BUILTIN_CVTSS2SD,
26990 IX86_BUILTIN_CVTTSD2SI,
26991 IX86_BUILTIN_CVTTSD2SI64,
26992
26993 IX86_BUILTIN_CVTPS2DQ,
26994 IX86_BUILTIN_CVTPS2PD,
26995 IX86_BUILTIN_CVTTPS2DQ,
26996
26997 IX86_BUILTIN_MOVNTI,
26998 IX86_BUILTIN_MOVNTI64,
26999 IX86_BUILTIN_MOVNTPD,
27000 IX86_BUILTIN_MOVNTDQ,
27001
27002 IX86_BUILTIN_MOVQ128,
27003
27004 /* SSE2 MMX */
27005 IX86_BUILTIN_MASKMOVDQU,
27006 IX86_BUILTIN_MOVMSKPD,
27007 IX86_BUILTIN_PMOVMSKB128,
27008
27009 IX86_BUILTIN_PACKSSWB128,
27010 IX86_BUILTIN_PACKSSDW128,
27011 IX86_BUILTIN_PACKUSWB128,
27012
27013 IX86_BUILTIN_PADDB128,
27014 IX86_BUILTIN_PADDW128,
27015 IX86_BUILTIN_PADDD128,
27016 IX86_BUILTIN_PADDQ128,
27017 IX86_BUILTIN_PADDSB128,
27018 IX86_BUILTIN_PADDSW128,
27019 IX86_BUILTIN_PADDUSB128,
27020 IX86_BUILTIN_PADDUSW128,
27021 IX86_BUILTIN_PSUBB128,
27022 IX86_BUILTIN_PSUBW128,
27023 IX86_BUILTIN_PSUBD128,
27024 IX86_BUILTIN_PSUBQ128,
27025 IX86_BUILTIN_PSUBSB128,
27026 IX86_BUILTIN_PSUBSW128,
27027 IX86_BUILTIN_PSUBUSB128,
27028 IX86_BUILTIN_PSUBUSW128,
27029
27030 IX86_BUILTIN_PAND128,
27031 IX86_BUILTIN_PANDN128,
27032 IX86_BUILTIN_POR128,
27033 IX86_BUILTIN_PXOR128,
27034
27035 IX86_BUILTIN_PAVGB128,
27036 IX86_BUILTIN_PAVGW128,
27037
27038 IX86_BUILTIN_PCMPEQB128,
27039 IX86_BUILTIN_PCMPEQW128,
27040 IX86_BUILTIN_PCMPEQD128,
27041 IX86_BUILTIN_PCMPGTB128,
27042 IX86_BUILTIN_PCMPGTW128,
27043 IX86_BUILTIN_PCMPGTD128,
27044
27045 IX86_BUILTIN_PMADDWD128,
27046
27047 IX86_BUILTIN_PMAXSW128,
27048 IX86_BUILTIN_PMAXUB128,
27049 IX86_BUILTIN_PMINSW128,
27050 IX86_BUILTIN_PMINUB128,
27051
27052 IX86_BUILTIN_PMULUDQ,
27053 IX86_BUILTIN_PMULUDQ128,
27054 IX86_BUILTIN_PMULHUW128,
27055 IX86_BUILTIN_PMULHW128,
27056 IX86_BUILTIN_PMULLW128,
27057
27058 IX86_BUILTIN_PSADBW128,
27059 IX86_BUILTIN_PSHUFHW,
27060 IX86_BUILTIN_PSHUFLW,
27061 IX86_BUILTIN_PSHUFD,
27062
27063 IX86_BUILTIN_PSLLDQI128,
27064 IX86_BUILTIN_PSLLWI128,
27065 IX86_BUILTIN_PSLLDI128,
27066 IX86_BUILTIN_PSLLQI128,
27067 IX86_BUILTIN_PSRAWI128,
27068 IX86_BUILTIN_PSRADI128,
27069 IX86_BUILTIN_PSRLDQI128,
27070 IX86_BUILTIN_PSRLWI128,
27071 IX86_BUILTIN_PSRLDI128,
27072 IX86_BUILTIN_PSRLQI128,
27073
27074 IX86_BUILTIN_PSLLDQ128,
27075 IX86_BUILTIN_PSLLW128,
27076 IX86_BUILTIN_PSLLD128,
27077 IX86_BUILTIN_PSLLQ128,
27078 IX86_BUILTIN_PSRAW128,
27079 IX86_BUILTIN_PSRAD128,
27080 IX86_BUILTIN_PSRLW128,
27081 IX86_BUILTIN_PSRLD128,
27082 IX86_BUILTIN_PSRLQ128,
27083
27084 IX86_BUILTIN_PUNPCKHBW128,
27085 IX86_BUILTIN_PUNPCKHWD128,
27086 IX86_BUILTIN_PUNPCKHDQ128,
27087 IX86_BUILTIN_PUNPCKHQDQ128,
27088 IX86_BUILTIN_PUNPCKLBW128,
27089 IX86_BUILTIN_PUNPCKLWD128,
27090 IX86_BUILTIN_PUNPCKLDQ128,
27091 IX86_BUILTIN_PUNPCKLQDQ128,
27092
27093 IX86_BUILTIN_CLFLUSH,
27094 IX86_BUILTIN_MFENCE,
27095 IX86_BUILTIN_LFENCE,
27096 IX86_BUILTIN_PAUSE,
27097
27098 IX86_BUILTIN_FNSTENV,
27099 IX86_BUILTIN_FLDENV,
27100 IX86_BUILTIN_FNSTSW,
27101 IX86_BUILTIN_FNCLEX,
27102
27103 IX86_BUILTIN_BSRSI,
27104 IX86_BUILTIN_BSRDI,
27105 IX86_BUILTIN_RDPMC,
27106 IX86_BUILTIN_RDTSC,
27107 IX86_BUILTIN_RDTSCP,
27108 IX86_BUILTIN_ROLQI,
27109 IX86_BUILTIN_ROLHI,
27110 IX86_BUILTIN_RORQI,
27111 IX86_BUILTIN_RORHI,
27112
27113 /* SSE3. */
27114 IX86_BUILTIN_ADDSUBPS,
27115 IX86_BUILTIN_HADDPS,
27116 IX86_BUILTIN_HSUBPS,
27117 IX86_BUILTIN_MOVSHDUP,
27118 IX86_BUILTIN_MOVSLDUP,
27119 IX86_BUILTIN_ADDSUBPD,
27120 IX86_BUILTIN_HADDPD,
27121 IX86_BUILTIN_HSUBPD,
27122 IX86_BUILTIN_LDDQU,
27123
27124 IX86_BUILTIN_MONITOR,
27125 IX86_BUILTIN_MWAIT,
27126
27127 /* SSSE3. */
27128 IX86_BUILTIN_PHADDW,
27129 IX86_BUILTIN_PHADDD,
27130 IX86_BUILTIN_PHADDSW,
27131 IX86_BUILTIN_PHSUBW,
27132 IX86_BUILTIN_PHSUBD,
27133 IX86_BUILTIN_PHSUBSW,
27134 IX86_BUILTIN_PMADDUBSW,
27135 IX86_BUILTIN_PMULHRSW,
27136 IX86_BUILTIN_PSHUFB,
27137 IX86_BUILTIN_PSIGNB,
27138 IX86_BUILTIN_PSIGNW,
27139 IX86_BUILTIN_PSIGND,
27140 IX86_BUILTIN_PALIGNR,
27141 IX86_BUILTIN_PABSB,
27142 IX86_BUILTIN_PABSW,
27143 IX86_BUILTIN_PABSD,
27144
27145 IX86_BUILTIN_PHADDW128,
27146 IX86_BUILTIN_PHADDD128,
27147 IX86_BUILTIN_PHADDSW128,
27148 IX86_BUILTIN_PHSUBW128,
27149 IX86_BUILTIN_PHSUBD128,
27150 IX86_BUILTIN_PHSUBSW128,
27151 IX86_BUILTIN_PMADDUBSW128,
27152 IX86_BUILTIN_PMULHRSW128,
27153 IX86_BUILTIN_PSHUFB128,
27154 IX86_BUILTIN_PSIGNB128,
27155 IX86_BUILTIN_PSIGNW128,
27156 IX86_BUILTIN_PSIGND128,
27157 IX86_BUILTIN_PALIGNR128,
27158 IX86_BUILTIN_PABSB128,
27159 IX86_BUILTIN_PABSW128,
27160 IX86_BUILTIN_PABSD128,
27161
27162 /* AMDFAM10 - SSE4A New Instructions. */
27163 IX86_BUILTIN_MOVNTSD,
27164 IX86_BUILTIN_MOVNTSS,
27165 IX86_BUILTIN_EXTRQI,
27166 IX86_BUILTIN_EXTRQ,
27167 IX86_BUILTIN_INSERTQI,
27168 IX86_BUILTIN_INSERTQ,
27169
27170 /* SSE4.1. */
27171 IX86_BUILTIN_BLENDPD,
27172 IX86_BUILTIN_BLENDPS,
27173 IX86_BUILTIN_BLENDVPD,
27174 IX86_BUILTIN_BLENDVPS,
27175 IX86_BUILTIN_PBLENDVB128,
27176 IX86_BUILTIN_PBLENDW128,
27177
27178 IX86_BUILTIN_DPPD,
27179 IX86_BUILTIN_DPPS,
27180
27181 IX86_BUILTIN_INSERTPS128,
27182
27183 IX86_BUILTIN_MOVNTDQA,
27184 IX86_BUILTIN_MPSADBW128,
27185 IX86_BUILTIN_PACKUSDW128,
27186 IX86_BUILTIN_PCMPEQQ,
27187 IX86_BUILTIN_PHMINPOSUW128,
27188
27189 IX86_BUILTIN_PMAXSB128,
27190 IX86_BUILTIN_PMAXSD128,
27191 IX86_BUILTIN_PMAXUD128,
27192 IX86_BUILTIN_PMAXUW128,
27193
27194 IX86_BUILTIN_PMINSB128,
27195 IX86_BUILTIN_PMINSD128,
27196 IX86_BUILTIN_PMINUD128,
27197 IX86_BUILTIN_PMINUW128,
27198
27199 IX86_BUILTIN_PMOVSXBW128,
27200 IX86_BUILTIN_PMOVSXBD128,
27201 IX86_BUILTIN_PMOVSXBQ128,
27202 IX86_BUILTIN_PMOVSXWD128,
27203 IX86_BUILTIN_PMOVSXWQ128,
27204 IX86_BUILTIN_PMOVSXDQ128,
27205
27206 IX86_BUILTIN_PMOVZXBW128,
27207 IX86_BUILTIN_PMOVZXBD128,
27208 IX86_BUILTIN_PMOVZXBQ128,
27209 IX86_BUILTIN_PMOVZXWD128,
27210 IX86_BUILTIN_PMOVZXWQ128,
27211 IX86_BUILTIN_PMOVZXDQ128,
27212
27213 IX86_BUILTIN_PMULDQ128,
27214 IX86_BUILTIN_PMULLD128,
27215
27216 IX86_BUILTIN_ROUNDSD,
27217 IX86_BUILTIN_ROUNDSS,
27218
27219 IX86_BUILTIN_ROUNDPD,
27220 IX86_BUILTIN_ROUNDPS,
27221
27222 IX86_BUILTIN_FLOORPD,
27223 IX86_BUILTIN_CEILPD,
27224 IX86_BUILTIN_TRUNCPD,
27225 IX86_BUILTIN_RINTPD,
27226 IX86_BUILTIN_ROUNDPD_AZ,
27227
27228 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
27229 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
27230 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
27231
27232 IX86_BUILTIN_FLOORPS,
27233 IX86_BUILTIN_CEILPS,
27234 IX86_BUILTIN_TRUNCPS,
27235 IX86_BUILTIN_RINTPS,
27236 IX86_BUILTIN_ROUNDPS_AZ,
27237
27238 IX86_BUILTIN_FLOORPS_SFIX,
27239 IX86_BUILTIN_CEILPS_SFIX,
27240 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
27241
27242 IX86_BUILTIN_PTESTZ,
27243 IX86_BUILTIN_PTESTC,
27244 IX86_BUILTIN_PTESTNZC,
27245
27246 IX86_BUILTIN_VEC_INIT_V2SI,
27247 IX86_BUILTIN_VEC_INIT_V4HI,
27248 IX86_BUILTIN_VEC_INIT_V8QI,
27249 IX86_BUILTIN_VEC_EXT_V2DF,
27250 IX86_BUILTIN_VEC_EXT_V2DI,
27251 IX86_BUILTIN_VEC_EXT_V4SF,
27252 IX86_BUILTIN_VEC_EXT_V4SI,
27253 IX86_BUILTIN_VEC_EXT_V8HI,
27254 IX86_BUILTIN_VEC_EXT_V2SI,
27255 IX86_BUILTIN_VEC_EXT_V4HI,
27256 IX86_BUILTIN_VEC_EXT_V16QI,
27257 IX86_BUILTIN_VEC_SET_V2DI,
27258 IX86_BUILTIN_VEC_SET_V4SF,
27259 IX86_BUILTIN_VEC_SET_V4SI,
27260 IX86_BUILTIN_VEC_SET_V8HI,
27261 IX86_BUILTIN_VEC_SET_V4HI,
27262 IX86_BUILTIN_VEC_SET_V16QI,
27263
27264 IX86_BUILTIN_VEC_PACK_SFIX,
27265 IX86_BUILTIN_VEC_PACK_SFIX256,
27266
27267 /* SSE4.2. */
27268 IX86_BUILTIN_CRC32QI,
27269 IX86_BUILTIN_CRC32HI,
27270 IX86_BUILTIN_CRC32SI,
27271 IX86_BUILTIN_CRC32DI,
27272
27273 IX86_BUILTIN_PCMPESTRI128,
27274 IX86_BUILTIN_PCMPESTRM128,
27275 IX86_BUILTIN_PCMPESTRA128,
27276 IX86_BUILTIN_PCMPESTRC128,
27277 IX86_BUILTIN_PCMPESTRO128,
27278 IX86_BUILTIN_PCMPESTRS128,
27279 IX86_BUILTIN_PCMPESTRZ128,
27280 IX86_BUILTIN_PCMPISTRI128,
27281 IX86_BUILTIN_PCMPISTRM128,
27282 IX86_BUILTIN_PCMPISTRA128,
27283 IX86_BUILTIN_PCMPISTRC128,
27284 IX86_BUILTIN_PCMPISTRO128,
27285 IX86_BUILTIN_PCMPISTRS128,
27286 IX86_BUILTIN_PCMPISTRZ128,
27287
27288 IX86_BUILTIN_PCMPGTQ,
27289
27290 /* AES instructions */
27291 IX86_BUILTIN_AESENC128,
27292 IX86_BUILTIN_AESENCLAST128,
27293 IX86_BUILTIN_AESDEC128,
27294 IX86_BUILTIN_AESDECLAST128,
27295 IX86_BUILTIN_AESIMC128,
27296 IX86_BUILTIN_AESKEYGENASSIST128,
27297
27298 /* PCLMUL instruction */
27299 IX86_BUILTIN_PCLMULQDQ128,
27300
27301 /* AVX */
27302 IX86_BUILTIN_ADDPD256,
27303 IX86_BUILTIN_ADDPS256,
27304 IX86_BUILTIN_ADDSUBPD256,
27305 IX86_BUILTIN_ADDSUBPS256,
27306 IX86_BUILTIN_ANDPD256,
27307 IX86_BUILTIN_ANDPS256,
27308 IX86_BUILTIN_ANDNPD256,
27309 IX86_BUILTIN_ANDNPS256,
27310 IX86_BUILTIN_BLENDPD256,
27311 IX86_BUILTIN_BLENDPS256,
27312 IX86_BUILTIN_BLENDVPD256,
27313 IX86_BUILTIN_BLENDVPS256,
27314 IX86_BUILTIN_DIVPD256,
27315 IX86_BUILTIN_DIVPS256,
27316 IX86_BUILTIN_DPPS256,
27317 IX86_BUILTIN_HADDPD256,
27318 IX86_BUILTIN_HADDPS256,
27319 IX86_BUILTIN_HSUBPD256,
27320 IX86_BUILTIN_HSUBPS256,
27321 IX86_BUILTIN_MAXPD256,
27322 IX86_BUILTIN_MAXPS256,
27323 IX86_BUILTIN_MINPD256,
27324 IX86_BUILTIN_MINPS256,
27325 IX86_BUILTIN_MULPD256,
27326 IX86_BUILTIN_MULPS256,
27327 IX86_BUILTIN_ORPD256,
27328 IX86_BUILTIN_ORPS256,
27329 IX86_BUILTIN_SHUFPD256,
27330 IX86_BUILTIN_SHUFPS256,
27331 IX86_BUILTIN_SUBPD256,
27332 IX86_BUILTIN_SUBPS256,
27333 IX86_BUILTIN_XORPD256,
27334 IX86_BUILTIN_XORPS256,
27335 IX86_BUILTIN_CMPSD,
27336 IX86_BUILTIN_CMPSS,
27337 IX86_BUILTIN_CMPPD,
27338 IX86_BUILTIN_CMPPS,
27339 IX86_BUILTIN_CMPPD256,
27340 IX86_BUILTIN_CMPPS256,
27341 IX86_BUILTIN_CVTDQ2PD256,
27342 IX86_BUILTIN_CVTDQ2PS256,
27343 IX86_BUILTIN_CVTPD2PS256,
27344 IX86_BUILTIN_CVTPS2DQ256,
27345 IX86_BUILTIN_CVTPS2PD256,
27346 IX86_BUILTIN_CVTTPD2DQ256,
27347 IX86_BUILTIN_CVTPD2DQ256,
27348 IX86_BUILTIN_CVTTPS2DQ256,
27349 IX86_BUILTIN_EXTRACTF128PD256,
27350 IX86_BUILTIN_EXTRACTF128PS256,
27351 IX86_BUILTIN_EXTRACTF128SI256,
27352 IX86_BUILTIN_VZEROALL,
27353 IX86_BUILTIN_VZEROUPPER,
27354 IX86_BUILTIN_VPERMILVARPD,
27355 IX86_BUILTIN_VPERMILVARPS,
27356 IX86_BUILTIN_VPERMILVARPD256,
27357 IX86_BUILTIN_VPERMILVARPS256,
27358 IX86_BUILTIN_VPERMILPD,
27359 IX86_BUILTIN_VPERMILPS,
27360 IX86_BUILTIN_VPERMILPD256,
27361 IX86_BUILTIN_VPERMILPS256,
27362 IX86_BUILTIN_VPERMIL2PD,
27363 IX86_BUILTIN_VPERMIL2PS,
27364 IX86_BUILTIN_VPERMIL2PD256,
27365 IX86_BUILTIN_VPERMIL2PS256,
27366 IX86_BUILTIN_VPERM2F128PD256,
27367 IX86_BUILTIN_VPERM2F128PS256,
27368 IX86_BUILTIN_VPERM2F128SI256,
27369 IX86_BUILTIN_VBROADCASTSS,
27370 IX86_BUILTIN_VBROADCASTSD256,
27371 IX86_BUILTIN_VBROADCASTSS256,
27372 IX86_BUILTIN_VBROADCASTPD256,
27373 IX86_BUILTIN_VBROADCASTPS256,
27374 IX86_BUILTIN_VINSERTF128PD256,
27375 IX86_BUILTIN_VINSERTF128PS256,
27376 IX86_BUILTIN_VINSERTF128SI256,
27377 IX86_BUILTIN_LOADUPD256,
27378 IX86_BUILTIN_LOADUPS256,
27379 IX86_BUILTIN_STOREUPD256,
27380 IX86_BUILTIN_STOREUPS256,
27381 IX86_BUILTIN_LDDQU256,
27382 IX86_BUILTIN_MOVNTDQ256,
27383 IX86_BUILTIN_MOVNTPD256,
27384 IX86_BUILTIN_MOVNTPS256,
27385 IX86_BUILTIN_LOADDQU256,
27386 IX86_BUILTIN_STOREDQU256,
27387 IX86_BUILTIN_MASKLOADPD,
27388 IX86_BUILTIN_MASKLOADPS,
27389 IX86_BUILTIN_MASKSTOREPD,
27390 IX86_BUILTIN_MASKSTOREPS,
27391 IX86_BUILTIN_MASKLOADPD256,
27392 IX86_BUILTIN_MASKLOADPS256,
27393 IX86_BUILTIN_MASKSTOREPD256,
27394 IX86_BUILTIN_MASKSTOREPS256,
27395 IX86_BUILTIN_MOVSHDUP256,
27396 IX86_BUILTIN_MOVSLDUP256,
27397 IX86_BUILTIN_MOVDDUP256,
27398
27399 IX86_BUILTIN_SQRTPD256,
27400 IX86_BUILTIN_SQRTPS256,
27401 IX86_BUILTIN_SQRTPS_NR256,
27402 IX86_BUILTIN_RSQRTPS256,
27403 IX86_BUILTIN_RSQRTPS_NR256,
27404
27405 IX86_BUILTIN_RCPPS256,
27406
27407 IX86_BUILTIN_ROUNDPD256,
27408 IX86_BUILTIN_ROUNDPS256,
27409
27410 IX86_BUILTIN_FLOORPD256,
27411 IX86_BUILTIN_CEILPD256,
27412 IX86_BUILTIN_TRUNCPD256,
27413 IX86_BUILTIN_RINTPD256,
27414 IX86_BUILTIN_ROUNDPD_AZ256,
27415
27416 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
27417 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
27418 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
27419
27420 IX86_BUILTIN_FLOORPS256,
27421 IX86_BUILTIN_CEILPS256,
27422 IX86_BUILTIN_TRUNCPS256,
27423 IX86_BUILTIN_RINTPS256,
27424 IX86_BUILTIN_ROUNDPS_AZ256,
27425
27426 IX86_BUILTIN_FLOORPS_SFIX256,
27427 IX86_BUILTIN_CEILPS_SFIX256,
27428 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
27429
27430 IX86_BUILTIN_UNPCKHPD256,
27431 IX86_BUILTIN_UNPCKLPD256,
27432 IX86_BUILTIN_UNPCKHPS256,
27433 IX86_BUILTIN_UNPCKLPS256,
27434
27435 IX86_BUILTIN_SI256_SI,
27436 IX86_BUILTIN_PS256_PS,
27437 IX86_BUILTIN_PD256_PD,
27438 IX86_BUILTIN_SI_SI256,
27439 IX86_BUILTIN_PS_PS256,
27440 IX86_BUILTIN_PD_PD256,
27441
27442 IX86_BUILTIN_VTESTZPD,
27443 IX86_BUILTIN_VTESTCPD,
27444 IX86_BUILTIN_VTESTNZCPD,
27445 IX86_BUILTIN_VTESTZPS,
27446 IX86_BUILTIN_VTESTCPS,
27447 IX86_BUILTIN_VTESTNZCPS,
27448 IX86_BUILTIN_VTESTZPD256,
27449 IX86_BUILTIN_VTESTCPD256,
27450 IX86_BUILTIN_VTESTNZCPD256,
27451 IX86_BUILTIN_VTESTZPS256,
27452 IX86_BUILTIN_VTESTCPS256,
27453 IX86_BUILTIN_VTESTNZCPS256,
27454 IX86_BUILTIN_PTESTZ256,
27455 IX86_BUILTIN_PTESTC256,
27456 IX86_BUILTIN_PTESTNZC256,
27457
27458 IX86_BUILTIN_MOVMSKPD256,
27459 IX86_BUILTIN_MOVMSKPS256,
27460
27461 /* AVX2 */
27462 IX86_BUILTIN_MPSADBW256,
27463 IX86_BUILTIN_PABSB256,
27464 IX86_BUILTIN_PABSW256,
27465 IX86_BUILTIN_PABSD256,
27466 IX86_BUILTIN_PACKSSDW256,
27467 IX86_BUILTIN_PACKSSWB256,
27468 IX86_BUILTIN_PACKUSDW256,
27469 IX86_BUILTIN_PACKUSWB256,
27470 IX86_BUILTIN_PADDB256,
27471 IX86_BUILTIN_PADDW256,
27472 IX86_BUILTIN_PADDD256,
27473 IX86_BUILTIN_PADDQ256,
27474 IX86_BUILTIN_PADDSB256,
27475 IX86_BUILTIN_PADDSW256,
27476 IX86_BUILTIN_PADDUSB256,
27477 IX86_BUILTIN_PADDUSW256,
27478 IX86_BUILTIN_PALIGNR256,
27479 IX86_BUILTIN_AND256I,
27480 IX86_BUILTIN_ANDNOT256I,
27481 IX86_BUILTIN_PAVGB256,
27482 IX86_BUILTIN_PAVGW256,
27483 IX86_BUILTIN_PBLENDVB256,
27484 IX86_BUILTIN_PBLENDVW256,
27485 IX86_BUILTIN_PCMPEQB256,
27486 IX86_BUILTIN_PCMPEQW256,
27487 IX86_BUILTIN_PCMPEQD256,
27488 IX86_BUILTIN_PCMPEQQ256,
27489 IX86_BUILTIN_PCMPGTB256,
27490 IX86_BUILTIN_PCMPGTW256,
27491 IX86_BUILTIN_PCMPGTD256,
27492 IX86_BUILTIN_PCMPGTQ256,
27493 IX86_BUILTIN_PHADDW256,
27494 IX86_BUILTIN_PHADDD256,
27495 IX86_BUILTIN_PHADDSW256,
27496 IX86_BUILTIN_PHSUBW256,
27497 IX86_BUILTIN_PHSUBD256,
27498 IX86_BUILTIN_PHSUBSW256,
27499 IX86_BUILTIN_PMADDUBSW256,
27500 IX86_BUILTIN_PMADDWD256,
27501 IX86_BUILTIN_PMAXSB256,
27502 IX86_BUILTIN_PMAXSW256,
27503 IX86_BUILTIN_PMAXSD256,
27504 IX86_BUILTIN_PMAXUB256,
27505 IX86_BUILTIN_PMAXUW256,
27506 IX86_BUILTIN_PMAXUD256,
27507 IX86_BUILTIN_PMINSB256,
27508 IX86_BUILTIN_PMINSW256,
27509 IX86_BUILTIN_PMINSD256,
27510 IX86_BUILTIN_PMINUB256,
27511 IX86_BUILTIN_PMINUW256,
27512 IX86_BUILTIN_PMINUD256,
27513 IX86_BUILTIN_PMOVMSKB256,
27514 IX86_BUILTIN_PMOVSXBW256,
27515 IX86_BUILTIN_PMOVSXBD256,
27516 IX86_BUILTIN_PMOVSXBQ256,
27517 IX86_BUILTIN_PMOVSXWD256,
27518 IX86_BUILTIN_PMOVSXWQ256,
27519 IX86_BUILTIN_PMOVSXDQ256,
27520 IX86_BUILTIN_PMOVZXBW256,
27521 IX86_BUILTIN_PMOVZXBD256,
27522 IX86_BUILTIN_PMOVZXBQ256,
27523 IX86_BUILTIN_PMOVZXWD256,
27524 IX86_BUILTIN_PMOVZXWQ256,
27525 IX86_BUILTIN_PMOVZXDQ256,
27526 IX86_BUILTIN_PMULDQ256,
27527 IX86_BUILTIN_PMULHRSW256,
27528 IX86_BUILTIN_PMULHUW256,
27529 IX86_BUILTIN_PMULHW256,
27530 IX86_BUILTIN_PMULLW256,
27531 IX86_BUILTIN_PMULLD256,
27532 IX86_BUILTIN_PMULUDQ256,
27533 IX86_BUILTIN_POR256,
27534 IX86_BUILTIN_PSADBW256,
27535 IX86_BUILTIN_PSHUFB256,
27536 IX86_BUILTIN_PSHUFD256,
27537 IX86_BUILTIN_PSHUFHW256,
27538 IX86_BUILTIN_PSHUFLW256,
27539 IX86_BUILTIN_PSIGNB256,
27540 IX86_BUILTIN_PSIGNW256,
27541 IX86_BUILTIN_PSIGND256,
27542 IX86_BUILTIN_PSLLDQI256,
27543 IX86_BUILTIN_PSLLWI256,
27544 IX86_BUILTIN_PSLLW256,
27545 IX86_BUILTIN_PSLLDI256,
27546 IX86_BUILTIN_PSLLD256,
27547 IX86_BUILTIN_PSLLQI256,
27548 IX86_BUILTIN_PSLLQ256,
27549 IX86_BUILTIN_PSRAWI256,
27550 IX86_BUILTIN_PSRAW256,
27551 IX86_BUILTIN_PSRADI256,
27552 IX86_BUILTIN_PSRAD256,
27553 IX86_BUILTIN_PSRLDQI256,
27554 IX86_BUILTIN_PSRLWI256,
27555 IX86_BUILTIN_PSRLW256,
27556 IX86_BUILTIN_PSRLDI256,
27557 IX86_BUILTIN_PSRLD256,
27558 IX86_BUILTIN_PSRLQI256,
27559 IX86_BUILTIN_PSRLQ256,
27560 IX86_BUILTIN_PSUBB256,
27561 IX86_BUILTIN_PSUBW256,
27562 IX86_BUILTIN_PSUBD256,
27563 IX86_BUILTIN_PSUBQ256,
27564 IX86_BUILTIN_PSUBSB256,
27565 IX86_BUILTIN_PSUBSW256,
27566 IX86_BUILTIN_PSUBUSB256,
27567 IX86_BUILTIN_PSUBUSW256,
27568 IX86_BUILTIN_PUNPCKHBW256,
27569 IX86_BUILTIN_PUNPCKHWD256,
27570 IX86_BUILTIN_PUNPCKHDQ256,
27571 IX86_BUILTIN_PUNPCKHQDQ256,
27572 IX86_BUILTIN_PUNPCKLBW256,
27573 IX86_BUILTIN_PUNPCKLWD256,
27574 IX86_BUILTIN_PUNPCKLDQ256,
27575 IX86_BUILTIN_PUNPCKLQDQ256,
27576 IX86_BUILTIN_PXOR256,
27577 IX86_BUILTIN_MOVNTDQA256,
27578 IX86_BUILTIN_VBROADCASTSS_PS,
27579 IX86_BUILTIN_VBROADCASTSS_PS256,
27580 IX86_BUILTIN_VBROADCASTSD_PD256,
27581 IX86_BUILTIN_VBROADCASTSI256,
27582 IX86_BUILTIN_PBLENDD256,
27583 IX86_BUILTIN_PBLENDD128,
27584 IX86_BUILTIN_PBROADCASTB256,
27585 IX86_BUILTIN_PBROADCASTW256,
27586 IX86_BUILTIN_PBROADCASTD256,
27587 IX86_BUILTIN_PBROADCASTQ256,
27588 IX86_BUILTIN_PBROADCASTB128,
27589 IX86_BUILTIN_PBROADCASTW128,
27590 IX86_BUILTIN_PBROADCASTD128,
27591 IX86_BUILTIN_PBROADCASTQ128,
27592 IX86_BUILTIN_VPERMVARSI256,
27593 IX86_BUILTIN_VPERMDF256,
27594 IX86_BUILTIN_VPERMVARSF256,
27595 IX86_BUILTIN_VPERMDI256,
27596 IX86_BUILTIN_VPERMTI256,
27597 IX86_BUILTIN_VEXTRACT128I256,
27598 IX86_BUILTIN_VINSERT128I256,
27599 IX86_BUILTIN_MASKLOADD,
27600 IX86_BUILTIN_MASKLOADQ,
27601 IX86_BUILTIN_MASKLOADD256,
27602 IX86_BUILTIN_MASKLOADQ256,
27603 IX86_BUILTIN_MASKSTORED,
27604 IX86_BUILTIN_MASKSTOREQ,
27605 IX86_BUILTIN_MASKSTORED256,
27606 IX86_BUILTIN_MASKSTOREQ256,
27607 IX86_BUILTIN_PSLLVV4DI,
27608 IX86_BUILTIN_PSLLVV2DI,
27609 IX86_BUILTIN_PSLLVV8SI,
27610 IX86_BUILTIN_PSLLVV4SI,
27611 IX86_BUILTIN_PSRAVV8SI,
27612 IX86_BUILTIN_PSRAVV4SI,
27613 IX86_BUILTIN_PSRLVV4DI,
27614 IX86_BUILTIN_PSRLVV2DI,
27615 IX86_BUILTIN_PSRLVV8SI,
27616 IX86_BUILTIN_PSRLVV4SI,
27617
27618 IX86_BUILTIN_GATHERSIV2DF,
27619 IX86_BUILTIN_GATHERSIV4DF,
27620 IX86_BUILTIN_GATHERDIV2DF,
27621 IX86_BUILTIN_GATHERDIV4DF,
27622 IX86_BUILTIN_GATHERSIV4SF,
27623 IX86_BUILTIN_GATHERSIV8SF,
27624 IX86_BUILTIN_GATHERDIV4SF,
27625 IX86_BUILTIN_GATHERDIV8SF,
27626 IX86_BUILTIN_GATHERSIV2DI,
27627 IX86_BUILTIN_GATHERSIV4DI,
27628 IX86_BUILTIN_GATHERDIV2DI,
27629 IX86_BUILTIN_GATHERDIV4DI,
27630 IX86_BUILTIN_GATHERSIV4SI,
27631 IX86_BUILTIN_GATHERSIV8SI,
27632 IX86_BUILTIN_GATHERDIV4SI,
27633 IX86_BUILTIN_GATHERDIV8SI,
27634
27635 /* Alternate 4 element gather for the vectorizer where
27636 all operands are 32-byte wide. */
27637 IX86_BUILTIN_GATHERALTSIV4DF,
27638 IX86_BUILTIN_GATHERALTDIV8SF,
27639 IX86_BUILTIN_GATHERALTSIV4DI,
27640 IX86_BUILTIN_GATHERALTDIV8SI,
27641
27642 /* TFmode support builtins. */
27643 IX86_BUILTIN_INFQ,
27644 IX86_BUILTIN_HUGE_VALQ,
27645 IX86_BUILTIN_FABSQ,
27646 IX86_BUILTIN_COPYSIGNQ,
27647
27648 /* Vectorizer support builtins. */
27649 IX86_BUILTIN_CPYSGNPS,
27650 IX86_BUILTIN_CPYSGNPD,
27651 IX86_BUILTIN_CPYSGNPS256,
27652 IX86_BUILTIN_CPYSGNPD256,
27653
27654 /* FMA4 instructions. */
27655 IX86_BUILTIN_VFMADDSS,
27656 IX86_BUILTIN_VFMADDSD,
27657 IX86_BUILTIN_VFMADDPS,
27658 IX86_BUILTIN_VFMADDPD,
27659 IX86_BUILTIN_VFMADDPS256,
27660 IX86_BUILTIN_VFMADDPD256,
27661 IX86_BUILTIN_VFMADDSUBPS,
27662 IX86_BUILTIN_VFMADDSUBPD,
27663 IX86_BUILTIN_VFMADDSUBPS256,
27664 IX86_BUILTIN_VFMADDSUBPD256,
27665
27666 /* FMA3 instructions. */
27667 IX86_BUILTIN_VFMADDSS3,
27668 IX86_BUILTIN_VFMADDSD3,
27669
27670 /* XOP instructions. */
27671 IX86_BUILTIN_VPCMOV,
27672 IX86_BUILTIN_VPCMOV_V2DI,
27673 IX86_BUILTIN_VPCMOV_V4SI,
27674 IX86_BUILTIN_VPCMOV_V8HI,
27675 IX86_BUILTIN_VPCMOV_V16QI,
27676 IX86_BUILTIN_VPCMOV_V4SF,
27677 IX86_BUILTIN_VPCMOV_V2DF,
27678 IX86_BUILTIN_VPCMOV256,
27679 IX86_BUILTIN_VPCMOV_V4DI256,
27680 IX86_BUILTIN_VPCMOV_V8SI256,
27681 IX86_BUILTIN_VPCMOV_V16HI256,
27682 IX86_BUILTIN_VPCMOV_V32QI256,
27683 IX86_BUILTIN_VPCMOV_V8SF256,
27684 IX86_BUILTIN_VPCMOV_V4DF256,
27685
27686 IX86_BUILTIN_VPPERM,
27687
27688 IX86_BUILTIN_VPMACSSWW,
27689 IX86_BUILTIN_VPMACSWW,
27690 IX86_BUILTIN_VPMACSSWD,
27691 IX86_BUILTIN_VPMACSWD,
27692 IX86_BUILTIN_VPMACSSDD,
27693 IX86_BUILTIN_VPMACSDD,
27694 IX86_BUILTIN_VPMACSSDQL,
27695 IX86_BUILTIN_VPMACSSDQH,
27696 IX86_BUILTIN_VPMACSDQL,
27697 IX86_BUILTIN_VPMACSDQH,
27698 IX86_BUILTIN_VPMADCSSWD,
27699 IX86_BUILTIN_VPMADCSWD,
27700
27701 IX86_BUILTIN_VPHADDBW,
27702 IX86_BUILTIN_VPHADDBD,
27703 IX86_BUILTIN_VPHADDBQ,
27704 IX86_BUILTIN_VPHADDWD,
27705 IX86_BUILTIN_VPHADDWQ,
27706 IX86_BUILTIN_VPHADDDQ,
27707 IX86_BUILTIN_VPHADDUBW,
27708 IX86_BUILTIN_VPHADDUBD,
27709 IX86_BUILTIN_VPHADDUBQ,
27710 IX86_BUILTIN_VPHADDUWD,
27711 IX86_BUILTIN_VPHADDUWQ,
27712 IX86_BUILTIN_VPHADDUDQ,
27713 IX86_BUILTIN_VPHSUBBW,
27714 IX86_BUILTIN_VPHSUBWD,
27715 IX86_BUILTIN_VPHSUBDQ,
27716
27717 IX86_BUILTIN_VPROTB,
27718 IX86_BUILTIN_VPROTW,
27719 IX86_BUILTIN_VPROTD,
27720 IX86_BUILTIN_VPROTQ,
27721 IX86_BUILTIN_VPROTB_IMM,
27722 IX86_BUILTIN_VPROTW_IMM,
27723 IX86_BUILTIN_VPROTD_IMM,
27724 IX86_BUILTIN_VPROTQ_IMM,
27725
27726 IX86_BUILTIN_VPSHLB,
27727 IX86_BUILTIN_VPSHLW,
27728 IX86_BUILTIN_VPSHLD,
27729 IX86_BUILTIN_VPSHLQ,
27730 IX86_BUILTIN_VPSHAB,
27731 IX86_BUILTIN_VPSHAW,
27732 IX86_BUILTIN_VPSHAD,
27733 IX86_BUILTIN_VPSHAQ,
27734
27735 IX86_BUILTIN_VFRCZSS,
27736 IX86_BUILTIN_VFRCZSD,
27737 IX86_BUILTIN_VFRCZPS,
27738 IX86_BUILTIN_VFRCZPD,
27739 IX86_BUILTIN_VFRCZPS256,
27740 IX86_BUILTIN_VFRCZPD256,
27741
27742 IX86_BUILTIN_VPCOMEQUB,
27743 IX86_BUILTIN_VPCOMNEUB,
27744 IX86_BUILTIN_VPCOMLTUB,
27745 IX86_BUILTIN_VPCOMLEUB,
27746 IX86_BUILTIN_VPCOMGTUB,
27747 IX86_BUILTIN_VPCOMGEUB,
27748 IX86_BUILTIN_VPCOMFALSEUB,
27749 IX86_BUILTIN_VPCOMTRUEUB,
27750
27751 IX86_BUILTIN_VPCOMEQUW,
27752 IX86_BUILTIN_VPCOMNEUW,
27753 IX86_BUILTIN_VPCOMLTUW,
27754 IX86_BUILTIN_VPCOMLEUW,
27755 IX86_BUILTIN_VPCOMGTUW,
27756 IX86_BUILTIN_VPCOMGEUW,
27757 IX86_BUILTIN_VPCOMFALSEUW,
27758 IX86_BUILTIN_VPCOMTRUEUW,
27759
27760 IX86_BUILTIN_VPCOMEQUD,
27761 IX86_BUILTIN_VPCOMNEUD,
27762 IX86_BUILTIN_VPCOMLTUD,
27763 IX86_BUILTIN_VPCOMLEUD,
27764 IX86_BUILTIN_VPCOMGTUD,
27765 IX86_BUILTIN_VPCOMGEUD,
27766 IX86_BUILTIN_VPCOMFALSEUD,
27767 IX86_BUILTIN_VPCOMTRUEUD,
27768
27769 IX86_BUILTIN_VPCOMEQUQ,
27770 IX86_BUILTIN_VPCOMNEUQ,
27771 IX86_BUILTIN_VPCOMLTUQ,
27772 IX86_BUILTIN_VPCOMLEUQ,
27773 IX86_BUILTIN_VPCOMGTUQ,
27774 IX86_BUILTIN_VPCOMGEUQ,
27775 IX86_BUILTIN_VPCOMFALSEUQ,
27776 IX86_BUILTIN_VPCOMTRUEUQ,
27777
27778 IX86_BUILTIN_VPCOMEQB,
27779 IX86_BUILTIN_VPCOMNEB,
27780 IX86_BUILTIN_VPCOMLTB,
27781 IX86_BUILTIN_VPCOMLEB,
27782 IX86_BUILTIN_VPCOMGTB,
27783 IX86_BUILTIN_VPCOMGEB,
27784 IX86_BUILTIN_VPCOMFALSEB,
27785 IX86_BUILTIN_VPCOMTRUEB,
27786
27787 IX86_BUILTIN_VPCOMEQW,
27788 IX86_BUILTIN_VPCOMNEW,
27789 IX86_BUILTIN_VPCOMLTW,
27790 IX86_BUILTIN_VPCOMLEW,
27791 IX86_BUILTIN_VPCOMGTW,
27792 IX86_BUILTIN_VPCOMGEW,
27793 IX86_BUILTIN_VPCOMFALSEW,
27794 IX86_BUILTIN_VPCOMTRUEW,
27795
27796 IX86_BUILTIN_VPCOMEQD,
27797 IX86_BUILTIN_VPCOMNED,
27798 IX86_BUILTIN_VPCOMLTD,
27799 IX86_BUILTIN_VPCOMLED,
27800 IX86_BUILTIN_VPCOMGTD,
27801 IX86_BUILTIN_VPCOMGED,
27802 IX86_BUILTIN_VPCOMFALSED,
27803 IX86_BUILTIN_VPCOMTRUED,
27804
27805 IX86_BUILTIN_VPCOMEQQ,
27806 IX86_BUILTIN_VPCOMNEQ,
27807 IX86_BUILTIN_VPCOMLTQ,
27808 IX86_BUILTIN_VPCOMLEQ,
27809 IX86_BUILTIN_VPCOMGTQ,
27810 IX86_BUILTIN_VPCOMGEQ,
27811 IX86_BUILTIN_VPCOMFALSEQ,
27812 IX86_BUILTIN_VPCOMTRUEQ,
27813
27814 /* LWP instructions. */
27815 IX86_BUILTIN_LLWPCB,
27816 IX86_BUILTIN_SLWPCB,
27817 IX86_BUILTIN_LWPVAL32,
27818 IX86_BUILTIN_LWPVAL64,
27819 IX86_BUILTIN_LWPINS32,
27820 IX86_BUILTIN_LWPINS64,
27821
27822 IX86_BUILTIN_CLZS,
27823
27824 /* RTM */
27825 IX86_BUILTIN_XBEGIN,
27826 IX86_BUILTIN_XEND,
27827 IX86_BUILTIN_XABORT,
27828 IX86_BUILTIN_XTEST,
27829
27830 /* BMI instructions. */
27831 IX86_BUILTIN_BEXTR32,
27832 IX86_BUILTIN_BEXTR64,
27833 IX86_BUILTIN_CTZS,
27834
27835 /* TBM instructions. */
27836 IX86_BUILTIN_BEXTRI32,
27837 IX86_BUILTIN_BEXTRI64,
27838
27839 /* BMI2 instructions. */
27840 IX86_BUILTIN_BZHI32,
27841 IX86_BUILTIN_BZHI64,
27842 IX86_BUILTIN_PDEP32,
27843 IX86_BUILTIN_PDEP64,
27844 IX86_BUILTIN_PEXT32,
27845 IX86_BUILTIN_PEXT64,
27846
27847 /* ADX instructions. */
27848 IX86_BUILTIN_ADDCARRYX32,
27849 IX86_BUILTIN_ADDCARRYX64,
27850
27851 /* FSGSBASE instructions. */
27852 IX86_BUILTIN_RDFSBASE32,
27853 IX86_BUILTIN_RDFSBASE64,
27854 IX86_BUILTIN_RDGSBASE32,
27855 IX86_BUILTIN_RDGSBASE64,
27856 IX86_BUILTIN_WRFSBASE32,
27857 IX86_BUILTIN_WRFSBASE64,
27858 IX86_BUILTIN_WRGSBASE32,
27859 IX86_BUILTIN_WRGSBASE64,
27860
27861 /* RDRND instructions. */
27862 IX86_BUILTIN_RDRAND16_STEP,
27863 IX86_BUILTIN_RDRAND32_STEP,
27864 IX86_BUILTIN_RDRAND64_STEP,
27865
27866 /* RDSEED instructions. */
27867 IX86_BUILTIN_RDSEED16_STEP,
27868 IX86_BUILTIN_RDSEED32_STEP,
27869 IX86_BUILTIN_RDSEED64_STEP,
27870
27871 /* F16C instructions. */
27872 IX86_BUILTIN_CVTPH2PS,
27873 IX86_BUILTIN_CVTPH2PS256,
27874 IX86_BUILTIN_CVTPS2PH,
27875 IX86_BUILTIN_CVTPS2PH256,
27876
27877 /* CFString built-in for darwin */
27878 IX86_BUILTIN_CFSTRING,
27879
27880 /* Builtins to get CPU type and supported features. */
27881 IX86_BUILTIN_CPU_INIT,
27882 IX86_BUILTIN_CPU_IS,
27883 IX86_BUILTIN_CPU_SUPPORTS,
27884
27885 IX86_BUILTIN_MAX
27886 };
27887
27888 /* Table for the ix86 builtin decls. */
27889 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27890
27891 /* Table of all of the builtin functions that are possible with different ISA's
27892 but are waiting to be built until a function is declared to use that
27893 ISA. */
27894 struct builtin_isa {
27895 const char *name; /* function name */
27896 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27897 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27898 bool const_p; /* true if the declaration is constant */
27899 bool set_and_not_built_p;
27900 };
27901
27902 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27903
27904
27905 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27906 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27907 function decl in the ix86_builtins array. Returns the function decl or
27908 NULL_TREE, if the builtin was not added.
27909
27910 If the front end has a special hook for builtin functions, delay adding
27911 builtin functions that aren't in the current ISA until the ISA is changed
27912 with function specific optimization. Doing so, can save about 300K for the
27913 default compiler. When the builtin is expanded, check at that time whether
27914 it is valid.
27915
27916 If the front end doesn't have a special hook, record all builtins, even if
27917 it isn't an instruction set in the current ISA in case the user uses
27918 function specific options for a different ISA, so that we don't get scope
27919 errors if a builtin is added in the middle of a function scope. */
27920
27921 static inline tree
27922 def_builtin (HOST_WIDE_INT mask, const char *name,
27923 enum ix86_builtin_func_type tcode,
27924 enum ix86_builtins code)
27925 {
27926 tree decl = NULL_TREE;
27927
27928 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27929 {
27930 ix86_builtins_isa[(int) code].isa = mask;
27931
27932 mask &= ~OPTION_MASK_ISA_64BIT;
27933 if (mask == 0
27934 || (mask & ix86_isa_flags) != 0
27935 || (lang_hooks.builtin_function
27936 == lang_hooks.builtin_function_ext_scope))
27937
27938 {
27939 tree type = ix86_get_builtin_func_type (tcode);
27940 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27941 NULL, NULL_TREE);
27942 ix86_builtins[(int) code] = decl;
27943 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27944 }
27945 else
27946 {
27947 ix86_builtins[(int) code] = NULL_TREE;
27948 ix86_builtins_isa[(int) code].tcode = tcode;
27949 ix86_builtins_isa[(int) code].name = name;
27950 ix86_builtins_isa[(int) code].const_p = false;
27951 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27952 }
27953 }
27954
27955 return decl;
27956 }
27957
27958 /* Like def_builtin, but also marks the function decl "const". */
27959
27960 static inline tree
27961 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27962 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27963 {
27964 tree decl = def_builtin (mask, name, tcode, code);
27965 if (decl)
27966 TREE_READONLY (decl) = 1;
27967 else
27968 ix86_builtins_isa[(int) code].const_p = true;
27969
27970 return decl;
27971 }
27972
27973 /* Add any new builtin functions for a given ISA that may not have been
27974 declared. This saves a bit of space compared to adding all of the
27975 declarations to the tree, even if we didn't use them. */
27976
27977 static void
27978 ix86_add_new_builtins (HOST_WIDE_INT isa)
27979 {
27980 int i;
27981
27982 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27983 {
27984 if ((ix86_builtins_isa[i].isa & isa) != 0
27985 && ix86_builtins_isa[i].set_and_not_built_p)
27986 {
27987 tree decl, type;
27988
27989 /* Don't define the builtin again. */
27990 ix86_builtins_isa[i].set_and_not_built_p = false;
27991
27992 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27993 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27994 type, i, BUILT_IN_MD, NULL,
27995 NULL_TREE);
27996
27997 ix86_builtins[i] = decl;
27998 if (ix86_builtins_isa[i].const_p)
27999 TREE_READONLY (decl) = 1;
28000 }
28001 }
28002 }
28003
28004 /* Bits for builtin_description.flag. */
28005
28006 /* Set when we don't support the comparison natively, and should
28007 swap_comparison in order to support it. */
28008 #define BUILTIN_DESC_SWAP_OPERANDS 1
28009
28010 struct builtin_description
28011 {
28012 const HOST_WIDE_INT mask;
28013 const enum insn_code icode;
28014 const char *const name;
28015 const enum ix86_builtins code;
28016 const enum rtx_code comparison;
28017 const int flag;
28018 };
28019
28020 static const struct builtin_description bdesc_comi[] =
28021 {
28022 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
28023 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
28024 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
28025 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
28026 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
28027 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
28028 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
28029 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
28030 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
28031 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
28032 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
28033 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
28034 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
28035 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
28036 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
28037 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
28038 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
28039 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
28040 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
28041 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
28042 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
28043 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
28044 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
28045 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
28046 };
28047
28048 static const struct builtin_description bdesc_pcmpestr[] =
28049 {
28050 /* SSE4.2 */
28051 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
28052 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
28053 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
28054 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
28055 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
28056 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
28057 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
28058 };
28059
28060 static const struct builtin_description bdesc_pcmpistr[] =
28061 {
28062 /* SSE4.2 */
28063 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
28064 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
28065 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
28066 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
28067 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
28068 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
28069 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
28070 };
28071
28072 /* Special builtins with variable number of arguments. */
28073 static const struct builtin_description bdesc_special_args[] =
28074 {
28075 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
28076 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
28077 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
28078
28079 /* 80387 (for use internally for atomic compound assignment). */
28080 { 0, CODE_FOR_fnstenv, "__builtin_ia32_fnstenv", IX86_BUILTIN_FNSTENV, UNKNOWN, (int) VOID_FTYPE_PVOID },
28081 { 0, CODE_FOR_fldenv, "__builtin_ia32_fldenv", IX86_BUILTIN_FLDENV, UNKNOWN, (int) VOID_FTYPE_PCVOID },
28082 { 0, CODE_FOR_fnstsw, "__builtin_ia32_fnstsw", IX86_BUILTIN_FNSTSW, UNKNOWN, (int) VOID_FTYPE_PUSHORT },
28083 { 0, CODE_FOR_fnclex, "__builtin_ia32_fnclex", IX86_BUILTIN_FNCLEX, UNKNOWN, (int) VOID_FTYPE_VOID },
28084
28085 /* MMX */
28086 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
28087
28088 /* 3DNow! */
28089 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
28090
28091 /* FXSR, XSAVE and XSAVEOPT */
28092 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
28093 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
28094 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28095 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28096 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28097
28098 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
28099 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
28100 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28101 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28102 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28103
28104 /* SSE */
28105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28108
28109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28111 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28113
28114 /* SSE or 3DNow!A */
28115 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28116 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
28117
28118 /* SSE2 */
28119 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28120 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28121 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28122 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
28123 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28124 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
28125 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
28126 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
28127 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
28128 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28129
28130 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28131 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28132
28133 /* SSE3 */
28134 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28135
28136 /* SSE4.1 */
28137 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
28138
28139 /* SSE4A */
28140 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28141 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28142
28143 /* AVX */
28144 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
28145 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
28146
28147 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28148 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28149 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28150 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
28151 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
28152
28153 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28154 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28155 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28156 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28157 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28158 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
28159 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28160
28161 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
28162 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28163 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28164
28165 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
28166 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
28167 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
28168 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
28169 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
28170 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
28171 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
28172 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
28173
28174 /* AVX2 */
28175 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
28176 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
28177 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
28178 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
28179 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
28180 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
28181 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
28182 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
28183 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
28184
28185 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
28186 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
28187 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
28188 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
28189 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
28190 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
28191
28192 /* FSGSBASE */
28193 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28194 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28195 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28196 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28197 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28198 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28199 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28200 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28201
28202 /* RTM */
28203 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28204 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
28205 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
28206 };
28207
28208 /* Builtins with variable number of arguments. */
28209 static const struct builtin_description bdesc_args[] =
28210 {
28211 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
28212 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
28213 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
28214 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28215 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28216 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28217 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28218
28219 /* MMX */
28220 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28221 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28222 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28223 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28224 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28225 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28226
28227 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28228 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28229 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28230 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28231 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28232 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28233 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28234 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28235
28236 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28237 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28238
28239 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28240 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28241 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28242 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28243
28244 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28245 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28246 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28247 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28248 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28249 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28250
28251 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28252 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28253 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28254 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28255 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
28256 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
28257
28258 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28259 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
28260 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28261
28262 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
28263
28264 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28265 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28266 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28267 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28268 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28269 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28270
28271 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28272 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28273 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28274 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28275 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28276 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28277
28278 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28279 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28280 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28281 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28282
28283 /* 3DNow! */
28284 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28285 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28286 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28287 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28288
28289 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28290 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28291 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28292 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28293 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28294 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28295 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28296 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28297 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28298 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28299 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28300 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28301 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28302 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28303 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28304
28305 /* 3DNow!A */
28306 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28307 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28308 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28309 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28310 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28311 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28312
28313 /* SSE */
28314 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
28315 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28316 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28317 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28318 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28319 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28320 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28321 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28322 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28323 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28324 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28325 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28326
28327 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28328
28329 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28330 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28331 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28332 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28333 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28334 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28335 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28336 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28337
28338 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28339 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28340 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28341 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28342 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28343 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28344 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28345 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28346 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28347 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28348 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
28349 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28350 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28351 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28352 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28353 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28354 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28355 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28356 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28357 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28358
28359 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28360 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28361 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28362 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28363
28364 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28365 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28366 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28367 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28368
28369 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28370
28371 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28372 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28373 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28374 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28375 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28376
28377 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
28378 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
28379 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
28380
28381 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
28382
28383 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28384 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28385 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28386
28387 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
28388 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
28389
28390 /* SSE MMX or 3Dnow!A */
28391 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28392 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28393 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28394
28395 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28396 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28397 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28398 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28399
28400 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
28401 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
28402
28403 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
28404
28405 /* SSE2 */
28406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28407
28408 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
28409 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
28410 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28411 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
28412 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
28413
28414 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28415 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28416 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
28417 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28418 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28419
28420 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
28421
28422 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28423 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28424 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28425 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28426
28427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28428 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
28429 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28430
28431 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28432 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28433 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28434 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28435 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28436 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28438 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28439
28440 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28442 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28443 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28444 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
28445 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28446 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28447 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28448 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28449 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28450 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28451 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28452 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28453 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28454 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28455 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28456 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28457 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28458 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28459 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28460
28461 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28462 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28463 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28464 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28465
28466 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28467 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28468 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28469 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28470
28471 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28472
28473 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28474 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28475 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28476
28477 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28478
28479 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28480 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28481 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28482 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28483 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28484 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28485 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28486 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28487
28488 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28489 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28490 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28491 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28492 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28493 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28494 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28495 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28496
28497 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28498 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
28499
28500 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28501 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28502 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28503 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28504
28505 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28506 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28507
28508 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28509 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28510 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28511 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28512 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28513 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28514
28515 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28516 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28517 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28518 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28519
28520 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28521 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28522 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28523 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28524 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28525 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28526 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28527 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28528
28529 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28530 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28531 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28532
28533 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28534 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
28535
28536 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
28537 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28538
28539 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
28540
28541 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
28542 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
28543 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
28544 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
28545
28546 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28547 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28548 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28549 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28550 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28551 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28552 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28553
28554 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28555 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28556 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28557 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28558 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28559 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28560 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28561
28562 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28563 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28564 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28565 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28566
28567 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
28568 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28569 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28570
28571 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
28572
28573 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28574
28575 /* SSE2 MMX */
28576 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28577 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28578
28579 /* SSE3 */
28580 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
28581 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28582
28583 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28584 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28585 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28586 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28587 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28588 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28589
28590 /* SSSE3 */
28591 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28592 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
28593 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28594 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
28595 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28596 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28597
28598 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28599 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28600 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28601 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28602 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28603 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28604 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28605 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28606 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28607 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28608 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28609 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28610 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28611 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28612 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28613 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28614 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28615 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28616 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28617 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28618 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28619 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28620 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28621 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28622
28623 /* SSSE3. */
28624 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28625 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28626
28627 /* SSE4.1 */
28628 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28629 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28630 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28631 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28632 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28633 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28634 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28635 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28636 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28637 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28638
28639 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28640 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28641 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28642 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28643 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28644 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28645 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28646 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28647 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28648 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28649 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28650 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28651 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28652
28653 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28654 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28655 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28656 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28657 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28658 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28659 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28660 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28661 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28662 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28663 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28664 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28665
28666 /* SSE4.1 */
28667 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28668 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28669 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28670 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28671
28672 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28673 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28674 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28675 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28676
28677 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28678 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28679
28680 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28681 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28682
28683 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28684 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28685 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28686 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28687
28688 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28689 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28690
28691 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28692 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28693
28694 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28695 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28696 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28697
28698 /* SSE4.2 */
28699 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28700 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28701 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28702 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28703 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28704
28705 /* SSE4A */
28706 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28707 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28708 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28709 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28710
28711 /* AES */
28712 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28713 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28714
28715 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28716 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28717 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28718 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28719
28720 /* PCLMUL */
28721 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28722
28723 /* AVX */
28724 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28725 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28726 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28727 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28728 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28729 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28730 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28731 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28732 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28733 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28734 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28735 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28736 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28737 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28738 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28739 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28740 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28741 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28742 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28743 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28744 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28745 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28746 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28747 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28748 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28749 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28750
28751 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28752 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28753 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28754 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28755
28756 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28757 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28758 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28759 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28760 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28761 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28762 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28763 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28764 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28765 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28766 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28767 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28768 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28769 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28770 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28771 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28772 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28773 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28774 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28775 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28776 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28777 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28779 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28780 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28781 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28782 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28783 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28784 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28785 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28786 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28787 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28788 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28789 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28790
28791 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28792 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28793 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28794
28795 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28796 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28797 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28798 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28799 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28800
28801 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28802
28803 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28804 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28805
28806 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28807 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28808 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28809 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28810
28811 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28812 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28813
28814 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28815 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28816
28817 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28818 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28819 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28820 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28821
28822 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28823 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28824
28825 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28826 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28827
28828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28829 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28830 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28831 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28832
28833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28834 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28835 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28836 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28837 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28838 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28839
28840 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28841 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28842 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28843 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28844 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28845 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28846 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28847 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28848 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28849 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28850 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28851 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28852 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28853 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28854 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28855
28856 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28857 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28858
28859 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28860 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28861
28862 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28863
28864 /* AVX2 */
28865 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28866 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28867 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28868 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28869 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28870 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28871 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28872 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28873 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28874 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28875 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28876 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28877 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28878 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28879 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28880 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28881 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28882 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28883 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28884 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28885 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28886 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28887 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28888 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28889 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28890 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28891 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28892 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28893 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28894 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28895 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28896 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28897 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28898 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28900 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28901 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28904 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28905 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28906 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28907 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28908 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28909 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28910 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28911 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28912 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28913 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28914 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28915 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28918 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28920 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28921 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28922 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28923 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28924 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28925 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28926 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28927 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28928 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28929 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28930 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28931 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28932 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28933 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28934 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28935 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28936 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28937 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28938 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28939 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28940 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28941 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28942 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28943 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28944 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28945 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28946 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28947 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28948 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28949 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28950 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28951 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28952 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28953 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28954 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28955 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28956 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28957 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28958 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28959 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28960 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28961 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28962 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28963 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28964 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28965 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28966 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28969 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28970 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28971 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28972 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28973 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28974 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28975 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28976 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28977 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28978 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28979 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28980 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28981 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28982 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28983 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28984 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28985 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28986 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28987 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28988 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28989 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28990 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28991 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28992 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28993 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28994 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28995 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28996 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28997 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28998 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28999 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
29000 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
29001 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29002 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
29003 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29004 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29005 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29006 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29007 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29008 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
29009 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29010 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29011
29012 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
29013
29014 /* BMI */
29015 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29016 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29017 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
29018
29019 /* TBM */
29020 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29021 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29022
29023 /* F16C */
29024 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
29025 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
29026 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
29027 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
29028
29029 /* BMI2 */
29030 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29031 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29032 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29033 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29034 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29035 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29036 };
29037
29038 /* FMA4 and XOP. */
29039 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
29040 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
29041 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
29042 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
29043 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
29044 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
29045 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
29046 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
29047 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
29048 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
29049 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
29050 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
29051 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
29052 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
29053 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
29054 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
29055 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
29056 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
29057 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
29058 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
29059 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
29060 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
29061 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
29062 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
29063 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
29064 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
29065 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
29066 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
29067 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
29068 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
29069 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
29070 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
29071 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
29072 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
29073 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
29074 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
29075 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
29076 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
29077 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
29078 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
29079 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
29080 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
29081 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
29082 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
29083 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
29084 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
29085 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
29086 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
29087 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
29088 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
29089 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
29090 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
29091
29092 static const struct builtin_description bdesc_multi_arg[] =
29093 {
29094 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
29095 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
29096 UNKNOWN, (int)MULTI_ARG_3_SF },
29097 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
29098 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
29099 UNKNOWN, (int)MULTI_ARG_3_DF },
29100
29101 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
29102 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
29103 UNKNOWN, (int)MULTI_ARG_3_SF },
29104 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
29105 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
29106 UNKNOWN, (int)MULTI_ARG_3_DF },
29107
29108 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
29109 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
29110 UNKNOWN, (int)MULTI_ARG_3_SF },
29111 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
29112 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
29113 UNKNOWN, (int)MULTI_ARG_3_DF },
29114 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
29115 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
29116 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29117 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
29118 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
29119 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29120
29121 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
29122 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
29123 UNKNOWN, (int)MULTI_ARG_3_SF },
29124 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
29125 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
29126 UNKNOWN, (int)MULTI_ARG_3_DF },
29127 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
29128 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
29129 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29130 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
29131 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
29132 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29133
29134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
29135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
29136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
29137 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
29138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
29139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
29140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
29141
29142 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29143 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29144 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
29145 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
29146 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
29147 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
29148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
29149
29150 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
29151
29152 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29158 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29160 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29161 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29163 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29164
29165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29166 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
29167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
29168 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
29169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
29170 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
29171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
29172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
29173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29174 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
29175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
29176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
29177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
29179 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
29180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
29181
29182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
29183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
29184 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
29185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
29186 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
29187 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
29188
29189 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29190 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29191 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29192 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29193 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29194 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29195 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29196 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29197 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29198 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29199 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29200 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29201 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29202 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29203 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29204
29205 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
29206 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29207 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29208 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
29209 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
29210 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
29211 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
29212
29213 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
29214 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
29217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
29218 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
29219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
29220
29221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
29222 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29223 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
29225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
29226 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
29227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
29228
29229 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29230 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29231 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
29233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
29234 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
29235 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
29236
29237 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
29238 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29239 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29240 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
29241 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
29242 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
29243 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
29244
29245 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
29246 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29247 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29248 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
29249 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
29250 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
29251 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
29252
29253 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
29254 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29255 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29256 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
29257 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
29258 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
29259 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
29260
29261 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29262 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29263 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29264 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
29265 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
29266 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
29267 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
29268
29269 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29270 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29271 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29272 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29273 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29274 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29275 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29276 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29277
29278 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29279 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29280 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29281 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29282 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29283 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29284 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29285 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29286
29287 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
29288 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
29289 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
29290 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
29291
29292 };
29293 \f
29294 /* TM vector builtins. */
29295
29296 /* Reuse the existing x86-specific `struct builtin_description' cause
29297 we're lazy. Add casts to make them fit. */
29298 static const struct builtin_description bdesc_tm[] =
29299 {
29300 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29301 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29302 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29303 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29304 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29305 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29306 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29307
29308 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29309 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29310 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29311 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29312 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29313 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29314 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29315
29316 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29317 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29318 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29319 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29320 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29321 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29322 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29323
29324 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
29325 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
29326 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
29327 };
29328
29329 /* TM callbacks. */
29330
29331 /* Return the builtin decl needed to load a vector of TYPE. */
29332
29333 static tree
29334 ix86_builtin_tm_load (tree type)
29335 {
29336 if (TREE_CODE (type) == VECTOR_TYPE)
29337 {
29338 switch (tree_to_uhwi (TYPE_SIZE (type)))
29339 {
29340 case 64:
29341 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
29342 case 128:
29343 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
29344 case 256:
29345 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
29346 }
29347 }
29348 return NULL_TREE;
29349 }
29350
29351 /* Return the builtin decl needed to store a vector of TYPE. */
29352
29353 static tree
29354 ix86_builtin_tm_store (tree type)
29355 {
29356 if (TREE_CODE (type) == VECTOR_TYPE)
29357 {
29358 switch (tree_to_uhwi (TYPE_SIZE (type)))
29359 {
29360 case 64:
29361 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
29362 case 128:
29363 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
29364 case 256:
29365 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
29366 }
29367 }
29368 return NULL_TREE;
29369 }
29370 \f
29371 /* Initialize the transactional memory vector load/store builtins. */
29372
29373 static void
29374 ix86_init_tm_builtins (void)
29375 {
29376 enum ix86_builtin_func_type ftype;
29377 const struct builtin_description *d;
29378 size_t i;
29379 tree decl;
29380 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
29381 tree attrs_log, attrs_type_log;
29382
29383 if (!flag_tm)
29384 return;
29385
29386 /* If there are no builtins defined, we must be compiling in a
29387 language without trans-mem support. */
29388 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
29389 return;
29390
29391 /* Use whatever attributes a normal TM load has. */
29392 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
29393 attrs_load = DECL_ATTRIBUTES (decl);
29394 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29395 /* Use whatever attributes a normal TM store has. */
29396 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
29397 attrs_store = DECL_ATTRIBUTES (decl);
29398 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29399 /* Use whatever attributes a normal TM log has. */
29400 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
29401 attrs_log = DECL_ATTRIBUTES (decl);
29402 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29403
29404 for (i = 0, d = bdesc_tm;
29405 i < ARRAY_SIZE (bdesc_tm);
29406 i++, d++)
29407 {
29408 if ((d->mask & ix86_isa_flags) != 0
29409 || (lang_hooks.builtin_function
29410 == lang_hooks.builtin_function_ext_scope))
29411 {
29412 tree type, attrs, attrs_type;
29413 enum built_in_function code = (enum built_in_function) d->code;
29414
29415 ftype = (enum ix86_builtin_func_type) d->flag;
29416 type = ix86_get_builtin_func_type (ftype);
29417
29418 if (BUILTIN_TM_LOAD_P (code))
29419 {
29420 attrs = attrs_load;
29421 attrs_type = attrs_type_load;
29422 }
29423 else if (BUILTIN_TM_STORE_P (code))
29424 {
29425 attrs = attrs_store;
29426 attrs_type = attrs_type_store;
29427 }
29428 else
29429 {
29430 attrs = attrs_log;
29431 attrs_type = attrs_type_log;
29432 }
29433 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
29434 /* The builtin without the prefix for
29435 calling it directly. */
29436 d->name + strlen ("__builtin_"),
29437 attrs);
29438 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
29439 set the TYPE_ATTRIBUTES. */
29440 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
29441
29442 set_builtin_decl (code, decl, false);
29443 }
29444 }
29445 }
29446
29447 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
29448 in the current target ISA to allow the user to compile particular modules
29449 with different target specific options that differ from the command line
29450 options. */
29451 static void
29452 ix86_init_mmx_sse_builtins (void)
29453 {
29454 const struct builtin_description * d;
29455 enum ix86_builtin_func_type ftype;
29456 size_t i;
29457
29458 /* Add all special builtins with variable number of operands. */
29459 for (i = 0, d = bdesc_special_args;
29460 i < ARRAY_SIZE (bdesc_special_args);
29461 i++, d++)
29462 {
29463 if (d->name == 0)
29464 continue;
29465
29466 ftype = (enum ix86_builtin_func_type) d->flag;
29467 def_builtin (d->mask, d->name, ftype, d->code);
29468 }
29469
29470 /* Add all builtins with variable number of operands. */
29471 for (i = 0, d = bdesc_args;
29472 i < ARRAY_SIZE (bdesc_args);
29473 i++, d++)
29474 {
29475 if (d->name == 0)
29476 continue;
29477
29478 ftype = (enum ix86_builtin_func_type) d->flag;
29479 def_builtin_const (d->mask, d->name, ftype, d->code);
29480 }
29481
29482 /* pcmpestr[im] insns. */
29483 for (i = 0, d = bdesc_pcmpestr;
29484 i < ARRAY_SIZE (bdesc_pcmpestr);
29485 i++, d++)
29486 {
29487 if (d->code == IX86_BUILTIN_PCMPESTRM128)
29488 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
29489 else
29490 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
29491 def_builtin_const (d->mask, d->name, ftype, d->code);
29492 }
29493
29494 /* pcmpistr[im] insns. */
29495 for (i = 0, d = bdesc_pcmpistr;
29496 i < ARRAY_SIZE (bdesc_pcmpistr);
29497 i++, d++)
29498 {
29499 if (d->code == IX86_BUILTIN_PCMPISTRM128)
29500 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
29501 else
29502 ftype = INT_FTYPE_V16QI_V16QI_INT;
29503 def_builtin_const (d->mask, d->name, ftype, d->code);
29504 }
29505
29506 /* comi/ucomi insns. */
29507 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29508 {
29509 if (d->mask == OPTION_MASK_ISA_SSE2)
29510 ftype = INT_FTYPE_V2DF_V2DF;
29511 else
29512 ftype = INT_FTYPE_V4SF_V4SF;
29513 def_builtin_const (d->mask, d->name, ftype, d->code);
29514 }
29515
29516 /* SSE */
29517 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
29518 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
29519 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
29520 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
29521
29522 /* SSE or 3DNow!A */
29523 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29524 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
29525 IX86_BUILTIN_MASKMOVQ);
29526
29527 /* SSE2 */
29528 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
29529 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
29530
29531 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
29532 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
29533 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
29534 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
29535
29536 /* SSE3. */
29537 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
29538 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
29539 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
29540 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
29541
29542 /* AES */
29543 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
29544 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
29545 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
29546 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
29547 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
29548 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
29549 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
29550 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
29551 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
29552 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
29553 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
29554 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
29555
29556 /* PCLMUL */
29557 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
29558 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
29559
29560 /* RDRND */
29561 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
29562 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
29563 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
29564 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
29565 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
29566 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
29567 IX86_BUILTIN_RDRAND64_STEP);
29568
29569 /* AVX2 */
29570 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
29571 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
29572 IX86_BUILTIN_GATHERSIV2DF);
29573
29574 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
29575 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
29576 IX86_BUILTIN_GATHERSIV4DF);
29577
29578 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
29579 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
29580 IX86_BUILTIN_GATHERDIV2DF);
29581
29582 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
29583 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
29584 IX86_BUILTIN_GATHERDIV4DF);
29585
29586 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
29587 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
29588 IX86_BUILTIN_GATHERSIV4SF);
29589
29590 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
29591 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
29592 IX86_BUILTIN_GATHERSIV8SF);
29593
29594 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
29595 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
29596 IX86_BUILTIN_GATHERDIV4SF);
29597
29598 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
29599 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29600 IX86_BUILTIN_GATHERDIV8SF);
29601
29602 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29603 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29604 IX86_BUILTIN_GATHERSIV2DI);
29605
29606 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29607 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29608 IX86_BUILTIN_GATHERSIV4DI);
29609
29610 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29611 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29612 IX86_BUILTIN_GATHERDIV2DI);
29613
29614 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29615 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29616 IX86_BUILTIN_GATHERDIV4DI);
29617
29618 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29619 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29620 IX86_BUILTIN_GATHERSIV4SI);
29621
29622 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29623 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29624 IX86_BUILTIN_GATHERSIV8SI);
29625
29626 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29627 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29628 IX86_BUILTIN_GATHERDIV4SI);
29629
29630 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29631 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29632 IX86_BUILTIN_GATHERDIV8SI);
29633
29634 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29635 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29636 IX86_BUILTIN_GATHERALTSIV4DF);
29637
29638 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29639 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29640 IX86_BUILTIN_GATHERALTDIV8SF);
29641
29642 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29643 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29644 IX86_BUILTIN_GATHERALTSIV4DI);
29645
29646 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29647 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29648 IX86_BUILTIN_GATHERALTDIV8SI);
29649
29650 /* RTM. */
29651 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29652 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29653
29654 /* MMX access to the vec_init patterns. */
29655 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29656 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29657
29658 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29659 V4HI_FTYPE_HI_HI_HI_HI,
29660 IX86_BUILTIN_VEC_INIT_V4HI);
29661
29662 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29663 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29664 IX86_BUILTIN_VEC_INIT_V8QI);
29665
29666 /* Access to the vec_extract patterns. */
29667 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29668 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29669 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29670 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29671 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29672 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29673 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29674 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29675 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29676 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29677
29678 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29679 "__builtin_ia32_vec_ext_v4hi",
29680 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29681
29682 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29683 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29684
29685 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29686 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29687
29688 /* Access to the vec_set patterns. */
29689 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29690 "__builtin_ia32_vec_set_v2di",
29691 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29692
29693 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29694 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29695
29696 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29697 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29698
29699 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29700 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29701
29702 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29703 "__builtin_ia32_vec_set_v4hi",
29704 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29705
29706 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29707 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29708
29709 /* RDSEED */
29710 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29711 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29712 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29713 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29714 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29715 "__builtin_ia32_rdseed_di_step",
29716 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29717
29718 /* ADCX */
29719 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29720 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29721 def_builtin (OPTION_MASK_ISA_64BIT,
29722 "__builtin_ia32_addcarryx_u64",
29723 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29724 IX86_BUILTIN_ADDCARRYX64);
29725
29726 /* Add FMA4 multi-arg argument instructions */
29727 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29728 {
29729 if (d->name == 0)
29730 continue;
29731
29732 ftype = (enum ix86_builtin_func_type) d->flag;
29733 def_builtin_const (d->mask, d->name, ftype, d->code);
29734 }
29735 }
29736
29737 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29738 to return a pointer to VERSION_DECL if the outcome of the expression
29739 formed by PREDICATE_CHAIN is true. This function will be called during
29740 version dispatch to decide which function version to execute. It returns
29741 the basic block at the end, to which more conditions can be added. */
29742
29743 static basic_block
29744 add_condition_to_bb (tree function_decl, tree version_decl,
29745 tree predicate_chain, basic_block new_bb)
29746 {
29747 gimple return_stmt;
29748 tree convert_expr, result_var;
29749 gimple convert_stmt;
29750 gimple call_cond_stmt;
29751 gimple if_else_stmt;
29752
29753 basic_block bb1, bb2, bb3;
29754 edge e12, e23;
29755
29756 tree cond_var, and_expr_var = NULL_TREE;
29757 gimple_seq gseq;
29758
29759 tree predicate_decl, predicate_arg;
29760
29761 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29762
29763 gcc_assert (new_bb != NULL);
29764 gseq = bb_seq (new_bb);
29765
29766
29767 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29768 build_fold_addr_expr (version_decl));
29769 result_var = create_tmp_var (ptr_type_node, NULL);
29770 convert_stmt = gimple_build_assign (result_var, convert_expr);
29771 return_stmt = gimple_build_return (result_var);
29772
29773 if (predicate_chain == NULL_TREE)
29774 {
29775 gimple_seq_add_stmt (&gseq, convert_stmt);
29776 gimple_seq_add_stmt (&gseq, return_stmt);
29777 set_bb_seq (new_bb, gseq);
29778 gimple_set_bb (convert_stmt, new_bb);
29779 gimple_set_bb (return_stmt, new_bb);
29780 pop_cfun ();
29781 return new_bb;
29782 }
29783
29784 while (predicate_chain != NULL)
29785 {
29786 cond_var = create_tmp_var (integer_type_node, NULL);
29787 predicate_decl = TREE_PURPOSE (predicate_chain);
29788 predicate_arg = TREE_VALUE (predicate_chain);
29789 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29790 gimple_call_set_lhs (call_cond_stmt, cond_var);
29791
29792 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29793 gimple_set_bb (call_cond_stmt, new_bb);
29794 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29795
29796 predicate_chain = TREE_CHAIN (predicate_chain);
29797
29798 if (and_expr_var == NULL)
29799 and_expr_var = cond_var;
29800 else
29801 {
29802 gimple assign_stmt;
29803 /* Use MIN_EXPR to check if any integer is zero?.
29804 and_expr_var = min_expr <cond_var, and_expr_var> */
29805 assign_stmt = gimple_build_assign (and_expr_var,
29806 build2 (MIN_EXPR, integer_type_node,
29807 cond_var, and_expr_var));
29808
29809 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29810 gimple_set_bb (assign_stmt, new_bb);
29811 gimple_seq_add_stmt (&gseq, assign_stmt);
29812 }
29813 }
29814
29815 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29816 integer_zero_node,
29817 NULL_TREE, NULL_TREE);
29818 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29819 gimple_set_bb (if_else_stmt, new_bb);
29820 gimple_seq_add_stmt (&gseq, if_else_stmt);
29821
29822 gimple_seq_add_stmt (&gseq, convert_stmt);
29823 gimple_seq_add_stmt (&gseq, return_stmt);
29824 set_bb_seq (new_bb, gseq);
29825
29826 bb1 = new_bb;
29827 e12 = split_block (bb1, if_else_stmt);
29828 bb2 = e12->dest;
29829 e12->flags &= ~EDGE_FALLTHRU;
29830 e12->flags |= EDGE_TRUE_VALUE;
29831
29832 e23 = split_block (bb2, return_stmt);
29833
29834 gimple_set_bb (convert_stmt, bb2);
29835 gimple_set_bb (return_stmt, bb2);
29836
29837 bb3 = e23->dest;
29838 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29839
29840 remove_edge (e23);
29841 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
29842
29843 pop_cfun ();
29844
29845 return bb3;
29846 }
29847
29848 /* This parses the attribute arguments to target in DECL and determines
29849 the right builtin to use to match the platform specification.
29850 It returns the priority value for this version decl. If PREDICATE_LIST
29851 is not NULL, it stores the list of cpu features that need to be checked
29852 before dispatching this function. */
29853
29854 static unsigned int
29855 get_builtin_code_for_version (tree decl, tree *predicate_list)
29856 {
29857 tree attrs;
29858 struct cl_target_option cur_target;
29859 tree target_node;
29860 struct cl_target_option *new_target;
29861 const char *arg_str = NULL;
29862 const char *attrs_str = NULL;
29863 char *tok_str = NULL;
29864 char *token;
29865
29866 /* Priority of i386 features, greater value is higher priority. This is
29867 used to decide the order in which function dispatch must happen. For
29868 instance, a version specialized for SSE4.2 should be checked for dispatch
29869 before a version for SSE3, as SSE4.2 implies SSE3. */
29870 enum feature_priority
29871 {
29872 P_ZERO = 0,
29873 P_MMX,
29874 P_SSE,
29875 P_SSE2,
29876 P_SSE3,
29877 P_SSSE3,
29878 P_PROC_SSSE3,
29879 P_SSE4_a,
29880 P_PROC_SSE4_a,
29881 P_SSE4_1,
29882 P_SSE4_2,
29883 P_PROC_SSE4_2,
29884 P_POPCNT,
29885 P_AVX,
29886 P_AVX2,
29887 P_FMA,
29888 P_PROC_FMA
29889 };
29890
29891 enum feature_priority priority = P_ZERO;
29892
29893 /* These are the target attribute strings for which a dispatcher is
29894 available, from fold_builtin_cpu. */
29895
29896 static struct _feature_list
29897 {
29898 const char *const name;
29899 const enum feature_priority priority;
29900 }
29901 const feature_list[] =
29902 {
29903 {"mmx", P_MMX},
29904 {"sse", P_SSE},
29905 {"sse2", P_SSE2},
29906 {"sse3", P_SSE3},
29907 {"ssse3", P_SSSE3},
29908 {"sse4.1", P_SSE4_1},
29909 {"sse4.2", P_SSE4_2},
29910 {"popcnt", P_POPCNT},
29911 {"avx", P_AVX},
29912 {"avx2", P_AVX2}
29913 };
29914
29915
29916 static unsigned int NUM_FEATURES
29917 = sizeof (feature_list) / sizeof (struct _feature_list);
29918
29919 unsigned int i;
29920
29921 tree predicate_chain = NULL_TREE;
29922 tree predicate_decl, predicate_arg;
29923
29924 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29925 gcc_assert (attrs != NULL);
29926
29927 attrs = TREE_VALUE (TREE_VALUE (attrs));
29928
29929 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29930 attrs_str = TREE_STRING_POINTER (attrs);
29931
29932 /* Return priority zero for default function. */
29933 if (strcmp (attrs_str, "default") == 0)
29934 return 0;
29935
29936 /* Handle arch= if specified. For priority, set it to be 1 more than
29937 the best instruction set the processor can handle. For instance, if
29938 there is a version for atom and a version for ssse3 (the highest ISA
29939 priority for atom), the atom version must be checked for dispatch
29940 before the ssse3 version. */
29941 if (strstr (attrs_str, "arch=") != NULL)
29942 {
29943 cl_target_option_save (&cur_target, &global_options);
29944 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
29945 &global_options_set);
29946
29947 gcc_assert (target_node);
29948 new_target = TREE_TARGET_OPTION (target_node);
29949 gcc_assert (new_target);
29950
29951 if (new_target->arch_specified && new_target->arch > 0)
29952 {
29953 switch (new_target->arch)
29954 {
29955 case PROCESSOR_CORE2:
29956 arg_str = "core2";
29957 priority = P_PROC_SSSE3;
29958 break;
29959 case PROCESSOR_COREI7:
29960 arg_str = "corei7";
29961 priority = P_PROC_SSE4_2;
29962 break;
29963 case PROCESSOR_COREI7_AVX:
29964 arg_str = "corei7-avx";
29965 priority = P_PROC_SSE4_2;
29966 break;
29967 case PROCESSOR_ATOM:
29968 arg_str = "atom";
29969 priority = P_PROC_SSSE3;
29970 break;
29971 case PROCESSOR_AMDFAM10:
29972 arg_str = "amdfam10h";
29973 priority = P_PROC_SSE4_a;
29974 break;
29975 case PROCESSOR_BDVER1:
29976 arg_str = "bdver1";
29977 priority = P_PROC_FMA;
29978 break;
29979 case PROCESSOR_BDVER2:
29980 arg_str = "bdver2";
29981 priority = P_PROC_FMA;
29982 break;
29983 }
29984 }
29985
29986 cl_target_option_restore (&global_options, &cur_target);
29987
29988 if (predicate_list && arg_str == NULL)
29989 {
29990 error_at (DECL_SOURCE_LOCATION (decl),
29991 "No dispatcher found for the versioning attributes");
29992 return 0;
29993 }
29994
29995 if (predicate_list)
29996 {
29997 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29998 /* For a C string literal the length includes the trailing NULL. */
29999 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
30000 predicate_chain = tree_cons (predicate_decl, predicate_arg,
30001 predicate_chain);
30002 }
30003 }
30004
30005 /* Process feature name. */
30006 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
30007 strcpy (tok_str, attrs_str);
30008 token = strtok (tok_str, ",");
30009 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
30010
30011 while (token != NULL)
30012 {
30013 /* Do not process "arch=" */
30014 if (strncmp (token, "arch=", 5) == 0)
30015 {
30016 token = strtok (NULL, ",");
30017 continue;
30018 }
30019 for (i = 0; i < NUM_FEATURES; ++i)
30020 {
30021 if (strcmp (token, feature_list[i].name) == 0)
30022 {
30023 if (predicate_list)
30024 {
30025 predicate_arg = build_string_literal (
30026 strlen (feature_list[i].name) + 1,
30027 feature_list[i].name);
30028 predicate_chain = tree_cons (predicate_decl, predicate_arg,
30029 predicate_chain);
30030 }
30031 /* Find the maximum priority feature. */
30032 if (feature_list[i].priority > priority)
30033 priority = feature_list[i].priority;
30034
30035 break;
30036 }
30037 }
30038 if (predicate_list && i == NUM_FEATURES)
30039 {
30040 error_at (DECL_SOURCE_LOCATION (decl),
30041 "No dispatcher found for %s", token);
30042 return 0;
30043 }
30044 token = strtok (NULL, ",");
30045 }
30046 free (tok_str);
30047
30048 if (predicate_list && predicate_chain == NULL_TREE)
30049 {
30050 error_at (DECL_SOURCE_LOCATION (decl),
30051 "No dispatcher found for the versioning attributes : %s",
30052 attrs_str);
30053 return 0;
30054 }
30055 else if (predicate_list)
30056 {
30057 predicate_chain = nreverse (predicate_chain);
30058 *predicate_list = predicate_chain;
30059 }
30060
30061 return priority;
30062 }
30063
30064 /* This compares the priority of target features in function DECL1
30065 and DECL2. It returns positive value if DECL1 is higher priority,
30066 negative value if DECL2 is higher priority and 0 if they are the
30067 same. */
30068
30069 static int
30070 ix86_compare_version_priority (tree decl1, tree decl2)
30071 {
30072 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
30073 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
30074
30075 return (int)priority1 - (int)priority2;
30076 }
30077
30078 /* V1 and V2 point to function versions with different priorities
30079 based on the target ISA. This function compares their priorities. */
30080
30081 static int
30082 feature_compare (const void *v1, const void *v2)
30083 {
30084 typedef struct _function_version_info
30085 {
30086 tree version_decl;
30087 tree predicate_chain;
30088 unsigned int dispatch_priority;
30089 } function_version_info;
30090
30091 const function_version_info c1 = *(const function_version_info *)v1;
30092 const function_version_info c2 = *(const function_version_info *)v2;
30093 return (c2.dispatch_priority - c1.dispatch_priority);
30094 }
30095
30096 /* This function generates the dispatch function for
30097 multi-versioned functions. DISPATCH_DECL is the function which will
30098 contain the dispatch logic. FNDECLS are the function choices for
30099 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
30100 in DISPATCH_DECL in which the dispatch code is generated. */
30101
30102 static int
30103 dispatch_function_versions (tree dispatch_decl,
30104 void *fndecls_p,
30105 basic_block *empty_bb)
30106 {
30107 tree default_decl;
30108 gimple ifunc_cpu_init_stmt;
30109 gimple_seq gseq;
30110 int ix;
30111 tree ele;
30112 vec<tree> *fndecls;
30113 unsigned int num_versions = 0;
30114 unsigned int actual_versions = 0;
30115 unsigned int i;
30116
30117 struct _function_version_info
30118 {
30119 tree version_decl;
30120 tree predicate_chain;
30121 unsigned int dispatch_priority;
30122 }*function_version_info;
30123
30124 gcc_assert (dispatch_decl != NULL
30125 && fndecls_p != NULL
30126 && empty_bb != NULL);
30127
30128 /*fndecls_p is actually a vector. */
30129 fndecls = static_cast<vec<tree> *> (fndecls_p);
30130
30131 /* At least one more version other than the default. */
30132 num_versions = fndecls->length ();
30133 gcc_assert (num_versions >= 2);
30134
30135 function_version_info = (struct _function_version_info *)
30136 XNEWVEC (struct _function_version_info, (num_versions - 1));
30137
30138 /* The first version in the vector is the default decl. */
30139 default_decl = (*fndecls)[0];
30140
30141 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
30142
30143 gseq = bb_seq (*empty_bb);
30144 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
30145 constructors, so explicity call __builtin_cpu_init here. */
30146 ifunc_cpu_init_stmt = gimple_build_call_vec (
30147 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
30148 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
30149 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
30150 set_bb_seq (*empty_bb, gseq);
30151
30152 pop_cfun ();
30153
30154
30155 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
30156 {
30157 tree version_decl = ele;
30158 tree predicate_chain = NULL_TREE;
30159 unsigned int priority;
30160 /* Get attribute string, parse it and find the right predicate decl.
30161 The predicate function could be a lengthy combination of many
30162 features, like arch-type and various isa-variants. */
30163 priority = get_builtin_code_for_version (version_decl,
30164 &predicate_chain);
30165
30166 if (predicate_chain == NULL_TREE)
30167 continue;
30168
30169 function_version_info [actual_versions].version_decl = version_decl;
30170 function_version_info [actual_versions].predicate_chain
30171 = predicate_chain;
30172 function_version_info [actual_versions].dispatch_priority = priority;
30173 actual_versions++;
30174 }
30175
30176 /* Sort the versions according to descending order of dispatch priority. The
30177 priority is based on the ISA. This is not a perfect solution. There
30178 could still be ambiguity. If more than one function version is suitable
30179 to execute, which one should be dispatched? In future, allow the user
30180 to specify a dispatch priority next to the version. */
30181 qsort (function_version_info, actual_versions,
30182 sizeof (struct _function_version_info), feature_compare);
30183
30184 for (i = 0; i < actual_versions; ++i)
30185 *empty_bb = add_condition_to_bb (dispatch_decl,
30186 function_version_info[i].version_decl,
30187 function_version_info[i].predicate_chain,
30188 *empty_bb);
30189
30190 /* dispatch default version at the end. */
30191 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
30192 NULL, *empty_bb);
30193
30194 free (function_version_info);
30195 return 0;
30196 }
30197
30198 /* Comparator function to be used in qsort routine to sort attribute
30199 specification strings to "target". */
30200
30201 static int
30202 attr_strcmp (const void *v1, const void *v2)
30203 {
30204 const char *c1 = *(char *const*)v1;
30205 const char *c2 = *(char *const*)v2;
30206 return strcmp (c1, c2);
30207 }
30208
30209 /* ARGLIST is the argument to target attribute. This function tokenizes
30210 the comma separated arguments, sorts them and returns a string which
30211 is a unique identifier for the comma separated arguments. It also
30212 replaces non-identifier characters "=,-" with "_". */
30213
30214 static char *
30215 sorted_attr_string (tree arglist)
30216 {
30217 tree arg;
30218 size_t str_len_sum = 0;
30219 char **args = NULL;
30220 char *attr_str, *ret_str;
30221 char *attr = NULL;
30222 unsigned int argnum = 1;
30223 unsigned int i;
30224
30225 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30226 {
30227 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30228 size_t len = strlen (str);
30229 str_len_sum += len + 1;
30230 if (arg != arglist)
30231 argnum++;
30232 for (i = 0; i < strlen (str); i++)
30233 if (str[i] == ',')
30234 argnum++;
30235 }
30236
30237 attr_str = XNEWVEC (char, str_len_sum);
30238 str_len_sum = 0;
30239 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30240 {
30241 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30242 size_t len = strlen (str);
30243 memcpy (attr_str + str_len_sum, str, len);
30244 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
30245 str_len_sum += len + 1;
30246 }
30247
30248 /* Replace "=,-" with "_". */
30249 for (i = 0; i < strlen (attr_str); i++)
30250 if (attr_str[i] == '=' || attr_str[i]== '-')
30251 attr_str[i] = '_';
30252
30253 if (argnum == 1)
30254 return attr_str;
30255
30256 args = XNEWVEC (char *, argnum);
30257
30258 i = 0;
30259 attr = strtok (attr_str, ",");
30260 while (attr != NULL)
30261 {
30262 args[i] = attr;
30263 i++;
30264 attr = strtok (NULL, ",");
30265 }
30266
30267 qsort (args, argnum, sizeof (char *), attr_strcmp);
30268
30269 ret_str = XNEWVEC (char, str_len_sum);
30270 str_len_sum = 0;
30271 for (i = 0; i < argnum; i++)
30272 {
30273 size_t len = strlen (args[i]);
30274 memcpy (ret_str + str_len_sum, args[i], len);
30275 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
30276 str_len_sum += len + 1;
30277 }
30278
30279 XDELETEVEC (args);
30280 XDELETEVEC (attr_str);
30281 return ret_str;
30282 }
30283
30284 /* This function changes the assembler name for functions that are
30285 versions. If DECL is a function version and has a "target"
30286 attribute, it appends the attribute string to its assembler name. */
30287
30288 static tree
30289 ix86_mangle_function_version_assembler_name (tree decl, tree id)
30290 {
30291 tree version_attr;
30292 const char *orig_name, *version_string;
30293 char *attr_str, *assembler_name;
30294
30295 if (DECL_DECLARED_INLINE_P (decl)
30296 && lookup_attribute ("gnu_inline",
30297 DECL_ATTRIBUTES (decl)))
30298 error_at (DECL_SOURCE_LOCATION (decl),
30299 "Function versions cannot be marked as gnu_inline,"
30300 " bodies have to be generated");
30301
30302 if (DECL_VIRTUAL_P (decl)
30303 || DECL_VINDEX (decl))
30304 sorry ("Virtual function multiversioning not supported");
30305
30306 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30307
30308 /* target attribute string cannot be NULL. */
30309 gcc_assert (version_attr != NULL_TREE);
30310
30311 orig_name = IDENTIFIER_POINTER (id);
30312 version_string
30313 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
30314
30315 if (strcmp (version_string, "default") == 0)
30316 return id;
30317
30318 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
30319 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
30320
30321 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
30322
30323 /* Allow assembler name to be modified if already set. */
30324 if (DECL_ASSEMBLER_NAME_SET_P (decl))
30325 SET_DECL_RTL (decl, NULL);
30326
30327 tree ret = get_identifier (assembler_name);
30328 XDELETEVEC (attr_str);
30329 XDELETEVEC (assembler_name);
30330 return ret;
30331 }
30332
30333 /* This function returns true if FN1 and FN2 are versions of the same function,
30334 that is, the target strings of the function decls are different. This assumes
30335 that FN1 and FN2 have the same signature. */
30336
30337 static bool
30338 ix86_function_versions (tree fn1, tree fn2)
30339 {
30340 tree attr1, attr2;
30341 char *target1, *target2;
30342 bool result;
30343
30344 if (TREE_CODE (fn1) != FUNCTION_DECL
30345 || TREE_CODE (fn2) != FUNCTION_DECL)
30346 return false;
30347
30348 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
30349 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
30350
30351 /* At least one function decl should have the target attribute specified. */
30352 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
30353 return false;
30354
30355 /* Diagnose missing target attribute if one of the decls is already
30356 multi-versioned. */
30357 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
30358 {
30359 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
30360 {
30361 if (attr2 != NULL_TREE)
30362 {
30363 tree tem = fn1;
30364 fn1 = fn2;
30365 fn2 = tem;
30366 attr1 = attr2;
30367 }
30368 error_at (DECL_SOURCE_LOCATION (fn2),
30369 "missing %<target%> attribute for multi-versioned %D",
30370 fn2);
30371 inform (DECL_SOURCE_LOCATION (fn1),
30372 "previous declaration of %D", fn1);
30373 /* Prevent diagnosing of the same error multiple times. */
30374 DECL_ATTRIBUTES (fn2)
30375 = tree_cons (get_identifier ("target"),
30376 copy_node (TREE_VALUE (attr1)),
30377 DECL_ATTRIBUTES (fn2));
30378 }
30379 return false;
30380 }
30381
30382 target1 = sorted_attr_string (TREE_VALUE (attr1));
30383 target2 = sorted_attr_string (TREE_VALUE (attr2));
30384
30385 /* The sorted target strings must be different for fn1 and fn2
30386 to be versions. */
30387 if (strcmp (target1, target2) == 0)
30388 result = false;
30389 else
30390 result = true;
30391
30392 XDELETEVEC (target1);
30393 XDELETEVEC (target2);
30394
30395 return result;
30396 }
30397
30398 static tree
30399 ix86_mangle_decl_assembler_name (tree decl, tree id)
30400 {
30401 /* For function version, add the target suffix to the assembler name. */
30402 if (TREE_CODE (decl) == FUNCTION_DECL
30403 && DECL_FUNCTION_VERSIONED (decl))
30404 id = ix86_mangle_function_version_assembler_name (decl, id);
30405 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
30406 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
30407 #endif
30408
30409 return id;
30410 }
30411
30412 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
30413 is true, append the full path name of the source file. */
30414
30415 static char *
30416 make_name (tree decl, const char *suffix, bool make_unique)
30417 {
30418 char *global_var_name;
30419 int name_len;
30420 const char *name;
30421 const char *unique_name = NULL;
30422
30423 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
30424
30425 /* Get a unique name that can be used globally without any chances
30426 of collision at link time. */
30427 if (make_unique)
30428 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
30429
30430 name_len = strlen (name) + strlen (suffix) + 2;
30431
30432 if (make_unique)
30433 name_len += strlen (unique_name) + 1;
30434 global_var_name = XNEWVEC (char, name_len);
30435
30436 /* Use '.' to concatenate names as it is demangler friendly. */
30437 if (make_unique)
30438 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
30439 suffix);
30440 else
30441 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
30442
30443 return global_var_name;
30444 }
30445
30446 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30447
30448 /* Make a dispatcher declaration for the multi-versioned function DECL.
30449 Calls to DECL function will be replaced with calls to the dispatcher
30450 by the front-end. Return the decl created. */
30451
30452 static tree
30453 make_dispatcher_decl (const tree decl)
30454 {
30455 tree func_decl;
30456 char *func_name;
30457 tree fn_type, func_type;
30458 bool is_uniq = false;
30459
30460 if (TREE_PUBLIC (decl) == 0)
30461 is_uniq = true;
30462
30463 func_name = make_name (decl, "ifunc", is_uniq);
30464
30465 fn_type = TREE_TYPE (decl);
30466 func_type = build_function_type (TREE_TYPE (fn_type),
30467 TYPE_ARG_TYPES (fn_type));
30468
30469 func_decl = build_fn_decl (func_name, func_type);
30470 XDELETEVEC (func_name);
30471 TREE_USED (func_decl) = 1;
30472 DECL_CONTEXT (func_decl) = NULL_TREE;
30473 DECL_INITIAL (func_decl) = error_mark_node;
30474 DECL_ARTIFICIAL (func_decl) = 1;
30475 /* Mark this func as external, the resolver will flip it again if
30476 it gets generated. */
30477 DECL_EXTERNAL (func_decl) = 1;
30478 /* This will be of type IFUNCs have to be externally visible. */
30479 TREE_PUBLIC (func_decl) = 1;
30480
30481 return func_decl;
30482 }
30483
30484 #endif
30485
30486 /* Returns true if decl is multi-versioned and DECL is the default function,
30487 that is it is not tagged with target specific optimization. */
30488
30489 static bool
30490 is_function_default_version (const tree decl)
30491 {
30492 if (TREE_CODE (decl) != FUNCTION_DECL
30493 || !DECL_FUNCTION_VERSIONED (decl))
30494 return false;
30495 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30496 gcc_assert (attr);
30497 attr = TREE_VALUE (TREE_VALUE (attr));
30498 return (TREE_CODE (attr) == STRING_CST
30499 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
30500 }
30501
30502 /* Make a dispatcher declaration for the multi-versioned function DECL.
30503 Calls to DECL function will be replaced with calls to the dispatcher
30504 by the front-end. Returns the decl of the dispatcher function. */
30505
30506 static tree
30507 ix86_get_function_versions_dispatcher (void *decl)
30508 {
30509 tree fn = (tree) decl;
30510 struct cgraph_node *node = NULL;
30511 struct cgraph_node *default_node = NULL;
30512 struct cgraph_function_version_info *node_v = NULL;
30513 struct cgraph_function_version_info *first_v = NULL;
30514
30515 tree dispatch_decl = NULL;
30516
30517 struct cgraph_function_version_info *default_version_info = NULL;
30518
30519 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
30520
30521 node = cgraph_get_node (fn);
30522 gcc_assert (node != NULL);
30523
30524 node_v = get_cgraph_node_version (node);
30525 gcc_assert (node_v != NULL);
30526
30527 if (node_v->dispatcher_resolver != NULL)
30528 return node_v->dispatcher_resolver;
30529
30530 /* Find the default version and make it the first node. */
30531 first_v = node_v;
30532 /* Go to the beginning of the chain. */
30533 while (first_v->prev != NULL)
30534 first_v = first_v->prev;
30535 default_version_info = first_v;
30536 while (default_version_info != NULL)
30537 {
30538 if (is_function_default_version
30539 (default_version_info->this_node->decl))
30540 break;
30541 default_version_info = default_version_info->next;
30542 }
30543
30544 /* If there is no default node, just return NULL. */
30545 if (default_version_info == NULL)
30546 return NULL;
30547
30548 /* Make default info the first node. */
30549 if (first_v != default_version_info)
30550 {
30551 default_version_info->prev->next = default_version_info->next;
30552 if (default_version_info->next)
30553 default_version_info->next->prev = default_version_info->prev;
30554 first_v->prev = default_version_info;
30555 default_version_info->next = first_v;
30556 default_version_info->prev = NULL;
30557 }
30558
30559 default_node = default_version_info->this_node;
30560
30561 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30562 if (targetm.has_ifunc_p ())
30563 {
30564 struct cgraph_function_version_info *it_v = NULL;
30565 struct cgraph_node *dispatcher_node = NULL;
30566 struct cgraph_function_version_info *dispatcher_version_info = NULL;
30567
30568 /* Right now, the dispatching is done via ifunc. */
30569 dispatch_decl = make_dispatcher_decl (default_node->decl);
30570
30571 dispatcher_node = cgraph_get_create_node (dispatch_decl);
30572 gcc_assert (dispatcher_node != NULL);
30573 dispatcher_node->dispatcher_function = 1;
30574 dispatcher_version_info
30575 = insert_new_cgraph_node_version (dispatcher_node);
30576 dispatcher_version_info->next = default_version_info;
30577 dispatcher_node->definition = 1;
30578
30579 /* Set the dispatcher for all the versions. */
30580 it_v = default_version_info;
30581 while (it_v != NULL)
30582 {
30583 it_v->dispatcher_resolver = dispatch_decl;
30584 it_v = it_v->next;
30585 }
30586 }
30587 else
30588 #endif
30589 {
30590 error_at (DECL_SOURCE_LOCATION (default_node->decl),
30591 "multiversioning needs ifunc which is not supported "
30592 "on this target");
30593 }
30594
30595 return dispatch_decl;
30596 }
30597
30598 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
30599 it to CHAIN. */
30600
30601 static tree
30602 make_attribute (const char *name, const char *arg_name, tree chain)
30603 {
30604 tree attr_name;
30605 tree attr_arg_name;
30606 tree attr_args;
30607 tree attr;
30608
30609 attr_name = get_identifier (name);
30610 attr_arg_name = build_string (strlen (arg_name), arg_name);
30611 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30612 attr = tree_cons (attr_name, attr_args, chain);
30613 return attr;
30614 }
30615
30616 /* Make the resolver function decl to dispatch the versions of
30617 a multi-versioned function, DEFAULT_DECL. Create an
30618 empty basic block in the resolver and store the pointer in
30619 EMPTY_BB. Return the decl of the resolver function. */
30620
30621 static tree
30622 make_resolver_func (const tree default_decl,
30623 const tree dispatch_decl,
30624 basic_block *empty_bb)
30625 {
30626 char *resolver_name;
30627 tree decl, type, decl_name, t;
30628 bool is_uniq = false;
30629
30630 /* IFUNC's have to be globally visible. So, if the default_decl is
30631 not, then the name of the IFUNC should be made unique. */
30632 if (TREE_PUBLIC (default_decl) == 0)
30633 is_uniq = true;
30634
30635 /* Append the filename to the resolver function if the versions are
30636 not externally visible. This is because the resolver function has
30637 to be externally visible for the loader to find it. So, appending
30638 the filename will prevent conflicts with a resolver function from
30639 another module which is based on the same version name. */
30640 resolver_name = make_name (default_decl, "resolver", is_uniq);
30641
30642 /* The resolver function should return a (void *). */
30643 type = build_function_type_list (ptr_type_node, NULL_TREE);
30644
30645 decl = build_fn_decl (resolver_name, type);
30646 decl_name = get_identifier (resolver_name);
30647 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30648
30649 DECL_NAME (decl) = decl_name;
30650 TREE_USED (decl) = 1;
30651 DECL_ARTIFICIAL (decl) = 1;
30652 DECL_IGNORED_P (decl) = 0;
30653 /* IFUNC resolvers have to be externally visible. */
30654 TREE_PUBLIC (decl) = 1;
30655 DECL_UNINLINABLE (decl) = 1;
30656
30657 /* Resolver is not external, body is generated. */
30658 DECL_EXTERNAL (decl) = 0;
30659 DECL_EXTERNAL (dispatch_decl) = 0;
30660
30661 DECL_CONTEXT (decl) = NULL_TREE;
30662 DECL_INITIAL (decl) = make_node (BLOCK);
30663 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30664
30665 if (DECL_COMDAT_GROUP (default_decl)
30666 || TREE_PUBLIC (default_decl))
30667 {
30668 /* In this case, each translation unit with a call to this
30669 versioned function will put out a resolver. Ensure it
30670 is comdat to keep just one copy. */
30671 DECL_COMDAT (decl) = 1;
30672 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30673 }
30674 /* Build result decl and add to function_decl. */
30675 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30676 DECL_ARTIFICIAL (t) = 1;
30677 DECL_IGNORED_P (t) = 1;
30678 DECL_RESULT (decl) = t;
30679
30680 gimplify_function_tree (decl);
30681 push_cfun (DECL_STRUCT_FUNCTION (decl));
30682 *empty_bb = init_lowered_empty_function (decl, false);
30683
30684 cgraph_add_new_function (decl, true);
30685 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30686
30687 pop_cfun ();
30688
30689 gcc_assert (dispatch_decl != NULL);
30690 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30691 DECL_ATTRIBUTES (dispatch_decl)
30692 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30693
30694 /* Create the alias for dispatch to resolver here. */
30695 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30696 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30697 XDELETEVEC (resolver_name);
30698 return decl;
30699 }
30700
30701 /* Generate the dispatching code body to dispatch multi-versioned function
30702 DECL. The target hook is called to process the "target" attributes and
30703 provide the code to dispatch the right function at run-time. NODE points
30704 to the dispatcher decl whose body will be created. */
30705
30706 static tree
30707 ix86_generate_version_dispatcher_body (void *node_p)
30708 {
30709 tree resolver_decl;
30710 basic_block empty_bb;
30711 vec<tree> fn_ver_vec = vNULL;
30712 tree default_ver_decl;
30713 struct cgraph_node *versn;
30714 struct cgraph_node *node;
30715
30716 struct cgraph_function_version_info *node_version_info = NULL;
30717 struct cgraph_function_version_info *versn_info = NULL;
30718
30719 node = (cgraph_node *)node_p;
30720
30721 node_version_info = get_cgraph_node_version (node);
30722 gcc_assert (node->dispatcher_function
30723 && node_version_info != NULL);
30724
30725 if (node_version_info->dispatcher_resolver)
30726 return node_version_info->dispatcher_resolver;
30727
30728 /* The first version in the chain corresponds to the default version. */
30729 default_ver_decl = node_version_info->next->this_node->decl;
30730
30731 /* node is going to be an alias, so remove the finalized bit. */
30732 node->definition = false;
30733
30734 resolver_decl = make_resolver_func (default_ver_decl,
30735 node->decl, &empty_bb);
30736
30737 node_version_info->dispatcher_resolver = resolver_decl;
30738
30739 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30740
30741 fn_ver_vec.create (2);
30742
30743 for (versn_info = node_version_info->next; versn_info;
30744 versn_info = versn_info->next)
30745 {
30746 versn = versn_info->this_node;
30747 /* Check for virtual functions here again, as by this time it should
30748 have been determined if this function needs a vtable index or
30749 not. This happens for methods in derived classes that override
30750 virtual methods in base classes but are not explicitly marked as
30751 virtual. */
30752 if (DECL_VINDEX (versn->decl))
30753 sorry ("Virtual function multiversioning not supported");
30754
30755 fn_ver_vec.safe_push (versn->decl);
30756 }
30757
30758 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30759 fn_ver_vec.release ();
30760 rebuild_cgraph_edges ();
30761 pop_cfun ();
30762 return resolver_decl;
30763 }
30764 /* This builds the processor_model struct type defined in
30765 libgcc/config/i386/cpuinfo.c */
30766
30767 static tree
30768 build_processor_model_struct (void)
30769 {
30770 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30771 "__cpu_features"};
30772 tree field = NULL_TREE, field_chain = NULL_TREE;
30773 int i;
30774 tree type = make_node (RECORD_TYPE);
30775
30776 /* The first 3 fields are unsigned int. */
30777 for (i = 0; i < 3; ++i)
30778 {
30779 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30780 get_identifier (field_name[i]), unsigned_type_node);
30781 if (field_chain != NULL_TREE)
30782 DECL_CHAIN (field) = field_chain;
30783 field_chain = field;
30784 }
30785
30786 /* The last field is an array of unsigned integers of size one. */
30787 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30788 get_identifier (field_name[3]),
30789 build_array_type (unsigned_type_node,
30790 build_index_type (size_one_node)));
30791 if (field_chain != NULL_TREE)
30792 DECL_CHAIN (field) = field_chain;
30793 field_chain = field;
30794
30795 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30796 return type;
30797 }
30798
30799 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30800
30801 static tree
30802 make_var_decl (tree type, const char *name)
30803 {
30804 tree new_decl;
30805
30806 new_decl = build_decl (UNKNOWN_LOCATION,
30807 VAR_DECL,
30808 get_identifier(name),
30809 type);
30810
30811 DECL_EXTERNAL (new_decl) = 1;
30812 TREE_STATIC (new_decl) = 1;
30813 TREE_PUBLIC (new_decl) = 1;
30814 DECL_INITIAL (new_decl) = 0;
30815 DECL_ARTIFICIAL (new_decl) = 0;
30816 DECL_PRESERVE_P (new_decl) = 1;
30817
30818 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30819 assemble_variable (new_decl, 0, 0, 0);
30820
30821 return new_decl;
30822 }
30823
30824 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30825 into an integer defined in libgcc/config/i386/cpuinfo.c */
30826
30827 static tree
30828 fold_builtin_cpu (tree fndecl, tree *args)
30829 {
30830 unsigned int i;
30831 enum ix86_builtins fn_code = (enum ix86_builtins)
30832 DECL_FUNCTION_CODE (fndecl);
30833 tree param_string_cst = NULL;
30834
30835 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30836 enum processor_features
30837 {
30838 F_CMOV = 0,
30839 F_MMX,
30840 F_POPCNT,
30841 F_SSE,
30842 F_SSE2,
30843 F_SSE3,
30844 F_SSSE3,
30845 F_SSE4_1,
30846 F_SSE4_2,
30847 F_AVX,
30848 F_AVX2,
30849 F_MAX
30850 };
30851
30852 /* These are the values for vendor types and cpu types and subtypes
30853 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30854 the corresponding start value. */
30855 enum processor_model
30856 {
30857 M_INTEL = 1,
30858 M_AMD,
30859 M_CPU_TYPE_START,
30860 M_INTEL_ATOM,
30861 M_INTEL_CORE2,
30862 M_INTEL_COREI7,
30863 M_AMDFAM10H,
30864 M_AMDFAM15H,
30865 M_INTEL_SLM,
30866 M_CPU_SUBTYPE_START,
30867 M_INTEL_COREI7_NEHALEM,
30868 M_INTEL_COREI7_WESTMERE,
30869 M_INTEL_COREI7_SANDYBRIDGE,
30870 M_AMDFAM10H_BARCELONA,
30871 M_AMDFAM10H_SHANGHAI,
30872 M_AMDFAM10H_ISTANBUL,
30873 M_AMDFAM15H_BDVER1,
30874 M_AMDFAM15H_BDVER2,
30875 M_AMDFAM15H_BDVER3,
30876 M_AMDFAM15H_BDVER4
30877 };
30878
30879 static struct _arch_names_table
30880 {
30881 const char *const name;
30882 const enum processor_model model;
30883 }
30884 const arch_names_table[] =
30885 {
30886 {"amd", M_AMD},
30887 {"intel", M_INTEL},
30888 {"atom", M_INTEL_ATOM},
30889 {"slm", M_INTEL_SLM},
30890 {"core2", M_INTEL_CORE2},
30891 {"corei7", M_INTEL_COREI7},
30892 {"nehalem", M_INTEL_COREI7_NEHALEM},
30893 {"westmere", M_INTEL_COREI7_WESTMERE},
30894 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30895 {"amdfam10h", M_AMDFAM10H},
30896 {"barcelona", M_AMDFAM10H_BARCELONA},
30897 {"shanghai", M_AMDFAM10H_SHANGHAI},
30898 {"istanbul", M_AMDFAM10H_ISTANBUL},
30899 {"amdfam15h", M_AMDFAM15H},
30900 {"bdver1", M_AMDFAM15H_BDVER1},
30901 {"bdver2", M_AMDFAM15H_BDVER2},
30902 {"bdver3", M_AMDFAM15H_BDVER3},
30903 {"bdver4", M_AMDFAM15H_BDVER4},
30904 };
30905
30906 static struct _isa_names_table
30907 {
30908 const char *const name;
30909 const enum processor_features feature;
30910 }
30911 const isa_names_table[] =
30912 {
30913 {"cmov", F_CMOV},
30914 {"mmx", F_MMX},
30915 {"popcnt", F_POPCNT},
30916 {"sse", F_SSE},
30917 {"sse2", F_SSE2},
30918 {"sse3", F_SSE3},
30919 {"ssse3", F_SSSE3},
30920 {"sse4.1", F_SSE4_1},
30921 {"sse4.2", F_SSE4_2},
30922 {"avx", F_AVX},
30923 {"avx2", F_AVX2}
30924 };
30925
30926 tree __processor_model_type = build_processor_model_struct ();
30927 tree __cpu_model_var = make_var_decl (__processor_model_type,
30928 "__cpu_model");
30929
30930
30931 varpool_add_new_variable (__cpu_model_var);
30932
30933 gcc_assert ((args != NULL) && (*args != NULL));
30934
30935 param_string_cst = *args;
30936 while (param_string_cst
30937 && TREE_CODE (param_string_cst) != STRING_CST)
30938 {
30939 /* *args must be a expr that can contain other EXPRS leading to a
30940 STRING_CST. */
30941 if (!EXPR_P (param_string_cst))
30942 {
30943 error ("Parameter to builtin must be a string constant or literal");
30944 return integer_zero_node;
30945 }
30946 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30947 }
30948
30949 gcc_assert (param_string_cst);
30950
30951 if (fn_code == IX86_BUILTIN_CPU_IS)
30952 {
30953 tree ref;
30954 tree field;
30955 tree final;
30956
30957 unsigned int field_val = 0;
30958 unsigned int NUM_ARCH_NAMES
30959 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30960
30961 for (i = 0; i < NUM_ARCH_NAMES; i++)
30962 if (strcmp (arch_names_table[i].name,
30963 TREE_STRING_POINTER (param_string_cst)) == 0)
30964 break;
30965
30966 if (i == NUM_ARCH_NAMES)
30967 {
30968 error ("Parameter to builtin not valid: %s",
30969 TREE_STRING_POINTER (param_string_cst));
30970 return integer_zero_node;
30971 }
30972
30973 field = TYPE_FIELDS (__processor_model_type);
30974 field_val = arch_names_table[i].model;
30975
30976 /* CPU types are stored in the next field. */
30977 if (field_val > M_CPU_TYPE_START
30978 && field_val < M_CPU_SUBTYPE_START)
30979 {
30980 field = DECL_CHAIN (field);
30981 field_val -= M_CPU_TYPE_START;
30982 }
30983
30984 /* CPU subtypes are stored in the next field. */
30985 if (field_val > M_CPU_SUBTYPE_START)
30986 {
30987 field = DECL_CHAIN ( DECL_CHAIN (field));
30988 field_val -= M_CPU_SUBTYPE_START;
30989 }
30990
30991 /* Get the appropriate field in __cpu_model. */
30992 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30993 field, NULL_TREE);
30994
30995 /* Check the value. */
30996 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30997 build_int_cstu (unsigned_type_node, field_val));
30998 return build1 (CONVERT_EXPR, integer_type_node, final);
30999 }
31000 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
31001 {
31002 tree ref;
31003 tree array_elt;
31004 tree field;
31005 tree final;
31006
31007 unsigned int field_val = 0;
31008 unsigned int NUM_ISA_NAMES
31009 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
31010
31011 for (i = 0; i < NUM_ISA_NAMES; i++)
31012 if (strcmp (isa_names_table[i].name,
31013 TREE_STRING_POINTER (param_string_cst)) == 0)
31014 break;
31015
31016 if (i == NUM_ISA_NAMES)
31017 {
31018 error ("Parameter to builtin not valid: %s",
31019 TREE_STRING_POINTER (param_string_cst));
31020 return integer_zero_node;
31021 }
31022
31023 field = TYPE_FIELDS (__processor_model_type);
31024 /* Get the last field, which is __cpu_features. */
31025 while (DECL_CHAIN (field))
31026 field = DECL_CHAIN (field);
31027
31028 /* Get the appropriate field: __cpu_model.__cpu_features */
31029 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
31030 field, NULL_TREE);
31031
31032 /* Access the 0th element of __cpu_features array. */
31033 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
31034 integer_zero_node, NULL_TREE, NULL_TREE);
31035
31036 field_val = (1 << isa_names_table[i].feature);
31037 /* Return __cpu_model.__cpu_features[0] & field_val */
31038 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
31039 build_int_cstu (unsigned_type_node, field_val));
31040 return build1 (CONVERT_EXPR, integer_type_node, final);
31041 }
31042 gcc_unreachable ();
31043 }
31044
31045 static tree
31046 ix86_fold_builtin (tree fndecl, int n_args,
31047 tree *args, bool ignore ATTRIBUTE_UNUSED)
31048 {
31049 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31050 {
31051 enum ix86_builtins fn_code = (enum ix86_builtins)
31052 DECL_FUNCTION_CODE (fndecl);
31053 if (fn_code == IX86_BUILTIN_CPU_IS
31054 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
31055 {
31056 gcc_assert (n_args == 1);
31057 return fold_builtin_cpu (fndecl, args);
31058 }
31059 }
31060
31061 #ifdef SUBTARGET_FOLD_BUILTIN
31062 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
31063 #endif
31064
31065 return NULL_TREE;
31066 }
31067
31068 /* Make builtins to detect cpu type and features supported. NAME is
31069 the builtin name, CODE is the builtin code, and FTYPE is the function
31070 type of the builtin. */
31071
31072 static void
31073 make_cpu_type_builtin (const char* name, int code,
31074 enum ix86_builtin_func_type ftype, bool is_const)
31075 {
31076 tree decl;
31077 tree type;
31078
31079 type = ix86_get_builtin_func_type (ftype);
31080 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31081 NULL, NULL_TREE);
31082 gcc_assert (decl != NULL_TREE);
31083 ix86_builtins[(int) code] = decl;
31084 TREE_READONLY (decl) = is_const;
31085 }
31086
31087 /* Make builtins to get CPU type and features supported. The created
31088 builtins are :
31089
31090 __builtin_cpu_init (), to detect cpu type and features,
31091 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
31092 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
31093 */
31094
31095 static void
31096 ix86_init_platform_type_builtins (void)
31097 {
31098 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
31099 INT_FTYPE_VOID, false);
31100 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
31101 INT_FTYPE_PCCHAR, true);
31102 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
31103 INT_FTYPE_PCCHAR, true);
31104 }
31105
31106 /* Internal method for ix86_init_builtins. */
31107
31108 static void
31109 ix86_init_builtins_va_builtins_abi (void)
31110 {
31111 tree ms_va_ref, sysv_va_ref;
31112 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
31113 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
31114 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
31115 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
31116
31117 if (!TARGET_64BIT)
31118 return;
31119 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
31120 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
31121 ms_va_ref = build_reference_type (ms_va_list_type_node);
31122 sysv_va_ref =
31123 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
31124
31125 fnvoid_va_end_ms =
31126 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31127 fnvoid_va_start_ms =
31128 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31129 fnvoid_va_end_sysv =
31130 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
31131 fnvoid_va_start_sysv =
31132 build_varargs_function_type_list (void_type_node, sysv_va_ref,
31133 NULL_TREE);
31134 fnvoid_va_copy_ms =
31135 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
31136 NULL_TREE);
31137 fnvoid_va_copy_sysv =
31138 build_function_type_list (void_type_node, sysv_va_ref,
31139 sysv_va_ref, NULL_TREE);
31140
31141 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
31142 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
31143 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
31144 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
31145 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
31146 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
31147 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
31148 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31149 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
31150 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31151 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
31152 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31153 }
31154
31155 static void
31156 ix86_init_builtin_types (void)
31157 {
31158 tree float128_type_node, float80_type_node;
31159
31160 /* The __float80 type. */
31161 float80_type_node = long_double_type_node;
31162 if (TYPE_MODE (float80_type_node) != XFmode)
31163 {
31164 /* The __float80 type. */
31165 float80_type_node = make_node (REAL_TYPE);
31166
31167 TYPE_PRECISION (float80_type_node) = 80;
31168 layout_type (float80_type_node);
31169 }
31170 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
31171
31172 /* The __float128 type. */
31173 float128_type_node = make_node (REAL_TYPE);
31174 TYPE_PRECISION (float128_type_node) = 128;
31175 layout_type (float128_type_node);
31176 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
31177
31178 /* This macro is built by i386-builtin-types.awk. */
31179 DEFINE_BUILTIN_PRIMITIVE_TYPES;
31180 }
31181
31182 static void
31183 ix86_init_builtins (void)
31184 {
31185 tree t;
31186
31187 ix86_init_builtin_types ();
31188
31189 /* Builtins to get CPU type and features. */
31190 ix86_init_platform_type_builtins ();
31191
31192 /* TFmode support builtins. */
31193 def_builtin_const (0, "__builtin_infq",
31194 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
31195 def_builtin_const (0, "__builtin_huge_valq",
31196 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
31197
31198 /* We will expand them to normal call if SSE isn't available since
31199 they are used by libgcc. */
31200 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
31201 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
31202 BUILT_IN_MD, "__fabstf2", NULL_TREE);
31203 TREE_READONLY (t) = 1;
31204 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
31205
31206 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
31207 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
31208 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
31209 TREE_READONLY (t) = 1;
31210 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
31211
31212 ix86_init_tm_builtins ();
31213 ix86_init_mmx_sse_builtins ();
31214
31215 if (TARGET_LP64)
31216 ix86_init_builtins_va_builtins_abi ();
31217
31218 #ifdef SUBTARGET_INIT_BUILTINS
31219 SUBTARGET_INIT_BUILTINS;
31220 #endif
31221 }
31222
31223 /* Return the ix86 builtin for CODE. */
31224
31225 static tree
31226 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
31227 {
31228 if (code >= IX86_BUILTIN_MAX)
31229 return error_mark_node;
31230
31231 return ix86_builtins[code];
31232 }
31233
31234 /* Errors in the source file can cause expand_expr to return const0_rtx
31235 where we expect a vector. To avoid crashing, use one of the vector
31236 clear instructions. */
31237 static rtx
31238 safe_vector_operand (rtx x, enum machine_mode mode)
31239 {
31240 if (x == const0_rtx)
31241 x = CONST0_RTX (mode);
31242 return x;
31243 }
31244
31245 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
31246
31247 static rtx
31248 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
31249 {
31250 rtx pat;
31251 tree arg0 = CALL_EXPR_ARG (exp, 0);
31252 tree arg1 = CALL_EXPR_ARG (exp, 1);
31253 rtx op0 = expand_normal (arg0);
31254 rtx op1 = expand_normal (arg1);
31255 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31256 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31257 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
31258
31259 if (VECTOR_MODE_P (mode0))
31260 op0 = safe_vector_operand (op0, mode0);
31261 if (VECTOR_MODE_P (mode1))
31262 op1 = safe_vector_operand (op1, mode1);
31263
31264 if (optimize || !target
31265 || GET_MODE (target) != tmode
31266 || !insn_data[icode].operand[0].predicate (target, tmode))
31267 target = gen_reg_rtx (tmode);
31268
31269 if (GET_MODE (op1) == SImode && mode1 == TImode)
31270 {
31271 rtx x = gen_reg_rtx (V4SImode);
31272 emit_insn (gen_sse2_loadd (x, op1));
31273 op1 = gen_lowpart (TImode, x);
31274 }
31275
31276 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31277 op0 = copy_to_mode_reg (mode0, op0);
31278 if (!insn_data[icode].operand[2].predicate (op1, mode1))
31279 op1 = copy_to_mode_reg (mode1, op1);
31280
31281 pat = GEN_FCN (icode) (target, op0, op1);
31282 if (! pat)
31283 return 0;
31284
31285 emit_insn (pat);
31286
31287 return target;
31288 }
31289
31290 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
31291
31292 static rtx
31293 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
31294 enum ix86_builtin_func_type m_type,
31295 enum rtx_code sub_code)
31296 {
31297 rtx pat;
31298 int i;
31299 int nargs;
31300 bool comparison_p = false;
31301 bool tf_p = false;
31302 bool last_arg_constant = false;
31303 int num_memory = 0;
31304 struct {
31305 rtx op;
31306 enum machine_mode mode;
31307 } args[4];
31308
31309 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31310
31311 switch (m_type)
31312 {
31313 case MULTI_ARG_4_DF2_DI_I:
31314 case MULTI_ARG_4_DF2_DI_I1:
31315 case MULTI_ARG_4_SF2_SI_I:
31316 case MULTI_ARG_4_SF2_SI_I1:
31317 nargs = 4;
31318 last_arg_constant = true;
31319 break;
31320
31321 case MULTI_ARG_3_SF:
31322 case MULTI_ARG_3_DF:
31323 case MULTI_ARG_3_SF2:
31324 case MULTI_ARG_3_DF2:
31325 case MULTI_ARG_3_DI:
31326 case MULTI_ARG_3_SI:
31327 case MULTI_ARG_3_SI_DI:
31328 case MULTI_ARG_3_HI:
31329 case MULTI_ARG_3_HI_SI:
31330 case MULTI_ARG_3_QI:
31331 case MULTI_ARG_3_DI2:
31332 case MULTI_ARG_3_SI2:
31333 case MULTI_ARG_3_HI2:
31334 case MULTI_ARG_3_QI2:
31335 nargs = 3;
31336 break;
31337
31338 case MULTI_ARG_2_SF:
31339 case MULTI_ARG_2_DF:
31340 case MULTI_ARG_2_DI:
31341 case MULTI_ARG_2_SI:
31342 case MULTI_ARG_2_HI:
31343 case MULTI_ARG_2_QI:
31344 nargs = 2;
31345 break;
31346
31347 case MULTI_ARG_2_DI_IMM:
31348 case MULTI_ARG_2_SI_IMM:
31349 case MULTI_ARG_2_HI_IMM:
31350 case MULTI_ARG_2_QI_IMM:
31351 nargs = 2;
31352 last_arg_constant = true;
31353 break;
31354
31355 case MULTI_ARG_1_SF:
31356 case MULTI_ARG_1_DF:
31357 case MULTI_ARG_1_SF2:
31358 case MULTI_ARG_1_DF2:
31359 case MULTI_ARG_1_DI:
31360 case MULTI_ARG_1_SI:
31361 case MULTI_ARG_1_HI:
31362 case MULTI_ARG_1_QI:
31363 case MULTI_ARG_1_SI_DI:
31364 case MULTI_ARG_1_HI_DI:
31365 case MULTI_ARG_1_HI_SI:
31366 case MULTI_ARG_1_QI_DI:
31367 case MULTI_ARG_1_QI_SI:
31368 case MULTI_ARG_1_QI_HI:
31369 nargs = 1;
31370 break;
31371
31372 case MULTI_ARG_2_DI_CMP:
31373 case MULTI_ARG_2_SI_CMP:
31374 case MULTI_ARG_2_HI_CMP:
31375 case MULTI_ARG_2_QI_CMP:
31376 nargs = 2;
31377 comparison_p = true;
31378 break;
31379
31380 case MULTI_ARG_2_SF_TF:
31381 case MULTI_ARG_2_DF_TF:
31382 case MULTI_ARG_2_DI_TF:
31383 case MULTI_ARG_2_SI_TF:
31384 case MULTI_ARG_2_HI_TF:
31385 case MULTI_ARG_2_QI_TF:
31386 nargs = 2;
31387 tf_p = true;
31388 break;
31389
31390 default:
31391 gcc_unreachable ();
31392 }
31393
31394 if (optimize || !target
31395 || GET_MODE (target) != tmode
31396 || !insn_data[icode].operand[0].predicate (target, tmode))
31397 target = gen_reg_rtx (tmode);
31398
31399 gcc_assert (nargs <= 4);
31400
31401 for (i = 0; i < nargs; i++)
31402 {
31403 tree arg = CALL_EXPR_ARG (exp, i);
31404 rtx op = expand_normal (arg);
31405 int adjust = (comparison_p) ? 1 : 0;
31406 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
31407
31408 if (last_arg_constant && i == nargs - 1)
31409 {
31410 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
31411 {
31412 enum insn_code new_icode = icode;
31413 switch (icode)
31414 {
31415 case CODE_FOR_xop_vpermil2v2df3:
31416 case CODE_FOR_xop_vpermil2v4sf3:
31417 case CODE_FOR_xop_vpermil2v4df3:
31418 case CODE_FOR_xop_vpermil2v8sf3:
31419 error ("the last argument must be a 2-bit immediate");
31420 return gen_reg_rtx (tmode);
31421 case CODE_FOR_xop_rotlv2di3:
31422 new_icode = CODE_FOR_rotlv2di3;
31423 goto xop_rotl;
31424 case CODE_FOR_xop_rotlv4si3:
31425 new_icode = CODE_FOR_rotlv4si3;
31426 goto xop_rotl;
31427 case CODE_FOR_xop_rotlv8hi3:
31428 new_icode = CODE_FOR_rotlv8hi3;
31429 goto xop_rotl;
31430 case CODE_FOR_xop_rotlv16qi3:
31431 new_icode = CODE_FOR_rotlv16qi3;
31432 xop_rotl:
31433 if (CONST_INT_P (op))
31434 {
31435 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
31436 op = GEN_INT (INTVAL (op) & mask);
31437 gcc_checking_assert
31438 (insn_data[icode].operand[i + 1].predicate (op, mode));
31439 }
31440 else
31441 {
31442 gcc_checking_assert
31443 (nargs == 2
31444 && insn_data[new_icode].operand[0].mode == tmode
31445 && insn_data[new_icode].operand[1].mode == tmode
31446 && insn_data[new_icode].operand[2].mode == mode
31447 && insn_data[new_icode].operand[0].predicate
31448 == insn_data[icode].operand[0].predicate
31449 && insn_data[new_icode].operand[1].predicate
31450 == insn_data[icode].operand[1].predicate);
31451 icode = new_icode;
31452 goto non_constant;
31453 }
31454 break;
31455 default:
31456 gcc_unreachable ();
31457 }
31458 }
31459 }
31460 else
31461 {
31462 non_constant:
31463 if (VECTOR_MODE_P (mode))
31464 op = safe_vector_operand (op, mode);
31465
31466 /* If we aren't optimizing, only allow one memory operand to be
31467 generated. */
31468 if (memory_operand (op, mode))
31469 num_memory++;
31470
31471 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
31472
31473 if (optimize
31474 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
31475 || num_memory > 1)
31476 op = force_reg (mode, op);
31477 }
31478
31479 args[i].op = op;
31480 args[i].mode = mode;
31481 }
31482
31483 switch (nargs)
31484 {
31485 case 1:
31486 pat = GEN_FCN (icode) (target, args[0].op);
31487 break;
31488
31489 case 2:
31490 if (tf_p)
31491 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
31492 GEN_INT ((int)sub_code));
31493 else if (! comparison_p)
31494 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31495 else
31496 {
31497 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
31498 args[0].op,
31499 args[1].op);
31500
31501 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
31502 }
31503 break;
31504
31505 case 3:
31506 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31507 break;
31508
31509 case 4:
31510 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
31511 break;
31512
31513 default:
31514 gcc_unreachable ();
31515 }
31516
31517 if (! pat)
31518 return 0;
31519
31520 emit_insn (pat);
31521 return target;
31522 }
31523
31524 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
31525 insns with vec_merge. */
31526
31527 static rtx
31528 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
31529 rtx target)
31530 {
31531 rtx pat;
31532 tree arg0 = CALL_EXPR_ARG (exp, 0);
31533 rtx op1, op0 = expand_normal (arg0);
31534 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31535 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31536
31537 if (optimize || !target
31538 || GET_MODE (target) != tmode
31539 || !insn_data[icode].operand[0].predicate (target, tmode))
31540 target = gen_reg_rtx (tmode);
31541
31542 if (VECTOR_MODE_P (mode0))
31543 op0 = safe_vector_operand (op0, mode0);
31544
31545 if ((optimize && !register_operand (op0, mode0))
31546 || !insn_data[icode].operand[1].predicate (op0, mode0))
31547 op0 = copy_to_mode_reg (mode0, op0);
31548
31549 op1 = op0;
31550 if (!insn_data[icode].operand[2].predicate (op1, mode0))
31551 op1 = copy_to_mode_reg (mode0, op1);
31552
31553 pat = GEN_FCN (icode) (target, op0, op1);
31554 if (! pat)
31555 return 0;
31556 emit_insn (pat);
31557 return target;
31558 }
31559
31560 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
31561
31562 static rtx
31563 ix86_expand_sse_compare (const struct builtin_description *d,
31564 tree exp, rtx target, bool swap)
31565 {
31566 rtx pat;
31567 tree arg0 = CALL_EXPR_ARG (exp, 0);
31568 tree arg1 = CALL_EXPR_ARG (exp, 1);
31569 rtx op0 = expand_normal (arg0);
31570 rtx op1 = expand_normal (arg1);
31571 rtx op2;
31572 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31573 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31574 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31575 enum rtx_code comparison = d->comparison;
31576
31577 if (VECTOR_MODE_P (mode0))
31578 op0 = safe_vector_operand (op0, mode0);
31579 if (VECTOR_MODE_P (mode1))
31580 op1 = safe_vector_operand (op1, mode1);
31581
31582 /* Swap operands if we have a comparison that isn't available in
31583 hardware. */
31584 if (swap)
31585 {
31586 rtx tmp = gen_reg_rtx (mode1);
31587 emit_move_insn (tmp, op1);
31588 op1 = op0;
31589 op0 = tmp;
31590 }
31591
31592 if (optimize || !target
31593 || GET_MODE (target) != tmode
31594 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31595 target = gen_reg_rtx (tmode);
31596
31597 if ((optimize && !register_operand (op0, mode0))
31598 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
31599 op0 = copy_to_mode_reg (mode0, op0);
31600 if ((optimize && !register_operand (op1, mode1))
31601 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
31602 op1 = copy_to_mode_reg (mode1, op1);
31603
31604 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
31605 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31606 if (! pat)
31607 return 0;
31608 emit_insn (pat);
31609 return target;
31610 }
31611
31612 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31613
31614 static rtx
31615 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31616 rtx target)
31617 {
31618 rtx pat;
31619 tree arg0 = CALL_EXPR_ARG (exp, 0);
31620 tree arg1 = CALL_EXPR_ARG (exp, 1);
31621 rtx op0 = expand_normal (arg0);
31622 rtx op1 = expand_normal (arg1);
31623 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31624 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31625 enum rtx_code comparison = d->comparison;
31626
31627 if (VECTOR_MODE_P (mode0))
31628 op0 = safe_vector_operand (op0, mode0);
31629 if (VECTOR_MODE_P (mode1))
31630 op1 = safe_vector_operand (op1, mode1);
31631
31632 /* Swap operands if we have a comparison that isn't available in
31633 hardware. */
31634 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31635 {
31636 rtx tmp = op1;
31637 op1 = op0;
31638 op0 = tmp;
31639 }
31640
31641 target = gen_reg_rtx (SImode);
31642 emit_move_insn (target, const0_rtx);
31643 target = gen_rtx_SUBREG (QImode, target, 0);
31644
31645 if ((optimize && !register_operand (op0, mode0))
31646 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31647 op0 = copy_to_mode_reg (mode0, op0);
31648 if ((optimize && !register_operand (op1, mode1))
31649 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31650 op1 = copy_to_mode_reg (mode1, op1);
31651
31652 pat = GEN_FCN (d->icode) (op0, op1);
31653 if (! pat)
31654 return 0;
31655 emit_insn (pat);
31656 emit_insn (gen_rtx_SET (VOIDmode,
31657 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31658 gen_rtx_fmt_ee (comparison, QImode,
31659 SET_DEST (pat),
31660 const0_rtx)));
31661
31662 return SUBREG_REG (target);
31663 }
31664
31665 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31666
31667 static rtx
31668 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31669 rtx target)
31670 {
31671 rtx pat;
31672 tree arg0 = CALL_EXPR_ARG (exp, 0);
31673 rtx op1, op0 = expand_normal (arg0);
31674 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31675 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31676
31677 if (optimize || target == 0
31678 || GET_MODE (target) != tmode
31679 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31680 target = gen_reg_rtx (tmode);
31681
31682 if (VECTOR_MODE_P (mode0))
31683 op0 = safe_vector_operand (op0, mode0);
31684
31685 if ((optimize && !register_operand (op0, mode0))
31686 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31687 op0 = copy_to_mode_reg (mode0, op0);
31688
31689 op1 = GEN_INT (d->comparison);
31690
31691 pat = GEN_FCN (d->icode) (target, op0, op1);
31692 if (! pat)
31693 return 0;
31694 emit_insn (pat);
31695 return target;
31696 }
31697
31698 static rtx
31699 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31700 tree exp, rtx target)
31701 {
31702 rtx pat;
31703 tree arg0 = CALL_EXPR_ARG (exp, 0);
31704 tree arg1 = CALL_EXPR_ARG (exp, 1);
31705 rtx op0 = expand_normal (arg0);
31706 rtx op1 = expand_normal (arg1);
31707 rtx op2;
31708 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31709 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31710 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31711
31712 if (optimize || target == 0
31713 || GET_MODE (target) != tmode
31714 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31715 target = gen_reg_rtx (tmode);
31716
31717 op0 = safe_vector_operand (op0, mode0);
31718 op1 = safe_vector_operand (op1, mode1);
31719
31720 if ((optimize && !register_operand (op0, mode0))
31721 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31722 op0 = copy_to_mode_reg (mode0, op0);
31723 if ((optimize && !register_operand (op1, mode1))
31724 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31725 op1 = copy_to_mode_reg (mode1, op1);
31726
31727 op2 = GEN_INT (d->comparison);
31728
31729 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31730 if (! pat)
31731 return 0;
31732 emit_insn (pat);
31733 return target;
31734 }
31735
31736 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31737
31738 static rtx
31739 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31740 rtx target)
31741 {
31742 rtx pat;
31743 tree arg0 = CALL_EXPR_ARG (exp, 0);
31744 tree arg1 = CALL_EXPR_ARG (exp, 1);
31745 rtx op0 = expand_normal (arg0);
31746 rtx op1 = expand_normal (arg1);
31747 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31748 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31749 enum rtx_code comparison = d->comparison;
31750
31751 if (VECTOR_MODE_P (mode0))
31752 op0 = safe_vector_operand (op0, mode0);
31753 if (VECTOR_MODE_P (mode1))
31754 op1 = safe_vector_operand (op1, mode1);
31755
31756 target = gen_reg_rtx (SImode);
31757 emit_move_insn (target, const0_rtx);
31758 target = gen_rtx_SUBREG (QImode, target, 0);
31759
31760 if ((optimize && !register_operand (op0, mode0))
31761 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31762 op0 = copy_to_mode_reg (mode0, op0);
31763 if ((optimize && !register_operand (op1, mode1))
31764 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31765 op1 = copy_to_mode_reg (mode1, op1);
31766
31767 pat = GEN_FCN (d->icode) (op0, op1);
31768 if (! pat)
31769 return 0;
31770 emit_insn (pat);
31771 emit_insn (gen_rtx_SET (VOIDmode,
31772 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31773 gen_rtx_fmt_ee (comparison, QImode,
31774 SET_DEST (pat),
31775 const0_rtx)));
31776
31777 return SUBREG_REG (target);
31778 }
31779
31780 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31781
31782 static rtx
31783 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31784 tree exp, rtx target)
31785 {
31786 rtx pat;
31787 tree arg0 = CALL_EXPR_ARG (exp, 0);
31788 tree arg1 = CALL_EXPR_ARG (exp, 1);
31789 tree arg2 = CALL_EXPR_ARG (exp, 2);
31790 tree arg3 = CALL_EXPR_ARG (exp, 3);
31791 tree arg4 = CALL_EXPR_ARG (exp, 4);
31792 rtx scratch0, scratch1;
31793 rtx op0 = expand_normal (arg0);
31794 rtx op1 = expand_normal (arg1);
31795 rtx op2 = expand_normal (arg2);
31796 rtx op3 = expand_normal (arg3);
31797 rtx op4 = expand_normal (arg4);
31798 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31799
31800 tmode0 = insn_data[d->icode].operand[0].mode;
31801 tmode1 = insn_data[d->icode].operand[1].mode;
31802 modev2 = insn_data[d->icode].operand[2].mode;
31803 modei3 = insn_data[d->icode].operand[3].mode;
31804 modev4 = insn_data[d->icode].operand[4].mode;
31805 modei5 = insn_data[d->icode].operand[5].mode;
31806 modeimm = insn_data[d->icode].operand[6].mode;
31807
31808 if (VECTOR_MODE_P (modev2))
31809 op0 = safe_vector_operand (op0, modev2);
31810 if (VECTOR_MODE_P (modev4))
31811 op2 = safe_vector_operand (op2, modev4);
31812
31813 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31814 op0 = copy_to_mode_reg (modev2, op0);
31815 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31816 op1 = copy_to_mode_reg (modei3, op1);
31817 if ((optimize && !register_operand (op2, modev4))
31818 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31819 op2 = copy_to_mode_reg (modev4, op2);
31820 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31821 op3 = copy_to_mode_reg (modei5, op3);
31822
31823 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31824 {
31825 error ("the fifth argument must be an 8-bit immediate");
31826 return const0_rtx;
31827 }
31828
31829 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31830 {
31831 if (optimize || !target
31832 || GET_MODE (target) != tmode0
31833 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31834 target = gen_reg_rtx (tmode0);
31835
31836 scratch1 = gen_reg_rtx (tmode1);
31837
31838 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31839 }
31840 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31841 {
31842 if (optimize || !target
31843 || GET_MODE (target) != tmode1
31844 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31845 target = gen_reg_rtx (tmode1);
31846
31847 scratch0 = gen_reg_rtx (tmode0);
31848
31849 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31850 }
31851 else
31852 {
31853 gcc_assert (d->flag);
31854
31855 scratch0 = gen_reg_rtx (tmode0);
31856 scratch1 = gen_reg_rtx (tmode1);
31857
31858 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31859 }
31860
31861 if (! pat)
31862 return 0;
31863
31864 emit_insn (pat);
31865
31866 if (d->flag)
31867 {
31868 target = gen_reg_rtx (SImode);
31869 emit_move_insn (target, const0_rtx);
31870 target = gen_rtx_SUBREG (QImode, target, 0);
31871
31872 emit_insn
31873 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31874 gen_rtx_fmt_ee (EQ, QImode,
31875 gen_rtx_REG ((enum machine_mode) d->flag,
31876 FLAGS_REG),
31877 const0_rtx)));
31878 return SUBREG_REG (target);
31879 }
31880 else
31881 return target;
31882 }
31883
31884
31885 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31886
31887 static rtx
31888 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31889 tree exp, rtx target)
31890 {
31891 rtx pat;
31892 tree arg0 = CALL_EXPR_ARG (exp, 0);
31893 tree arg1 = CALL_EXPR_ARG (exp, 1);
31894 tree arg2 = CALL_EXPR_ARG (exp, 2);
31895 rtx scratch0, scratch1;
31896 rtx op0 = expand_normal (arg0);
31897 rtx op1 = expand_normal (arg1);
31898 rtx op2 = expand_normal (arg2);
31899 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31900
31901 tmode0 = insn_data[d->icode].operand[0].mode;
31902 tmode1 = insn_data[d->icode].operand[1].mode;
31903 modev2 = insn_data[d->icode].operand[2].mode;
31904 modev3 = insn_data[d->icode].operand[3].mode;
31905 modeimm = insn_data[d->icode].operand[4].mode;
31906
31907 if (VECTOR_MODE_P (modev2))
31908 op0 = safe_vector_operand (op0, modev2);
31909 if (VECTOR_MODE_P (modev3))
31910 op1 = safe_vector_operand (op1, modev3);
31911
31912 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31913 op0 = copy_to_mode_reg (modev2, op0);
31914 if ((optimize && !register_operand (op1, modev3))
31915 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31916 op1 = copy_to_mode_reg (modev3, op1);
31917
31918 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31919 {
31920 error ("the third argument must be an 8-bit immediate");
31921 return const0_rtx;
31922 }
31923
31924 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31925 {
31926 if (optimize || !target
31927 || GET_MODE (target) != tmode0
31928 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31929 target = gen_reg_rtx (tmode0);
31930
31931 scratch1 = gen_reg_rtx (tmode1);
31932
31933 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31934 }
31935 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31936 {
31937 if (optimize || !target
31938 || GET_MODE (target) != tmode1
31939 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31940 target = gen_reg_rtx (tmode1);
31941
31942 scratch0 = gen_reg_rtx (tmode0);
31943
31944 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31945 }
31946 else
31947 {
31948 gcc_assert (d->flag);
31949
31950 scratch0 = gen_reg_rtx (tmode0);
31951 scratch1 = gen_reg_rtx (tmode1);
31952
31953 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31954 }
31955
31956 if (! pat)
31957 return 0;
31958
31959 emit_insn (pat);
31960
31961 if (d->flag)
31962 {
31963 target = gen_reg_rtx (SImode);
31964 emit_move_insn (target, const0_rtx);
31965 target = gen_rtx_SUBREG (QImode, target, 0);
31966
31967 emit_insn
31968 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31969 gen_rtx_fmt_ee (EQ, QImode,
31970 gen_rtx_REG ((enum machine_mode) d->flag,
31971 FLAGS_REG),
31972 const0_rtx)));
31973 return SUBREG_REG (target);
31974 }
31975 else
31976 return target;
31977 }
31978
31979 /* Subroutine of ix86_expand_builtin to take care of insns with
31980 variable number of operands. */
31981
31982 static rtx
31983 ix86_expand_args_builtin (const struct builtin_description *d,
31984 tree exp, rtx target)
31985 {
31986 rtx pat, real_target;
31987 unsigned int i, nargs;
31988 unsigned int nargs_constant = 0;
31989 int num_memory = 0;
31990 struct
31991 {
31992 rtx op;
31993 enum machine_mode mode;
31994 } args[4];
31995 bool last_arg_count = false;
31996 enum insn_code icode = d->icode;
31997 const struct insn_data_d *insn_p = &insn_data[icode];
31998 enum machine_mode tmode = insn_p->operand[0].mode;
31999 enum machine_mode rmode = VOIDmode;
32000 bool swap = false;
32001 enum rtx_code comparison = d->comparison;
32002
32003 switch ((enum ix86_builtin_func_type) d->flag)
32004 {
32005 case V2DF_FTYPE_V2DF_ROUND:
32006 case V4DF_FTYPE_V4DF_ROUND:
32007 case V4SF_FTYPE_V4SF_ROUND:
32008 case V8SF_FTYPE_V8SF_ROUND:
32009 case V4SI_FTYPE_V4SF_ROUND:
32010 case V8SI_FTYPE_V8SF_ROUND:
32011 return ix86_expand_sse_round (d, exp, target);
32012 case V4SI_FTYPE_V2DF_V2DF_ROUND:
32013 case V8SI_FTYPE_V4DF_V4DF_ROUND:
32014 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
32015 case INT_FTYPE_V8SF_V8SF_PTEST:
32016 case INT_FTYPE_V4DI_V4DI_PTEST:
32017 case INT_FTYPE_V4DF_V4DF_PTEST:
32018 case INT_FTYPE_V4SF_V4SF_PTEST:
32019 case INT_FTYPE_V2DI_V2DI_PTEST:
32020 case INT_FTYPE_V2DF_V2DF_PTEST:
32021 return ix86_expand_sse_ptest (d, exp, target);
32022 case FLOAT128_FTYPE_FLOAT128:
32023 case FLOAT_FTYPE_FLOAT:
32024 case INT_FTYPE_INT:
32025 case UINT64_FTYPE_INT:
32026 case UINT16_FTYPE_UINT16:
32027 case INT64_FTYPE_INT64:
32028 case INT64_FTYPE_V4SF:
32029 case INT64_FTYPE_V2DF:
32030 case INT_FTYPE_V16QI:
32031 case INT_FTYPE_V8QI:
32032 case INT_FTYPE_V8SF:
32033 case INT_FTYPE_V4DF:
32034 case INT_FTYPE_V4SF:
32035 case INT_FTYPE_V2DF:
32036 case INT_FTYPE_V32QI:
32037 case V16QI_FTYPE_V16QI:
32038 case V8SI_FTYPE_V8SF:
32039 case V8SI_FTYPE_V4SI:
32040 case V8HI_FTYPE_V8HI:
32041 case V8HI_FTYPE_V16QI:
32042 case V8QI_FTYPE_V8QI:
32043 case V8SF_FTYPE_V8SF:
32044 case V8SF_FTYPE_V8SI:
32045 case V8SF_FTYPE_V4SF:
32046 case V8SF_FTYPE_V8HI:
32047 case V4SI_FTYPE_V4SI:
32048 case V4SI_FTYPE_V16QI:
32049 case V4SI_FTYPE_V4SF:
32050 case V4SI_FTYPE_V8SI:
32051 case V4SI_FTYPE_V8HI:
32052 case V4SI_FTYPE_V4DF:
32053 case V4SI_FTYPE_V2DF:
32054 case V4HI_FTYPE_V4HI:
32055 case V4DF_FTYPE_V4DF:
32056 case V4DF_FTYPE_V4SI:
32057 case V4DF_FTYPE_V4SF:
32058 case V4DF_FTYPE_V2DF:
32059 case V4SF_FTYPE_V4SF:
32060 case V4SF_FTYPE_V4SI:
32061 case V4SF_FTYPE_V8SF:
32062 case V4SF_FTYPE_V4DF:
32063 case V4SF_FTYPE_V8HI:
32064 case V4SF_FTYPE_V2DF:
32065 case V2DI_FTYPE_V2DI:
32066 case V2DI_FTYPE_V16QI:
32067 case V2DI_FTYPE_V8HI:
32068 case V2DI_FTYPE_V4SI:
32069 case V2DF_FTYPE_V2DF:
32070 case V2DF_FTYPE_V4SI:
32071 case V2DF_FTYPE_V4DF:
32072 case V2DF_FTYPE_V4SF:
32073 case V2DF_FTYPE_V2SI:
32074 case V2SI_FTYPE_V2SI:
32075 case V2SI_FTYPE_V4SF:
32076 case V2SI_FTYPE_V2SF:
32077 case V2SI_FTYPE_V2DF:
32078 case V2SF_FTYPE_V2SF:
32079 case V2SF_FTYPE_V2SI:
32080 case V32QI_FTYPE_V32QI:
32081 case V32QI_FTYPE_V16QI:
32082 case V16HI_FTYPE_V16HI:
32083 case V16HI_FTYPE_V8HI:
32084 case V8SI_FTYPE_V8SI:
32085 case V16HI_FTYPE_V16QI:
32086 case V8SI_FTYPE_V16QI:
32087 case V4DI_FTYPE_V16QI:
32088 case V8SI_FTYPE_V8HI:
32089 case V4DI_FTYPE_V8HI:
32090 case V4DI_FTYPE_V4SI:
32091 case V4DI_FTYPE_V2DI:
32092 nargs = 1;
32093 break;
32094 case V4SF_FTYPE_V4SF_VEC_MERGE:
32095 case V2DF_FTYPE_V2DF_VEC_MERGE:
32096 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
32097 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
32098 case V16QI_FTYPE_V16QI_V16QI:
32099 case V16QI_FTYPE_V8HI_V8HI:
32100 case V8QI_FTYPE_V8QI_V8QI:
32101 case V8QI_FTYPE_V4HI_V4HI:
32102 case V8HI_FTYPE_V8HI_V8HI:
32103 case V8HI_FTYPE_V16QI_V16QI:
32104 case V8HI_FTYPE_V4SI_V4SI:
32105 case V8SF_FTYPE_V8SF_V8SF:
32106 case V8SF_FTYPE_V8SF_V8SI:
32107 case V4SI_FTYPE_V4SI_V4SI:
32108 case V4SI_FTYPE_V8HI_V8HI:
32109 case V4SI_FTYPE_V4SF_V4SF:
32110 case V4SI_FTYPE_V2DF_V2DF:
32111 case V4HI_FTYPE_V4HI_V4HI:
32112 case V4HI_FTYPE_V8QI_V8QI:
32113 case V4HI_FTYPE_V2SI_V2SI:
32114 case V4DF_FTYPE_V4DF_V4DF:
32115 case V4DF_FTYPE_V4DF_V4DI:
32116 case V4SF_FTYPE_V4SF_V4SF:
32117 case V4SF_FTYPE_V4SF_V4SI:
32118 case V4SF_FTYPE_V4SF_V2SI:
32119 case V4SF_FTYPE_V4SF_V2DF:
32120 case V4SF_FTYPE_V4SF_DI:
32121 case V4SF_FTYPE_V4SF_SI:
32122 case V2DI_FTYPE_V2DI_V2DI:
32123 case V2DI_FTYPE_V16QI_V16QI:
32124 case V2DI_FTYPE_V4SI_V4SI:
32125 case V2UDI_FTYPE_V4USI_V4USI:
32126 case V2DI_FTYPE_V2DI_V16QI:
32127 case V2DI_FTYPE_V2DF_V2DF:
32128 case V2SI_FTYPE_V2SI_V2SI:
32129 case V2SI_FTYPE_V4HI_V4HI:
32130 case V2SI_FTYPE_V2SF_V2SF:
32131 case V2DF_FTYPE_V2DF_V2DF:
32132 case V2DF_FTYPE_V2DF_V4SF:
32133 case V2DF_FTYPE_V2DF_V2DI:
32134 case V2DF_FTYPE_V2DF_DI:
32135 case V2DF_FTYPE_V2DF_SI:
32136 case V2SF_FTYPE_V2SF_V2SF:
32137 case V1DI_FTYPE_V1DI_V1DI:
32138 case V1DI_FTYPE_V8QI_V8QI:
32139 case V1DI_FTYPE_V2SI_V2SI:
32140 case V32QI_FTYPE_V16HI_V16HI:
32141 case V16HI_FTYPE_V8SI_V8SI:
32142 case V32QI_FTYPE_V32QI_V32QI:
32143 case V16HI_FTYPE_V32QI_V32QI:
32144 case V16HI_FTYPE_V16HI_V16HI:
32145 case V8SI_FTYPE_V4DF_V4DF:
32146 case V8SI_FTYPE_V8SI_V8SI:
32147 case V8SI_FTYPE_V16HI_V16HI:
32148 case V4DI_FTYPE_V4DI_V4DI:
32149 case V4DI_FTYPE_V8SI_V8SI:
32150 case V4UDI_FTYPE_V8USI_V8USI:
32151 if (comparison == UNKNOWN)
32152 return ix86_expand_binop_builtin (icode, exp, target);
32153 nargs = 2;
32154 break;
32155 case V4SF_FTYPE_V4SF_V4SF_SWAP:
32156 case V2DF_FTYPE_V2DF_V2DF_SWAP:
32157 gcc_assert (comparison != UNKNOWN);
32158 nargs = 2;
32159 swap = true;
32160 break;
32161 case V16HI_FTYPE_V16HI_V8HI_COUNT:
32162 case V16HI_FTYPE_V16HI_SI_COUNT:
32163 case V8SI_FTYPE_V8SI_V4SI_COUNT:
32164 case V8SI_FTYPE_V8SI_SI_COUNT:
32165 case V4DI_FTYPE_V4DI_V2DI_COUNT:
32166 case V4DI_FTYPE_V4DI_INT_COUNT:
32167 case V8HI_FTYPE_V8HI_V8HI_COUNT:
32168 case V8HI_FTYPE_V8HI_SI_COUNT:
32169 case V4SI_FTYPE_V4SI_V4SI_COUNT:
32170 case V4SI_FTYPE_V4SI_SI_COUNT:
32171 case V4HI_FTYPE_V4HI_V4HI_COUNT:
32172 case V4HI_FTYPE_V4HI_SI_COUNT:
32173 case V2DI_FTYPE_V2DI_V2DI_COUNT:
32174 case V2DI_FTYPE_V2DI_SI_COUNT:
32175 case V2SI_FTYPE_V2SI_V2SI_COUNT:
32176 case V2SI_FTYPE_V2SI_SI_COUNT:
32177 case V1DI_FTYPE_V1DI_V1DI_COUNT:
32178 case V1DI_FTYPE_V1DI_SI_COUNT:
32179 nargs = 2;
32180 last_arg_count = true;
32181 break;
32182 case UINT64_FTYPE_UINT64_UINT64:
32183 case UINT_FTYPE_UINT_UINT:
32184 case UINT_FTYPE_UINT_USHORT:
32185 case UINT_FTYPE_UINT_UCHAR:
32186 case UINT16_FTYPE_UINT16_INT:
32187 case UINT8_FTYPE_UINT8_INT:
32188 nargs = 2;
32189 break;
32190 case V2DI_FTYPE_V2DI_INT_CONVERT:
32191 nargs = 2;
32192 rmode = V1TImode;
32193 nargs_constant = 1;
32194 break;
32195 case V4DI_FTYPE_V4DI_INT_CONVERT:
32196 nargs = 2;
32197 rmode = V2TImode;
32198 nargs_constant = 1;
32199 break;
32200 case V8HI_FTYPE_V8HI_INT:
32201 case V8HI_FTYPE_V8SF_INT:
32202 case V8HI_FTYPE_V4SF_INT:
32203 case V8SF_FTYPE_V8SF_INT:
32204 case V4SI_FTYPE_V4SI_INT:
32205 case V4SI_FTYPE_V8SI_INT:
32206 case V4HI_FTYPE_V4HI_INT:
32207 case V4DF_FTYPE_V4DF_INT:
32208 case V4SF_FTYPE_V4SF_INT:
32209 case V4SF_FTYPE_V8SF_INT:
32210 case V2DI_FTYPE_V2DI_INT:
32211 case V2DF_FTYPE_V2DF_INT:
32212 case V2DF_FTYPE_V4DF_INT:
32213 case V16HI_FTYPE_V16HI_INT:
32214 case V8SI_FTYPE_V8SI_INT:
32215 case V4DI_FTYPE_V4DI_INT:
32216 case V2DI_FTYPE_V4DI_INT:
32217 nargs = 2;
32218 nargs_constant = 1;
32219 break;
32220 case V16QI_FTYPE_V16QI_V16QI_V16QI:
32221 case V8SF_FTYPE_V8SF_V8SF_V8SF:
32222 case V4DF_FTYPE_V4DF_V4DF_V4DF:
32223 case V4SF_FTYPE_V4SF_V4SF_V4SF:
32224 case V2DF_FTYPE_V2DF_V2DF_V2DF:
32225 case V32QI_FTYPE_V32QI_V32QI_V32QI:
32226 nargs = 3;
32227 break;
32228 case V32QI_FTYPE_V32QI_V32QI_INT:
32229 case V16HI_FTYPE_V16HI_V16HI_INT:
32230 case V16QI_FTYPE_V16QI_V16QI_INT:
32231 case V4DI_FTYPE_V4DI_V4DI_INT:
32232 case V8HI_FTYPE_V8HI_V8HI_INT:
32233 case V8SI_FTYPE_V8SI_V8SI_INT:
32234 case V8SI_FTYPE_V8SI_V4SI_INT:
32235 case V8SF_FTYPE_V8SF_V8SF_INT:
32236 case V8SF_FTYPE_V8SF_V4SF_INT:
32237 case V4SI_FTYPE_V4SI_V4SI_INT:
32238 case V4DF_FTYPE_V4DF_V4DF_INT:
32239 case V4DF_FTYPE_V4DF_V2DF_INT:
32240 case V4SF_FTYPE_V4SF_V4SF_INT:
32241 case V2DI_FTYPE_V2DI_V2DI_INT:
32242 case V4DI_FTYPE_V4DI_V2DI_INT:
32243 case V2DF_FTYPE_V2DF_V2DF_INT:
32244 nargs = 3;
32245 nargs_constant = 1;
32246 break;
32247 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
32248 nargs = 3;
32249 rmode = V4DImode;
32250 nargs_constant = 1;
32251 break;
32252 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
32253 nargs = 3;
32254 rmode = V2DImode;
32255 nargs_constant = 1;
32256 break;
32257 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
32258 nargs = 3;
32259 rmode = DImode;
32260 nargs_constant = 1;
32261 break;
32262 case V2DI_FTYPE_V2DI_UINT_UINT:
32263 nargs = 3;
32264 nargs_constant = 2;
32265 break;
32266 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
32267 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
32268 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
32269 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
32270 nargs = 4;
32271 nargs_constant = 1;
32272 break;
32273 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
32274 nargs = 4;
32275 nargs_constant = 2;
32276 break;
32277 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
32278 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
32279 nargs = 4;
32280 break;
32281 default:
32282 gcc_unreachable ();
32283 }
32284
32285 gcc_assert (nargs <= ARRAY_SIZE (args));
32286
32287 if (comparison != UNKNOWN)
32288 {
32289 gcc_assert (nargs == 2);
32290 return ix86_expand_sse_compare (d, exp, target, swap);
32291 }
32292
32293 if (rmode == VOIDmode || rmode == tmode)
32294 {
32295 if (optimize
32296 || target == 0
32297 || GET_MODE (target) != tmode
32298 || !insn_p->operand[0].predicate (target, tmode))
32299 target = gen_reg_rtx (tmode);
32300 real_target = target;
32301 }
32302 else
32303 {
32304 real_target = gen_reg_rtx (tmode);
32305 target = simplify_gen_subreg (rmode, real_target, tmode, 0);
32306 }
32307
32308 for (i = 0; i < nargs; i++)
32309 {
32310 tree arg = CALL_EXPR_ARG (exp, i);
32311 rtx op = expand_normal (arg);
32312 enum machine_mode mode = insn_p->operand[i + 1].mode;
32313 bool match = insn_p->operand[i + 1].predicate (op, mode);
32314
32315 if (last_arg_count && (i + 1) == nargs)
32316 {
32317 /* SIMD shift insns take either an 8-bit immediate or
32318 register as count. But builtin functions take int as
32319 count. If count doesn't match, we put it in register. */
32320 if (!match)
32321 {
32322 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
32323 if (!insn_p->operand[i + 1].predicate (op, mode))
32324 op = copy_to_reg (op);
32325 }
32326 }
32327 else if ((nargs - i) <= nargs_constant)
32328 {
32329 if (!match)
32330 switch (icode)
32331 {
32332 case CODE_FOR_avx2_inserti128:
32333 case CODE_FOR_avx2_extracti128:
32334 error ("the last argument must be an 1-bit immediate");
32335 return const0_rtx;
32336
32337 case CODE_FOR_sse4_1_roundsd:
32338 case CODE_FOR_sse4_1_roundss:
32339
32340 case CODE_FOR_sse4_1_roundpd:
32341 case CODE_FOR_sse4_1_roundps:
32342 case CODE_FOR_avx_roundpd256:
32343 case CODE_FOR_avx_roundps256:
32344
32345 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
32346 case CODE_FOR_sse4_1_roundps_sfix:
32347 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
32348 case CODE_FOR_avx_roundps_sfix256:
32349
32350 case CODE_FOR_sse4_1_blendps:
32351 case CODE_FOR_avx_blendpd256:
32352 case CODE_FOR_avx_vpermilv4df:
32353 error ("the last argument must be a 4-bit immediate");
32354 return const0_rtx;
32355
32356 case CODE_FOR_sse4_1_blendpd:
32357 case CODE_FOR_avx_vpermilv2df:
32358 case CODE_FOR_xop_vpermil2v2df3:
32359 case CODE_FOR_xop_vpermil2v4sf3:
32360 case CODE_FOR_xop_vpermil2v4df3:
32361 case CODE_FOR_xop_vpermil2v8sf3:
32362 error ("the last argument must be a 2-bit immediate");
32363 return const0_rtx;
32364
32365 case CODE_FOR_avx_vextractf128v4df:
32366 case CODE_FOR_avx_vextractf128v8sf:
32367 case CODE_FOR_avx_vextractf128v8si:
32368 case CODE_FOR_avx_vinsertf128v4df:
32369 case CODE_FOR_avx_vinsertf128v8sf:
32370 case CODE_FOR_avx_vinsertf128v8si:
32371 error ("the last argument must be a 1-bit immediate");
32372 return const0_rtx;
32373
32374 case CODE_FOR_avx_vmcmpv2df3:
32375 case CODE_FOR_avx_vmcmpv4sf3:
32376 case CODE_FOR_avx_cmpv2df3:
32377 case CODE_FOR_avx_cmpv4sf3:
32378 case CODE_FOR_avx_cmpv4df3:
32379 case CODE_FOR_avx_cmpv8sf3:
32380 error ("the last argument must be a 5-bit immediate");
32381 return const0_rtx;
32382
32383 default:
32384 switch (nargs_constant)
32385 {
32386 case 2:
32387 if ((nargs - i) == nargs_constant)
32388 {
32389 error ("the next to last argument must be an 8-bit immediate");
32390 break;
32391 }
32392 case 1:
32393 error ("the last argument must be an 8-bit immediate");
32394 break;
32395 default:
32396 gcc_unreachable ();
32397 }
32398 return const0_rtx;
32399 }
32400 }
32401 else
32402 {
32403 if (VECTOR_MODE_P (mode))
32404 op = safe_vector_operand (op, mode);
32405
32406 /* If we aren't optimizing, only allow one memory operand to
32407 be generated. */
32408 if (memory_operand (op, mode))
32409 num_memory++;
32410
32411 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
32412 {
32413 if (optimize || !match || num_memory > 1)
32414 op = copy_to_mode_reg (mode, op);
32415 }
32416 else
32417 {
32418 op = copy_to_reg (op);
32419 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
32420 }
32421 }
32422
32423 args[i].op = op;
32424 args[i].mode = mode;
32425 }
32426
32427 switch (nargs)
32428 {
32429 case 1:
32430 pat = GEN_FCN (icode) (real_target, args[0].op);
32431 break;
32432 case 2:
32433 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
32434 break;
32435 case 3:
32436 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32437 args[2].op);
32438 break;
32439 case 4:
32440 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32441 args[2].op, args[3].op);
32442 break;
32443 default:
32444 gcc_unreachable ();
32445 }
32446
32447 if (! pat)
32448 return 0;
32449
32450 emit_insn (pat);
32451 return target;
32452 }
32453
32454 /* Subroutine of ix86_expand_builtin to take care of special insns
32455 with variable number of operands. */
32456
32457 static rtx
32458 ix86_expand_special_args_builtin (const struct builtin_description *d,
32459 tree exp, rtx target)
32460 {
32461 tree arg;
32462 rtx pat, op;
32463 unsigned int i, nargs, arg_adjust, memory;
32464 struct
32465 {
32466 rtx op;
32467 enum machine_mode mode;
32468 } args[3];
32469 enum insn_code icode = d->icode;
32470 bool last_arg_constant = false;
32471 const struct insn_data_d *insn_p = &insn_data[icode];
32472 enum machine_mode tmode = insn_p->operand[0].mode;
32473 enum { load, store } klass;
32474
32475 switch ((enum ix86_builtin_func_type) d->flag)
32476 {
32477 case VOID_FTYPE_VOID:
32478 emit_insn (GEN_FCN (icode) (target));
32479 return 0;
32480 case VOID_FTYPE_UINT64:
32481 case VOID_FTYPE_UNSIGNED:
32482 nargs = 0;
32483 klass = store;
32484 memory = 0;
32485 break;
32486
32487 case INT_FTYPE_VOID:
32488 case UINT64_FTYPE_VOID:
32489 case UNSIGNED_FTYPE_VOID:
32490 nargs = 0;
32491 klass = load;
32492 memory = 0;
32493 break;
32494 case UINT64_FTYPE_PUNSIGNED:
32495 case V2DI_FTYPE_PV2DI:
32496 case V4DI_FTYPE_PV4DI:
32497 case V32QI_FTYPE_PCCHAR:
32498 case V16QI_FTYPE_PCCHAR:
32499 case V8SF_FTYPE_PCV4SF:
32500 case V8SF_FTYPE_PCFLOAT:
32501 case V4SF_FTYPE_PCFLOAT:
32502 case V4DF_FTYPE_PCV2DF:
32503 case V4DF_FTYPE_PCDOUBLE:
32504 case V2DF_FTYPE_PCDOUBLE:
32505 case VOID_FTYPE_PVOID:
32506 nargs = 1;
32507 klass = load;
32508 memory = 0;
32509 break;
32510 case VOID_FTYPE_PV2SF_V4SF:
32511 case VOID_FTYPE_PV4DI_V4DI:
32512 case VOID_FTYPE_PV2DI_V2DI:
32513 case VOID_FTYPE_PCHAR_V32QI:
32514 case VOID_FTYPE_PCHAR_V16QI:
32515 case VOID_FTYPE_PFLOAT_V8SF:
32516 case VOID_FTYPE_PFLOAT_V4SF:
32517 case VOID_FTYPE_PDOUBLE_V4DF:
32518 case VOID_FTYPE_PDOUBLE_V2DF:
32519 case VOID_FTYPE_PLONGLONG_LONGLONG:
32520 case VOID_FTYPE_PULONGLONG_ULONGLONG:
32521 case VOID_FTYPE_PINT_INT:
32522 nargs = 1;
32523 klass = store;
32524 /* Reserve memory operand for target. */
32525 memory = ARRAY_SIZE (args);
32526 break;
32527 case V4SF_FTYPE_V4SF_PCV2SF:
32528 case V2DF_FTYPE_V2DF_PCDOUBLE:
32529 nargs = 2;
32530 klass = load;
32531 memory = 1;
32532 break;
32533 case V8SF_FTYPE_PCV8SF_V8SI:
32534 case V4DF_FTYPE_PCV4DF_V4DI:
32535 case V4SF_FTYPE_PCV4SF_V4SI:
32536 case V2DF_FTYPE_PCV2DF_V2DI:
32537 case V8SI_FTYPE_PCV8SI_V8SI:
32538 case V4DI_FTYPE_PCV4DI_V4DI:
32539 case V4SI_FTYPE_PCV4SI_V4SI:
32540 case V2DI_FTYPE_PCV2DI_V2DI:
32541 nargs = 2;
32542 klass = load;
32543 memory = 0;
32544 break;
32545 case VOID_FTYPE_PV8SF_V8SI_V8SF:
32546 case VOID_FTYPE_PV4DF_V4DI_V4DF:
32547 case VOID_FTYPE_PV4SF_V4SI_V4SF:
32548 case VOID_FTYPE_PV2DF_V2DI_V2DF:
32549 case VOID_FTYPE_PV8SI_V8SI_V8SI:
32550 case VOID_FTYPE_PV4DI_V4DI_V4DI:
32551 case VOID_FTYPE_PV4SI_V4SI_V4SI:
32552 case VOID_FTYPE_PV2DI_V2DI_V2DI:
32553 nargs = 2;
32554 klass = store;
32555 /* Reserve memory operand for target. */
32556 memory = ARRAY_SIZE (args);
32557 break;
32558 case VOID_FTYPE_UINT_UINT_UINT:
32559 case VOID_FTYPE_UINT64_UINT_UINT:
32560 case UCHAR_FTYPE_UINT_UINT_UINT:
32561 case UCHAR_FTYPE_UINT64_UINT_UINT:
32562 nargs = 3;
32563 klass = load;
32564 memory = ARRAY_SIZE (args);
32565 last_arg_constant = true;
32566 break;
32567 default:
32568 gcc_unreachable ();
32569 }
32570
32571 gcc_assert (nargs <= ARRAY_SIZE (args));
32572
32573 if (klass == store)
32574 {
32575 arg = CALL_EXPR_ARG (exp, 0);
32576 op = expand_normal (arg);
32577 gcc_assert (target == 0);
32578 if (memory)
32579 {
32580 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32581 target = gen_rtx_MEM (tmode, op);
32582 }
32583 else
32584 target = force_reg (tmode, op);
32585 arg_adjust = 1;
32586 }
32587 else
32588 {
32589 arg_adjust = 0;
32590 if (optimize
32591 || target == 0
32592 || !register_operand (target, tmode)
32593 || GET_MODE (target) != tmode)
32594 target = gen_reg_rtx (tmode);
32595 }
32596
32597 for (i = 0; i < nargs; i++)
32598 {
32599 enum machine_mode mode = insn_p->operand[i + 1].mode;
32600 bool match;
32601
32602 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
32603 op = expand_normal (arg);
32604 match = insn_p->operand[i + 1].predicate (op, mode);
32605
32606 if (last_arg_constant && (i + 1) == nargs)
32607 {
32608 if (!match)
32609 {
32610 if (icode == CODE_FOR_lwp_lwpvalsi3
32611 || icode == CODE_FOR_lwp_lwpinssi3
32612 || icode == CODE_FOR_lwp_lwpvaldi3
32613 || icode == CODE_FOR_lwp_lwpinsdi3)
32614 error ("the last argument must be a 32-bit immediate");
32615 else
32616 error ("the last argument must be an 8-bit immediate");
32617 return const0_rtx;
32618 }
32619 }
32620 else
32621 {
32622 if (i == memory)
32623 {
32624 /* This must be the memory operand. */
32625 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32626 op = gen_rtx_MEM (mode, op);
32627 gcc_assert (GET_MODE (op) == mode
32628 || GET_MODE (op) == VOIDmode);
32629 }
32630 else
32631 {
32632 /* This must be register. */
32633 if (VECTOR_MODE_P (mode))
32634 op = safe_vector_operand (op, mode);
32635
32636 gcc_assert (GET_MODE (op) == mode
32637 || GET_MODE (op) == VOIDmode);
32638 op = copy_to_mode_reg (mode, op);
32639 }
32640 }
32641
32642 args[i].op = op;
32643 args[i].mode = mode;
32644 }
32645
32646 switch (nargs)
32647 {
32648 case 0:
32649 pat = GEN_FCN (icode) (target);
32650 break;
32651 case 1:
32652 pat = GEN_FCN (icode) (target, args[0].op);
32653 break;
32654 case 2:
32655 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32656 break;
32657 case 3:
32658 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32659 break;
32660 default:
32661 gcc_unreachable ();
32662 }
32663
32664 if (! pat)
32665 return 0;
32666 emit_insn (pat);
32667 return klass == store ? 0 : target;
32668 }
32669
32670 /* Return the integer constant in ARG. Constrain it to be in the range
32671 of the subparts of VEC_TYPE; issue an error if not. */
32672
32673 static int
32674 get_element_number (tree vec_type, tree arg)
32675 {
32676 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32677
32678 if (!tree_fits_uhwi_p (arg)
32679 || (elt = tree_to_uhwi (arg), elt > max))
32680 {
32681 error ("selector must be an integer constant in the range 0..%wi", max);
32682 return 0;
32683 }
32684
32685 return elt;
32686 }
32687
32688 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32689 ix86_expand_vector_init. We DO have language-level syntax for this, in
32690 the form of (type){ init-list }. Except that since we can't place emms
32691 instructions from inside the compiler, we can't allow the use of MMX
32692 registers unless the user explicitly asks for it. So we do *not* define
32693 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32694 we have builtins invoked by mmintrin.h that gives us license to emit
32695 these sorts of instructions. */
32696
32697 static rtx
32698 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32699 {
32700 enum machine_mode tmode = TYPE_MODE (type);
32701 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32702 int i, n_elt = GET_MODE_NUNITS (tmode);
32703 rtvec v = rtvec_alloc (n_elt);
32704
32705 gcc_assert (VECTOR_MODE_P (tmode));
32706 gcc_assert (call_expr_nargs (exp) == n_elt);
32707
32708 for (i = 0; i < n_elt; ++i)
32709 {
32710 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32711 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32712 }
32713
32714 if (!target || !register_operand (target, tmode))
32715 target = gen_reg_rtx (tmode);
32716
32717 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32718 return target;
32719 }
32720
32721 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32722 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32723 had a language-level syntax for referencing vector elements. */
32724
32725 static rtx
32726 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32727 {
32728 enum machine_mode tmode, mode0;
32729 tree arg0, arg1;
32730 int elt;
32731 rtx op0;
32732
32733 arg0 = CALL_EXPR_ARG (exp, 0);
32734 arg1 = CALL_EXPR_ARG (exp, 1);
32735
32736 op0 = expand_normal (arg0);
32737 elt = get_element_number (TREE_TYPE (arg0), arg1);
32738
32739 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32740 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32741 gcc_assert (VECTOR_MODE_P (mode0));
32742
32743 op0 = force_reg (mode0, op0);
32744
32745 if (optimize || !target || !register_operand (target, tmode))
32746 target = gen_reg_rtx (tmode);
32747
32748 ix86_expand_vector_extract (true, target, op0, elt);
32749
32750 return target;
32751 }
32752
32753 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32754 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32755 a language-level syntax for referencing vector elements. */
32756
32757 static rtx
32758 ix86_expand_vec_set_builtin (tree exp)
32759 {
32760 enum machine_mode tmode, mode1;
32761 tree arg0, arg1, arg2;
32762 int elt;
32763 rtx op0, op1, target;
32764
32765 arg0 = CALL_EXPR_ARG (exp, 0);
32766 arg1 = CALL_EXPR_ARG (exp, 1);
32767 arg2 = CALL_EXPR_ARG (exp, 2);
32768
32769 tmode = TYPE_MODE (TREE_TYPE (arg0));
32770 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32771 gcc_assert (VECTOR_MODE_P (tmode));
32772
32773 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32774 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32775 elt = get_element_number (TREE_TYPE (arg0), arg2);
32776
32777 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32778 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32779
32780 op0 = force_reg (tmode, op0);
32781 op1 = force_reg (mode1, op1);
32782
32783 /* OP0 is the source of these builtin functions and shouldn't be
32784 modified. Create a copy, use it and return it as target. */
32785 target = gen_reg_rtx (tmode);
32786 emit_move_insn (target, op0);
32787 ix86_expand_vector_set (true, target, op1, elt);
32788
32789 return target;
32790 }
32791
32792 /* Expand an expression EXP that calls a built-in function,
32793 with result going to TARGET if that's convenient
32794 (and in mode MODE if that's convenient).
32795 SUBTARGET may be used as the target for computing one of EXP's operands.
32796 IGNORE is nonzero if the value is to be ignored. */
32797
32798 static rtx
32799 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32800 enum machine_mode mode, int ignore)
32801 {
32802 const struct builtin_description *d;
32803 size_t i;
32804 enum insn_code icode;
32805 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32806 tree arg0, arg1, arg2, arg3, arg4;
32807 rtx op0, op1, op2, op3, op4, pat, insn;
32808 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32809 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32810
32811 /* For CPU builtins that can be folded, fold first and expand the fold. */
32812 switch (fcode)
32813 {
32814 case IX86_BUILTIN_CPU_INIT:
32815 {
32816 /* Make it call __cpu_indicator_init in libgcc. */
32817 tree call_expr, fndecl, type;
32818 type = build_function_type_list (integer_type_node, NULL_TREE);
32819 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32820 call_expr = build_call_expr (fndecl, 0);
32821 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32822 }
32823 case IX86_BUILTIN_CPU_IS:
32824 case IX86_BUILTIN_CPU_SUPPORTS:
32825 {
32826 tree arg0 = CALL_EXPR_ARG (exp, 0);
32827 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32828 gcc_assert (fold_expr != NULL_TREE);
32829 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32830 }
32831 }
32832
32833 /* Determine whether the builtin function is available under the current ISA.
32834 Originally the builtin was not created if it wasn't applicable to the
32835 current ISA based on the command line switches. With function specific
32836 options, we need to check in the context of the function making the call
32837 whether it is supported. */
32838 if (ix86_builtins_isa[fcode].isa
32839 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32840 {
32841 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32842 NULL, (enum fpmath_unit) 0, false);
32843
32844 if (!opts)
32845 error ("%qE needs unknown isa option", fndecl);
32846 else
32847 {
32848 gcc_assert (opts != NULL);
32849 error ("%qE needs isa option %s", fndecl, opts);
32850 free (opts);
32851 }
32852 return const0_rtx;
32853 }
32854
32855 switch (fcode)
32856 {
32857 case IX86_BUILTIN_MASKMOVQ:
32858 case IX86_BUILTIN_MASKMOVDQU:
32859 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32860 ? CODE_FOR_mmx_maskmovq
32861 : CODE_FOR_sse2_maskmovdqu);
32862 /* Note the arg order is different from the operand order. */
32863 arg1 = CALL_EXPR_ARG (exp, 0);
32864 arg2 = CALL_EXPR_ARG (exp, 1);
32865 arg0 = CALL_EXPR_ARG (exp, 2);
32866 op0 = expand_normal (arg0);
32867 op1 = expand_normal (arg1);
32868 op2 = expand_normal (arg2);
32869 mode0 = insn_data[icode].operand[0].mode;
32870 mode1 = insn_data[icode].operand[1].mode;
32871 mode2 = insn_data[icode].operand[2].mode;
32872
32873 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32874 op0 = gen_rtx_MEM (mode1, op0);
32875
32876 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32877 op0 = copy_to_mode_reg (mode0, op0);
32878 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32879 op1 = copy_to_mode_reg (mode1, op1);
32880 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32881 op2 = copy_to_mode_reg (mode2, op2);
32882 pat = GEN_FCN (icode) (op0, op1, op2);
32883 if (! pat)
32884 return 0;
32885 emit_insn (pat);
32886 return 0;
32887
32888 case IX86_BUILTIN_LDMXCSR:
32889 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32890 target = assign_386_stack_local (SImode, SLOT_TEMP);
32891 emit_move_insn (target, op0);
32892 emit_insn (gen_sse_ldmxcsr (target));
32893 return 0;
32894
32895 case IX86_BUILTIN_STMXCSR:
32896 target = assign_386_stack_local (SImode, SLOT_TEMP);
32897 emit_insn (gen_sse_stmxcsr (target));
32898 return copy_to_mode_reg (SImode, target);
32899
32900 case IX86_BUILTIN_CLFLUSH:
32901 arg0 = CALL_EXPR_ARG (exp, 0);
32902 op0 = expand_normal (arg0);
32903 icode = CODE_FOR_sse2_clflush;
32904 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32905 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32906
32907 emit_insn (gen_sse2_clflush (op0));
32908 return 0;
32909
32910 case IX86_BUILTIN_MONITOR:
32911 arg0 = CALL_EXPR_ARG (exp, 0);
32912 arg1 = CALL_EXPR_ARG (exp, 1);
32913 arg2 = CALL_EXPR_ARG (exp, 2);
32914 op0 = expand_normal (arg0);
32915 op1 = expand_normal (arg1);
32916 op2 = expand_normal (arg2);
32917 if (!REG_P (op0))
32918 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32919 if (!REG_P (op1))
32920 op1 = copy_to_mode_reg (SImode, op1);
32921 if (!REG_P (op2))
32922 op2 = copy_to_mode_reg (SImode, op2);
32923 emit_insn (ix86_gen_monitor (op0, op1, op2));
32924 return 0;
32925
32926 case IX86_BUILTIN_MWAIT:
32927 arg0 = CALL_EXPR_ARG (exp, 0);
32928 arg1 = CALL_EXPR_ARG (exp, 1);
32929 op0 = expand_normal (arg0);
32930 op1 = expand_normal (arg1);
32931 if (!REG_P (op0))
32932 op0 = copy_to_mode_reg (SImode, op0);
32933 if (!REG_P (op1))
32934 op1 = copy_to_mode_reg (SImode, op1);
32935 emit_insn (gen_sse3_mwait (op0, op1));
32936 return 0;
32937
32938 case IX86_BUILTIN_VEC_INIT_V2SI:
32939 case IX86_BUILTIN_VEC_INIT_V4HI:
32940 case IX86_BUILTIN_VEC_INIT_V8QI:
32941 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32942
32943 case IX86_BUILTIN_VEC_EXT_V2DF:
32944 case IX86_BUILTIN_VEC_EXT_V2DI:
32945 case IX86_BUILTIN_VEC_EXT_V4SF:
32946 case IX86_BUILTIN_VEC_EXT_V4SI:
32947 case IX86_BUILTIN_VEC_EXT_V8HI:
32948 case IX86_BUILTIN_VEC_EXT_V2SI:
32949 case IX86_BUILTIN_VEC_EXT_V4HI:
32950 case IX86_BUILTIN_VEC_EXT_V16QI:
32951 return ix86_expand_vec_ext_builtin (exp, target);
32952
32953 case IX86_BUILTIN_VEC_SET_V2DI:
32954 case IX86_BUILTIN_VEC_SET_V4SF:
32955 case IX86_BUILTIN_VEC_SET_V4SI:
32956 case IX86_BUILTIN_VEC_SET_V8HI:
32957 case IX86_BUILTIN_VEC_SET_V4HI:
32958 case IX86_BUILTIN_VEC_SET_V16QI:
32959 return ix86_expand_vec_set_builtin (exp);
32960
32961 case IX86_BUILTIN_INFQ:
32962 case IX86_BUILTIN_HUGE_VALQ:
32963 {
32964 REAL_VALUE_TYPE inf;
32965 rtx tmp;
32966
32967 real_inf (&inf);
32968 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32969
32970 tmp = validize_mem (force_const_mem (mode, tmp));
32971
32972 if (target == 0)
32973 target = gen_reg_rtx (mode);
32974
32975 emit_move_insn (target, tmp);
32976 return target;
32977 }
32978
32979 case IX86_BUILTIN_RDPMC:
32980 case IX86_BUILTIN_RDTSC:
32981 case IX86_BUILTIN_RDTSCP:
32982
32983 op0 = gen_reg_rtx (DImode);
32984 op1 = gen_reg_rtx (DImode);
32985
32986 if (fcode == IX86_BUILTIN_RDPMC)
32987 {
32988 arg0 = CALL_EXPR_ARG (exp, 0);
32989 op2 = expand_normal (arg0);
32990 if (!register_operand (op2, SImode))
32991 op2 = copy_to_mode_reg (SImode, op2);
32992
32993 insn = (TARGET_64BIT
32994 ? gen_rdpmc_rex64 (op0, op1, op2)
32995 : gen_rdpmc (op0, op2));
32996 emit_insn (insn);
32997 }
32998 else if (fcode == IX86_BUILTIN_RDTSC)
32999 {
33000 insn = (TARGET_64BIT
33001 ? gen_rdtsc_rex64 (op0, op1)
33002 : gen_rdtsc (op0));
33003 emit_insn (insn);
33004 }
33005 else
33006 {
33007 op2 = gen_reg_rtx (SImode);
33008
33009 insn = (TARGET_64BIT
33010 ? gen_rdtscp_rex64 (op0, op1, op2)
33011 : gen_rdtscp (op0, op2));
33012 emit_insn (insn);
33013
33014 arg0 = CALL_EXPR_ARG (exp, 0);
33015 op4 = expand_normal (arg0);
33016 if (!address_operand (op4, VOIDmode))
33017 {
33018 op4 = convert_memory_address (Pmode, op4);
33019 op4 = copy_addr_to_reg (op4);
33020 }
33021 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
33022 }
33023
33024 if (target == 0)
33025 {
33026 /* mode is VOIDmode if __builtin_rd* has been called
33027 without lhs. */
33028 if (mode == VOIDmode)
33029 return target;
33030 target = gen_reg_rtx (mode);
33031 }
33032
33033 if (TARGET_64BIT)
33034 {
33035 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
33036 op1, 1, OPTAB_DIRECT);
33037 op0 = expand_simple_binop (DImode, IOR, op0, op1,
33038 op0, 1, OPTAB_DIRECT);
33039 }
33040
33041 emit_move_insn (target, op0);
33042 return target;
33043
33044 case IX86_BUILTIN_FXSAVE:
33045 case IX86_BUILTIN_FXRSTOR:
33046 case IX86_BUILTIN_FXSAVE64:
33047 case IX86_BUILTIN_FXRSTOR64:
33048 case IX86_BUILTIN_FNSTENV:
33049 case IX86_BUILTIN_FLDENV:
33050 case IX86_BUILTIN_FNSTSW:
33051 mode0 = BLKmode;
33052 switch (fcode)
33053 {
33054 case IX86_BUILTIN_FXSAVE:
33055 icode = CODE_FOR_fxsave;
33056 break;
33057 case IX86_BUILTIN_FXRSTOR:
33058 icode = CODE_FOR_fxrstor;
33059 break;
33060 case IX86_BUILTIN_FXSAVE64:
33061 icode = CODE_FOR_fxsave64;
33062 break;
33063 case IX86_BUILTIN_FXRSTOR64:
33064 icode = CODE_FOR_fxrstor64;
33065 break;
33066 case IX86_BUILTIN_FNSTENV:
33067 icode = CODE_FOR_fnstenv;
33068 break;
33069 case IX86_BUILTIN_FLDENV:
33070 icode = CODE_FOR_fldenv;
33071 break;
33072 case IX86_BUILTIN_FNSTSW:
33073 icode = CODE_FOR_fnstsw;
33074 mode0 = HImode;
33075 break;
33076 default:
33077 gcc_unreachable ();
33078 }
33079
33080 arg0 = CALL_EXPR_ARG (exp, 0);
33081 op0 = expand_normal (arg0);
33082
33083 if (!address_operand (op0, VOIDmode))
33084 {
33085 op0 = convert_memory_address (Pmode, op0);
33086 op0 = copy_addr_to_reg (op0);
33087 }
33088 op0 = gen_rtx_MEM (mode0, op0);
33089
33090 pat = GEN_FCN (icode) (op0);
33091 if (pat)
33092 emit_insn (pat);
33093 return 0;
33094
33095 case IX86_BUILTIN_XSAVE:
33096 case IX86_BUILTIN_XRSTOR:
33097 case IX86_BUILTIN_XSAVE64:
33098 case IX86_BUILTIN_XRSTOR64:
33099 case IX86_BUILTIN_XSAVEOPT:
33100 case IX86_BUILTIN_XSAVEOPT64:
33101 arg0 = CALL_EXPR_ARG (exp, 0);
33102 arg1 = CALL_EXPR_ARG (exp, 1);
33103 op0 = expand_normal (arg0);
33104 op1 = expand_normal (arg1);
33105
33106 if (!address_operand (op0, VOIDmode))
33107 {
33108 op0 = convert_memory_address (Pmode, op0);
33109 op0 = copy_addr_to_reg (op0);
33110 }
33111 op0 = gen_rtx_MEM (BLKmode, op0);
33112
33113 op1 = force_reg (DImode, op1);
33114
33115 if (TARGET_64BIT)
33116 {
33117 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
33118 NULL, 1, OPTAB_DIRECT);
33119 switch (fcode)
33120 {
33121 case IX86_BUILTIN_XSAVE:
33122 icode = CODE_FOR_xsave_rex64;
33123 break;
33124 case IX86_BUILTIN_XRSTOR:
33125 icode = CODE_FOR_xrstor_rex64;
33126 break;
33127 case IX86_BUILTIN_XSAVE64:
33128 icode = CODE_FOR_xsave64;
33129 break;
33130 case IX86_BUILTIN_XRSTOR64:
33131 icode = CODE_FOR_xrstor64;
33132 break;
33133 case IX86_BUILTIN_XSAVEOPT:
33134 icode = CODE_FOR_xsaveopt_rex64;
33135 break;
33136 case IX86_BUILTIN_XSAVEOPT64:
33137 icode = CODE_FOR_xsaveopt64;
33138 break;
33139 default:
33140 gcc_unreachable ();
33141 }
33142
33143 op2 = gen_lowpart (SImode, op2);
33144 op1 = gen_lowpart (SImode, op1);
33145 pat = GEN_FCN (icode) (op0, op1, op2);
33146 }
33147 else
33148 {
33149 switch (fcode)
33150 {
33151 case IX86_BUILTIN_XSAVE:
33152 icode = CODE_FOR_xsave;
33153 break;
33154 case IX86_BUILTIN_XRSTOR:
33155 icode = CODE_FOR_xrstor;
33156 break;
33157 case IX86_BUILTIN_XSAVEOPT:
33158 icode = CODE_FOR_xsaveopt;
33159 break;
33160 default:
33161 gcc_unreachable ();
33162 }
33163 pat = GEN_FCN (icode) (op0, op1);
33164 }
33165
33166 if (pat)
33167 emit_insn (pat);
33168 return 0;
33169
33170 case IX86_BUILTIN_LLWPCB:
33171 arg0 = CALL_EXPR_ARG (exp, 0);
33172 op0 = expand_normal (arg0);
33173 icode = CODE_FOR_lwp_llwpcb;
33174 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
33175 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
33176 emit_insn (gen_lwp_llwpcb (op0));
33177 return 0;
33178
33179 case IX86_BUILTIN_SLWPCB:
33180 icode = CODE_FOR_lwp_slwpcb;
33181 if (!target
33182 || !insn_data[icode].operand[0].predicate (target, Pmode))
33183 target = gen_reg_rtx (Pmode);
33184 emit_insn (gen_lwp_slwpcb (target));
33185 return target;
33186
33187 case IX86_BUILTIN_BEXTRI32:
33188 case IX86_BUILTIN_BEXTRI64:
33189 arg0 = CALL_EXPR_ARG (exp, 0);
33190 arg1 = CALL_EXPR_ARG (exp, 1);
33191 op0 = expand_normal (arg0);
33192 op1 = expand_normal (arg1);
33193 icode = (fcode == IX86_BUILTIN_BEXTRI32
33194 ? CODE_FOR_tbm_bextri_si
33195 : CODE_FOR_tbm_bextri_di);
33196 if (!CONST_INT_P (op1))
33197 {
33198 error ("last argument must be an immediate");
33199 return const0_rtx;
33200 }
33201 else
33202 {
33203 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
33204 unsigned char lsb_index = INTVAL (op1) & 0xFF;
33205 op1 = GEN_INT (length);
33206 op2 = GEN_INT (lsb_index);
33207 pat = GEN_FCN (icode) (target, op0, op1, op2);
33208 if (pat)
33209 emit_insn (pat);
33210 return target;
33211 }
33212
33213 case IX86_BUILTIN_RDRAND16_STEP:
33214 icode = CODE_FOR_rdrandhi_1;
33215 mode0 = HImode;
33216 goto rdrand_step;
33217
33218 case IX86_BUILTIN_RDRAND32_STEP:
33219 icode = CODE_FOR_rdrandsi_1;
33220 mode0 = SImode;
33221 goto rdrand_step;
33222
33223 case IX86_BUILTIN_RDRAND64_STEP:
33224 icode = CODE_FOR_rdranddi_1;
33225 mode0 = DImode;
33226
33227 rdrand_step:
33228 op0 = gen_reg_rtx (mode0);
33229 emit_insn (GEN_FCN (icode) (op0));
33230
33231 arg0 = CALL_EXPR_ARG (exp, 0);
33232 op1 = expand_normal (arg0);
33233 if (!address_operand (op1, VOIDmode))
33234 {
33235 op1 = convert_memory_address (Pmode, op1);
33236 op1 = copy_addr_to_reg (op1);
33237 }
33238 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33239
33240 op1 = gen_reg_rtx (SImode);
33241 emit_move_insn (op1, CONST1_RTX (SImode));
33242
33243 /* Emit SImode conditional move. */
33244 if (mode0 == HImode)
33245 {
33246 op2 = gen_reg_rtx (SImode);
33247 emit_insn (gen_zero_extendhisi2 (op2, op0));
33248 }
33249 else if (mode0 == SImode)
33250 op2 = op0;
33251 else
33252 op2 = gen_rtx_SUBREG (SImode, op0, 0);
33253
33254 if (target == 0)
33255 target = gen_reg_rtx (SImode);
33256
33257 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
33258 const0_rtx);
33259 emit_insn (gen_rtx_SET (VOIDmode, target,
33260 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
33261 return target;
33262
33263 case IX86_BUILTIN_RDSEED16_STEP:
33264 icode = CODE_FOR_rdseedhi_1;
33265 mode0 = HImode;
33266 goto rdseed_step;
33267
33268 case IX86_BUILTIN_RDSEED32_STEP:
33269 icode = CODE_FOR_rdseedsi_1;
33270 mode0 = SImode;
33271 goto rdseed_step;
33272
33273 case IX86_BUILTIN_RDSEED64_STEP:
33274 icode = CODE_FOR_rdseeddi_1;
33275 mode0 = DImode;
33276
33277 rdseed_step:
33278 op0 = gen_reg_rtx (mode0);
33279 emit_insn (GEN_FCN (icode) (op0));
33280
33281 arg0 = CALL_EXPR_ARG (exp, 0);
33282 op1 = expand_normal (arg0);
33283 if (!address_operand (op1, VOIDmode))
33284 {
33285 op1 = convert_memory_address (Pmode, op1);
33286 op1 = copy_addr_to_reg (op1);
33287 }
33288 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33289
33290 op2 = gen_reg_rtx (QImode);
33291
33292 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
33293 const0_rtx);
33294 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
33295
33296 if (target == 0)
33297 target = gen_reg_rtx (SImode);
33298
33299 emit_insn (gen_zero_extendqisi2 (target, op2));
33300 return target;
33301
33302 case IX86_BUILTIN_ADDCARRYX32:
33303 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
33304 mode0 = SImode;
33305 goto addcarryx;
33306
33307 case IX86_BUILTIN_ADDCARRYX64:
33308 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
33309 mode0 = DImode;
33310
33311 addcarryx:
33312 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
33313 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
33314 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
33315 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
33316
33317 op0 = gen_reg_rtx (QImode);
33318
33319 /* Generate CF from input operand. */
33320 op1 = expand_normal (arg0);
33321 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
33322 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
33323
33324 /* Gen ADCX instruction to compute X+Y+CF. */
33325 op2 = expand_normal (arg1);
33326 op3 = expand_normal (arg2);
33327
33328 if (!REG_P (op2))
33329 op2 = copy_to_mode_reg (mode0, op2);
33330 if (!REG_P (op3))
33331 op3 = copy_to_mode_reg (mode0, op3);
33332
33333 op0 = gen_reg_rtx (mode0);
33334
33335 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
33336 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
33337 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
33338
33339 /* Store the result. */
33340 op4 = expand_normal (arg3);
33341 if (!address_operand (op4, VOIDmode))
33342 {
33343 op4 = convert_memory_address (Pmode, op4);
33344 op4 = copy_addr_to_reg (op4);
33345 }
33346 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
33347
33348 /* Return current CF value. */
33349 if (target == 0)
33350 target = gen_reg_rtx (QImode);
33351
33352 PUT_MODE (pat, QImode);
33353 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
33354 return target;
33355
33356 case IX86_BUILTIN_GATHERSIV2DF:
33357 icode = CODE_FOR_avx2_gathersiv2df;
33358 goto gather_gen;
33359 case IX86_BUILTIN_GATHERSIV4DF:
33360 icode = CODE_FOR_avx2_gathersiv4df;
33361 goto gather_gen;
33362 case IX86_BUILTIN_GATHERDIV2DF:
33363 icode = CODE_FOR_avx2_gatherdiv2df;
33364 goto gather_gen;
33365 case IX86_BUILTIN_GATHERDIV4DF:
33366 icode = CODE_FOR_avx2_gatherdiv4df;
33367 goto gather_gen;
33368 case IX86_BUILTIN_GATHERSIV4SF:
33369 icode = CODE_FOR_avx2_gathersiv4sf;
33370 goto gather_gen;
33371 case IX86_BUILTIN_GATHERSIV8SF:
33372 icode = CODE_FOR_avx2_gathersiv8sf;
33373 goto gather_gen;
33374 case IX86_BUILTIN_GATHERDIV4SF:
33375 icode = CODE_FOR_avx2_gatherdiv4sf;
33376 goto gather_gen;
33377 case IX86_BUILTIN_GATHERDIV8SF:
33378 icode = CODE_FOR_avx2_gatherdiv8sf;
33379 goto gather_gen;
33380 case IX86_BUILTIN_GATHERSIV2DI:
33381 icode = CODE_FOR_avx2_gathersiv2di;
33382 goto gather_gen;
33383 case IX86_BUILTIN_GATHERSIV4DI:
33384 icode = CODE_FOR_avx2_gathersiv4di;
33385 goto gather_gen;
33386 case IX86_BUILTIN_GATHERDIV2DI:
33387 icode = CODE_FOR_avx2_gatherdiv2di;
33388 goto gather_gen;
33389 case IX86_BUILTIN_GATHERDIV4DI:
33390 icode = CODE_FOR_avx2_gatherdiv4di;
33391 goto gather_gen;
33392 case IX86_BUILTIN_GATHERSIV4SI:
33393 icode = CODE_FOR_avx2_gathersiv4si;
33394 goto gather_gen;
33395 case IX86_BUILTIN_GATHERSIV8SI:
33396 icode = CODE_FOR_avx2_gathersiv8si;
33397 goto gather_gen;
33398 case IX86_BUILTIN_GATHERDIV4SI:
33399 icode = CODE_FOR_avx2_gatherdiv4si;
33400 goto gather_gen;
33401 case IX86_BUILTIN_GATHERDIV8SI:
33402 icode = CODE_FOR_avx2_gatherdiv8si;
33403 goto gather_gen;
33404 case IX86_BUILTIN_GATHERALTSIV4DF:
33405 icode = CODE_FOR_avx2_gathersiv4df;
33406 goto gather_gen;
33407 case IX86_BUILTIN_GATHERALTDIV8SF:
33408 icode = CODE_FOR_avx2_gatherdiv8sf;
33409 goto gather_gen;
33410 case IX86_BUILTIN_GATHERALTSIV4DI:
33411 icode = CODE_FOR_avx2_gathersiv4di;
33412 goto gather_gen;
33413 case IX86_BUILTIN_GATHERALTDIV8SI:
33414 icode = CODE_FOR_avx2_gatherdiv8si;
33415 goto gather_gen;
33416
33417 gather_gen:
33418 arg0 = CALL_EXPR_ARG (exp, 0);
33419 arg1 = CALL_EXPR_ARG (exp, 1);
33420 arg2 = CALL_EXPR_ARG (exp, 2);
33421 arg3 = CALL_EXPR_ARG (exp, 3);
33422 arg4 = CALL_EXPR_ARG (exp, 4);
33423 op0 = expand_normal (arg0);
33424 op1 = expand_normal (arg1);
33425 op2 = expand_normal (arg2);
33426 op3 = expand_normal (arg3);
33427 op4 = expand_normal (arg4);
33428 /* Note the arg order is different from the operand order. */
33429 mode0 = insn_data[icode].operand[1].mode;
33430 mode2 = insn_data[icode].operand[3].mode;
33431 mode3 = insn_data[icode].operand[4].mode;
33432 mode4 = insn_data[icode].operand[5].mode;
33433
33434 if (target == NULL_RTX
33435 || GET_MODE (target) != insn_data[icode].operand[0].mode)
33436 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
33437 else
33438 subtarget = target;
33439
33440 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
33441 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
33442 {
33443 rtx half = gen_reg_rtx (V4SImode);
33444 if (!nonimmediate_operand (op2, V8SImode))
33445 op2 = copy_to_mode_reg (V8SImode, op2);
33446 emit_insn (gen_vec_extract_lo_v8si (half, op2));
33447 op2 = half;
33448 }
33449 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
33450 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
33451 {
33452 rtx (*gen) (rtx, rtx);
33453 rtx half = gen_reg_rtx (mode0);
33454 if (mode0 == V4SFmode)
33455 gen = gen_vec_extract_lo_v8sf;
33456 else
33457 gen = gen_vec_extract_lo_v8si;
33458 if (!nonimmediate_operand (op0, GET_MODE (op0)))
33459 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
33460 emit_insn (gen (half, op0));
33461 op0 = half;
33462 if (!nonimmediate_operand (op3, GET_MODE (op3)))
33463 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
33464 emit_insn (gen (half, op3));
33465 op3 = half;
33466 }
33467
33468 /* Force memory operand only with base register here. But we
33469 don't want to do it on memory operand for other builtin
33470 functions. */
33471 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
33472
33473 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33474 op0 = copy_to_mode_reg (mode0, op0);
33475 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
33476 op1 = copy_to_mode_reg (Pmode, op1);
33477 if (!insn_data[icode].operand[3].predicate (op2, mode2))
33478 op2 = copy_to_mode_reg (mode2, op2);
33479 if (!insn_data[icode].operand[4].predicate (op3, mode3))
33480 op3 = copy_to_mode_reg (mode3, op3);
33481 if (!insn_data[icode].operand[5].predicate (op4, mode4))
33482 {
33483 error ("last argument must be scale 1, 2, 4, 8");
33484 return const0_rtx;
33485 }
33486
33487 /* Optimize. If mask is known to have all high bits set,
33488 replace op0 with pc_rtx to signal that the instruction
33489 overwrites the whole destination and doesn't use its
33490 previous contents. */
33491 if (optimize)
33492 {
33493 if (TREE_CODE (arg3) == VECTOR_CST)
33494 {
33495 unsigned int negative = 0;
33496 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
33497 {
33498 tree cst = VECTOR_CST_ELT (arg3, i);
33499 if (TREE_CODE (cst) == INTEGER_CST
33500 && tree_int_cst_sign_bit (cst))
33501 negative++;
33502 else if (TREE_CODE (cst) == REAL_CST
33503 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
33504 negative++;
33505 }
33506 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
33507 op0 = pc_rtx;
33508 }
33509 else if (TREE_CODE (arg3) == SSA_NAME)
33510 {
33511 /* Recognize also when mask is like:
33512 __v2df src = _mm_setzero_pd ();
33513 __v2df mask = _mm_cmpeq_pd (src, src);
33514 or
33515 __v8sf src = _mm256_setzero_ps ();
33516 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
33517 as that is a cheaper way to load all ones into
33518 a register than having to load a constant from
33519 memory. */
33520 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
33521 if (is_gimple_call (def_stmt))
33522 {
33523 tree fndecl = gimple_call_fndecl (def_stmt);
33524 if (fndecl
33525 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33526 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
33527 {
33528 case IX86_BUILTIN_CMPPD:
33529 case IX86_BUILTIN_CMPPS:
33530 case IX86_BUILTIN_CMPPD256:
33531 case IX86_BUILTIN_CMPPS256:
33532 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
33533 break;
33534 /* FALLTHRU */
33535 case IX86_BUILTIN_CMPEQPD:
33536 case IX86_BUILTIN_CMPEQPS:
33537 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
33538 && initializer_zerop (gimple_call_arg (def_stmt,
33539 1)))
33540 op0 = pc_rtx;
33541 break;
33542 default:
33543 break;
33544 }
33545 }
33546 }
33547 }
33548
33549 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
33550 if (! pat)
33551 return const0_rtx;
33552 emit_insn (pat);
33553
33554 if (fcode == IX86_BUILTIN_GATHERDIV8SF
33555 || fcode == IX86_BUILTIN_GATHERDIV8SI)
33556 {
33557 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
33558 ? V4SFmode : V4SImode;
33559 if (target == NULL_RTX)
33560 target = gen_reg_rtx (tmode);
33561 if (tmode == V4SFmode)
33562 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
33563 else
33564 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
33565 }
33566 else
33567 target = subtarget;
33568
33569 return target;
33570
33571 case IX86_BUILTIN_XABORT:
33572 icode = CODE_FOR_xabort;
33573 arg0 = CALL_EXPR_ARG (exp, 0);
33574 op0 = expand_normal (arg0);
33575 mode0 = insn_data[icode].operand[0].mode;
33576 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33577 {
33578 error ("the xabort's argument must be an 8-bit immediate");
33579 return const0_rtx;
33580 }
33581 emit_insn (gen_xabort (op0));
33582 return 0;
33583
33584 default:
33585 break;
33586 }
33587
33588 for (i = 0, d = bdesc_special_args;
33589 i < ARRAY_SIZE (bdesc_special_args);
33590 i++, d++)
33591 if (d->code == fcode)
33592 return ix86_expand_special_args_builtin (d, exp, target);
33593
33594 for (i = 0, d = bdesc_args;
33595 i < ARRAY_SIZE (bdesc_args);
33596 i++, d++)
33597 if (d->code == fcode)
33598 switch (fcode)
33599 {
33600 case IX86_BUILTIN_FABSQ:
33601 case IX86_BUILTIN_COPYSIGNQ:
33602 if (!TARGET_SSE)
33603 /* Emit a normal call if SSE isn't available. */
33604 return expand_call (exp, target, ignore);
33605 default:
33606 return ix86_expand_args_builtin (d, exp, target);
33607 }
33608
33609 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
33610 if (d->code == fcode)
33611 return ix86_expand_sse_comi (d, exp, target);
33612
33613 for (i = 0, d = bdesc_pcmpestr;
33614 i < ARRAY_SIZE (bdesc_pcmpestr);
33615 i++, d++)
33616 if (d->code == fcode)
33617 return ix86_expand_sse_pcmpestr (d, exp, target);
33618
33619 for (i = 0, d = bdesc_pcmpistr;
33620 i < ARRAY_SIZE (bdesc_pcmpistr);
33621 i++, d++)
33622 if (d->code == fcode)
33623 return ix86_expand_sse_pcmpistr (d, exp, target);
33624
33625 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33626 if (d->code == fcode)
33627 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33628 (enum ix86_builtin_func_type)
33629 d->flag, d->comparison);
33630
33631 gcc_unreachable ();
33632 }
33633
33634 /* Returns a function decl for a vectorized version of the builtin function
33635 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33636 if it is not available. */
33637
33638 static tree
33639 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33640 tree type_in)
33641 {
33642 enum machine_mode in_mode, out_mode;
33643 int in_n, out_n;
33644 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33645
33646 if (TREE_CODE (type_out) != VECTOR_TYPE
33647 || TREE_CODE (type_in) != VECTOR_TYPE
33648 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33649 return NULL_TREE;
33650
33651 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33652 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33653 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33654 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33655
33656 switch (fn)
33657 {
33658 case BUILT_IN_SQRT:
33659 if (out_mode == DFmode && in_mode == DFmode)
33660 {
33661 if (out_n == 2 && in_n == 2)
33662 return ix86_builtins[IX86_BUILTIN_SQRTPD];
33663 else if (out_n == 4 && in_n == 4)
33664 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
33665 }
33666 break;
33667
33668 case BUILT_IN_SQRTF:
33669 if (out_mode == SFmode && in_mode == SFmode)
33670 {
33671 if (out_n == 4 && in_n == 4)
33672 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
33673 else if (out_n == 8 && in_n == 8)
33674 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
33675 }
33676 break;
33677
33678 case BUILT_IN_IFLOOR:
33679 case BUILT_IN_LFLOOR:
33680 case BUILT_IN_LLFLOOR:
33681 /* The round insn does not trap on denormals. */
33682 if (flag_trapping_math || !TARGET_ROUND)
33683 break;
33684
33685 if (out_mode == SImode && in_mode == DFmode)
33686 {
33687 if (out_n == 4 && in_n == 2)
33688 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
33689 else if (out_n == 8 && in_n == 4)
33690 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
33691 }
33692 break;
33693
33694 case BUILT_IN_IFLOORF:
33695 case BUILT_IN_LFLOORF:
33696 case BUILT_IN_LLFLOORF:
33697 /* The round insn does not trap on denormals. */
33698 if (flag_trapping_math || !TARGET_ROUND)
33699 break;
33700
33701 if (out_mode == SImode && in_mode == SFmode)
33702 {
33703 if (out_n == 4 && in_n == 4)
33704 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
33705 else if (out_n == 8 && in_n == 8)
33706 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
33707 }
33708 break;
33709
33710 case BUILT_IN_ICEIL:
33711 case BUILT_IN_LCEIL:
33712 case BUILT_IN_LLCEIL:
33713 /* The round insn does not trap on denormals. */
33714 if (flag_trapping_math || !TARGET_ROUND)
33715 break;
33716
33717 if (out_mode == SImode && in_mode == DFmode)
33718 {
33719 if (out_n == 4 && in_n == 2)
33720 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
33721 else if (out_n == 8 && in_n == 4)
33722 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
33723 }
33724 break;
33725
33726 case BUILT_IN_ICEILF:
33727 case BUILT_IN_LCEILF:
33728 case BUILT_IN_LLCEILF:
33729 /* The round insn does not trap on denormals. */
33730 if (flag_trapping_math || !TARGET_ROUND)
33731 break;
33732
33733 if (out_mode == SImode && in_mode == SFmode)
33734 {
33735 if (out_n == 4 && in_n == 4)
33736 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
33737 else if (out_n == 8 && in_n == 8)
33738 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
33739 }
33740 break;
33741
33742 case BUILT_IN_IRINT:
33743 case BUILT_IN_LRINT:
33744 case BUILT_IN_LLRINT:
33745 if (out_mode == SImode && in_mode == DFmode)
33746 {
33747 if (out_n == 4 && in_n == 2)
33748 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33749 else if (out_n == 8 && in_n == 4)
33750 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33751 }
33752 break;
33753
33754 case BUILT_IN_IRINTF:
33755 case BUILT_IN_LRINTF:
33756 case BUILT_IN_LLRINTF:
33757 if (out_mode == SImode && in_mode == SFmode)
33758 {
33759 if (out_n == 4 && in_n == 4)
33760 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33761 else if (out_n == 8 && in_n == 8)
33762 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33763 }
33764 break;
33765
33766 case BUILT_IN_IROUND:
33767 case BUILT_IN_LROUND:
33768 case BUILT_IN_LLROUND:
33769 /* The round insn does not trap on denormals. */
33770 if (flag_trapping_math || !TARGET_ROUND)
33771 break;
33772
33773 if (out_mode == SImode && in_mode == DFmode)
33774 {
33775 if (out_n == 4 && in_n == 2)
33776 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33777 else if (out_n == 8 && in_n == 4)
33778 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33779 }
33780 break;
33781
33782 case BUILT_IN_IROUNDF:
33783 case BUILT_IN_LROUNDF:
33784 case BUILT_IN_LLROUNDF:
33785 /* The round insn does not trap on denormals. */
33786 if (flag_trapping_math || !TARGET_ROUND)
33787 break;
33788
33789 if (out_mode == SImode && in_mode == SFmode)
33790 {
33791 if (out_n == 4 && in_n == 4)
33792 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33793 else if (out_n == 8 && in_n == 8)
33794 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33795 }
33796 break;
33797
33798 case BUILT_IN_COPYSIGN:
33799 if (out_mode == DFmode && in_mode == DFmode)
33800 {
33801 if (out_n == 2 && in_n == 2)
33802 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33803 else if (out_n == 4 && in_n == 4)
33804 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33805 }
33806 break;
33807
33808 case BUILT_IN_COPYSIGNF:
33809 if (out_mode == SFmode && in_mode == SFmode)
33810 {
33811 if (out_n == 4 && in_n == 4)
33812 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33813 else if (out_n == 8 && in_n == 8)
33814 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33815 }
33816 break;
33817
33818 case BUILT_IN_FLOOR:
33819 /* The round insn does not trap on denormals. */
33820 if (flag_trapping_math || !TARGET_ROUND)
33821 break;
33822
33823 if (out_mode == DFmode && in_mode == DFmode)
33824 {
33825 if (out_n == 2 && in_n == 2)
33826 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33827 else if (out_n == 4 && in_n == 4)
33828 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33829 }
33830 break;
33831
33832 case BUILT_IN_FLOORF:
33833 /* The round insn does not trap on denormals. */
33834 if (flag_trapping_math || !TARGET_ROUND)
33835 break;
33836
33837 if (out_mode == SFmode && in_mode == SFmode)
33838 {
33839 if (out_n == 4 && in_n == 4)
33840 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33841 else if (out_n == 8 && in_n == 8)
33842 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33843 }
33844 break;
33845
33846 case BUILT_IN_CEIL:
33847 /* The round insn does not trap on denormals. */
33848 if (flag_trapping_math || !TARGET_ROUND)
33849 break;
33850
33851 if (out_mode == DFmode && in_mode == DFmode)
33852 {
33853 if (out_n == 2 && in_n == 2)
33854 return ix86_builtins[IX86_BUILTIN_CEILPD];
33855 else if (out_n == 4 && in_n == 4)
33856 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33857 }
33858 break;
33859
33860 case BUILT_IN_CEILF:
33861 /* The round insn does not trap on denormals. */
33862 if (flag_trapping_math || !TARGET_ROUND)
33863 break;
33864
33865 if (out_mode == SFmode && in_mode == SFmode)
33866 {
33867 if (out_n == 4 && in_n == 4)
33868 return ix86_builtins[IX86_BUILTIN_CEILPS];
33869 else if (out_n == 8 && in_n == 8)
33870 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33871 }
33872 break;
33873
33874 case BUILT_IN_TRUNC:
33875 /* The round insn does not trap on denormals. */
33876 if (flag_trapping_math || !TARGET_ROUND)
33877 break;
33878
33879 if (out_mode == DFmode && in_mode == DFmode)
33880 {
33881 if (out_n == 2 && in_n == 2)
33882 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33883 else if (out_n == 4 && in_n == 4)
33884 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33885 }
33886 break;
33887
33888 case BUILT_IN_TRUNCF:
33889 /* The round insn does not trap on denormals. */
33890 if (flag_trapping_math || !TARGET_ROUND)
33891 break;
33892
33893 if (out_mode == SFmode && in_mode == SFmode)
33894 {
33895 if (out_n == 4 && in_n == 4)
33896 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33897 else if (out_n == 8 && in_n == 8)
33898 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33899 }
33900 break;
33901
33902 case BUILT_IN_RINT:
33903 /* The round insn does not trap on denormals. */
33904 if (flag_trapping_math || !TARGET_ROUND)
33905 break;
33906
33907 if (out_mode == DFmode && in_mode == DFmode)
33908 {
33909 if (out_n == 2 && in_n == 2)
33910 return ix86_builtins[IX86_BUILTIN_RINTPD];
33911 else if (out_n == 4 && in_n == 4)
33912 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33913 }
33914 break;
33915
33916 case BUILT_IN_RINTF:
33917 /* The round insn does not trap on denormals. */
33918 if (flag_trapping_math || !TARGET_ROUND)
33919 break;
33920
33921 if (out_mode == SFmode && in_mode == SFmode)
33922 {
33923 if (out_n == 4 && in_n == 4)
33924 return ix86_builtins[IX86_BUILTIN_RINTPS];
33925 else if (out_n == 8 && in_n == 8)
33926 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33927 }
33928 break;
33929
33930 case BUILT_IN_ROUND:
33931 /* The round insn does not trap on denormals. */
33932 if (flag_trapping_math || !TARGET_ROUND)
33933 break;
33934
33935 if (out_mode == DFmode && in_mode == DFmode)
33936 {
33937 if (out_n == 2 && in_n == 2)
33938 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33939 else if (out_n == 4 && in_n == 4)
33940 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33941 }
33942 break;
33943
33944 case BUILT_IN_ROUNDF:
33945 /* The round insn does not trap on denormals. */
33946 if (flag_trapping_math || !TARGET_ROUND)
33947 break;
33948
33949 if (out_mode == SFmode && in_mode == SFmode)
33950 {
33951 if (out_n == 4 && in_n == 4)
33952 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33953 else if (out_n == 8 && in_n == 8)
33954 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33955 }
33956 break;
33957
33958 case BUILT_IN_FMA:
33959 if (out_mode == DFmode && in_mode == DFmode)
33960 {
33961 if (out_n == 2 && in_n == 2)
33962 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33963 if (out_n == 4 && in_n == 4)
33964 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33965 }
33966 break;
33967
33968 case BUILT_IN_FMAF:
33969 if (out_mode == SFmode && in_mode == SFmode)
33970 {
33971 if (out_n == 4 && in_n == 4)
33972 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33973 if (out_n == 8 && in_n == 8)
33974 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33975 }
33976 break;
33977
33978 default:
33979 break;
33980 }
33981
33982 /* Dispatch to a handler for a vectorization library. */
33983 if (ix86_veclib_handler)
33984 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33985 type_in);
33986
33987 return NULL_TREE;
33988 }
33989
33990 /* Handler for an SVML-style interface to
33991 a library with vectorized intrinsics. */
33992
33993 static tree
33994 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33995 {
33996 char name[20];
33997 tree fntype, new_fndecl, args;
33998 unsigned arity;
33999 const char *bname;
34000 enum machine_mode el_mode, in_mode;
34001 int n, in_n;
34002
34003 /* The SVML is suitable for unsafe math only. */
34004 if (!flag_unsafe_math_optimizations)
34005 return NULL_TREE;
34006
34007 el_mode = TYPE_MODE (TREE_TYPE (type_out));
34008 n = TYPE_VECTOR_SUBPARTS (type_out);
34009 in_mode = TYPE_MODE (TREE_TYPE (type_in));
34010 in_n = TYPE_VECTOR_SUBPARTS (type_in);
34011 if (el_mode != in_mode
34012 || n != in_n)
34013 return NULL_TREE;
34014
34015 switch (fn)
34016 {
34017 case BUILT_IN_EXP:
34018 case BUILT_IN_LOG:
34019 case BUILT_IN_LOG10:
34020 case BUILT_IN_POW:
34021 case BUILT_IN_TANH:
34022 case BUILT_IN_TAN:
34023 case BUILT_IN_ATAN:
34024 case BUILT_IN_ATAN2:
34025 case BUILT_IN_ATANH:
34026 case BUILT_IN_CBRT:
34027 case BUILT_IN_SINH:
34028 case BUILT_IN_SIN:
34029 case BUILT_IN_ASINH:
34030 case BUILT_IN_ASIN:
34031 case BUILT_IN_COSH:
34032 case BUILT_IN_COS:
34033 case BUILT_IN_ACOSH:
34034 case BUILT_IN_ACOS:
34035 if (el_mode != DFmode || n != 2)
34036 return NULL_TREE;
34037 break;
34038
34039 case BUILT_IN_EXPF:
34040 case BUILT_IN_LOGF:
34041 case BUILT_IN_LOG10F:
34042 case BUILT_IN_POWF:
34043 case BUILT_IN_TANHF:
34044 case BUILT_IN_TANF:
34045 case BUILT_IN_ATANF:
34046 case BUILT_IN_ATAN2F:
34047 case BUILT_IN_ATANHF:
34048 case BUILT_IN_CBRTF:
34049 case BUILT_IN_SINHF:
34050 case BUILT_IN_SINF:
34051 case BUILT_IN_ASINHF:
34052 case BUILT_IN_ASINF:
34053 case BUILT_IN_COSHF:
34054 case BUILT_IN_COSF:
34055 case BUILT_IN_ACOSHF:
34056 case BUILT_IN_ACOSF:
34057 if (el_mode != SFmode || n != 4)
34058 return NULL_TREE;
34059 break;
34060
34061 default:
34062 return NULL_TREE;
34063 }
34064
34065 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34066
34067 if (fn == BUILT_IN_LOGF)
34068 strcpy (name, "vmlsLn4");
34069 else if (fn == BUILT_IN_LOG)
34070 strcpy (name, "vmldLn2");
34071 else if (n == 4)
34072 {
34073 sprintf (name, "vmls%s", bname+10);
34074 name[strlen (name)-1] = '4';
34075 }
34076 else
34077 sprintf (name, "vmld%s2", bname+10);
34078
34079 /* Convert to uppercase. */
34080 name[4] &= ~0x20;
34081
34082 arity = 0;
34083 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34084 args;
34085 args = TREE_CHAIN (args))
34086 arity++;
34087
34088 if (arity == 1)
34089 fntype = build_function_type_list (type_out, type_in, NULL);
34090 else
34091 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34092
34093 /* Build a function declaration for the vectorized function. */
34094 new_fndecl = build_decl (BUILTINS_LOCATION,
34095 FUNCTION_DECL, get_identifier (name), fntype);
34096 TREE_PUBLIC (new_fndecl) = 1;
34097 DECL_EXTERNAL (new_fndecl) = 1;
34098 DECL_IS_NOVOPS (new_fndecl) = 1;
34099 TREE_READONLY (new_fndecl) = 1;
34100
34101 return new_fndecl;
34102 }
34103
34104 /* Handler for an ACML-style interface to
34105 a library with vectorized intrinsics. */
34106
34107 static tree
34108 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
34109 {
34110 char name[20] = "__vr.._";
34111 tree fntype, new_fndecl, args;
34112 unsigned arity;
34113 const char *bname;
34114 enum machine_mode el_mode, in_mode;
34115 int n, in_n;
34116
34117 /* The ACML is 64bits only and suitable for unsafe math only as
34118 it does not correctly support parts of IEEE with the required
34119 precision such as denormals. */
34120 if (!TARGET_64BIT
34121 || !flag_unsafe_math_optimizations)
34122 return NULL_TREE;
34123
34124 el_mode = TYPE_MODE (TREE_TYPE (type_out));
34125 n = TYPE_VECTOR_SUBPARTS (type_out);
34126 in_mode = TYPE_MODE (TREE_TYPE (type_in));
34127 in_n = TYPE_VECTOR_SUBPARTS (type_in);
34128 if (el_mode != in_mode
34129 || n != in_n)
34130 return NULL_TREE;
34131
34132 switch (fn)
34133 {
34134 case BUILT_IN_SIN:
34135 case BUILT_IN_COS:
34136 case BUILT_IN_EXP:
34137 case BUILT_IN_LOG:
34138 case BUILT_IN_LOG2:
34139 case BUILT_IN_LOG10:
34140 name[4] = 'd';
34141 name[5] = '2';
34142 if (el_mode != DFmode
34143 || n != 2)
34144 return NULL_TREE;
34145 break;
34146
34147 case BUILT_IN_SINF:
34148 case BUILT_IN_COSF:
34149 case BUILT_IN_EXPF:
34150 case BUILT_IN_POWF:
34151 case BUILT_IN_LOGF:
34152 case BUILT_IN_LOG2F:
34153 case BUILT_IN_LOG10F:
34154 name[4] = 's';
34155 name[5] = '4';
34156 if (el_mode != SFmode
34157 || n != 4)
34158 return NULL_TREE;
34159 break;
34160
34161 default:
34162 return NULL_TREE;
34163 }
34164
34165 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34166 sprintf (name + 7, "%s", bname+10);
34167
34168 arity = 0;
34169 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34170 args;
34171 args = TREE_CHAIN (args))
34172 arity++;
34173
34174 if (arity == 1)
34175 fntype = build_function_type_list (type_out, type_in, NULL);
34176 else
34177 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34178
34179 /* Build a function declaration for the vectorized function. */
34180 new_fndecl = build_decl (BUILTINS_LOCATION,
34181 FUNCTION_DECL, get_identifier (name), fntype);
34182 TREE_PUBLIC (new_fndecl) = 1;
34183 DECL_EXTERNAL (new_fndecl) = 1;
34184 DECL_IS_NOVOPS (new_fndecl) = 1;
34185 TREE_READONLY (new_fndecl) = 1;
34186
34187 return new_fndecl;
34188 }
34189
34190 /* Returns a decl of a function that implements gather load with
34191 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
34192 Return NULL_TREE if it is not available. */
34193
34194 static tree
34195 ix86_vectorize_builtin_gather (const_tree mem_vectype,
34196 const_tree index_type, int scale)
34197 {
34198 bool si;
34199 enum ix86_builtins code;
34200
34201 if (! TARGET_AVX2)
34202 return NULL_TREE;
34203
34204 if ((TREE_CODE (index_type) != INTEGER_TYPE
34205 && !POINTER_TYPE_P (index_type))
34206 || (TYPE_MODE (index_type) != SImode
34207 && TYPE_MODE (index_type) != DImode))
34208 return NULL_TREE;
34209
34210 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
34211 return NULL_TREE;
34212
34213 /* v*gather* insn sign extends index to pointer mode. */
34214 if (TYPE_PRECISION (index_type) < POINTER_SIZE
34215 && TYPE_UNSIGNED (index_type))
34216 return NULL_TREE;
34217
34218 if (scale <= 0
34219 || scale > 8
34220 || (scale & (scale - 1)) != 0)
34221 return NULL_TREE;
34222
34223 si = TYPE_MODE (index_type) == SImode;
34224 switch (TYPE_MODE (mem_vectype))
34225 {
34226 case V2DFmode:
34227 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
34228 break;
34229 case V4DFmode:
34230 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
34231 break;
34232 case V2DImode:
34233 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
34234 break;
34235 case V4DImode:
34236 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
34237 break;
34238 case V4SFmode:
34239 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
34240 break;
34241 case V8SFmode:
34242 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
34243 break;
34244 case V4SImode:
34245 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
34246 break;
34247 case V8SImode:
34248 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
34249 break;
34250 default:
34251 return NULL_TREE;
34252 }
34253
34254 return ix86_builtins[code];
34255 }
34256
34257 /* Returns a code for a target-specific builtin that implements
34258 reciprocal of the function, or NULL_TREE if not available. */
34259
34260 static tree
34261 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
34262 bool sqrt ATTRIBUTE_UNUSED)
34263 {
34264 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
34265 && flag_finite_math_only && !flag_trapping_math
34266 && flag_unsafe_math_optimizations))
34267 return NULL_TREE;
34268
34269 if (md_fn)
34270 /* Machine dependent builtins. */
34271 switch (fn)
34272 {
34273 /* Vectorized version of sqrt to rsqrt conversion. */
34274 case IX86_BUILTIN_SQRTPS_NR:
34275 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
34276
34277 case IX86_BUILTIN_SQRTPS_NR256:
34278 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
34279
34280 default:
34281 return NULL_TREE;
34282 }
34283 else
34284 /* Normal builtins. */
34285 switch (fn)
34286 {
34287 /* Sqrt to rsqrt conversion. */
34288 case BUILT_IN_SQRTF:
34289 return ix86_builtins[IX86_BUILTIN_RSQRTF];
34290
34291 default:
34292 return NULL_TREE;
34293 }
34294 }
34295 \f
34296 /* Helper for avx_vpermilps256_operand et al. This is also used by
34297 the expansion functions to turn the parallel back into a mask.
34298 The return value is 0 for no match and the imm8+1 for a match. */
34299
34300 int
34301 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
34302 {
34303 unsigned i, nelt = GET_MODE_NUNITS (mode);
34304 unsigned mask = 0;
34305 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34306
34307 if (XVECLEN (par, 0) != (int) nelt)
34308 return 0;
34309
34310 /* Validate that all of the elements are constants, and not totally
34311 out of range. Copy the data into an integral array to make the
34312 subsequent checks easier. */
34313 for (i = 0; i < nelt; ++i)
34314 {
34315 rtx er = XVECEXP (par, 0, i);
34316 unsigned HOST_WIDE_INT ei;
34317
34318 if (!CONST_INT_P (er))
34319 return 0;
34320 ei = INTVAL (er);
34321 if (ei >= nelt)
34322 return 0;
34323 ipar[i] = ei;
34324 }
34325
34326 switch (mode)
34327 {
34328 case V4DFmode:
34329 /* In the 256-bit DFmode case, we can only move elements within
34330 a 128-bit lane. */
34331 for (i = 0; i < 2; ++i)
34332 {
34333 if (ipar[i] >= 2)
34334 return 0;
34335 mask |= ipar[i] << i;
34336 }
34337 for (i = 2; i < 4; ++i)
34338 {
34339 if (ipar[i] < 2)
34340 return 0;
34341 mask |= (ipar[i] - 2) << i;
34342 }
34343 break;
34344
34345 case V8SFmode:
34346 /* In the 256-bit SFmode case, we have full freedom of movement
34347 within the low 128-bit lane, but the high 128-bit lane must
34348 mirror the exact same pattern. */
34349 for (i = 0; i < 4; ++i)
34350 if (ipar[i] + 4 != ipar[i + 4])
34351 return 0;
34352 nelt = 4;
34353 /* FALLTHRU */
34354
34355 case V2DFmode:
34356 case V4SFmode:
34357 /* In the 128-bit case, we've full freedom in the placement of
34358 the elements from the source operand. */
34359 for (i = 0; i < nelt; ++i)
34360 mask |= ipar[i] << (i * (nelt / 2));
34361 break;
34362
34363 default:
34364 gcc_unreachable ();
34365 }
34366
34367 /* Make sure success has a non-zero value by adding one. */
34368 return mask + 1;
34369 }
34370
34371 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
34372 the expansion functions to turn the parallel back into a mask.
34373 The return value is 0 for no match and the imm8+1 for a match. */
34374
34375 int
34376 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
34377 {
34378 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
34379 unsigned mask = 0;
34380 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34381
34382 if (XVECLEN (par, 0) != (int) nelt)
34383 return 0;
34384
34385 /* Validate that all of the elements are constants, and not totally
34386 out of range. Copy the data into an integral array to make the
34387 subsequent checks easier. */
34388 for (i = 0; i < nelt; ++i)
34389 {
34390 rtx er = XVECEXP (par, 0, i);
34391 unsigned HOST_WIDE_INT ei;
34392
34393 if (!CONST_INT_P (er))
34394 return 0;
34395 ei = INTVAL (er);
34396 if (ei >= 2 * nelt)
34397 return 0;
34398 ipar[i] = ei;
34399 }
34400
34401 /* Validate that the halves of the permute are halves. */
34402 for (i = 0; i < nelt2 - 1; ++i)
34403 if (ipar[i] + 1 != ipar[i + 1])
34404 return 0;
34405 for (i = nelt2; i < nelt - 1; ++i)
34406 if (ipar[i] + 1 != ipar[i + 1])
34407 return 0;
34408
34409 /* Reconstruct the mask. */
34410 for (i = 0; i < 2; ++i)
34411 {
34412 unsigned e = ipar[i * nelt2];
34413 if (e % nelt2)
34414 return 0;
34415 e /= nelt2;
34416 mask |= e << (i * 4);
34417 }
34418
34419 /* Make sure success has a non-zero value by adding one. */
34420 return mask + 1;
34421 }
34422 \f
34423 /* Store OPERAND to the memory after reload is completed. This means
34424 that we can't easily use assign_stack_local. */
34425 rtx
34426 ix86_force_to_memory (enum machine_mode mode, rtx operand)
34427 {
34428 rtx result;
34429
34430 gcc_assert (reload_completed);
34431 if (ix86_using_red_zone ())
34432 {
34433 result = gen_rtx_MEM (mode,
34434 gen_rtx_PLUS (Pmode,
34435 stack_pointer_rtx,
34436 GEN_INT (-RED_ZONE_SIZE)));
34437 emit_move_insn (result, operand);
34438 }
34439 else if (TARGET_64BIT)
34440 {
34441 switch (mode)
34442 {
34443 case HImode:
34444 case SImode:
34445 operand = gen_lowpart (DImode, operand);
34446 /* FALLTHRU */
34447 case DImode:
34448 emit_insn (
34449 gen_rtx_SET (VOIDmode,
34450 gen_rtx_MEM (DImode,
34451 gen_rtx_PRE_DEC (DImode,
34452 stack_pointer_rtx)),
34453 operand));
34454 break;
34455 default:
34456 gcc_unreachable ();
34457 }
34458 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34459 }
34460 else
34461 {
34462 switch (mode)
34463 {
34464 case DImode:
34465 {
34466 rtx operands[2];
34467 split_double_mode (mode, &operand, 1, operands, operands + 1);
34468 emit_insn (
34469 gen_rtx_SET (VOIDmode,
34470 gen_rtx_MEM (SImode,
34471 gen_rtx_PRE_DEC (Pmode,
34472 stack_pointer_rtx)),
34473 operands[1]));
34474 emit_insn (
34475 gen_rtx_SET (VOIDmode,
34476 gen_rtx_MEM (SImode,
34477 gen_rtx_PRE_DEC (Pmode,
34478 stack_pointer_rtx)),
34479 operands[0]));
34480 }
34481 break;
34482 case HImode:
34483 /* Store HImodes as SImodes. */
34484 operand = gen_lowpart (SImode, operand);
34485 /* FALLTHRU */
34486 case SImode:
34487 emit_insn (
34488 gen_rtx_SET (VOIDmode,
34489 gen_rtx_MEM (GET_MODE (operand),
34490 gen_rtx_PRE_DEC (SImode,
34491 stack_pointer_rtx)),
34492 operand));
34493 break;
34494 default:
34495 gcc_unreachable ();
34496 }
34497 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34498 }
34499 return result;
34500 }
34501
34502 /* Free operand from the memory. */
34503 void
34504 ix86_free_from_memory (enum machine_mode mode)
34505 {
34506 if (!ix86_using_red_zone ())
34507 {
34508 int size;
34509
34510 if (mode == DImode || TARGET_64BIT)
34511 size = 8;
34512 else
34513 size = 4;
34514 /* Use LEA to deallocate stack space. In peephole2 it will be converted
34515 to pop or add instruction if registers are available. */
34516 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
34517 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
34518 GEN_INT (size))));
34519 }
34520 }
34521
34522 /* Return a register priority for hard reg REGNO. */
34523 static int
34524 ix86_register_priority (int hard_regno)
34525 {
34526 /* ebp and r13 as the base always wants a displacement, r12 as the
34527 base always wants an index. So discourage their usage in an
34528 address. */
34529 if (hard_regno == R12_REG || hard_regno == R13_REG)
34530 return 0;
34531 if (hard_regno == BP_REG)
34532 return 1;
34533 /* New x86-64 int registers result in bigger code size. Discourage
34534 them. */
34535 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
34536 return 2;
34537 /* New x86-64 SSE registers result in bigger code size. Discourage
34538 them. */
34539 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
34540 return 2;
34541 /* Usage of AX register results in smaller code. Prefer it. */
34542 if (hard_regno == 0)
34543 return 4;
34544 return 3;
34545 }
34546
34547 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
34548
34549 Put float CONST_DOUBLE in the constant pool instead of fp regs.
34550 QImode must go into class Q_REGS.
34551 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
34552 movdf to do mem-to-mem moves through integer regs. */
34553
34554 static reg_class_t
34555 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
34556 {
34557 enum machine_mode mode = GET_MODE (x);
34558
34559 /* We're only allowed to return a subclass of CLASS. Many of the
34560 following checks fail for NO_REGS, so eliminate that early. */
34561 if (regclass == NO_REGS)
34562 return NO_REGS;
34563
34564 /* All classes can load zeros. */
34565 if (x == CONST0_RTX (mode))
34566 return regclass;
34567
34568 /* Force constants into memory if we are loading a (nonzero) constant into
34569 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
34570 instructions to load from a constant. */
34571 if (CONSTANT_P (x)
34572 && (MAYBE_MMX_CLASS_P (regclass)
34573 || MAYBE_SSE_CLASS_P (regclass)
34574 || MAYBE_MASK_CLASS_P (regclass)))
34575 return NO_REGS;
34576
34577 /* Prefer SSE regs only, if we can use them for math. */
34578 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
34579 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
34580
34581 /* Floating-point constants need more complex checks. */
34582 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
34583 {
34584 /* General regs can load everything. */
34585 if (reg_class_subset_p (regclass, GENERAL_REGS))
34586 return regclass;
34587
34588 /* Floats can load 0 and 1 plus some others. Note that we eliminated
34589 zero above. We only want to wind up preferring 80387 registers if
34590 we plan on doing computation with them. */
34591 if (TARGET_80387
34592 && standard_80387_constant_p (x) > 0)
34593 {
34594 /* Limit class to non-sse. */
34595 if (regclass == FLOAT_SSE_REGS)
34596 return FLOAT_REGS;
34597 if (regclass == FP_TOP_SSE_REGS)
34598 return FP_TOP_REG;
34599 if (regclass == FP_SECOND_SSE_REGS)
34600 return FP_SECOND_REG;
34601 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
34602 return regclass;
34603 }
34604
34605 return NO_REGS;
34606 }
34607
34608 /* Generally when we see PLUS here, it's the function invariant
34609 (plus soft-fp const_int). Which can only be computed into general
34610 regs. */
34611 if (GET_CODE (x) == PLUS)
34612 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
34613
34614 /* QImode constants are easy to load, but non-constant QImode data
34615 must go into Q_REGS. */
34616 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
34617 {
34618 if (reg_class_subset_p (regclass, Q_REGS))
34619 return regclass;
34620 if (reg_class_subset_p (Q_REGS, regclass))
34621 return Q_REGS;
34622 return NO_REGS;
34623 }
34624
34625 return regclass;
34626 }
34627
34628 /* Discourage putting floating-point values in SSE registers unless
34629 SSE math is being used, and likewise for the 387 registers. */
34630 static reg_class_t
34631 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34632 {
34633 enum machine_mode mode = GET_MODE (x);
34634
34635 /* Restrict the output reload class to the register bank that we are doing
34636 math on. If we would like not to return a subset of CLASS, reject this
34637 alternative: if reload cannot do this, it will still use its choice. */
34638 mode = GET_MODE (x);
34639 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34640 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
34641
34642 if (X87_FLOAT_MODE_P (mode))
34643 {
34644 if (regclass == FP_TOP_SSE_REGS)
34645 return FP_TOP_REG;
34646 else if (regclass == FP_SECOND_SSE_REGS)
34647 return FP_SECOND_REG;
34648 else
34649 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34650 }
34651
34652 return regclass;
34653 }
34654
34655 static reg_class_t
34656 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34657 enum machine_mode mode, secondary_reload_info *sri)
34658 {
34659 /* Double-word spills from general registers to non-offsettable memory
34660 references (zero-extended addresses) require special handling. */
34661 if (TARGET_64BIT
34662 && MEM_P (x)
34663 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34664 && INTEGER_CLASS_P (rclass)
34665 && !offsettable_memref_p (x))
34666 {
34667 sri->icode = (in_p
34668 ? CODE_FOR_reload_noff_load
34669 : CODE_FOR_reload_noff_store);
34670 /* Add the cost of moving address to a temporary. */
34671 sri->extra_cost = 1;
34672
34673 return NO_REGS;
34674 }
34675
34676 /* QImode spills from non-QI registers require
34677 intermediate register on 32bit targets. */
34678 if (mode == QImode
34679 && (MAYBE_MASK_CLASS_P (rclass)
34680 || (!TARGET_64BIT && !in_p
34681 && INTEGER_CLASS_P (rclass)
34682 && MAYBE_NON_Q_CLASS_P (rclass))))
34683 {
34684 int regno;
34685
34686 if (REG_P (x))
34687 regno = REGNO (x);
34688 else
34689 regno = -1;
34690
34691 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34692 regno = true_regnum (x);
34693
34694 /* Return Q_REGS if the operand is in memory. */
34695 if (regno == -1)
34696 return Q_REGS;
34697 }
34698
34699 /* This condition handles corner case where an expression involving
34700 pointers gets vectorized. We're trying to use the address of a
34701 stack slot as a vector initializer.
34702
34703 (set (reg:V2DI 74 [ vect_cst_.2 ])
34704 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34705
34706 Eventually frame gets turned into sp+offset like this:
34707
34708 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34709 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34710 (const_int 392 [0x188]))))
34711
34712 That later gets turned into:
34713
34714 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34715 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34716 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34717
34718 We'll have the following reload recorded:
34719
34720 Reload 0: reload_in (DI) =
34721 (plus:DI (reg/f:DI 7 sp)
34722 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34723 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34724 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34725 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34726 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34727 reload_reg_rtx: (reg:V2DI 22 xmm1)
34728
34729 Which isn't going to work since SSE instructions can't handle scalar
34730 additions. Returning GENERAL_REGS forces the addition into integer
34731 register and reload can handle subsequent reloads without problems. */
34732
34733 if (in_p && GET_CODE (x) == PLUS
34734 && SSE_CLASS_P (rclass)
34735 && SCALAR_INT_MODE_P (mode))
34736 return GENERAL_REGS;
34737
34738 return NO_REGS;
34739 }
34740
34741 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34742
34743 static bool
34744 ix86_class_likely_spilled_p (reg_class_t rclass)
34745 {
34746 switch (rclass)
34747 {
34748 case AREG:
34749 case DREG:
34750 case CREG:
34751 case BREG:
34752 case AD_REGS:
34753 case SIREG:
34754 case DIREG:
34755 case SSE_FIRST_REG:
34756 case FP_TOP_REG:
34757 case FP_SECOND_REG:
34758 case BND_REGS:
34759 return true;
34760
34761 default:
34762 break;
34763 }
34764
34765 return false;
34766 }
34767
34768 /* If we are copying between general and FP registers, we need a memory
34769 location. The same is true for SSE and MMX registers.
34770
34771 To optimize register_move_cost performance, allow inline variant.
34772
34773 The macro can't work reliably when one of the CLASSES is class containing
34774 registers from multiple units (SSE, MMX, integer). We avoid this by never
34775 combining those units in single alternative in the machine description.
34776 Ensure that this constraint holds to avoid unexpected surprises.
34777
34778 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34779 enforce these sanity checks. */
34780
34781 static inline bool
34782 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34783 enum machine_mode mode, int strict)
34784 {
34785 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34786 return false;
34787 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34788 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34789 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34790 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34791 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34792 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34793 {
34794 gcc_assert (!strict || lra_in_progress);
34795 return true;
34796 }
34797
34798 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34799 return true;
34800
34801 /* ??? This is a lie. We do have moves between mmx/general, and for
34802 mmx/sse2. But by saying we need secondary memory we discourage the
34803 register allocator from using the mmx registers unless needed. */
34804 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34805 return true;
34806
34807 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34808 {
34809 /* SSE1 doesn't have any direct moves from other classes. */
34810 if (!TARGET_SSE2)
34811 return true;
34812
34813 /* If the target says that inter-unit moves are more expensive
34814 than moving through memory, then don't generate them. */
34815 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34816 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34817 return true;
34818
34819 /* Between SSE and general, we have moves no larger than word size. */
34820 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34821 return true;
34822 }
34823
34824 return false;
34825 }
34826
34827 bool
34828 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34829 enum machine_mode mode, int strict)
34830 {
34831 return inline_secondary_memory_needed (class1, class2, mode, strict);
34832 }
34833
34834 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34835
34836 On the 80386, this is the size of MODE in words,
34837 except in the FP regs, where a single reg is always enough. */
34838
34839 static unsigned char
34840 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34841 {
34842 if (MAYBE_INTEGER_CLASS_P (rclass))
34843 {
34844 if (mode == XFmode)
34845 return (TARGET_64BIT ? 2 : 3);
34846 else if (mode == XCmode)
34847 return (TARGET_64BIT ? 4 : 6);
34848 else
34849 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34850 }
34851 else
34852 {
34853 if (COMPLEX_MODE_P (mode))
34854 return 2;
34855 else
34856 return 1;
34857 }
34858 }
34859
34860 /* Return true if the registers in CLASS cannot represent the change from
34861 modes FROM to TO. */
34862
34863 bool
34864 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34865 enum reg_class regclass)
34866 {
34867 if (from == to)
34868 return false;
34869
34870 /* x87 registers can't do subreg at all, as all values are reformatted
34871 to extended precision. */
34872 if (MAYBE_FLOAT_CLASS_P (regclass))
34873 return true;
34874
34875 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34876 {
34877 /* Vector registers do not support QI or HImode loads. If we don't
34878 disallow a change to these modes, reload will assume it's ok to
34879 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34880 the vec_dupv4hi pattern. */
34881 if (GET_MODE_SIZE (from) < 4)
34882 return true;
34883
34884 /* Vector registers do not support subreg with nonzero offsets, which
34885 are otherwise valid for integer registers. Since we can't see
34886 whether we have a nonzero offset from here, prohibit all
34887 nonparadoxical subregs changing size. */
34888 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34889 return true;
34890 }
34891
34892 return false;
34893 }
34894
34895 /* Return the cost of moving data of mode M between a
34896 register and memory. A value of 2 is the default; this cost is
34897 relative to those in `REGISTER_MOVE_COST'.
34898
34899 This function is used extensively by register_move_cost that is used to
34900 build tables at startup. Make it inline in this case.
34901 When IN is 2, return maximum of in and out move cost.
34902
34903 If moving between registers and memory is more expensive than
34904 between two registers, you should define this macro to express the
34905 relative cost.
34906
34907 Model also increased moving costs of QImode registers in non
34908 Q_REGS classes.
34909 */
34910 static inline int
34911 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34912 int in)
34913 {
34914 int cost;
34915 if (FLOAT_CLASS_P (regclass))
34916 {
34917 int index;
34918 switch (mode)
34919 {
34920 case SFmode:
34921 index = 0;
34922 break;
34923 case DFmode:
34924 index = 1;
34925 break;
34926 case XFmode:
34927 index = 2;
34928 break;
34929 default:
34930 return 100;
34931 }
34932 if (in == 2)
34933 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34934 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34935 }
34936 if (SSE_CLASS_P (regclass))
34937 {
34938 int index;
34939 switch (GET_MODE_SIZE (mode))
34940 {
34941 case 4:
34942 index = 0;
34943 break;
34944 case 8:
34945 index = 1;
34946 break;
34947 case 16:
34948 index = 2;
34949 break;
34950 default:
34951 return 100;
34952 }
34953 if (in == 2)
34954 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34955 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34956 }
34957 if (MMX_CLASS_P (regclass))
34958 {
34959 int index;
34960 switch (GET_MODE_SIZE (mode))
34961 {
34962 case 4:
34963 index = 0;
34964 break;
34965 case 8:
34966 index = 1;
34967 break;
34968 default:
34969 return 100;
34970 }
34971 if (in)
34972 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34973 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34974 }
34975 switch (GET_MODE_SIZE (mode))
34976 {
34977 case 1:
34978 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34979 {
34980 if (!in)
34981 return ix86_cost->int_store[0];
34982 if (TARGET_PARTIAL_REG_DEPENDENCY
34983 && optimize_function_for_speed_p (cfun))
34984 cost = ix86_cost->movzbl_load;
34985 else
34986 cost = ix86_cost->int_load[0];
34987 if (in == 2)
34988 return MAX (cost, ix86_cost->int_store[0]);
34989 return cost;
34990 }
34991 else
34992 {
34993 if (in == 2)
34994 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34995 if (in)
34996 return ix86_cost->movzbl_load;
34997 else
34998 return ix86_cost->int_store[0] + 4;
34999 }
35000 break;
35001 case 2:
35002 if (in == 2)
35003 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
35004 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
35005 default:
35006 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
35007 if (mode == TFmode)
35008 mode = XFmode;
35009 if (in == 2)
35010 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
35011 else if (in)
35012 cost = ix86_cost->int_load[2];
35013 else
35014 cost = ix86_cost->int_store[2];
35015 return (cost * (((int) GET_MODE_SIZE (mode)
35016 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
35017 }
35018 }
35019
35020 static int
35021 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
35022 bool in)
35023 {
35024 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
35025 }
35026
35027
35028 /* Return the cost of moving data from a register in class CLASS1 to
35029 one in class CLASS2.
35030
35031 It is not required that the cost always equal 2 when FROM is the same as TO;
35032 on some machines it is expensive to move between registers if they are not
35033 general registers. */
35034
35035 static int
35036 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
35037 reg_class_t class2_i)
35038 {
35039 enum reg_class class1 = (enum reg_class) class1_i;
35040 enum reg_class class2 = (enum reg_class) class2_i;
35041
35042 /* In case we require secondary memory, compute cost of the store followed
35043 by load. In order to avoid bad register allocation choices, we need
35044 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
35045
35046 if (inline_secondary_memory_needed (class1, class2, mode, 0))
35047 {
35048 int cost = 1;
35049
35050 cost += inline_memory_move_cost (mode, class1, 2);
35051 cost += inline_memory_move_cost (mode, class2, 2);
35052
35053 /* In case of copying from general_purpose_register we may emit multiple
35054 stores followed by single load causing memory size mismatch stall.
35055 Count this as arbitrarily high cost of 20. */
35056 if (targetm.class_max_nregs (class1, mode)
35057 > targetm.class_max_nregs (class2, mode))
35058 cost += 20;
35059
35060 /* In the case of FP/MMX moves, the registers actually overlap, and we
35061 have to switch modes in order to treat them differently. */
35062 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
35063 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
35064 cost += 20;
35065
35066 return cost;
35067 }
35068
35069 /* Moves between SSE/MMX and integer unit are expensive. */
35070 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
35071 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
35072
35073 /* ??? By keeping returned value relatively high, we limit the number
35074 of moves between integer and MMX/SSE registers for all targets.
35075 Additionally, high value prevents problem with x86_modes_tieable_p(),
35076 where integer modes in MMX/SSE registers are not tieable
35077 because of missing QImode and HImode moves to, from or between
35078 MMX/SSE registers. */
35079 return MAX (8, ix86_cost->mmxsse_to_integer);
35080
35081 if (MAYBE_FLOAT_CLASS_P (class1))
35082 return ix86_cost->fp_move;
35083 if (MAYBE_SSE_CLASS_P (class1))
35084 return ix86_cost->sse_move;
35085 if (MAYBE_MMX_CLASS_P (class1))
35086 return ix86_cost->mmx_move;
35087 return 2;
35088 }
35089
35090 /* Return TRUE if hard register REGNO can hold a value of machine-mode
35091 MODE. */
35092
35093 bool
35094 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
35095 {
35096 /* Flags and only flags can only hold CCmode values. */
35097 if (CC_REGNO_P (regno))
35098 return GET_MODE_CLASS (mode) == MODE_CC;
35099 if (GET_MODE_CLASS (mode) == MODE_CC
35100 || GET_MODE_CLASS (mode) == MODE_RANDOM
35101 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
35102 return false;
35103 if (STACK_REGNO_P (regno))
35104 return VALID_FP_MODE_P (mode);
35105 if (MASK_REGNO_P (regno))
35106 return VALID_MASK_REG_MODE (mode);
35107 if (BND_REGNO_P (regno))
35108 return VALID_BND_REG_MODE (mode);
35109 if (SSE_REGNO_P (regno))
35110 {
35111 /* We implement the move patterns for all vector modes into and
35112 out of SSE registers, even when no operation instructions
35113 are available. */
35114
35115 /* For AVX-512 we allow, regardless of regno:
35116 - XI mode
35117 - any of 512-bit wide vector mode
35118 - any scalar mode. */
35119 if (TARGET_AVX512F
35120 && (mode == XImode
35121 || VALID_AVX512F_REG_MODE (mode)
35122 || VALID_AVX512F_SCALAR_MODE (mode)))
35123 return true;
35124
35125 /* xmm16-xmm31 are only available for AVX-512. */
35126 if (EXT_REX_SSE_REGNO_P (regno))
35127 return false;
35128
35129 /* OImode move is available only when AVX is enabled. */
35130 return ((TARGET_AVX && mode == OImode)
35131 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35132 || VALID_SSE_REG_MODE (mode)
35133 || VALID_SSE2_REG_MODE (mode)
35134 || VALID_MMX_REG_MODE (mode)
35135 || VALID_MMX_REG_MODE_3DNOW (mode));
35136 }
35137 if (MMX_REGNO_P (regno))
35138 {
35139 /* We implement the move patterns for 3DNOW modes even in MMX mode,
35140 so if the register is available at all, then we can move data of
35141 the given mode into or out of it. */
35142 return (VALID_MMX_REG_MODE (mode)
35143 || VALID_MMX_REG_MODE_3DNOW (mode));
35144 }
35145
35146 if (mode == QImode)
35147 {
35148 /* Take care for QImode values - they can be in non-QI regs,
35149 but then they do cause partial register stalls. */
35150 if (ANY_QI_REGNO_P (regno))
35151 return true;
35152 if (!TARGET_PARTIAL_REG_STALL)
35153 return true;
35154 /* LRA checks if the hard register is OK for the given mode.
35155 QImode values can live in non-QI regs, so we allow all
35156 registers here. */
35157 if (lra_in_progress)
35158 return true;
35159 return !can_create_pseudo_p ();
35160 }
35161 /* We handle both integer and floats in the general purpose registers. */
35162 else if (VALID_INT_MODE_P (mode))
35163 return true;
35164 else if (VALID_FP_MODE_P (mode))
35165 return true;
35166 else if (VALID_DFP_MODE_P (mode))
35167 return true;
35168 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
35169 on to use that value in smaller contexts, this can easily force a
35170 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
35171 supporting DImode, allow it. */
35172 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
35173 return true;
35174
35175 return false;
35176 }
35177
35178 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
35179 tieable integer mode. */
35180
35181 static bool
35182 ix86_tieable_integer_mode_p (enum machine_mode mode)
35183 {
35184 switch (mode)
35185 {
35186 case HImode:
35187 case SImode:
35188 return true;
35189
35190 case QImode:
35191 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
35192
35193 case DImode:
35194 return TARGET_64BIT;
35195
35196 default:
35197 return false;
35198 }
35199 }
35200
35201 /* Return true if MODE1 is accessible in a register that can hold MODE2
35202 without copying. That is, all register classes that can hold MODE2
35203 can also hold MODE1. */
35204
35205 bool
35206 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
35207 {
35208 if (mode1 == mode2)
35209 return true;
35210
35211 if (ix86_tieable_integer_mode_p (mode1)
35212 && ix86_tieable_integer_mode_p (mode2))
35213 return true;
35214
35215 /* MODE2 being XFmode implies fp stack or general regs, which means we
35216 can tie any smaller floating point modes to it. Note that we do not
35217 tie this with TFmode. */
35218 if (mode2 == XFmode)
35219 return mode1 == SFmode || mode1 == DFmode;
35220
35221 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
35222 that we can tie it with SFmode. */
35223 if (mode2 == DFmode)
35224 return mode1 == SFmode;
35225
35226 /* If MODE2 is only appropriate for an SSE register, then tie with
35227 any other mode acceptable to SSE registers. */
35228 if (GET_MODE_SIZE (mode2) == 32
35229 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35230 return (GET_MODE_SIZE (mode1) == 32
35231 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35232 if (GET_MODE_SIZE (mode2) == 16
35233 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35234 return (GET_MODE_SIZE (mode1) == 16
35235 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35236
35237 /* If MODE2 is appropriate for an MMX register, then tie
35238 with any other mode acceptable to MMX registers. */
35239 if (GET_MODE_SIZE (mode2) == 8
35240 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
35241 return (GET_MODE_SIZE (mode1) == 8
35242 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
35243
35244 return false;
35245 }
35246
35247 /* Return the cost of moving between two registers of mode MODE. */
35248
35249 static int
35250 ix86_set_reg_reg_cost (enum machine_mode mode)
35251 {
35252 unsigned int units = UNITS_PER_WORD;
35253
35254 switch (GET_MODE_CLASS (mode))
35255 {
35256 default:
35257 break;
35258
35259 case MODE_CC:
35260 units = GET_MODE_SIZE (CCmode);
35261 break;
35262
35263 case MODE_FLOAT:
35264 if ((TARGET_SSE && mode == TFmode)
35265 || (TARGET_80387 && mode == XFmode)
35266 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
35267 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
35268 units = GET_MODE_SIZE (mode);
35269 break;
35270
35271 case MODE_COMPLEX_FLOAT:
35272 if ((TARGET_SSE && mode == TCmode)
35273 || (TARGET_80387 && mode == XCmode)
35274 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
35275 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
35276 units = GET_MODE_SIZE (mode);
35277 break;
35278
35279 case MODE_VECTOR_INT:
35280 case MODE_VECTOR_FLOAT:
35281 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
35282 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35283 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
35284 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
35285 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
35286 units = GET_MODE_SIZE (mode);
35287 }
35288
35289 /* Return the cost of moving between two registers of mode MODE,
35290 assuming that the move will be in pieces of at most UNITS bytes. */
35291 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
35292 }
35293
35294 /* Compute a (partial) cost for rtx X. Return true if the complete
35295 cost has been computed, and false if subexpressions should be
35296 scanned. In either case, *TOTAL contains the cost result. */
35297
35298 static bool
35299 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
35300 bool speed)
35301 {
35302 enum rtx_code code = (enum rtx_code) code_i;
35303 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
35304 enum machine_mode mode = GET_MODE (x);
35305 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
35306
35307 switch (code)
35308 {
35309 case SET:
35310 if (register_operand (SET_DEST (x), VOIDmode)
35311 && reg_or_0_operand (SET_SRC (x), VOIDmode))
35312 {
35313 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
35314 return true;
35315 }
35316 return false;
35317
35318 case CONST_INT:
35319 case CONST:
35320 case LABEL_REF:
35321 case SYMBOL_REF:
35322 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
35323 *total = 3;
35324 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
35325 *total = 2;
35326 else if (flag_pic && SYMBOLIC_CONST (x)
35327 && (!TARGET_64BIT
35328 || (!GET_CODE (x) != LABEL_REF
35329 && (GET_CODE (x) != SYMBOL_REF
35330 || !SYMBOL_REF_LOCAL_P (x)))))
35331 *total = 1;
35332 else
35333 *total = 0;
35334 return true;
35335
35336 case CONST_DOUBLE:
35337 if (mode == VOIDmode)
35338 {
35339 *total = 0;
35340 return true;
35341 }
35342 switch (standard_80387_constant_p (x))
35343 {
35344 case 1: /* 0.0 */
35345 *total = 1;
35346 return true;
35347 default: /* Other constants */
35348 *total = 2;
35349 return true;
35350 case 0:
35351 case -1:
35352 break;
35353 }
35354 if (SSE_FLOAT_MODE_P (mode))
35355 {
35356 case CONST_VECTOR:
35357 switch (standard_sse_constant_p (x))
35358 {
35359 case 0:
35360 break;
35361 case 1: /* 0: xor eliminates false dependency */
35362 *total = 0;
35363 return true;
35364 default: /* -1: cmp contains false dependency */
35365 *total = 1;
35366 return true;
35367 }
35368 }
35369 /* Fall back to (MEM (SYMBOL_REF)), since that's where
35370 it'll probably end up. Add a penalty for size. */
35371 *total = (COSTS_N_INSNS (1)
35372 + (flag_pic != 0 && !TARGET_64BIT)
35373 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
35374 return true;
35375
35376 case ZERO_EXTEND:
35377 /* The zero extensions is often completely free on x86_64, so make
35378 it as cheap as possible. */
35379 if (TARGET_64BIT && mode == DImode
35380 && GET_MODE (XEXP (x, 0)) == SImode)
35381 *total = 1;
35382 else if (TARGET_ZERO_EXTEND_WITH_AND)
35383 *total = cost->add;
35384 else
35385 *total = cost->movzx;
35386 return false;
35387
35388 case SIGN_EXTEND:
35389 *total = cost->movsx;
35390 return false;
35391
35392 case ASHIFT:
35393 if (SCALAR_INT_MODE_P (mode)
35394 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
35395 && CONST_INT_P (XEXP (x, 1)))
35396 {
35397 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35398 if (value == 1)
35399 {
35400 *total = cost->add;
35401 return false;
35402 }
35403 if ((value == 2 || value == 3)
35404 && cost->lea <= cost->shift_const)
35405 {
35406 *total = cost->lea;
35407 return false;
35408 }
35409 }
35410 /* FALLTHRU */
35411
35412 case ROTATE:
35413 case ASHIFTRT:
35414 case LSHIFTRT:
35415 case ROTATERT:
35416 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35417 {
35418 /* ??? Should be SSE vector operation cost. */
35419 /* At least for published AMD latencies, this really is the same
35420 as the latency for a simple fpu operation like fabs. */
35421 /* V*QImode is emulated with 1-11 insns. */
35422 if (mode == V16QImode || mode == V32QImode)
35423 {
35424 int count = 11;
35425 if (TARGET_XOP && mode == V16QImode)
35426 {
35427 /* For XOP we use vpshab, which requires a broadcast of the
35428 value to the variable shift insn. For constants this
35429 means a V16Q const in mem; even when we can perform the
35430 shift with one insn set the cost to prefer paddb. */
35431 if (CONSTANT_P (XEXP (x, 1)))
35432 {
35433 *total = (cost->fabs
35434 + rtx_cost (XEXP (x, 0), code, 0, speed)
35435 + (speed ? 2 : COSTS_N_BYTES (16)));
35436 return true;
35437 }
35438 count = 3;
35439 }
35440 else if (TARGET_SSSE3)
35441 count = 7;
35442 *total = cost->fabs * count;
35443 }
35444 else
35445 *total = cost->fabs;
35446 }
35447 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35448 {
35449 if (CONST_INT_P (XEXP (x, 1)))
35450 {
35451 if (INTVAL (XEXP (x, 1)) > 32)
35452 *total = cost->shift_const + COSTS_N_INSNS (2);
35453 else
35454 *total = cost->shift_const * 2;
35455 }
35456 else
35457 {
35458 if (GET_CODE (XEXP (x, 1)) == AND)
35459 *total = cost->shift_var * 2;
35460 else
35461 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
35462 }
35463 }
35464 else
35465 {
35466 if (CONST_INT_P (XEXP (x, 1)))
35467 *total = cost->shift_const;
35468 else if (GET_CODE (XEXP (x, 1)) == SUBREG
35469 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
35470 {
35471 /* Return the cost after shift-and truncation. */
35472 *total = cost->shift_var;
35473 return true;
35474 }
35475 else
35476 *total = cost->shift_var;
35477 }
35478 return false;
35479
35480 case FMA:
35481 {
35482 rtx sub;
35483
35484 gcc_assert (FLOAT_MODE_P (mode));
35485 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
35486
35487 /* ??? SSE scalar/vector cost should be used here. */
35488 /* ??? Bald assumption that fma has the same cost as fmul. */
35489 *total = cost->fmul;
35490 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
35491
35492 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
35493 sub = XEXP (x, 0);
35494 if (GET_CODE (sub) == NEG)
35495 sub = XEXP (sub, 0);
35496 *total += rtx_cost (sub, FMA, 0, speed);
35497
35498 sub = XEXP (x, 2);
35499 if (GET_CODE (sub) == NEG)
35500 sub = XEXP (sub, 0);
35501 *total += rtx_cost (sub, FMA, 2, speed);
35502 return true;
35503 }
35504
35505 case MULT:
35506 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35507 {
35508 /* ??? SSE scalar cost should be used here. */
35509 *total = cost->fmul;
35510 return false;
35511 }
35512 else if (X87_FLOAT_MODE_P (mode))
35513 {
35514 *total = cost->fmul;
35515 return false;
35516 }
35517 else if (FLOAT_MODE_P (mode))
35518 {
35519 /* ??? SSE vector cost should be used here. */
35520 *total = cost->fmul;
35521 return false;
35522 }
35523 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35524 {
35525 /* V*QImode is emulated with 7-13 insns. */
35526 if (mode == V16QImode || mode == V32QImode)
35527 {
35528 int extra = 11;
35529 if (TARGET_XOP && mode == V16QImode)
35530 extra = 5;
35531 else if (TARGET_SSSE3)
35532 extra = 6;
35533 *total = cost->fmul * 2 + cost->fabs * extra;
35534 }
35535 /* V*DImode is emulated with 5-8 insns. */
35536 else if (mode == V2DImode || mode == V4DImode)
35537 {
35538 if (TARGET_XOP && mode == V2DImode)
35539 *total = cost->fmul * 2 + cost->fabs * 3;
35540 else
35541 *total = cost->fmul * 3 + cost->fabs * 5;
35542 }
35543 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
35544 insns, including two PMULUDQ. */
35545 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
35546 *total = cost->fmul * 2 + cost->fabs * 5;
35547 else
35548 *total = cost->fmul;
35549 return false;
35550 }
35551 else
35552 {
35553 rtx op0 = XEXP (x, 0);
35554 rtx op1 = XEXP (x, 1);
35555 int nbits;
35556 if (CONST_INT_P (XEXP (x, 1)))
35557 {
35558 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35559 for (nbits = 0; value != 0; value &= value - 1)
35560 nbits++;
35561 }
35562 else
35563 /* This is arbitrary. */
35564 nbits = 7;
35565
35566 /* Compute costs correctly for widening multiplication. */
35567 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
35568 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
35569 == GET_MODE_SIZE (mode))
35570 {
35571 int is_mulwiden = 0;
35572 enum machine_mode inner_mode = GET_MODE (op0);
35573
35574 if (GET_CODE (op0) == GET_CODE (op1))
35575 is_mulwiden = 1, op1 = XEXP (op1, 0);
35576 else if (CONST_INT_P (op1))
35577 {
35578 if (GET_CODE (op0) == SIGN_EXTEND)
35579 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
35580 == INTVAL (op1);
35581 else
35582 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
35583 }
35584
35585 if (is_mulwiden)
35586 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
35587 }
35588
35589 *total = (cost->mult_init[MODE_INDEX (mode)]
35590 + nbits * cost->mult_bit
35591 + rtx_cost (op0, outer_code, opno, speed)
35592 + rtx_cost (op1, outer_code, opno, speed));
35593
35594 return true;
35595 }
35596
35597 case DIV:
35598 case UDIV:
35599 case MOD:
35600 case UMOD:
35601 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35602 /* ??? SSE cost should be used here. */
35603 *total = cost->fdiv;
35604 else if (X87_FLOAT_MODE_P (mode))
35605 *total = cost->fdiv;
35606 else if (FLOAT_MODE_P (mode))
35607 /* ??? SSE vector cost should be used here. */
35608 *total = cost->fdiv;
35609 else
35610 *total = cost->divide[MODE_INDEX (mode)];
35611 return false;
35612
35613 case PLUS:
35614 if (GET_MODE_CLASS (mode) == MODE_INT
35615 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
35616 {
35617 if (GET_CODE (XEXP (x, 0)) == PLUS
35618 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
35619 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
35620 && CONSTANT_P (XEXP (x, 1)))
35621 {
35622 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35623 if (val == 2 || val == 4 || val == 8)
35624 {
35625 *total = cost->lea;
35626 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35627 outer_code, opno, speed);
35628 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35629 outer_code, opno, speed);
35630 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35631 return true;
35632 }
35633 }
35634 else if (GET_CODE (XEXP (x, 0)) == MULT
35635 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35636 {
35637 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35638 if (val == 2 || val == 4 || val == 8)
35639 {
35640 *total = cost->lea;
35641 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35642 outer_code, opno, speed);
35643 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35644 return true;
35645 }
35646 }
35647 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35648 {
35649 *total = cost->lea;
35650 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35651 outer_code, opno, speed);
35652 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35653 outer_code, opno, speed);
35654 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35655 return true;
35656 }
35657 }
35658 /* FALLTHRU */
35659
35660 case MINUS:
35661 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35662 {
35663 /* ??? SSE cost should be used here. */
35664 *total = cost->fadd;
35665 return false;
35666 }
35667 else if (X87_FLOAT_MODE_P (mode))
35668 {
35669 *total = cost->fadd;
35670 return false;
35671 }
35672 else if (FLOAT_MODE_P (mode))
35673 {
35674 /* ??? SSE vector cost should be used here. */
35675 *total = cost->fadd;
35676 return false;
35677 }
35678 /* FALLTHRU */
35679
35680 case AND:
35681 case IOR:
35682 case XOR:
35683 if (GET_MODE_CLASS (mode) == MODE_INT
35684 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35685 {
35686 *total = (cost->add * 2
35687 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35688 << (GET_MODE (XEXP (x, 0)) != DImode))
35689 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35690 << (GET_MODE (XEXP (x, 1)) != DImode)));
35691 return true;
35692 }
35693 /* FALLTHRU */
35694
35695 case NEG:
35696 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35697 {
35698 /* ??? SSE cost should be used here. */
35699 *total = cost->fchs;
35700 return false;
35701 }
35702 else if (X87_FLOAT_MODE_P (mode))
35703 {
35704 *total = cost->fchs;
35705 return false;
35706 }
35707 else if (FLOAT_MODE_P (mode))
35708 {
35709 /* ??? SSE vector cost should be used here. */
35710 *total = cost->fchs;
35711 return false;
35712 }
35713 /* FALLTHRU */
35714
35715 case NOT:
35716 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35717 {
35718 /* ??? Should be SSE vector operation cost. */
35719 /* At least for published AMD latencies, this really is the same
35720 as the latency for a simple fpu operation like fabs. */
35721 *total = cost->fabs;
35722 }
35723 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35724 *total = cost->add * 2;
35725 else
35726 *total = cost->add;
35727 return false;
35728
35729 case COMPARE:
35730 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35731 && XEXP (XEXP (x, 0), 1) == const1_rtx
35732 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35733 && XEXP (x, 1) == const0_rtx)
35734 {
35735 /* This kind of construct is implemented using test[bwl].
35736 Treat it as if we had an AND. */
35737 *total = (cost->add
35738 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35739 + rtx_cost (const1_rtx, outer_code, opno, speed));
35740 return true;
35741 }
35742 return false;
35743
35744 case FLOAT_EXTEND:
35745 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35746 *total = 0;
35747 return false;
35748
35749 case ABS:
35750 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35751 /* ??? SSE cost should be used here. */
35752 *total = cost->fabs;
35753 else if (X87_FLOAT_MODE_P (mode))
35754 *total = cost->fabs;
35755 else if (FLOAT_MODE_P (mode))
35756 /* ??? SSE vector cost should be used here. */
35757 *total = cost->fabs;
35758 return false;
35759
35760 case SQRT:
35761 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35762 /* ??? SSE cost should be used here. */
35763 *total = cost->fsqrt;
35764 else if (X87_FLOAT_MODE_P (mode))
35765 *total = cost->fsqrt;
35766 else if (FLOAT_MODE_P (mode))
35767 /* ??? SSE vector cost should be used here. */
35768 *total = cost->fsqrt;
35769 return false;
35770
35771 case UNSPEC:
35772 if (XINT (x, 1) == UNSPEC_TP)
35773 *total = 0;
35774 return false;
35775
35776 case VEC_SELECT:
35777 case VEC_CONCAT:
35778 case VEC_MERGE:
35779 case VEC_DUPLICATE:
35780 /* ??? Assume all of these vector manipulation patterns are
35781 recognizable. In which case they all pretty much have the
35782 same cost. */
35783 *total = cost->fabs;
35784 return true;
35785
35786 default:
35787 return false;
35788 }
35789 }
35790
35791 #if TARGET_MACHO
35792
35793 static int current_machopic_label_num;
35794
35795 /* Given a symbol name and its associated stub, write out the
35796 definition of the stub. */
35797
35798 void
35799 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35800 {
35801 unsigned int length;
35802 char *binder_name, *symbol_name, lazy_ptr_name[32];
35803 int label = ++current_machopic_label_num;
35804
35805 /* For 64-bit we shouldn't get here. */
35806 gcc_assert (!TARGET_64BIT);
35807
35808 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35809 symb = targetm.strip_name_encoding (symb);
35810
35811 length = strlen (stub);
35812 binder_name = XALLOCAVEC (char, length + 32);
35813 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35814
35815 length = strlen (symb);
35816 symbol_name = XALLOCAVEC (char, length + 32);
35817 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35818
35819 sprintf (lazy_ptr_name, "L%d$lz", label);
35820
35821 if (MACHOPIC_ATT_STUB)
35822 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35823 else if (MACHOPIC_PURE)
35824 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35825 else
35826 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35827
35828 fprintf (file, "%s:\n", stub);
35829 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35830
35831 if (MACHOPIC_ATT_STUB)
35832 {
35833 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35834 }
35835 else if (MACHOPIC_PURE)
35836 {
35837 /* PIC stub. */
35838 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35839 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35840 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35841 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35842 label, lazy_ptr_name, label);
35843 fprintf (file, "\tjmp\t*%%ecx\n");
35844 }
35845 else
35846 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35847
35848 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35849 it needs no stub-binding-helper. */
35850 if (MACHOPIC_ATT_STUB)
35851 return;
35852
35853 fprintf (file, "%s:\n", binder_name);
35854
35855 if (MACHOPIC_PURE)
35856 {
35857 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35858 fprintf (file, "\tpushl\t%%ecx\n");
35859 }
35860 else
35861 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35862
35863 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35864
35865 /* N.B. Keep the correspondence of these
35866 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35867 old-pic/new-pic/non-pic stubs; altering this will break
35868 compatibility with existing dylibs. */
35869 if (MACHOPIC_PURE)
35870 {
35871 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35872 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35873 }
35874 else
35875 /* 16-byte -mdynamic-no-pic stub. */
35876 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35877
35878 fprintf (file, "%s:\n", lazy_ptr_name);
35879 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35880 fprintf (file, ASM_LONG "%s\n", binder_name);
35881 }
35882 #endif /* TARGET_MACHO */
35883
35884 /* Order the registers for register allocator. */
35885
35886 void
35887 x86_order_regs_for_local_alloc (void)
35888 {
35889 int pos = 0;
35890 int i;
35891
35892 /* First allocate the local general purpose registers. */
35893 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35894 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35895 reg_alloc_order [pos++] = i;
35896
35897 /* Global general purpose registers. */
35898 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35899 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35900 reg_alloc_order [pos++] = i;
35901
35902 /* x87 registers come first in case we are doing FP math
35903 using them. */
35904 if (!TARGET_SSE_MATH)
35905 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35906 reg_alloc_order [pos++] = i;
35907
35908 /* SSE registers. */
35909 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35910 reg_alloc_order [pos++] = i;
35911 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35912 reg_alloc_order [pos++] = i;
35913
35914 /* Extended REX SSE registers. */
35915 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
35916 reg_alloc_order [pos++] = i;
35917
35918 /* Mask register. */
35919 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
35920 reg_alloc_order [pos++] = i;
35921
35922 /* MPX bound registers. */
35923 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
35924 reg_alloc_order [pos++] = i;
35925
35926 /* x87 registers. */
35927 if (TARGET_SSE_MATH)
35928 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35929 reg_alloc_order [pos++] = i;
35930
35931 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35932 reg_alloc_order [pos++] = i;
35933
35934 /* Initialize the rest of array as we do not allocate some registers
35935 at all. */
35936 while (pos < FIRST_PSEUDO_REGISTER)
35937 reg_alloc_order [pos++] = 0;
35938 }
35939
35940 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35941 in struct attribute_spec handler. */
35942 static tree
35943 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35944 tree args,
35945 int flags ATTRIBUTE_UNUSED,
35946 bool *no_add_attrs)
35947 {
35948 if (TREE_CODE (*node) != FUNCTION_TYPE
35949 && TREE_CODE (*node) != METHOD_TYPE
35950 && TREE_CODE (*node) != FIELD_DECL
35951 && TREE_CODE (*node) != TYPE_DECL)
35952 {
35953 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35954 name);
35955 *no_add_attrs = true;
35956 return NULL_TREE;
35957 }
35958 if (TARGET_64BIT)
35959 {
35960 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35961 name);
35962 *no_add_attrs = true;
35963 return NULL_TREE;
35964 }
35965 if (is_attribute_p ("callee_pop_aggregate_return", name))
35966 {
35967 tree cst;
35968
35969 cst = TREE_VALUE (args);
35970 if (TREE_CODE (cst) != INTEGER_CST)
35971 {
35972 warning (OPT_Wattributes,
35973 "%qE attribute requires an integer constant argument",
35974 name);
35975 *no_add_attrs = true;
35976 }
35977 else if (compare_tree_int (cst, 0) != 0
35978 && compare_tree_int (cst, 1) != 0)
35979 {
35980 warning (OPT_Wattributes,
35981 "argument to %qE attribute is neither zero, nor one",
35982 name);
35983 *no_add_attrs = true;
35984 }
35985
35986 return NULL_TREE;
35987 }
35988
35989 return NULL_TREE;
35990 }
35991
35992 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35993 struct attribute_spec.handler. */
35994 static tree
35995 ix86_handle_abi_attribute (tree *node, tree name,
35996 tree args ATTRIBUTE_UNUSED,
35997 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35998 {
35999 if (TREE_CODE (*node) != FUNCTION_TYPE
36000 && TREE_CODE (*node) != METHOD_TYPE
36001 && TREE_CODE (*node) != FIELD_DECL
36002 && TREE_CODE (*node) != TYPE_DECL)
36003 {
36004 warning (OPT_Wattributes, "%qE attribute only applies to functions",
36005 name);
36006 *no_add_attrs = true;
36007 return NULL_TREE;
36008 }
36009
36010 /* Can combine regparm with all attributes but fastcall. */
36011 if (is_attribute_p ("ms_abi", name))
36012 {
36013 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
36014 {
36015 error ("ms_abi and sysv_abi attributes are not compatible");
36016 }
36017
36018 return NULL_TREE;
36019 }
36020 else if (is_attribute_p ("sysv_abi", name))
36021 {
36022 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
36023 {
36024 error ("ms_abi and sysv_abi attributes are not compatible");
36025 }
36026
36027 return NULL_TREE;
36028 }
36029
36030 return NULL_TREE;
36031 }
36032
36033 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
36034 struct attribute_spec.handler. */
36035 static tree
36036 ix86_handle_struct_attribute (tree *node, tree name,
36037 tree args ATTRIBUTE_UNUSED,
36038 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36039 {
36040 tree *type = NULL;
36041 if (DECL_P (*node))
36042 {
36043 if (TREE_CODE (*node) == TYPE_DECL)
36044 type = &TREE_TYPE (*node);
36045 }
36046 else
36047 type = node;
36048
36049 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
36050 {
36051 warning (OPT_Wattributes, "%qE attribute ignored",
36052 name);
36053 *no_add_attrs = true;
36054 }
36055
36056 else if ((is_attribute_p ("ms_struct", name)
36057 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
36058 || ((is_attribute_p ("gcc_struct", name)
36059 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
36060 {
36061 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
36062 name);
36063 *no_add_attrs = true;
36064 }
36065
36066 return NULL_TREE;
36067 }
36068
36069 static tree
36070 ix86_handle_fndecl_attribute (tree *node, tree name,
36071 tree args ATTRIBUTE_UNUSED,
36072 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36073 {
36074 if (TREE_CODE (*node) != FUNCTION_DECL)
36075 {
36076 warning (OPT_Wattributes, "%qE attribute only applies to functions",
36077 name);
36078 *no_add_attrs = true;
36079 }
36080 return NULL_TREE;
36081 }
36082
36083 static bool
36084 ix86_ms_bitfield_layout_p (const_tree record_type)
36085 {
36086 return ((TARGET_MS_BITFIELD_LAYOUT
36087 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
36088 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
36089 }
36090
36091 /* Returns an expression indicating where the this parameter is
36092 located on entry to the FUNCTION. */
36093
36094 static rtx
36095 x86_this_parameter (tree function)
36096 {
36097 tree type = TREE_TYPE (function);
36098 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
36099 int nregs;
36100
36101 if (TARGET_64BIT)
36102 {
36103 const int *parm_regs;
36104
36105 if (ix86_function_type_abi (type) == MS_ABI)
36106 parm_regs = x86_64_ms_abi_int_parameter_registers;
36107 else
36108 parm_regs = x86_64_int_parameter_registers;
36109 return gen_rtx_REG (Pmode, parm_regs[aggr]);
36110 }
36111
36112 nregs = ix86_function_regparm (type, function);
36113
36114 if (nregs > 0 && !stdarg_p (type))
36115 {
36116 int regno;
36117 unsigned int ccvt = ix86_get_callcvt (type);
36118
36119 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36120 regno = aggr ? DX_REG : CX_REG;
36121 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36122 {
36123 regno = CX_REG;
36124 if (aggr)
36125 return gen_rtx_MEM (SImode,
36126 plus_constant (Pmode, stack_pointer_rtx, 4));
36127 }
36128 else
36129 {
36130 regno = AX_REG;
36131 if (aggr)
36132 {
36133 regno = DX_REG;
36134 if (nregs == 1)
36135 return gen_rtx_MEM (SImode,
36136 plus_constant (Pmode,
36137 stack_pointer_rtx, 4));
36138 }
36139 }
36140 return gen_rtx_REG (SImode, regno);
36141 }
36142
36143 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
36144 aggr ? 8 : 4));
36145 }
36146
36147 /* Determine whether x86_output_mi_thunk can succeed. */
36148
36149 static bool
36150 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
36151 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
36152 HOST_WIDE_INT vcall_offset, const_tree function)
36153 {
36154 /* 64-bit can handle anything. */
36155 if (TARGET_64BIT)
36156 return true;
36157
36158 /* For 32-bit, everything's fine if we have one free register. */
36159 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
36160 return true;
36161
36162 /* Need a free register for vcall_offset. */
36163 if (vcall_offset)
36164 return false;
36165
36166 /* Need a free register for GOT references. */
36167 if (flag_pic && !targetm.binds_local_p (function))
36168 return false;
36169
36170 /* Otherwise ok. */
36171 return true;
36172 }
36173
36174 /* Output the assembler code for a thunk function. THUNK_DECL is the
36175 declaration for the thunk function itself, FUNCTION is the decl for
36176 the target function. DELTA is an immediate constant offset to be
36177 added to THIS. If VCALL_OFFSET is nonzero, the word at
36178 *(*this + vcall_offset) should be added to THIS. */
36179
36180 static void
36181 x86_output_mi_thunk (FILE *file,
36182 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
36183 HOST_WIDE_INT vcall_offset, tree function)
36184 {
36185 rtx this_param = x86_this_parameter (function);
36186 rtx this_reg, tmp, fnaddr;
36187 unsigned int tmp_regno;
36188
36189 if (TARGET_64BIT)
36190 tmp_regno = R10_REG;
36191 else
36192 {
36193 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
36194 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36195 tmp_regno = AX_REG;
36196 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36197 tmp_regno = DX_REG;
36198 else
36199 tmp_regno = CX_REG;
36200 }
36201
36202 emit_note (NOTE_INSN_PROLOGUE_END);
36203
36204 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
36205 pull it in now and let DELTA benefit. */
36206 if (REG_P (this_param))
36207 this_reg = this_param;
36208 else if (vcall_offset)
36209 {
36210 /* Put the this parameter into %eax. */
36211 this_reg = gen_rtx_REG (Pmode, AX_REG);
36212 emit_move_insn (this_reg, this_param);
36213 }
36214 else
36215 this_reg = NULL_RTX;
36216
36217 /* Adjust the this parameter by a fixed constant. */
36218 if (delta)
36219 {
36220 rtx delta_rtx = GEN_INT (delta);
36221 rtx delta_dst = this_reg ? this_reg : this_param;
36222
36223 if (TARGET_64BIT)
36224 {
36225 if (!x86_64_general_operand (delta_rtx, Pmode))
36226 {
36227 tmp = gen_rtx_REG (Pmode, tmp_regno);
36228 emit_move_insn (tmp, delta_rtx);
36229 delta_rtx = tmp;
36230 }
36231 }
36232
36233 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
36234 }
36235
36236 /* Adjust the this parameter by a value stored in the vtable. */
36237 if (vcall_offset)
36238 {
36239 rtx vcall_addr, vcall_mem, this_mem;
36240
36241 tmp = gen_rtx_REG (Pmode, tmp_regno);
36242
36243 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
36244 if (Pmode != ptr_mode)
36245 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
36246 emit_move_insn (tmp, this_mem);
36247
36248 /* Adjust the this parameter. */
36249 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
36250 if (TARGET_64BIT
36251 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
36252 {
36253 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
36254 emit_move_insn (tmp2, GEN_INT (vcall_offset));
36255 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
36256 }
36257
36258 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
36259 if (Pmode != ptr_mode)
36260 emit_insn (gen_addsi_1_zext (this_reg,
36261 gen_rtx_REG (ptr_mode,
36262 REGNO (this_reg)),
36263 vcall_mem));
36264 else
36265 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
36266 }
36267
36268 /* If necessary, drop THIS back to its stack slot. */
36269 if (this_reg && this_reg != this_param)
36270 emit_move_insn (this_param, this_reg);
36271
36272 fnaddr = XEXP (DECL_RTL (function), 0);
36273 if (TARGET_64BIT)
36274 {
36275 if (!flag_pic || targetm.binds_local_p (function)
36276 || TARGET_PECOFF)
36277 ;
36278 else
36279 {
36280 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
36281 tmp = gen_rtx_CONST (Pmode, tmp);
36282 fnaddr = gen_rtx_MEM (Pmode, tmp);
36283 }
36284 }
36285 else
36286 {
36287 if (!flag_pic || targetm.binds_local_p (function))
36288 ;
36289 #if TARGET_MACHO
36290 else if (TARGET_MACHO)
36291 {
36292 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
36293 fnaddr = XEXP (fnaddr, 0);
36294 }
36295 #endif /* TARGET_MACHO */
36296 else
36297 {
36298 tmp = gen_rtx_REG (Pmode, CX_REG);
36299 output_set_got (tmp, NULL_RTX);
36300
36301 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
36302 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
36303 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
36304 }
36305 }
36306
36307 /* Our sibling call patterns do not allow memories, because we have no
36308 predicate that can distinguish between frame and non-frame memory.
36309 For our purposes here, we can get away with (ab)using a jump pattern,
36310 because we're going to do no optimization. */
36311 if (MEM_P (fnaddr))
36312 emit_jump_insn (gen_indirect_jump (fnaddr));
36313 else
36314 {
36315 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
36316 fnaddr = legitimize_pic_address (fnaddr,
36317 gen_rtx_REG (Pmode, tmp_regno));
36318
36319 if (!sibcall_insn_operand (fnaddr, word_mode))
36320 {
36321 tmp = gen_rtx_REG (word_mode, tmp_regno);
36322 if (GET_MODE (fnaddr) != word_mode)
36323 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
36324 emit_move_insn (tmp, fnaddr);
36325 fnaddr = tmp;
36326 }
36327
36328 tmp = gen_rtx_MEM (QImode, fnaddr);
36329 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
36330 tmp = emit_call_insn (tmp);
36331 SIBLING_CALL_P (tmp) = 1;
36332 }
36333 emit_barrier ();
36334
36335 /* Emit just enough of rest_of_compilation to get the insns emitted.
36336 Note that use_thunk calls assemble_start_function et al. */
36337 tmp = get_insns ();
36338 shorten_branches (tmp);
36339 final_start_function (tmp, file, 1);
36340 final (tmp, file, 1);
36341 final_end_function ();
36342 }
36343
36344 static void
36345 x86_file_start (void)
36346 {
36347 default_file_start ();
36348 #if TARGET_MACHO
36349 darwin_file_start ();
36350 #endif
36351 if (X86_FILE_START_VERSION_DIRECTIVE)
36352 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
36353 if (X86_FILE_START_FLTUSED)
36354 fputs ("\t.global\t__fltused\n", asm_out_file);
36355 if (ix86_asm_dialect == ASM_INTEL)
36356 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
36357 }
36358
36359 int
36360 x86_field_alignment (tree field, int computed)
36361 {
36362 enum machine_mode mode;
36363 tree type = TREE_TYPE (field);
36364
36365 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
36366 return computed;
36367 mode = TYPE_MODE (strip_array_types (type));
36368 if (mode == DFmode || mode == DCmode
36369 || GET_MODE_CLASS (mode) == MODE_INT
36370 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
36371 return MIN (32, computed);
36372 return computed;
36373 }
36374
36375 /* Output assembler code to FILE to increment profiler label # LABELNO
36376 for profiling a function entry. */
36377 void
36378 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
36379 {
36380 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
36381 : MCOUNT_NAME);
36382
36383 if (TARGET_64BIT)
36384 {
36385 #ifndef NO_PROFILE_COUNTERS
36386 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
36387 #endif
36388
36389 if (!TARGET_PECOFF && flag_pic)
36390 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
36391 else
36392 fprintf (file, "\tcall\t%s\n", mcount_name);
36393 }
36394 else if (flag_pic)
36395 {
36396 #ifndef NO_PROFILE_COUNTERS
36397 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
36398 LPREFIX, labelno);
36399 #endif
36400 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
36401 }
36402 else
36403 {
36404 #ifndef NO_PROFILE_COUNTERS
36405 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
36406 LPREFIX, labelno);
36407 #endif
36408 fprintf (file, "\tcall\t%s\n", mcount_name);
36409 }
36410 }
36411
36412 /* We don't have exact information about the insn sizes, but we may assume
36413 quite safely that we are informed about all 1 byte insns and memory
36414 address sizes. This is enough to eliminate unnecessary padding in
36415 99% of cases. */
36416
36417 static int
36418 min_insn_size (rtx insn)
36419 {
36420 int l = 0, len;
36421
36422 if (!INSN_P (insn) || !active_insn_p (insn))
36423 return 0;
36424
36425 /* Discard alignments we've emit and jump instructions. */
36426 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
36427 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
36428 return 0;
36429
36430 /* Important case - calls are always 5 bytes.
36431 It is common to have many calls in the row. */
36432 if (CALL_P (insn)
36433 && symbolic_reference_mentioned_p (PATTERN (insn))
36434 && !SIBLING_CALL_P (insn))
36435 return 5;
36436 len = get_attr_length (insn);
36437 if (len <= 1)
36438 return 1;
36439
36440 /* For normal instructions we rely on get_attr_length being exact,
36441 with a few exceptions. */
36442 if (!JUMP_P (insn))
36443 {
36444 enum attr_type type = get_attr_type (insn);
36445
36446 switch (type)
36447 {
36448 case TYPE_MULTI:
36449 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
36450 || asm_noperands (PATTERN (insn)) >= 0)
36451 return 0;
36452 break;
36453 case TYPE_OTHER:
36454 case TYPE_FCMP:
36455 break;
36456 default:
36457 /* Otherwise trust get_attr_length. */
36458 return len;
36459 }
36460
36461 l = get_attr_length_address (insn);
36462 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
36463 l = 4;
36464 }
36465 if (l)
36466 return 1+l;
36467 else
36468 return 2;
36469 }
36470
36471 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36472
36473 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
36474 window. */
36475
36476 static void
36477 ix86_avoid_jump_mispredicts (void)
36478 {
36479 rtx insn, start = get_insns ();
36480 int nbytes = 0, njumps = 0;
36481 int isjump = 0;
36482
36483 /* Look for all minimal intervals of instructions containing 4 jumps.
36484 The intervals are bounded by START and INSN. NBYTES is the total
36485 size of instructions in the interval including INSN and not including
36486 START. When the NBYTES is smaller than 16 bytes, it is possible
36487 that the end of START and INSN ends up in the same 16byte page.
36488
36489 The smallest offset in the page INSN can start is the case where START
36490 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
36491 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
36492 */
36493 for (insn = start; insn; insn = NEXT_INSN (insn))
36494 {
36495 int min_size;
36496
36497 if (LABEL_P (insn))
36498 {
36499 int align = label_to_alignment (insn);
36500 int max_skip = label_to_max_skip (insn);
36501
36502 if (max_skip > 15)
36503 max_skip = 15;
36504 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
36505 already in the current 16 byte page, because otherwise
36506 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
36507 bytes to reach 16 byte boundary. */
36508 if (align <= 0
36509 || (align <= 3 && max_skip != (1 << align) - 1))
36510 max_skip = 0;
36511 if (dump_file)
36512 fprintf (dump_file, "Label %i with max_skip %i\n",
36513 INSN_UID (insn), max_skip);
36514 if (max_skip)
36515 {
36516 while (nbytes + max_skip >= 16)
36517 {
36518 start = NEXT_INSN (start);
36519 if (JUMP_P (start) || CALL_P (start))
36520 njumps--, isjump = 1;
36521 else
36522 isjump = 0;
36523 nbytes -= min_insn_size (start);
36524 }
36525 }
36526 continue;
36527 }
36528
36529 min_size = min_insn_size (insn);
36530 nbytes += min_size;
36531 if (dump_file)
36532 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
36533 INSN_UID (insn), min_size);
36534 if (JUMP_P (insn) || CALL_P (insn))
36535 njumps++;
36536 else
36537 continue;
36538
36539 while (njumps > 3)
36540 {
36541 start = NEXT_INSN (start);
36542 if (JUMP_P (start) || CALL_P (start))
36543 njumps--, isjump = 1;
36544 else
36545 isjump = 0;
36546 nbytes -= min_insn_size (start);
36547 }
36548 gcc_assert (njumps >= 0);
36549 if (dump_file)
36550 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
36551 INSN_UID (start), INSN_UID (insn), nbytes);
36552
36553 if (njumps == 3 && isjump && nbytes < 16)
36554 {
36555 int padsize = 15 - nbytes + min_insn_size (insn);
36556
36557 if (dump_file)
36558 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
36559 INSN_UID (insn), padsize);
36560 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
36561 }
36562 }
36563 }
36564 #endif
36565
36566 /* AMD Athlon works faster
36567 when RET is not destination of conditional jump or directly preceded
36568 by other jump instruction. We avoid the penalty by inserting NOP just
36569 before the RET instructions in such cases. */
36570 static void
36571 ix86_pad_returns (void)
36572 {
36573 edge e;
36574 edge_iterator ei;
36575
36576 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36577 {
36578 basic_block bb = e->src;
36579 rtx ret = BB_END (bb);
36580 rtx prev;
36581 bool replace = false;
36582
36583 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
36584 || optimize_bb_for_size_p (bb))
36585 continue;
36586 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
36587 if (active_insn_p (prev) || LABEL_P (prev))
36588 break;
36589 if (prev && LABEL_P (prev))
36590 {
36591 edge e;
36592 edge_iterator ei;
36593
36594 FOR_EACH_EDGE (e, ei, bb->preds)
36595 if (EDGE_FREQUENCY (e) && e->src->index >= 0
36596 && !(e->flags & EDGE_FALLTHRU))
36597 {
36598 replace = true;
36599 break;
36600 }
36601 }
36602 if (!replace)
36603 {
36604 prev = prev_active_insn (ret);
36605 if (prev
36606 && ((JUMP_P (prev) && any_condjump_p (prev))
36607 || CALL_P (prev)))
36608 replace = true;
36609 /* Empty functions get branch mispredict even when
36610 the jump destination is not visible to us. */
36611 if (!prev && !optimize_function_for_size_p (cfun))
36612 replace = true;
36613 }
36614 if (replace)
36615 {
36616 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
36617 delete_insn (ret);
36618 }
36619 }
36620 }
36621
36622 /* Count the minimum number of instructions in BB. Return 4 if the
36623 number of instructions >= 4. */
36624
36625 static int
36626 ix86_count_insn_bb (basic_block bb)
36627 {
36628 rtx insn;
36629 int insn_count = 0;
36630
36631 /* Count number of instructions in this block. Return 4 if the number
36632 of instructions >= 4. */
36633 FOR_BB_INSNS (bb, insn)
36634 {
36635 /* Only happen in exit blocks. */
36636 if (JUMP_P (insn)
36637 && ANY_RETURN_P (PATTERN (insn)))
36638 break;
36639
36640 if (NONDEBUG_INSN_P (insn)
36641 && GET_CODE (PATTERN (insn)) != USE
36642 && GET_CODE (PATTERN (insn)) != CLOBBER)
36643 {
36644 insn_count++;
36645 if (insn_count >= 4)
36646 return insn_count;
36647 }
36648 }
36649
36650 return insn_count;
36651 }
36652
36653
36654 /* Count the minimum number of instructions in code path in BB.
36655 Return 4 if the number of instructions >= 4. */
36656
36657 static int
36658 ix86_count_insn (basic_block bb)
36659 {
36660 edge e;
36661 edge_iterator ei;
36662 int min_prev_count;
36663
36664 /* Only bother counting instructions along paths with no
36665 more than 2 basic blocks between entry and exit. Given
36666 that BB has an edge to exit, determine if a predecessor
36667 of BB has an edge from entry. If so, compute the number
36668 of instructions in the predecessor block. If there
36669 happen to be multiple such blocks, compute the minimum. */
36670 min_prev_count = 4;
36671 FOR_EACH_EDGE (e, ei, bb->preds)
36672 {
36673 edge prev_e;
36674 edge_iterator prev_ei;
36675
36676 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
36677 {
36678 min_prev_count = 0;
36679 break;
36680 }
36681 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36682 {
36683 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
36684 {
36685 int count = ix86_count_insn_bb (e->src);
36686 if (count < min_prev_count)
36687 min_prev_count = count;
36688 break;
36689 }
36690 }
36691 }
36692
36693 if (min_prev_count < 4)
36694 min_prev_count += ix86_count_insn_bb (bb);
36695
36696 return min_prev_count;
36697 }
36698
36699 /* Pad short function to 4 instructions. */
36700
36701 static void
36702 ix86_pad_short_function (void)
36703 {
36704 edge e;
36705 edge_iterator ei;
36706
36707 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36708 {
36709 rtx ret = BB_END (e->src);
36710 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36711 {
36712 int insn_count = ix86_count_insn (e->src);
36713
36714 /* Pad short function. */
36715 if (insn_count < 4)
36716 {
36717 rtx insn = ret;
36718
36719 /* Find epilogue. */
36720 while (insn
36721 && (!NOTE_P (insn)
36722 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36723 insn = PREV_INSN (insn);
36724
36725 if (!insn)
36726 insn = ret;
36727
36728 /* Two NOPs count as one instruction. */
36729 insn_count = 2 * (4 - insn_count);
36730 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36731 }
36732 }
36733 }
36734 }
36735
36736 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36737 the epilogue, the Windows system unwinder will apply epilogue logic and
36738 produce incorrect offsets. This can be avoided by adding a nop between
36739 the last insn that can throw and the first insn of the epilogue. */
36740
36741 static void
36742 ix86_seh_fixup_eh_fallthru (void)
36743 {
36744 edge e;
36745 edge_iterator ei;
36746
36747 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36748 {
36749 rtx insn, next;
36750
36751 /* Find the beginning of the epilogue. */
36752 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36753 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36754 break;
36755 if (insn == NULL)
36756 continue;
36757
36758 /* We only care about preceding insns that can throw. */
36759 insn = prev_active_insn (insn);
36760 if (insn == NULL || !can_throw_internal (insn))
36761 continue;
36762
36763 /* Do not separate calls from their debug information. */
36764 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36765 if (NOTE_P (next)
36766 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36767 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36768 insn = next;
36769 else
36770 break;
36771
36772 emit_insn_after (gen_nops (const1_rtx), insn);
36773 }
36774 }
36775
36776 /* Implement machine specific optimizations. We implement padding of returns
36777 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36778 static void
36779 ix86_reorg (void)
36780 {
36781 /* We are freeing block_for_insn in the toplev to keep compatibility
36782 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36783 compute_bb_for_insn ();
36784
36785 if (TARGET_SEH && current_function_has_exception_handlers ())
36786 ix86_seh_fixup_eh_fallthru ();
36787
36788 if (optimize && optimize_function_for_speed_p (cfun))
36789 {
36790 if (TARGET_PAD_SHORT_FUNCTION)
36791 ix86_pad_short_function ();
36792 else if (TARGET_PAD_RETURNS)
36793 ix86_pad_returns ();
36794 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36795 if (TARGET_FOUR_JUMP_LIMIT)
36796 ix86_avoid_jump_mispredicts ();
36797 #endif
36798 }
36799 }
36800
36801 /* Return nonzero when QImode register that must be represented via REX prefix
36802 is used. */
36803 bool
36804 x86_extended_QIreg_mentioned_p (rtx insn)
36805 {
36806 int i;
36807 extract_insn_cached (insn);
36808 for (i = 0; i < recog_data.n_operands; i++)
36809 if (GENERAL_REG_P (recog_data.operand[i])
36810 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36811 return true;
36812 return false;
36813 }
36814
36815 /* Return nonzero when P points to register encoded via REX prefix.
36816 Called via for_each_rtx. */
36817 static int
36818 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36819 {
36820 unsigned int regno;
36821 if (!REG_P (*p))
36822 return 0;
36823 regno = REGNO (*p);
36824 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36825 }
36826
36827 /* Return true when INSN mentions register that must be encoded using REX
36828 prefix. */
36829 bool
36830 x86_extended_reg_mentioned_p (rtx insn)
36831 {
36832 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36833 extended_reg_mentioned_1, NULL);
36834 }
36835
36836 /* If profitable, negate (without causing overflow) integer constant
36837 of mode MODE at location LOC. Return true in this case. */
36838 bool
36839 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36840 {
36841 HOST_WIDE_INT val;
36842
36843 if (!CONST_INT_P (*loc))
36844 return false;
36845
36846 switch (mode)
36847 {
36848 case DImode:
36849 /* DImode x86_64 constants must fit in 32 bits. */
36850 gcc_assert (x86_64_immediate_operand (*loc, mode));
36851
36852 mode = SImode;
36853 break;
36854
36855 case SImode:
36856 case HImode:
36857 case QImode:
36858 break;
36859
36860 default:
36861 gcc_unreachable ();
36862 }
36863
36864 /* Avoid overflows. */
36865 if (mode_signbit_p (mode, *loc))
36866 return false;
36867
36868 val = INTVAL (*loc);
36869
36870 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36871 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36872 if ((val < 0 && val != -128)
36873 || val == 128)
36874 {
36875 *loc = GEN_INT (-val);
36876 return true;
36877 }
36878
36879 return false;
36880 }
36881
36882 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36883 optabs would emit if we didn't have TFmode patterns. */
36884
36885 void
36886 x86_emit_floatuns (rtx operands[2])
36887 {
36888 rtx neglab, donelab, i0, i1, f0, in, out;
36889 enum machine_mode mode, inmode;
36890
36891 inmode = GET_MODE (operands[1]);
36892 gcc_assert (inmode == SImode || inmode == DImode);
36893
36894 out = operands[0];
36895 in = force_reg (inmode, operands[1]);
36896 mode = GET_MODE (out);
36897 neglab = gen_label_rtx ();
36898 donelab = gen_label_rtx ();
36899 f0 = gen_reg_rtx (mode);
36900
36901 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36902
36903 expand_float (out, in, 0);
36904
36905 emit_jump_insn (gen_jump (donelab));
36906 emit_barrier ();
36907
36908 emit_label (neglab);
36909
36910 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36911 1, OPTAB_DIRECT);
36912 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36913 1, OPTAB_DIRECT);
36914 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36915
36916 expand_float (f0, i0, 0);
36917
36918 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36919
36920 emit_label (donelab);
36921 }
36922 \f
36923 /* AVX512F does support 64-byte integer vector operations,
36924 thus the longest vector we are faced with is V64QImode. */
36925 #define MAX_VECT_LEN 64
36926
36927 struct expand_vec_perm_d
36928 {
36929 rtx target, op0, op1;
36930 unsigned char perm[MAX_VECT_LEN];
36931 enum machine_mode vmode;
36932 unsigned char nelt;
36933 bool one_operand_p;
36934 bool testing_p;
36935 };
36936
36937 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36938 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36939 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36940
36941 /* Get a vector mode of the same size as the original but with elements
36942 twice as wide. This is only guaranteed to apply to integral vectors. */
36943
36944 static inline enum machine_mode
36945 get_mode_wider_vector (enum machine_mode o)
36946 {
36947 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36948 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36949 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36950 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36951 return n;
36952 }
36953
36954 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36955 with all elements equal to VAR. Return true if successful. */
36956
36957 static bool
36958 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36959 rtx target, rtx val)
36960 {
36961 bool ok;
36962
36963 switch (mode)
36964 {
36965 case V2SImode:
36966 case V2SFmode:
36967 if (!mmx_ok)
36968 return false;
36969 /* FALLTHRU */
36970
36971 case V4DFmode:
36972 case V4DImode:
36973 case V8SFmode:
36974 case V8SImode:
36975 case V2DFmode:
36976 case V2DImode:
36977 case V4SFmode:
36978 case V4SImode:
36979 {
36980 rtx insn, dup;
36981
36982 /* First attempt to recognize VAL as-is. */
36983 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36984 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36985 if (recog_memoized (insn) < 0)
36986 {
36987 rtx seq;
36988 /* If that fails, force VAL into a register. */
36989
36990 start_sequence ();
36991 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36992 seq = get_insns ();
36993 end_sequence ();
36994 if (seq)
36995 emit_insn_before (seq, insn);
36996
36997 ok = recog_memoized (insn) >= 0;
36998 gcc_assert (ok);
36999 }
37000 }
37001 return true;
37002
37003 case V4HImode:
37004 if (!mmx_ok)
37005 return false;
37006 if (TARGET_SSE || TARGET_3DNOW_A)
37007 {
37008 rtx x;
37009
37010 val = gen_lowpart (SImode, val);
37011 x = gen_rtx_TRUNCATE (HImode, val);
37012 x = gen_rtx_VEC_DUPLICATE (mode, x);
37013 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37014 return true;
37015 }
37016 goto widen;
37017
37018 case V8QImode:
37019 if (!mmx_ok)
37020 return false;
37021 goto widen;
37022
37023 case V8HImode:
37024 if (TARGET_SSE2)
37025 {
37026 struct expand_vec_perm_d dperm;
37027 rtx tmp1, tmp2;
37028
37029 permute:
37030 memset (&dperm, 0, sizeof (dperm));
37031 dperm.target = target;
37032 dperm.vmode = mode;
37033 dperm.nelt = GET_MODE_NUNITS (mode);
37034 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
37035 dperm.one_operand_p = true;
37036
37037 /* Extend to SImode using a paradoxical SUBREG. */
37038 tmp1 = gen_reg_rtx (SImode);
37039 emit_move_insn (tmp1, gen_lowpart (SImode, val));
37040
37041 /* Insert the SImode value as low element of a V4SImode vector. */
37042 tmp2 = gen_reg_rtx (V4SImode);
37043 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
37044 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
37045
37046 ok = (expand_vec_perm_1 (&dperm)
37047 || expand_vec_perm_broadcast_1 (&dperm));
37048 gcc_assert (ok);
37049 return ok;
37050 }
37051 goto widen;
37052
37053 case V16QImode:
37054 if (TARGET_SSE2)
37055 goto permute;
37056 goto widen;
37057
37058 widen:
37059 /* Replicate the value once into the next wider mode and recurse. */
37060 {
37061 enum machine_mode smode, wsmode, wvmode;
37062 rtx x;
37063
37064 smode = GET_MODE_INNER (mode);
37065 wvmode = get_mode_wider_vector (mode);
37066 wsmode = GET_MODE_INNER (wvmode);
37067
37068 val = convert_modes (wsmode, smode, val, true);
37069 x = expand_simple_binop (wsmode, ASHIFT, val,
37070 GEN_INT (GET_MODE_BITSIZE (smode)),
37071 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37072 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
37073
37074 x = gen_reg_rtx (wvmode);
37075 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
37076 gcc_assert (ok);
37077 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
37078 return ok;
37079 }
37080
37081 case V16HImode:
37082 case V32QImode:
37083 {
37084 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
37085 rtx x = gen_reg_rtx (hvmode);
37086
37087 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
37088 gcc_assert (ok);
37089
37090 x = gen_rtx_VEC_CONCAT (mode, x, x);
37091 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37092 }
37093 return true;
37094
37095 default:
37096 return false;
37097 }
37098 }
37099
37100 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37101 whose ONE_VAR element is VAR, and other elements are zero. Return true
37102 if successful. */
37103
37104 static bool
37105 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
37106 rtx target, rtx var, int one_var)
37107 {
37108 enum machine_mode vsimode;
37109 rtx new_target;
37110 rtx x, tmp;
37111 bool use_vector_set = false;
37112
37113 switch (mode)
37114 {
37115 case V2DImode:
37116 /* For SSE4.1, we normally use vector set. But if the second
37117 element is zero and inter-unit moves are OK, we use movq
37118 instead. */
37119 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
37120 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
37121 && one_var == 0));
37122 break;
37123 case V16QImode:
37124 case V4SImode:
37125 case V4SFmode:
37126 use_vector_set = TARGET_SSE4_1;
37127 break;
37128 case V8HImode:
37129 use_vector_set = TARGET_SSE2;
37130 break;
37131 case V4HImode:
37132 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
37133 break;
37134 case V32QImode:
37135 case V16HImode:
37136 case V8SImode:
37137 case V8SFmode:
37138 case V4DFmode:
37139 use_vector_set = TARGET_AVX;
37140 break;
37141 case V4DImode:
37142 /* Use ix86_expand_vector_set in 64bit mode only. */
37143 use_vector_set = TARGET_AVX && TARGET_64BIT;
37144 break;
37145 default:
37146 break;
37147 }
37148
37149 if (use_vector_set)
37150 {
37151 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
37152 var = force_reg (GET_MODE_INNER (mode), var);
37153 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37154 return true;
37155 }
37156
37157 switch (mode)
37158 {
37159 case V2SFmode:
37160 case V2SImode:
37161 if (!mmx_ok)
37162 return false;
37163 /* FALLTHRU */
37164
37165 case V2DFmode:
37166 case V2DImode:
37167 if (one_var != 0)
37168 return false;
37169 var = force_reg (GET_MODE_INNER (mode), var);
37170 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
37171 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37172 return true;
37173
37174 case V4SFmode:
37175 case V4SImode:
37176 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
37177 new_target = gen_reg_rtx (mode);
37178 else
37179 new_target = target;
37180 var = force_reg (GET_MODE_INNER (mode), var);
37181 x = gen_rtx_VEC_DUPLICATE (mode, var);
37182 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
37183 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
37184 if (one_var != 0)
37185 {
37186 /* We need to shuffle the value to the correct position, so
37187 create a new pseudo to store the intermediate result. */
37188
37189 /* With SSE2, we can use the integer shuffle insns. */
37190 if (mode != V4SFmode && TARGET_SSE2)
37191 {
37192 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
37193 const1_rtx,
37194 GEN_INT (one_var == 1 ? 0 : 1),
37195 GEN_INT (one_var == 2 ? 0 : 1),
37196 GEN_INT (one_var == 3 ? 0 : 1)));
37197 if (target != new_target)
37198 emit_move_insn (target, new_target);
37199 return true;
37200 }
37201
37202 /* Otherwise convert the intermediate result to V4SFmode and
37203 use the SSE1 shuffle instructions. */
37204 if (mode != V4SFmode)
37205 {
37206 tmp = gen_reg_rtx (V4SFmode);
37207 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
37208 }
37209 else
37210 tmp = new_target;
37211
37212 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
37213 const1_rtx,
37214 GEN_INT (one_var == 1 ? 0 : 1),
37215 GEN_INT (one_var == 2 ? 0+4 : 1+4),
37216 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
37217
37218 if (mode != V4SFmode)
37219 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
37220 else if (tmp != target)
37221 emit_move_insn (target, tmp);
37222 }
37223 else if (target != new_target)
37224 emit_move_insn (target, new_target);
37225 return true;
37226
37227 case V8HImode:
37228 case V16QImode:
37229 vsimode = V4SImode;
37230 goto widen;
37231 case V4HImode:
37232 case V8QImode:
37233 if (!mmx_ok)
37234 return false;
37235 vsimode = V2SImode;
37236 goto widen;
37237 widen:
37238 if (one_var != 0)
37239 return false;
37240
37241 /* Zero extend the variable element to SImode and recurse. */
37242 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
37243
37244 x = gen_reg_rtx (vsimode);
37245 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
37246 var, one_var))
37247 gcc_unreachable ();
37248
37249 emit_move_insn (target, gen_lowpart (mode, x));
37250 return true;
37251
37252 default:
37253 return false;
37254 }
37255 }
37256
37257 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37258 consisting of the values in VALS. It is known that all elements
37259 except ONE_VAR are constants. Return true if successful. */
37260
37261 static bool
37262 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
37263 rtx target, rtx vals, int one_var)
37264 {
37265 rtx var = XVECEXP (vals, 0, one_var);
37266 enum machine_mode wmode;
37267 rtx const_vec, x;
37268
37269 const_vec = copy_rtx (vals);
37270 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
37271 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
37272
37273 switch (mode)
37274 {
37275 case V2DFmode:
37276 case V2DImode:
37277 case V2SFmode:
37278 case V2SImode:
37279 /* For the two element vectors, it's just as easy to use
37280 the general case. */
37281 return false;
37282
37283 case V4DImode:
37284 /* Use ix86_expand_vector_set in 64bit mode only. */
37285 if (!TARGET_64BIT)
37286 return false;
37287 case V4DFmode:
37288 case V8SFmode:
37289 case V8SImode:
37290 case V16HImode:
37291 case V32QImode:
37292 case V4SFmode:
37293 case V4SImode:
37294 case V8HImode:
37295 case V4HImode:
37296 break;
37297
37298 case V16QImode:
37299 if (TARGET_SSE4_1)
37300 break;
37301 wmode = V8HImode;
37302 goto widen;
37303 case V8QImode:
37304 wmode = V4HImode;
37305 goto widen;
37306 widen:
37307 /* There's no way to set one QImode entry easily. Combine
37308 the variable value with its adjacent constant value, and
37309 promote to an HImode set. */
37310 x = XVECEXP (vals, 0, one_var ^ 1);
37311 if (one_var & 1)
37312 {
37313 var = convert_modes (HImode, QImode, var, true);
37314 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
37315 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37316 x = GEN_INT (INTVAL (x) & 0xff);
37317 }
37318 else
37319 {
37320 var = convert_modes (HImode, QImode, var, true);
37321 x = gen_int_mode (INTVAL (x) << 8, HImode);
37322 }
37323 if (x != const0_rtx)
37324 var = expand_simple_binop (HImode, IOR, var, x, var,
37325 1, OPTAB_LIB_WIDEN);
37326
37327 x = gen_reg_rtx (wmode);
37328 emit_move_insn (x, gen_lowpart (wmode, const_vec));
37329 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
37330
37331 emit_move_insn (target, gen_lowpart (mode, x));
37332 return true;
37333
37334 default:
37335 return false;
37336 }
37337
37338 emit_move_insn (target, const_vec);
37339 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37340 return true;
37341 }
37342
37343 /* A subroutine of ix86_expand_vector_init_general. Use vector
37344 concatenate to handle the most general case: all values variable,
37345 and none identical. */
37346
37347 static void
37348 ix86_expand_vector_init_concat (enum machine_mode mode,
37349 rtx target, rtx *ops, int n)
37350 {
37351 enum machine_mode cmode, hmode = VOIDmode;
37352 rtx first[8], second[4];
37353 rtvec v;
37354 int i, j;
37355
37356 switch (n)
37357 {
37358 case 2:
37359 switch (mode)
37360 {
37361 case V8SImode:
37362 cmode = V4SImode;
37363 break;
37364 case V8SFmode:
37365 cmode = V4SFmode;
37366 break;
37367 case V4DImode:
37368 cmode = V2DImode;
37369 break;
37370 case V4DFmode:
37371 cmode = V2DFmode;
37372 break;
37373 case V4SImode:
37374 cmode = V2SImode;
37375 break;
37376 case V4SFmode:
37377 cmode = V2SFmode;
37378 break;
37379 case V2DImode:
37380 cmode = DImode;
37381 break;
37382 case V2SImode:
37383 cmode = SImode;
37384 break;
37385 case V2DFmode:
37386 cmode = DFmode;
37387 break;
37388 case V2SFmode:
37389 cmode = SFmode;
37390 break;
37391 default:
37392 gcc_unreachable ();
37393 }
37394
37395 if (!register_operand (ops[1], cmode))
37396 ops[1] = force_reg (cmode, ops[1]);
37397 if (!register_operand (ops[0], cmode))
37398 ops[0] = force_reg (cmode, ops[0]);
37399 emit_insn (gen_rtx_SET (VOIDmode, target,
37400 gen_rtx_VEC_CONCAT (mode, ops[0],
37401 ops[1])));
37402 break;
37403
37404 case 4:
37405 switch (mode)
37406 {
37407 case V4DImode:
37408 cmode = V2DImode;
37409 break;
37410 case V4DFmode:
37411 cmode = V2DFmode;
37412 break;
37413 case V4SImode:
37414 cmode = V2SImode;
37415 break;
37416 case V4SFmode:
37417 cmode = V2SFmode;
37418 break;
37419 default:
37420 gcc_unreachable ();
37421 }
37422 goto half;
37423
37424 case 8:
37425 switch (mode)
37426 {
37427 case V8SImode:
37428 cmode = V2SImode;
37429 hmode = V4SImode;
37430 break;
37431 case V8SFmode:
37432 cmode = V2SFmode;
37433 hmode = V4SFmode;
37434 break;
37435 default:
37436 gcc_unreachable ();
37437 }
37438 goto half;
37439
37440 half:
37441 /* FIXME: We process inputs backward to help RA. PR 36222. */
37442 i = n - 1;
37443 j = (n >> 1) - 1;
37444 for (; i > 0; i -= 2, j--)
37445 {
37446 first[j] = gen_reg_rtx (cmode);
37447 v = gen_rtvec (2, ops[i - 1], ops[i]);
37448 ix86_expand_vector_init (false, first[j],
37449 gen_rtx_PARALLEL (cmode, v));
37450 }
37451
37452 n >>= 1;
37453 if (n > 2)
37454 {
37455 gcc_assert (hmode != VOIDmode);
37456 for (i = j = 0; i < n; i += 2, j++)
37457 {
37458 second[j] = gen_reg_rtx (hmode);
37459 ix86_expand_vector_init_concat (hmode, second [j],
37460 &first [i], 2);
37461 }
37462 n >>= 1;
37463 ix86_expand_vector_init_concat (mode, target, second, n);
37464 }
37465 else
37466 ix86_expand_vector_init_concat (mode, target, first, n);
37467 break;
37468
37469 default:
37470 gcc_unreachable ();
37471 }
37472 }
37473
37474 /* A subroutine of ix86_expand_vector_init_general. Use vector
37475 interleave to handle the most general case: all values variable,
37476 and none identical. */
37477
37478 static void
37479 ix86_expand_vector_init_interleave (enum machine_mode mode,
37480 rtx target, rtx *ops, int n)
37481 {
37482 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
37483 int i, j;
37484 rtx op0, op1;
37485 rtx (*gen_load_even) (rtx, rtx, rtx);
37486 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
37487 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
37488
37489 switch (mode)
37490 {
37491 case V8HImode:
37492 gen_load_even = gen_vec_setv8hi;
37493 gen_interleave_first_low = gen_vec_interleave_lowv4si;
37494 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37495 inner_mode = HImode;
37496 first_imode = V4SImode;
37497 second_imode = V2DImode;
37498 third_imode = VOIDmode;
37499 break;
37500 case V16QImode:
37501 gen_load_even = gen_vec_setv16qi;
37502 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
37503 gen_interleave_second_low = gen_vec_interleave_lowv4si;
37504 inner_mode = QImode;
37505 first_imode = V8HImode;
37506 second_imode = V4SImode;
37507 third_imode = V2DImode;
37508 break;
37509 default:
37510 gcc_unreachable ();
37511 }
37512
37513 for (i = 0; i < n; i++)
37514 {
37515 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
37516 op0 = gen_reg_rtx (SImode);
37517 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
37518
37519 /* Insert the SImode value as low element of V4SImode vector. */
37520 op1 = gen_reg_rtx (V4SImode);
37521 op0 = gen_rtx_VEC_MERGE (V4SImode,
37522 gen_rtx_VEC_DUPLICATE (V4SImode,
37523 op0),
37524 CONST0_RTX (V4SImode),
37525 const1_rtx);
37526 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
37527
37528 /* Cast the V4SImode vector back to a vector in orignal mode. */
37529 op0 = gen_reg_rtx (mode);
37530 emit_move_insn (op0, gen_lowpart (mode, op1));
37531
37532 /* Load even elements into the second position. */
37533 emit_insn (gen_load_even (op0,
37534 force_reg (inner_mode,
37535 ops [i + i + 1]),
37536 const1_rtx));
37537
37538 /* Cast vector to FIRST_IMODE vector. */
37539 ops[i] = gen_reg_rtx (first_imode);
37540 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
37541 }
37542
37543 /* Interleave low FIRST_IMODE vectors. */
37544 for (i = j = 0; i < n; i += 2, j++)
37545 {
37546 op0 = gen_reg_rtx (first_imode);
37547 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
37548
37549 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
37550 ops[j] = gen_reg_rtx (second_imode);
37551 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
37552 }
37553
37554 /* Interleave low SECOND_IMODE vectors. */
37555 switch (second_imode)
37556 {
37557 case V4SImode:
37558 for (i = j = 0; i < n / 2; i += 2, j++)
37559 {
37560 op0 = gen_reg_rtx (second_imode);
37561 emit_insn (gen_interleave_second_low (op0, ops[i],
37562 ops[i + 1]));
37563
37564 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
37565 vector. */
37566 ops[j] = gen_reg_rtx (third_imode);
37567 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
37568 }
37569 second_imode = V2DImode;
37570 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37571 /* FALLTHRU */
37572
37573 case V2DImode:
37574 op0 = gen_reg_rtx (second_imode);
37575 emit_insn (gen_interleave_second_low (op0, ops[0],
37576 ops[1]));
37577
37578 /* Cast the SECOND_IMODE vector back to a vector on original
37579 mode. */
37580 emit_insn (gen_rtx_SET (VOIDmode, target,
37581 gen_lowpart (mode, op0)));
37582 break;
37583
37584 default:
37585 gcc_unreachable ();
37586 }
37587 }
37588
37589 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
37590 all values variable, and none identical. */
37591
37592 static void
37593 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
37594 rtx target, rtx vals)
37595 {
37596 rtx ops[32], op0, op1;
37597 enum machine_mode half_mode = VOIDmode;
37598 int n, i;
37599
37600 switch (mode)
37601 {
37602 case V2SFmode:
37603 case V2SImode:
37604 if (!mmx_ok && !TARGET_SSE)
37605 break;
37606 /* FALLTHRU */
37607
37608 case V8SFmode:
37609 case V8SImode:
37610 case V4DFmode:
37611 case V4DImode:
37612 case V4SFmode:
37613 case V4SImode:
37614 case V2DFmode:
37615 case V2DImode:
37616 n = GET_MODE_NUNITS (mode);
37617 for (i = 0; i < n; i++)
37618 ops[i] = XVECEXP (vals, 0, i);
37619 ix86_expand_vector_init_concat (mode, target, ops, n);
37620 return;
37621
37622 case V32QImode:
37623 half_mode = V16QImode;
37624 goto half;
37625
37626 case V16HImode:
37627 half_mode = V8HImode;
37628 goto half;
37629
37630 half:
37631 n = GET_MODE_NUNITS (mode);
37632 for (i = 0; i < n; i++)
37633 ops[i] = XVECEXP (vals, 0, i);
37634 op0 = gen_reg_rtx (half_mode);
37635 op1 = gen_reg_rtx (half_mode);
37636 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37637 n >> 2);
37638 ix86_expand_vector_init_interleave (half_mode, op1,
37639 &ops [n >> 1], n >> 2);
37640 emit_insn (gen_rtx_SET (VOIDmode, target,
37641 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37642 return;
37643
37644 case V16QImode:
37645 if (!TARGET_SSE4_1)
37646 break;
37647 /* FALLTHRU */
37648
37649 case V8HImode:
37650 if (!TARGET_SSE2)
37651 break;
37652
37653 /* Don't use ix86_expand_vector_init_interleave if we can't
37654 move from GPR to SSE register directly. */
37655 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37656 break;
37657
37658 n = GET_MODE_NUNITS (mode);
37659 for (i = 0; i < n; i++)
37660 ops[i] = XVECEXP (vals, 0, i);
37661 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37662 return;
37663
37664 case V4HImode:
37665 case V8QImode:
37666 break;
37667
37668 default:
37669 gcc_unreachable ();
37670 }
37671
37672 {
37673 int i, j, n_elts, n_words, n_elt_per_word;
37674 enum machine_mode inner_mode;
37675 rtx words[4], shift;
37676
37677 inner_mode = GET_MODE_INNER (mode);
37678 n_elts = GET_MODE_NUNITS (mode);
37679 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37680 n_elt_per_word = n_elts / n_words;
37681 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37682
37683 for (i = 0; i < n_words; ++i)
37684 {
37685 rtx word = NULL_RTX;
37686
37687 for (j = 0; j < n_elt_per_word; ++j)
37688 {
37689 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37690 elt = convert_modes (word_mode, inner_mode, elt, true);
37691
37692 if (j == 0)
37693 word = elt;
37694 else
37695 {
37696 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37697 word, 1, OPTAB_LIB_WIDEN);
37698 word = expand_simple_binop (word_mode, IOR, word, elt,
37699 word, 1, OPTAB_LIB_WIDEN);
37700 }
37701 }
37702
37703 words[i] = word;
37704 }
37705
37706 if (n_words == 1)
37707 emit_move_insn (target, gen_lowpart (mode, words[0]));
37708 else if (n_words == 2)
37709 {
37710 rtx tmp = gen_reg_rtx (mode);
37711 emit_clobber (tmp);
37712 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37713 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37714 emit_move_insn (target, tmp);
37715 }
37716 else if (n_words == 4)
37717 {
37718 rtx tmp = gen_reg_rtx (V4SImode);
37719 gcc_assert (word_mode == SImode);
37720 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37721 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37722 emit_move_insn (target, gen_lowpart (mode, tmp));
37723 }
37724 else
37725 gcc_unreachable ();
37726 }
37727 }
37728
37729 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37730 instructions unless MMX_OK is true. */
37731
37732 void
37733 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37734 {
37735 enum machine_mode mode = GET_MODE (target);
37736 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37737 int n_elts = GET_MODE_NUNITS (mode);
37738 int n_var = 0, one_var = -1;
37739 bool all_same = true, all_const_zero = true;
37740 int i;
37741 rtx x;
37742
37743 for (i = 0; i < n_elts; ++i)
37744 {
37745 x = XVECEXP (vals, 0, i);
37746 if (!(CONST_INT_P (x)
37747 || GET_CODE (x) == CONST_DOUBLE
37748 || GET_CODE (x) == CONST_FIXED))
37749 n_var++, one_var = i;
37750 else if (x != CONST0_RTX (inner_mode))
37751 all_const_zero = false;
37752 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37753 all_same = false;
37754 }
37755
37756 /* Constants are best loaded from the constant pool. */
37757 if (n_var == 0)
37758 {
37759 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37760 return;
37761 }
37762
37763 /* If all values are identical, broadcast the value. */
37764 if (all_same
37765 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37766 XVECEXP (vals, 0, 0)))
37767 return;
37768
37769 /* Values where only one field is non-constant are best loaded from
37770 the pool and overwritten via move later. */
37771 if (n_var == 1)
37772 {
37773 if (all_const_zero
37774 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37775 XVECEXP (vals, 0, one_var),
37776 one_var))
37777 return;
37778
37779 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37780 return;
37781 }
37782
37783 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37784 }
37785
37786 void
37787 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37788 {
37789 enum machine_mode mode = GET_MODE (target);
37790 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37791 enum machine_mode half_mode;
37792 bool use_vec_merge = false;
37793 rtx tmp;
37794 static rtx (*gen_extract[6][2]) (rtx, rtx)
37795 = {
37796 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37797 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37798 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37799 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37800 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37801 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37802 };
37803 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37804 = {
37805 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37806 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37807 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37808 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37809 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37810 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37811 };
37812 int i, j, n;
37813
37814 switch (mode)
37815 {
37816 case V2SFmode:
37817 case V2SImode:
37818 if (mmx_ok)
37819 {
37820 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37821 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37822 if (elt == 0)
37823 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37824 else
37825 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37826 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37827 return;
37828 }
37829 break;
37830
37831 case V2DImode:
37832 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37833 if (use_vec_merge)
37834 break;
37835
37836 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37837 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37838 if (elt == 0)
37839 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37840 else
37841 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37842 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37843 return;
37844
37845 case V2DFmode:
37846 {
37847 rtx op0, op1;
37848
37849 /* For the two element vectors, we implement a VEC_CONCAT with
37850 the extraction of the other element. */
37851
37852 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37853 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37854
37855 if (elt == 0)
37856 op0 = val, op1 = tmp;
37857 else
37858 op0 = tmp, op1 = val;
37859
37860 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37861 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37862 }
37863 return;
37864
37865 case V4SFmode:
37866 use_vec_merge = TARGET_SSE4_1;
37867 if (use_vec_merge)
37868 break;
37869
37870 switch (elt)
37871 {
37872 case 0:
37873 use_vec_merge = true;
37874 break;
37875
37876 case 1:
37877 /* tmp = target = A B C D */
37878 tmp = copy_to_reg (target);
37879 /* target = A A B B */
37880 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37881 /* target = X A B B */
37882 ix86_expand_vector_set (false, target, val, 0);
37883 /* target = A X C D */
37884 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37885 const1_rtx, const0_rtx,
37886 GEN_INT (2+4), GEN_INT (3+4)));
37887 return;
37888
37889 case 2:
37890 /* tmp = target = A B C D */
37891 tmp = copy_to_reg (target);
37892 /* tmp = X B C D */
37893 ix86_expand_vector_set (false, tmp, val, 0);
37894 /* target = A B X D */
37895 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37896 const0_rtx, const1_rtx,
37897 GEN_INT (0+4), GEN_INT (3+4)));
37898 return;
37899
37900 case 3:
37901 /* tmp = target = A B C D */
37902 tmp = copy_to_reg (target);
37903 /* tmp = X B C D */
37904 ix86_expand_vector_set (false, tmp, val, 0);
37905 /* target = A B X D */
37906 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37907 const0_rtx, const1_rtx,
37908 GEN_INT (2+4), GEN_INT (0+4)));
37909 return;
37910
37911 default:
37912 gcc_unreachable ();
37913 }
37914 break;
37915
37916 case V4SImode:
37917 use_vec_merge = TARGET_SSE4_1;
37918 if (use_vec_merge)
37919 break;
37920
37921 /* Element 0 handled by vec_merge below. */
37922 if (elt == 0)
37923 {
37924 use_vec_merge = true;
37925 break;
37926 }
37927
37928 if (TARGET_SSE2)
37929 {
37930 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37931 store into element 0, then shuffle them back. */
37932
37933 rtx order[4];
37934
37935 order[0] = GEN_INT (elt);
37936 order[1] = const1_rtx;
37937 order[2] = const2_rtx;
37938 order[3] = GEN_INT (3);
37939 order[elt] = const0_rtx;
37940
37941 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37942 order[1], order[2], order[3]));
37943
37944 ix86_expand_vector_set (false, target, val, 0);
37945
37946 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37947 order[1], order[2], order[3]));
37948 }
37949 else
37950 {
37951 /* For SSE1, we have to reuse the V4SF code. */
37952 rtx t = gen_reg_rtx (V4SFmode);
37953 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
37954 emit_move_insn (target, gen_lowpart (mode, t));
37955 }
37956 return;
37957
37958 case V8HImode:
37959 use_vec_merge = TARGET_SSE2;
37960 break;
37961 case V4HImode:
37962 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37963 break;
37964
37965 case V16QImode:
37966 use_vec_merge = TARGET_SSE4_1;
37967 break;
37968
37969 case V8QImode:
37970 break;
37971
37972 case V32QImode:
37973 half_mode = V16QImode;
37974 j = 0;
37975 n = 16;
37976 goto half;
37977
37978 case V16HImode:
37979 half_mode = V8HImode;
37980 j = 1;
37981 n = 8;
37982 goto half;
37983
37984 case V8SImode:
37985 half_mode = V4SImode;
37986 j = 2;
37987 n = 4;
37988 goto half;
37989
37990 case V4DImode:
37991 half_mode = V2DImode;
37992 j = 3;
37993 n = 2;
37994 goto half;
37995
37996 case V8SFmode:
37997 half_mode = V4SFmode;
37998 j = 4;
37999 n = 4;
38000 goto half;
38001
38002 case V4DFmode:
38003 half_mode = V2DFmode;
38004 j = 5;
38005 n = 2;
38006 goto half;
38007
38008 half:
38009 /* Compute offset. */
38010 i = elt / n;
38011 elt %= n;
38012
38013 gcc_assert (i <= 1);
38014
38015 /* Extract the half. */
38016 tmp = gen_reg_rtx (half_mode);
38017 emit_insn (gen_extract[j][i] (tmp, target));
38018
38019 /* Put val in tmp at elt. */
38020 ix86_expand_vector_set (false, tmp, val, elt);
38021
38022 /* Put it back. */
38023 emit_insn (gen_insert[j][i] (target, target, tmp));
38024 return;
38025
38026 default:
38027 break;
38028 }
38029
38030 if (use_vec_merge)
38031 {
38032 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
38033 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
38034 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38035 }
38036 else
38037 {
38038 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38039
38040 emit_move_insn (mem, target);
38041
38042 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38043 emit_move_insn (tmp, val);
38044
38045 emit_move_insn (target, mem);
38046 }
38047 }
38048
38049 void
38050 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
38051 {
38052 enum machine_mode mode = GET_MODE (vec);
38053 enum machine_mode inner_mode = GET_MODE_INNER (mode);
38054 bool use_vec_extr = false;
38055 rtx tmp;
38056
38057 switch (mode)
38058 {
38059 case V2SImode:
38060 case V2SFmode:
38061 if (!mmx_ok)
38062 break;
38063 /* FALLTHRU */
38064
38065 case V2DFmode:
38066 case V2DImode:
38067 use_vec_extr = true;
38068 break;
38069
38070 case V4SFmode:
38071 use_vec_extr = TARGET_SSE4_1;
38072 if (use_vec_extr)
38073 break;
38074
38075 switch (elt)
38076 {
38077 case 0:
38078 tmp = vec;
38079 break;
38080
38081 case 1:
38082 case 3:
38083 tmp = gen_reg_rtx (mode);
38084 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
38085 GEN_INT (elt), GEN_INT (elt),
38086 GEN_INT (elt+4), GEN_INT (elt+4)));
38087 break;
38088
38089 case 2:
38090 tmp = gen_reg_rtx (mode);
38091 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
38092 break;
38093
38094 default:
38095 gcc_unreachable ();
38096 }
38097 vec = tmp;
38098 use_vec_extr = true;
38099 elt = 0;
38100 break;
38101
38102 case V4SImode:
38103 use_vec_extr = TARGET_SSE4_1;
38104 if (use_vec_extr)
38105 break;
38106
38107 if (TARGET_SSE2)
38108 {
38109 switch (elt)
38110 {
38111 case 0:
38112 tmp = vec;
38113 break;
38114
38115 case 1:
38116 case 3:
38117 tmp = gen_reg_rtx (mode);
38118 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
38119 GEN_INT (elt), GEN_INT (elt),
38120 GEN_INT (elt), GEN_INT (elt)));
38121 break;
38122
38123 case 2:
38124 tmp = gen_reg_rtx (mode);
38125 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
38126 break;
38127
38128 default:
38129 gcc_unreachable ();
38130 }
38131 vec = tmp;
38132 use_vec_extr = true;
38133 elt = 0;
38134 }
38135 else
38136 {
38137 /* For SSE1, we have to reuse the V4SF code. */
38138 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
38139 gen_lowpart (V4SFmode, vec), elt);
38140 return;
38141 }
38142 break;
38143
38144 case V8HImode:
38145 use_vec_extr = TARGET_SSE2;
38146 break;
38147 case V4HImode:
38148 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
38149 break;
38150
38151 case V16QImode:
38152 use_vec_extr = TARGET_SSE4_1;
38153 break;
38154
38155 case V8SFmode:
38156 if (TARGET_AVX)
38157 {
38158 tmp = gen_reg_rtx (V4SFmode);
38159 if (elt < 4)
38160 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
38161 else
38162 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
38163 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38164 return;
38165 }
38166 break;
38167
38168 case V4DFmode:
38169 if (TARGET_AVX)
38170 {
38171 tmp = gen_reg_rtx (V2DFmode);
38172 if (elt < 2)
38173 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
38174 else
38175 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
38176 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38177 return;
38178 }
38179 break;
38180
38181 case V32QImode:
38182 if (TARGET_AVX)
38183 {
38184 tmp = gen_reg_rtx (V16QImode);
38185 if (elt < 16)
38186 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
38187 else
38188 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
38189 ix86_expand_vector_extract (false, target, tmp, elt & 15);
38190 return;
38191 }
38192 break;
38193
38194 case V16HImode:
38195 if (TARGET_AVX)
38196 {
38197 tmp = gen_reg_rtx (V8HImode);
38198 if (elt < 8)
38199 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
38200 else
38201 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
38202 ix86_expand_vector_extract (false, target, tmp, elt & 7);
38203 return;
38204 }
38205 break;
38206
38207 case V8SImode:
38208 if (TARGET_AVX)
38209 {
38210 tmp = gen_reg_rtx (V4SImode);
38211 if (elt < 4)
38212 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
38213 else
38214 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
38215 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38216 return;
38217 }
38218 break;
38219
38220 case V4DImode:
38221 if (TARGET_AVX)
38222 {
38223 tmp = gen_reg_rtx (V2DImode);
38224 if (elt < 2)
38225 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
38226 else
38227 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
38228 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38229 return;
38230 }
38231 break;
38232
38233 case V8QImode:
38234 /* ??? Could extract the appropriate HImode element and shift. */
38235 default:
38236 break;
38237 }
38238
38239 if (use_vec_extr)
38240 {
38241 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
38242 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
38243
38244 /* Let the rtl optimizers know about the zero extension performed. */
38245 if (inner_mode == QImode || inner_mode == HImode)
38246 {
38247 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
38248 target = gen_lowpart (SImode, target);
38249 }
38250
38251 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38252 }
38253 else
38254 {
38255 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38256
38257 emit_move_insn (mem, vec);
38258
38259 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38260 emit_move_insn (target, tmp);
38261 }
38262 }
38263
38264 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
38265 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
38266 The upper bits of DEST are undefined, though they shouldn't cause
38267 exceptions (some bits from src or all zeros are ok). */
38268
38269 static void
38270 emit_reduc_half (rtx dest, rtx src, int i)
38271 {
38272 rtx tem, d = dest;
38273 switch (GET_MODE (src))
38274 {
38275 case V4SFmode:
38276 if (i == 128)
38277 tem = gen_sse_movhlps (dest, src, src);
38278 else
38279 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
38280 GEN_INT (1 + 4), GEN_INT (1 + 4));
38281 break;
38282 case V2DFmode:
38283 tem = gen_vec_interleave_highv2df (dest, src, src);
38284 break;
38285 case V16QImode:
38286 case V8HImode:
38287 case V4SImode:
38288 case V2DImode:
38289 d = gen_reg_rtx (V1TImode);
38290 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
38291 GEN_INT (i / 2));
38292 break;
38293 case V8SFmode:
38294 if (i == 256)
38295 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
38296 else
38297 tem = gen_avx_shufps256 (dest, src, src,
38298 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
38299 break;
38300 case V4DFmode:
38301 if (i == 256)
38302 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
38303 else
38304 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
38305 break;
38306 case V32QImode:
38307 case V16HImode:
38308 case V8SImode:
38309 case V4DImode:
38310 if (i == 256)
38311 {
38312 if (GET_MODE (dest) != V4DImode)
38313 d = gen_reg_rtx (V4DImode);
38314 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
38315 gen_lowpart (V4DImode, src),
38316 const1_rtx);
38317 }
38318 else
38319 {
38320 d = gen_reg_rtx (V2TImode);
38321 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
38322 GEN_INT (i / 2));
38323 }
38324 break;
38325 default:
38326 gcc_unreachable ();
38327 }
38328 emit_insn (tem);
38329 if (d != dest)
38330 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
38331 }
38332
38333 /* Expand a vector reduction. FN is the binary pattern to reduce;
38334 DEST is the destination; IN is the input vector. */
38335
38336 void
38337 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
38338 {
38339 rtx half, dst, vec = in;
38340 enum machine_mode mode = GET_MODE (in);
38341 int i;
38342
38343 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
38344 if (TARGET_SSE4_1
38345 && mode == V8HImode
38346 && fn == gen_uminv8hi3)
38347 {
38348 emit_insn (gen_sse4_1_phminposuw (dest, in));
38349 return;
38350 }
38351
38352 for (i = GET_MODE_BITSIZE (mode);
38353 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
38354 i >>= 1)
38355 {
38356 half = gen_reg_rtx (mode);
38357 emit_reduc_half (half, vec, i);
38358 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
38359 dst = dest;
38360 else
38361 dst = gen_reg_rtx (mode);
38362 emit_insn (fn (dst, half, vec));
38363 vec = dst;
38364 }
38365 }
38366 \f
38367 /* Target hook for scalar_mode_supported_p. */
38368 static bool
38369 ix86_scalar_mode_supported_p (enum machine_mode mode)
38370 {
38371 if (DECIMAL_FLOAT_MODE_P (mode))
38372 return default_decimal_float_supported_p ();
38373 else if (mode == TFmode)
38374 return true;
38375 else
38376 return default_scalar_mode_supported_p (mode);
38377 }
38378
38379 /* Implements target hook vector_mode_supported_p. */
38380 static bool
38381 ix86_vector_mode_supported_p (enum machine_mode mode)
38382 {
38383 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
38384 return true;
38385 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
38386 return true;
38387 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
38388 return true;
38389 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
38390 return true;
38391 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
38392 return true;
38393 return false;
38394 }
38395
38396 /* Target hook for c_mode_for_suffix. */
38397 static enum machine_mode
38398 ix86_c_mode_for_suffix (char suffix)
38399 {
38400 if (suffix == 'q')
38401 return TFmode;
38402 if (suffix == 'w')
38403 return XFmode;
38404
38405 return VOIDmode;
38406 }
38407
38408 /* Worker function for TARGET_MD_ASM_CLOBBERS.
38409
38410 We do this in the new i386 backend to maintain source compatibility
38411 with the old cc0-based compiler. */
38412
38413 static tree
38414 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
38415 tree inputs ATTRIBUTE_UNUSED,
38416 tree clobbers)
38417 {
38418 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
38419 clobbers);
38420 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
38421 clobbers);
38422 return clobbers;
38423 }
38424
38425 /* Implements target vector targetm.asm.encode_section_info. */
38426
38427 static void ATTRIBUTE_UNUSED
38428 ix86_encode_section_info (tree decl, rtx rtl, int first)
38429 {
38430 default_encode_section_info (decl, rtl, first);
38431
38432 if (TREE_CODE (decl) == VAR_DECL
38433 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
38434 && ix86_in_large_data_p (decl))
38435 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
38436 }
38437
38438 /* Worker function for REVERSE_CONDITION. */
38439
38440 enum rtx_code
38441 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
38442 {
38443 return (mode != CCFPmode && mode != CCFPUmode
38444 ? reverse_condition (code)
38445 : reverse_condition_maybe_unordered (code));
38446 }
38447
38448 /* Output code to perform an x87 FP register move, from OPERANDS[1]
38449 to OPERANDS[0]. */
38450
38451 const char *
38452 output_387_reg_move (rtx insn, rtx *operands)
38453 {
38454 if (REG_P (operands[0]))
38455 {
38456 if (REG_P (operands[1])
38457 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38458 {
38459 if (REGNO (operands[0]) == FIRST_STACK_REG)
38460 return output_387_ffreep (operands, 0);
38461 return "fstp\t%y0";
38462 }
38463 if (STACK_TOP_P (operands[0]))
38464 return "fld%Z1\t%y1";
38465 return "fst\t%y0";
38466 }
38467 else if (MEM_P (operands[0]))
38468 {
38469 gcc_assert (REG_P (operands[1]));
38470 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38471 return "fstp%Z0\t%y0";
38472 else
38473 {
38474 /* There is no non-popping store to memory for XFmode.
38475 So if we need one, follow the store with a load. */
38476 if (GET_MODE (operands[0]) == XFmode)
38477 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
38478 else
38479 return "fst%Z0\t%y0";
38480 }
38481 }
38482 else
38483 gcc_unreachable();
38484 }
38485
38486 /* Output code to perform a conditional jump to LABEL, if C2 flag in
38487 FP status register is set. */
38488
38489 void
38490 ix86_emit_fp_unordered_jump (rtx label)
38491 {
38492 rtx reg = gen_reg_rtx (HImode);
38493 rtx temp;
38494
38495 emit_insn (gen_x86_fnstsw_1 (reg));
38496
38497 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
38498 {
38499 emit_insn (gen_x86_sahf_1 (reg));
38500
38501 temp = gen_rtx_REG (CCmode, FLAGS_REG);
38502 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
38503 }
38504 else
38505 {
38506 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
38507
38508 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
38509 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
38510 }
38511
38512 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
38513 gen_rtx_LABEL_REF (VOIDmode, label),
38514 pc_rtx);
38515 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
38516
38517 emit_jump_insn (temp);
38518 predict_jump (REG_BR_PROB_BASE * 10 / 100);
38519 }
38520
38521 /* Output code to perform a log1p XFmode calculation. */
38522
38523 void ix86_emit_i387_log1p (rtx op0, rtx op1)
38524 {
38525 rtx label1 = gen_label_rtx ();
38526 rtx label2 = gen_label_rtx ();
38527
38528 rtx tmp = gen_reg_rtx (XFmode);
38529 rtx tmp2 = gen_reg_rtx (XFmode);
38530 rtx test;
38531
38532 emit_insn (gen_absxf2 (tmp, op1));
38533 test = gen_rtx_GE (VOIDmode, tmp,
38534 CONST_DOUBLE_FROM_REAL_VALUE (
38535 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
38536 XFmode));
38537 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
38538
38539 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38540 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
38541 emit_jump (label2);
38542
38543 emit_label (label1);
38544 emit_move_insn (tmp, CONST1_RTX (XFmode));
38545 emit_insn (gen_addxf3 (tmp, op1, tmp));
38546 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38547 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
38548
38549 emit_label (label2);
38550 }
38551
38552 /* Emit code for round calculation. */
38553 void ix86_emit_i387_round (rtx op0, rtx op1)
38554 {
38555 enum machine_mode inmode = GET_MODE (op1);
38556 enum machine_mode outmode = GET_MODE (op0);
38557 rtx e1, e2, res, tmp, tmp1, half;
38558 rtx scratch = gen_reg_rtx (HImode);
38559 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
38560 rtx jump_label = gen_label_rtx ();
38561 rtx insn;
38562 rtx (*gen_abs) (rtx, rtx);
38563 rtx (*gen_neg) (rtx, rtx);
38564
38565 switch (inmode)
38566 {
38567 case SFmode:
38568 gen_abs = gen_abssf2;
38569 break;
38570 case DFmode:
38571 gen_abs = gen_absdf2;
38572 break;
38573 case XFmode:
38574 gen_abs = gen_absxf2;
38575 break;
38576 default:
38577 gcc_unreachable ();
38578 }
38579
38580 switch (outmode)
38581 {
38582 case SFmode:
38583 gen_neg = gen_negsf2;
38584 break;
38585 case DFmode:
38586 gen_neg = gen_negdf2;
38587 break;
38588 case XFmode:
38589 gen_neg = gen_negxf2;
38590 break;
38591 case HImode:
38592 gen_neg = gen_neghi2;
38593 break;
38594 case SImode:
38595 gen_neg = gen_negsi2;
38596 break;
38597 case DImode:
38598 gen_neg = gen_negdi2;
38599 break;
38600 default:
38601 gcc_unreachable ();
38602 }
38603
38604 e1 = gen_reg_rtx (inmode);
38605 e2 = gen_reg_rtx (inmode);
38606 res = gen_reg_rtx (outmode);
38607
38608 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
38609
38610 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
38611
38612 /* scratch = fxam(op1) */
38613 emit_insn (gen_rtx_SET (VOIDmode, scratch,
38614 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
38615 UNSPEC_FXAM)));
38616 /* e1 = fabs(op1) */
38617 emit_insn (gen_abs (e1, op1));
38618
38619 /* e2 = e1 + 0.5 */
38620 half = force_reg (inmode, half);
38621 emit_insn (gen_rtx_SET (VOIDmode, e2,
38622 gen_rtx_PLUS (inmode, e1, half)));
38623
38624 /* res = floor(e2) */
38625 if (inmode != XFmode)
38626 {
38627 tmp1 = gen_reg_rtx (XFmode);
38628
38629 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
38630 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
38631 }
38632 else
38633 tmp1 = e2;
38634
38635 switch (outmode)
38636 {
38637 case SFmode:
38638 case DFmode:
38639 {
38640 rtx tmp0 = gen_reg_rtx (XFmode);
38641
38642 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38643
38644 emit_insn (gen_rtx_SET (VOIDmode, res,
38645 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38646 UNSPEC_TRUNC_NOOP)));
38647 }
38648 break;
38649 case XFmode:
38650 emit_insn (gen_frndintxf2_floor (res, tmp1));
38651 break;
38652 case HImode:
38653 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38654 break;
38655 case SImode:
38656 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38657 break;
38658 case DImode:
38659 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38660 break;
38661 default:
38662 gcc_unreachable ();
38663 }
38664
38665 /* flags = signbit(a) */
38666 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38667
38668 /* if (flags) then res = -res */
38669 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38670 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38671 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38672 pc_rtx);
38673 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38674 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38675 JUMP_LABEL (insn) = jump_label;
38676
38677 emit_insn (gen_neg (res, res));
38678
38679 emit_label (jump_label);
38680 LABEL_NUSES (jump_label) = 1;
38681
38682 emit_move_insn (op0, res);
38683 }
38684
38685 /* Output code to perform a Newton-Rhapson approximation of a single precision
38686 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38687
38688 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38689 {
38690 rtx x0, x1, e0, e1;
38691
38692 x0 = gen_reg_rtx (mode);
38693 e0 = gen_reg_rtx (mode);
38694 e1 = gen_reg_rtx (mode);
38695 x1 = gen_reg_rtx (mode);
38696
38697 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38698
38699 b = force_reg (mode, b);
38700
38701 /* x0 = rcp(b) estimate */
38702 emit_insn (gen_rtx_SET (VOIDmode, x0,
38703 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38704 UNSPEC_RCP)));
38705 /* e0 = x0 * b */
38706 emit_insn (gen_rtx_SET (VOIDmode, e0,
38707 gen_rtx_MULT (mode, x0, b)));
38708
38709 /* e0 = x0 * e0 */
38710 emit_insn (gen_rtx_SET (VOIDmode, e0,
38711 gen_rtx_MULT (mode, x0, e0)));
38712
38713 /* e1 = x0 + x0 */
38714 emit_insn (gen_rtx_SET (VOIDmode, e1,
38715 gen_rtx_PLUS (mode, x0, x0)));
38716
38717 /* x1 = e1 - e0 */
38718 emit_insn (gen_rtx_SET (VOIDmode, x1,
38719 gen_rtx_MINUS (mode, e1, e0)));
38720
38721 /* res = a * x1 */
38722 emit_insn (gen_rtx_SET (VOIDmode, res,
38723 gen_rtx_MULT (mode, a, x1)));
38724 }
38725
38726 /* Output code to perform a Newton-Rhapson approximation of a
38727 single precision floating point [reciprocal] square root. */
38728
38729 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38730 bool recip)
38731 {
38732 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38733 REAL_VALUE_TYPE r;
38734
38735 x0 = gen_reg_rtx (mode);
38736 e0 = gen_reg_rtx (mode);
38737 e1 = gen_reg_rtx (mode);
38738 e2 = gen_reg_rtx (mode);
38739 e3 = gen_reg_rtx (mode);
38740
38741 real_from_integer (&r, VOIDmode, -3, SIGNED);
38742 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38743
38744 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38745 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38746
38747 if (VECTOR_MODE_P (mode))
38748 {
38749 mthree = ix86_build_const_vector (mode, true, mthree);
38750 mhalf = ix86_build_const_vector (mode, true, mhalf);
38751 }
38752
38753 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38754 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38755
38756 a = force_reg (mode, a);
38757
38758 /* x0 = rsqrt(a) estimate */
38759 emit_insn (gen_rtx_SET (VOIDmode, x0,
38760 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38761 UNSPEC_RSQRT)));
38762
38763 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38764 if (!recip)
38765 {
38766 rtx zero, mask;
38767
38768 zero = gen_reg_rtx (mode);
38769 mask = gen_reg_rtx (mode);
38770
38771 zero = force_reg (mode, CONST0_RTX(mode));
38772 emit_insn (gen_rtx_SET (VOIDmode, mask,
38773 gen_rtx_NE (mode, zero, a)));
38774
38775 emit_insn (gen_rtx_SET (VOIDmode, x0,
38776 gen_rtx_AND (mode, x0, mask)));
38777 }
38778
38779 /* e0 = x0 * a */
38780 emit_insn (gen_rtx_SET (VOIDmode, e0,
38781 gen_rtx_MULT (mode, x0, a)));
38782 /* e1 = e0 * x0 */
38783 emit_insn (gen_rtx_SET (VOIDmode, e1,
38784 gen_rtx_MULT (mode, e0, x0)));
38785
38786 /* e2 = e1 - 3. */
38787 mthree = force_reg (mode, mthree);
38788 emit_insn (gen_rtx_SET (VOIDmode, e2,
38789 gen_rtx_PLUS (mode, e1, mthree)));
38790
38791 mhalf = force_reg (mode, mhalf);
38792 if (recip)
38793 /* e3 = -.5 * x0 */
38794 emit_insn (gen_rtx_SET (VOIDmode, e3,
38795 gen_rtx_MULT (mode, x0, mhalf)));
38796 else
38797 /* e3 = -.5 * e0 */
38798 emit_insn (gen_rtx_SET (VOIDmode, e3,
38799 gen_rtx_MULT (mode, e0, mhalf)));
38800 /* ret = e2 * e3 */
38801 emit_insn (gen_rtx_SET (VOIDmode, res,
38802 gen_rtx_MULT (mode, e2, e3)));
38803 }
38804
38805 #ifdef TARGET_SOLARIS
38806 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38807
38808 static void
38809 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38810 tree decl)
38811 {
38812 /* With Binutils 2.15, the "@unwind" marker must be specified on
38813 every occurrence of the ".eh_frame" section, not just the first
38814 one. */
38815 if (TARGET_64BIT
38816 && strcmp (name, ".eh_frame") == 0)
38817 {
38818 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38819 flags & SECTION_WRITE ? "aw" : "a");
38820 return;
38821 }
38822
38823 #ifndef USE_GAS
38824 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38825 {
38826 solaris_elf_asm_comdat_section (name, flags, decl);
38827 return;
38828 }
38829 #endif
38830
38831 default_elf_asm_named_section (name, flags, decl);
38832 }
38833 #endif /* TARGET_SOLARIS */
38834
38835 /* Return the mangling of TYPE if it is an extended fundamental type. */
38836
38837 static const char *
38838 ix86_mangle_type (const_tree type)
38839 {
38840 type = TYPE_MAIN_VARIANT (type);
38841
38842 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38843 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38844 return NULL;
38845
38846 switch (TYPE_MODE (type))
38847 {
38848 case TFmode:
38849 /* __float128 is "g". */
38850 return "g";
38851 case XFmode:
38852 /* "long double" or __float80 is "e". */
38853 return "e";
38854 default:
38855 return NULL;
38856 }
38857 }
38858
38859 /* For 32-bit code we can save PIC register setup by using
38860 __stack_chk_fail_local hidden function instead of calling
38861 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38862 register, so it is better to call __stack_chk_fail directly. */
38863
38864 static tree ATTRIBUTE_UNUSED
38865 ix86_stack_protect_fail (void)
38866 {
38867 return TARGET_64BIT
38868 ? default_external_stack_protect_fail ()
38869 : default_hidden_stack_protect_fail ();
38870 }
38871
38872 /* Select a format to encode pointers in exception handling data. CODE
38873 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38874 true if the symbol may be affected by dynamic relocations.
38875
38876 ??? All x86 object file formats are capable of representing this.
38877 After all, the relocation needed is the same as for the call insn.
38878 Whether or not a particular assembler allows us to enter such, I
38879 guess we'll have to see. */
38880 int
38881 asm_preferred_eh_data_format (int code, int global)
38882 {
38883 if (flag_pic)
38884 {
38885 int type = DW_EH_PE_sdata8;
38886 if (!TARGET_64BIT
38887 || ix86_cmodel == CM_SMALL_PIC
38888 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38889 type = DW_EH_PE_sdata4;
38890 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38891 }
38892 if (ix86_cmodel == CM_SMALL
38893 || (ix86_cmodel == CM_MEDIUM && code))
38894 return DW_EH_PE_udata4;
38895 return DW_EH_PE_absptr;
38896 }
38897 \f
38898 /* Expand copysign from SIGN to the positive value ABS_VALUE
38899 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38900 the sign-bit. */
38901 static void
38902 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38903 {
38904 enum machine_mode mode = GET_MODE (sign);
38905 rtx sgn = gen_reg_rtx (mode);
38906 if (mask == NULL_RTX)
38907 {
38908 enum machine_mode vmode;
38909
38910 if (mode == SFmode)
38911 vmode = V4SFmode;
38912 else if (mode == DFmode)
38913 vmode = V2DFmode;
38914 else
38915 vmode = mode;
38916
38917 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38918 if (!VECTOR_MODE_P (mode))
38919 {
38920 /* We need to generate a scalar mode mask in this case. */
38921 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38922 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38923 mask = gen_reg_rtx (mode);
38924 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38925 }
38926 }
38927 else
38928 mask = gen_rtx_NOT (mode, mask);
38929 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38930 gen_rtx_AND (mode, mask, sign)));
38931 emit_insn (gen_rtx_SET (VOIDmode, result,
38932 gen_rtx_IOR (mode, abs_value, sgn)));
38933 }
38934
38935 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38936 mask for masking out the sign-bit is stored in *SMASK, if that is
38937 non-null. */
38938 static rtx
38939 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38940 {
38941 enum machine_mode vmode, mode = GET_MODE (op0);
38942 rtx xa, mask;
38943
38944 xa = gen_reg_rtx (mode);
38945 if (mode == SFmode)
38946 vmode = V4SFmode;
38947 else if (mode == DFmode)
38948 vmode = V2DFmode;
38949 else
38950 vmode = mode;
38951 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38952 if (!VECTOR_MODE_P (mode))
38953 {
38954 /* We need to generate a scalar mode mask in this case. */
38955 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38956 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38957 mask = gen_reg_rtx (mode);
38958 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38959 }
38960 emit_insn (gen_rtx_SET (VOIDmode, xa,
38961 gen_rtx_AND (mode, op0, mask)));
38962
38963 if (smask)
38964 *smask = mask;
38965
38966 return xa;
38967 }
38968
38969 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38970 swapping the operands if SWAP_OPERANDS is true. The expanded
38971 code is a forward jump to a newly created label in case the
38972 comparison is true. The generated label rtx is returned. */
38973 static rtx
38974 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38975 bool swap_operands)
38976 {
38977 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
38978 rtx label, tmp;
38979
38980 if (swap_operands)
38981 {
38982 tmp = op0;
38983 op0 = op1;
38984 op1 = tmp;
38985 }
38986
38987 label = gen_label_rtx ();
38988 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
38989 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38990 gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
38991 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38992 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38993 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38994 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38995 JUMP_LABEL (tmp) = label;
38996
38997 return label;
38998 }
38999
39000 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
39001 using comparison code CODE. Operands are swapped for the comparison if
39002 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
39003 static rtx
39004 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
39005 bool swap_operands)
39006 {
39007 rtx (*insn)(rtx, rtx, rtx, rtx);
39008 enum machine_mode mode = GET_MODE (op0);
39009 rtx mask = gen_reg_rtx (mode);
39010
39011 if (swap_operands)
39012 {
39013 rtx tmp = op0;
39014 op0 = op1;
39015 op1 = tmp;
39016 }
39017
39018 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
39019
39020 emit_insn (insn (mask, op0, op1,
39021 gen_rtx_fmt_ee (code, mode, op0, op1)));
39022 return mask;
39023 }
39024
39025 /* Generate and return a rtx of mode MODE for 2**n where n is the number
39026 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
39027 static rtx
39028 ix86_gen_TWO52 (enum machine_mode mode)
39029 {
39030 REAL_VALUE_TYPE TWO52r;
39031 rtx TWO52;
39032
39033 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
39034 TWO52 = const_double_from_real_value (TWO52r, mode);
39035 TWO52 = force_reg (mode, TWO52);
39036
39037 return TWO52;
39038 }
39039
39040 /* Expand SSE sequence for computing lround from OP1 storing
39041 into OP0. */
39042 void
39043 ix86_expand_lround (rtx op0, rtx op1)
39044 {
39045 /* C code for the stuff we're doing below:
39046 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
39047 return (long)tmp;
39048 */
39049 enum machine_mode mode = GET_MODE (op1);
39050 const struct real_format *fmt;
39051 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39052 rtx adj;
39053
39054 /* load nextafter (0.5, 0.0) */
39055 fmt = REAL_MODE_FORMAT (mode);
39056 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39057 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39058
39059 /* adj = copysign (0.5, op1) */
39060 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
39061 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
39062
39063 /* adj = op1 + adj */
39064 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
39065
39066 /* op0 = (imode)adj */
39067 expand_fix (op0, adj, 0);
39068 }
39069
39070 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
39071 into OPERAND0. */
39072 void
39073 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
39074 {
39075 /* C code for the stuff we're doing below (for do_floor):
39076 xi = (long)op1;
39077 xi -= (double)xi > op1 ? 1 : 0;
39078 return xi;
39079 */
39080 enum machine_mode fmode = GET_MODE (op1);
39081 enum machine_mode imode = GET_MODE (op0);
39082 rtx ireg, freg, label, tmp;
39083
39084 /* reg = (long)op1 */
39085 ireg = gen_reg_rtx (imode);
39086 expand_fix (ireg, op1, 0);
39087
39088 /* freg = (double)reg */
39089 freg = gen_reg_rtx (fmode);
39090 expand_float (freg, ireg, 0);
39091
39092 /* ireg = (freg > op1) ? ireg - 1 : ireg */
39093 label = ix86_expand_sse_compare_and_jump (UNLE,
39094 freg, op1, !do_floor);
39095 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
39096 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
39097 emit_move_insn (ireg, tmp);
39098
39099 emit_label (label);
39100 LABEL_NUSES (label) = 1;
39101
39102 emit_move_insn (op0, ireg);
39103 }
39104
39105 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
39106 result in OPERAND0. */
39107 void
39108 ix86_expand_rint (rtx operand0, rtx operand1)
39109 {
39110 /* C code for the stuff we're doing below:
39111 xa = fabs (operand1);
39112 if (!isless (xa, 2**52))
39113 return operand1;
39114 xa = xa + 2**52 - 2**52;
39115 return copysign (xa, operand1);
39116 */
39117 enum machine_mode mode = GET_MODE (operand0);
39118 rtx res, xa, label, TWO52, mask;
39119
39120 res = gen_reg_rtx (mode);
39121 emit_move_insn (res, operand1);
39122
39123 /* xa = abs (operand1) */
39124 xa = ix86_expand_sse_fabs (res, &mask);
39125
39126 /* if (!isless (xa, TWO52)) goto label; */
39127 TWO52 = ix86_gen_TWO52 (mode);
39128 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39129
39130 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39131 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39132
39133 ix86_sse_copysign_to_positive (res, xa, res, mask);
39134
39135 emit_label (label);
39136 LABEL_NUSES (label) = 1;
39137
39138 emit_move_insn (operand0, res);
39139 }
39140
39141 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39142 into OPERAND0. */
39143 void
39144 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
39145 {
39146 /* C code for the stuff we expand below.
39147 double xa = fabs (x), x2;
39148 if (!isless (xa, TWO52))
39149 return x;
39150 xa = xa + TWO52 - TWO52;
39151 x2 = copysign (xa, x);
39152 Compensate. Floor:
39153 if (x2 > x)
39154 x2 -= 1;
39155 Compensate. Ceil:
39156 if (x2 < x)
39157 x2 -= -1;
39158 return x2;
39159 */
39160 enum machine_mode mode = GET_MODE (operand0);
39161 rtx xa, TWO52, tmp, label, one, res, mask;
39162
39163 TWO52 = ix86_gen_TWO52 (mode);
39164
39165 /* Temporary for holding the result, initialized to the input
39166 operand to ease control flow. */
39167 res = gen_reg_rtx (mode);
39168 emit_move_insn (res, operand1);
39169
39170 /* xa = abs (operand1) */
39171 xa = ix86_expand_sse_fabs (res, &mask);
39172
39173 /* if (!isless (xa, TWO52)) goto label; */
39174 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39175
39176 /* xa = xa + TWO52 - TWO52; */
39177 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39178 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39179
39180 /* xa = copysign (xa, operand1) */
39181 ix86_sse_copysign_to_positive (xa, xa, res, mask);
39182
39183 /* generate 1.0 or -1.0 */
39184 one = force_reg (mode,
39185 const_double_from_real_value (do_floor
39186 ? dconst1 : dconstm1, mode));
39187
39188 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39189 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39190 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39191 gen_rtx_AND (mode, one, tmp)));
39192 /* We always need to subtract here to preserve signed zero. */
39193 tmp = expand_simple_binop (mode, MINUS,
39194 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39195 emit_move_insn (res, tmp);
39196
39197 emit_label (label);
39198 LABEL_NUSES (label) = 1;
39199
39200 emit_move_insn (operand0, res);
39201 }
39202
39203 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39204 into OPERAND0. */
39205 void
39206 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
39207 {
39208 /* C code for the stuff we expand below.
39209 double xa = fabs (x), x2;
39210 if (!isless (xa, TWO52))
39211 return x;
39212 x2 = (double)(long)x;
39213 Compensate. Floor:
39214 if (x2 > x)
39215 x2 -= 1;
39216 Compensate. Ceil:
39217 if (x2 < x)
39218 x2 += 1;
39219 if (HONOR_SIGNED_ZEROS (mode))
39220 return copysign (x2, x);
39221 return x2;
39222 */
39223 enum machine_mode mode = GET_MODE (operand0);
39224 rtx xa, xi, TWO52, tmp, label, one, res, mask;
39225
39226 TWO52 = ix86_gen_TWO52 (mode);
39227
39228 /* Temporary for holding the result, initialized to the input
39229 operand to ease control flow. */
39230 res = gen_reg_rtx (mode);
39231 emit_move_insn (res, operand1);
39232
39233 /* xa = abs (operand1) */
39234 xa = ix86_expand_sse_fabs (res, &mask);
39235
39236 /* if (!isless (xa, TWO52)) goto label; */
39237 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39238
39239 /* xa = (double)(long)x */
39240 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39241 expand_fix (xi, res, 0);
39242 expand_float (xa, xi, 0);
39243
39244 /* generate 1.0 */
39245 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39246
39247 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39248 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39249 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39250 gen_rtx_AND (mode, one, tmp)));
39251 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
39252 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39253 emit_move_insn (res, tmp);
39254
39255 if (HONOR_SIGNED_ZEROS (mode))
39256 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39257
39258 emit_label (label);
39259 LABEL_NUSES (label) = 1;
39260
39261 emit_move_insn (operand0, res);
39262 }
39263
39264 /* Expand SSE sequence for computing round from OPERAND1 storing
39265 into OPERAND0. Sequence that works without relying on DImode truncation
39266 via cvttsd2siq that is only available on 64bit targets. */
39267 void
39268 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
39269 {
39270 /* C code for the stuff we expand below.
39271 double xa = fabs (x), xa2, x2;
39272 if (!isless (xa, TWO52))
39273 return x;
39274 Using the absolute value and copying back sign makes
39275 -0.0 -> -0.0 correct.
39276 xa2 = xa + TWO52 - TWO52;
39277 Compensate.
39278 dxa = xa2 - xa;
39279 if (dxa <= -0.5)
39280 xa2 += 1;
39281 else if (dxa > 0.5)
39282 xa2 -= 1;
39283 x2 = copysign (xa2, x);
39284 return x2;
39285 */
39286 enum machine_mode mode = GET_MODE (operand0);
39287 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
39288
39289 TWO52 = ix86_gen_TWO52 (mode);
39290
39291 /* Temporary for holding the result, initialized to the input
39292 operand to ease control flow. */
39293 res = gen_reg_rtx (mode);
39294 emit_move_insn (res, operand1);
39295
39296 /* xa = abs (operand1) */
39297 xa = ix86_expand_sse_fabs (res, &mask);
39298
39299 /* if (!isless (xa, TWO52)) goto label; */
39300 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39301
39302 /* xa2 = xa + TWO52 - TWO52; */
39303 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39304 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
39305
39306 /* dxa = xa2 - xa; */
39307 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
39308
39309 /* generate 0.5, 1.0 and -0.5 */
39310 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
39311 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
39312 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
39313 0, OPTAB_DIRECT);
39314
39315 /* Compensate. */
39316 tmp = gen_reg_rtx (mode);
39317 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
39318 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
39319 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39320 gen_rtx_AND (mode, one, tmp)));
39321 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39322 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
39323 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
39324 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39325 gen_rtx_AND (mode, one, tmp)));
39326 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39327
39328 /* res = copysign (xa2, operand1) */
39329 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
39330
39331 emit_label (label);
39332 LABEL_NUSES (label) = 1;
39333
39334 emit_move_insn (operand0, res);
39335 }
39336
39337 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39338 into OPERAND0. */
39339 void
39340 ix86_expand_trunc (rtx operand0, rtx operand1)
39341 {
39342 /* C code for SSE variant we expand below.
39343 double xa = fabs (x), x2;
39344 if (!isless (xa, TWO52))
39345 return x;
39346 x2 = (double)(long)x;
39347 if (HONOR_SIGNED_ZEROS (mode))
39348 return copysign (x2, x);
39349 return x2;
39350 */
39351 enum machine_mode mode = GET_MODE (operand0);
39352 rtx xa, xi, TWO52, label, res, mask;
39353
39354 TWO52 = ix86_gen_TWO52 (mode);
39355
39356 /* Temporary for holding the result, initialized to the input
39357 operand to ease control flow. */
39358 res = gen_reg_rtx (mode);
39359 emit_move_insn (res, operand1);
39360
39361 /* xa = abs (operand1) */
39362 xa = ix86_expand_sse_fabs (res, &mask);
39363
39364 /* if (!isless (xa, TWO52)) goto label; */
39365 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39366
39367 /* x = (double)(long)x */
39368 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39369 expand_fix (xi, res, 0);
39370 expand_float (res, xi, 0);
39371
39372 if (HONOR_SIGNED_ZEROS (mode))
39373 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39374
39375 emit_label (label);
39376 LABEL_NUSES (label) = 1;
39377
39378 emit_move_insn (operand0, res);
39379 }
39380
39381 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39382 into OPERAND0. */
39383 void
39384 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
39385 {
39386 enum machine_mode mode = GET_MODE (operand0);
39387 rtx xa, mask, TWO52, label, one, res, smask, tmp;
39388
39389 /* C code for SSE variant we expand below.
39390 double xa = fabs (x), x2;
39391 if (!isless (xa, TWO52))
39392 return x;
39393 xa2 = xa + TWO52 - TWO52;
39394 Compensate:
39395 if (xa2 > xa)
39396 xa2 -= 1.0;
39397 x2 = copysign (xa2, x);
39398 return x2;
39399 */
39400
39401 TWO52 = ix86_gen_TWO52 (mode);
39402
39403 /* Temporary for holding the result, initialized to the input
39404 operand to ease control flow. */
39405 res = gen_reg_rtx (mode);
39406 emit_move_insn (res, operand1);
39407
39408 /* xa = abs (operand1) */
39409 xa = ix86_expand_sse_fabs (res, &smask);
39410
39411 /* if (!isless (xa, TWO52)) goto label; */
39412 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39413
39414 /* res = xa + TWO52 - TWO52; */
39415 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39416 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
39417 emit_move_insn (res, tmp);
39418
39419 /* generate 1.0 */
39420 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39421
39422 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
39423 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
39424 emit_insn (gen_rtx_SET (VOIDmode, mask,
39425 gen_rtx_AND (mode, mask, one)));
39426 tmp = expand_simple_binop (mode, MINUS,
39427 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
39428 emit_move_insn (res, tmp);
39429
39430 /* res = copysign (res, operand1) */
39431 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
39432
39433 emit_label (label);
39434 LABEL_NUSES (label) = 1;
39435
39436 emit_move_insn (operand0, res);
39437 }
39438
39439 /* Expand SSE sequence for computing round from OPERAND1 storing
39440 into OPERAND0. */
39441 void
39442 ix86_expand_round (rtx operand0, rtx operand1)
39443 {
39444 /* C code for the stuff we're doing below:
39445 double xa = fabs (x);
39446 if (!isless (xa, TWO52))
39447 return x;
39448 xa = (double)(long)(xa + nextafter (0.5, 0.0));
39449 return copysign (xa, x);
39450 */
39451 enum machine_mode mode = GET_MODE (operand0);
39452 rtx res, TWO52, xa, label, xi, half, mask;
39453 const struct real_format *fmt;
39454 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39455
39456 /* Temporary for holding the result, initialized to the input
39457 operand to ease control flow. */
39458 res = gen_reg_rtx (mode);
39459 emit_move_insn (res, operand1);
39460
39461 TWO52 = ix86_gen_TWO52 (mode);
39462 xa = ix86_expand_sse_fabs (res, &mask);
39463 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39464
39465 /* load nextafter (0.5, 0.0) */
39466 fmt = REAL_MODE_FORMAT (mode);
39467 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39468 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39469
39470 /* xa = xa + 0.5 */
39471 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
39472 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
39473
39474 /* xa = (double)(int64_t)xa */
39475 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39476 expand_fix (xi, xa, 0);
39477 expand_float (xa, xi, 0);
39478
39479 /* res = copysign (xa, operand1) */
39480 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
39481
39482 emit_label (label);
39483 LABEL_NUSES (label) = 1;
39484
39485 emit_move_insn (operand0, res);
39486 }
39487
39488 /* Expand SSE sequence for computing round
39489 from OP1 storing into OP0 using sse4 round insn. */
39490 void
39491 ix86_expand_round_sse4 (rtx op0, rtx op1)
39492 {
39493 enum machine_mode mode = GET_MODE (op0);
39494 rtx e1, e2, res, half;
39495 const struct real_format *fmt;
39496 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39497 rtx (*gen_copysign) (rtx, rtx, rtx);
39498 rtx (*gen_round) (rtx, rtx, rtx);
39499
39500 switch (mode)
39501 {
39502 case SFmode:
39503 gen_copysign = gen_copysignsf3;
39504 gen_round = gen_sse4_1_roundsf2;
39505 break;
39506 case DFmode:
39507 gen_copysign = gen_copysigndf3;
39508 gen_round = gen_sse4_1_rounddf2;
39509 break;
39510 default:
39511 gcc_unreachable ();
39512 }
39513
39514 /* round (a) = trunc (a + copysign (0.5, a)) */
39515
39516 /* load nextafter (0.5, 0.0) */
39517 fmt = REAL_MODE_FORMAT (mode);
39518 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39519 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39520 half = const_double_from_real_value (pred_half, mode);
39521
39522 /* e1 = copysign (0.5, op1) */
39523 e1 = gen_reg_rtx (mode);
39524 emit_insn (gen_copysign (e1, half, op1));
39525
39526 /* e2 = op1 + e1 */
39527 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
39528
39529 /* res = trunc (e2) */
39530 res = gen_reg_rtx (mode);
39531 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
39532
39533 emit_move_insn (op0, res);
39534 }
39535 \f
39536
39537 /* Table of valid machine attributes. */
39538 static const struct attribute_spec ix86_attribute_table[] =
39539 {
39540 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
39541 affects_type_identity } */
39542 /* Stdcall attribute says callee is responsible for popping arguments
39543 if they are not variable. */
39544 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39545 true },
39546 /* Fastcall attribute says callee is responsible for popping arguments
39547 if they are not variable. */
39548 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39549 true },
39550 /* Thiscall attribute says callee is responsible for popping arguments
39551 if they are not variable. */
39552 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39553 true },
39554 /* Cdecl attribute says the callee is a normal C declaration */
39555 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39556 true },
39557 /* Regparm attribute specifies how many integer arguments are to be
39558 passed in registers. */
39559 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
39560 true },
39561 /* Sseregparm attribute says we are using x86_64 calling conventions
39562 for FP arguments. */
39563 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39564 true },
39565 /* The transactional memory builtins are implicitly regparm or fastcall
39566 depending on the ABI. Override the generic do-nothing attribute that
39567 these builtins were declared with. */
39568 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
39569 true },
39570 /* force_align_arg_pointer says this function realigns the stack at entry. */
39571 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
39572 false, true, true, ix86_handle_cconv_attribute, false },
39573 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39574 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
39575 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
39576 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
39577 false },
39578 #endif
39579 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39580 false },
39581 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39582 false },
39583 #ifdef SUBTARGET_ATTRIBUTE_TABLE
39584 SUBTARGET_ATTRIBUTE_TABLE,
39585 #endif
39586 /* ms_abi and sysv_abi calling convention function attributes. */
39587 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39588 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39589 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
39590 false },
39591 { "callee_pop_aggregate_return", 1, 1, false, true, true,
39592 ix86_handle_callee_pop_aggregate_return, true },
39593 /* End element. */
39594 { NULL, 0, 0, false, false, false, NULL, false }
39595 };
39596
39597 /* Implement targetm.vectorize.builtin_vectorization_cost. */
39598 static int
39599 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
39600 tree vectype,
39601 int misalign ATTRIBUTE_UNUSED)
39602 {
39603 unsigned elements;
39604
39605 switch (type_of_cost)
39606 {
39607 case scalar_stmt:
39608 return ix86_cost->scalar_stmt_cost;
39609
39610 case scalar_load:
39611 return ix86_cost->scalar_load_cost;
39612
39613 case scalar_store:
39614 return ix86_cost->scalar_store_cost;
39615
39616 case vector_stmt:
39617 return ix86_cost->vec_stmt_cost;
39618
39619 case vector_load:
39620 return ix86_cost->vec_align_load_cost;
39621
39622 case vector_store:
39623 return ix86_cost->vec_store_cost;
39624
39625 case vec_to_scalar:
39626 return ix86_cost->vec_to_scalar_cost;
39627
39628 case scalar_to_vec:
39629 return ix86_cost->scalar_to_vec_cost;
39630
39631 case unaligned_load:
39632 case unaligned_store:
39633 return ix86_cost->vec_unalign_load_cost;
39634
39635 case cond_branch_taken:
39636 return ix86_cost->cond_taken_branch_cost;
39637
39638 case cond_branch_not_taken:
39639 return ix86_cost->cond_not_taken_branch_cost;
39640
39641 case vec_perm:
39642 case vec_promote_demote:
39643 return ix86_cost->vec_stmt_cost;
39644
39645 case vec_construct:
39646 elements = TYPE_VECTOR_SUBPARTS (vectype);
39647 return elements / 2 + 1;
39648
39649 default:
39650 gcc_unreachable ();
39651 }
39652 }
39653
39654 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39655 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39656 insn every time. */
39657
39658 static GTY(()) rtx vselect_insn;
39659
39660 /* Initialize vselect_insn. */
39661
39662 static void
39663 init_vselect_insn (void)
39664 {
39665 unsigned i;
39666 rtx x;
39667
39668 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39669 for (i = 0; i < MAX_VECT_LEN; ++i)
39670 XVECEXP (x, 0, i) = const0_rtx;
39671 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39672 const0_rtx), x);
39673 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39674 start_sequence ();
39675 vselect_insn = emit_insn (x);
39676 end_sequence ();
39677 }
39678
39679 /* Construct (set target (vec_select op0 (parallel perm))) and
39680 return true if that's a valid instruction in the active ISA. */
39681
39682 static bool
39683 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39684 unsigned nelt, bool testing_p)
39685 {
39686 unsigned int i;
39687 rtx x, save_vconcat;
39688 int icode;
39689
39690 if (vselect_insn == NULL_RTX)
39691 init_vselect_insn ();
39692
39693 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39694 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39695 for (i = 0; i < nelt; ++i)
39696 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39697 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39698 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39699 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39700 SET_DEST (PATTERN (vselect_insn)) = target;
39701 icode = recog_memoized (vselect_insn);
39702
39703 if (icode >= 0 && !testing_p)
39704 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39705
39706 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39707 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39708 INSN_CODE (vselect_insn) = -1;
39709
39710 return icode >= 0;
39711 }
39712
39713 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39714
39715 static bool
39716 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39717 const unsigned char *perm, unsigned nelt,
39718 bool testing_p)
39719 {
39720 enum machine_mode v2mode;
39721 rtx x;
39722 bool ok;
39723
39724 if (vselect_insn == NULL_RTX)
39725 init_vselect_insn ();
39726
39727 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39728 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39729 PUT_MODE (x, v2mode);
39730 XEXP (x, 0) = op0;
39731 XEXP (x, 1) = op1;
39732 ok = expand_vselect (target, x, perm, nelt, testing_p);
39733 XEXP (x, 0) = const0_rtx;
39734 XEXP (x, 1) = const0_rtx;
39735 return ok;
39736 }
39737
39738 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39739 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39740
39741 static bool
39742 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39743 {
39744 enum machine_mode vmode = d->vmode;
39745 unsigned i, mask, nelt = d->nelt;
39746 rtx target, op0, op1, x;
39747 rtx rperm[32], vperm;
39748
39749 if (d->one_operand_p)
39750 return false;
39751 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39752 ;
39753 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39754 ;
39755 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39756 ;
39757 else
39758 return false;
39759
39760 /* This is a blend, not a permute. Elements must stay in their
39761 respective lanes. */
39762 for (i = 0; i < nelt; ++i)
39763 {
39764 unsigned e = d->perm[i];
39765 if (!(e == i || e == i + nelt))
39766 return false;
39767 }
39768
39769 if (d->testing_p)
39770 return true;
39771
39772 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39773 decision should be extracted elsewhere, so that we only try that
39774 sequence once all budget==3 options have been tried. */
39775 target = d->target;
39776 op0 = d->op0;
39777 op1 = d->op1;
39778 mask = 0;
39779
39780 switch (vmode)
39781 {
39782 case V4DFmode:
39783 case V8SFmode:
39784 case V2DFmode:
39785 case V4SFmode:
39786 case V8HImode:
39787 case V8SImode:
39788 for (i = 0; i < nelt; ++i)
39789 mask |= (d->perm[i] >= nelt) << i;
39790 break;
39791
39792 case V2DImode:
39793 for (i = 0; i < 2; ++i)
39794 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39795 vmode = V8HImode;
39796 goto do_subreg;
39797
39798 case V4SImode:
39799 for (i = 0; i < 4; ++i)
39800 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39801 vmode = V8HImode;
39802 goto do_subreg;
39803
39804 case V16QImode:
39805 /* See if bytes move in pairs so we can use pblendw with
39806 an immediate argument, rather than pblendvb with a vector
39807 argument. */
39808 for (i = 0; i < 16; i += 2)
39809 if (d->perm[i] + 1 != d->perm[i + 1])
39810 {
39811 use_pblendvb:
39812 for (i = 0; i < nelt; ++i)
39813 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39814
39815 finish_pblendvb:
39816 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39817 vperm = force_reg (vmode, vperm);
39818
39819 if (GET_MODE_SIZE (vmode) == 16)
39820 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39821 else
39822 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39823 if (target != d->target)
39824 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39825 return true;
39826 }
39827
39828 for (i = 0; i < 8; ++i)
39829 mask |= (d->perm[i * 2] >= 16) << i;
39830 vmode = V8HImode;
39831 /* FALLTHRU */
39832
39833 do_subreg:
39834 target = gen_reg_rtx (vmode);
39835 op0 = gen_lowpart (vmode, op0);
39836 op1 = gen_lowpart (vmode, op1);
39837 break;
39838
39839 case V32QImode:
39840 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39841 for (i = 0; i < 32; i += 2)
39842 if (d->perm[i] + 1 != d->perm[i + 1])
39843 goto use_pblendvb;
39844 /* See if bytes move in quadruplets. If yes, vpblendd
39845 with immediate can be used. */
39846 for (i = 0; i < 32; i += 4)
39847 if (d->perm[i] + 2 != d->perm[i + 2])
39848 break;
39849 if (i < 32)
39850 {
39851 /* See if bytes move the same in both lanes. If yes,
39852 vpblendw with immediate can be used. */
39853 for (i = 0; i < 16; i += 2)
39854 if (d->perm[i] + 16 != d->perm[i + 16])
39855 goto use_pblendvb;
39856
39857 /* Use vpblendw. */
39858 for (i = 0; i < 16; ++i)
39859 mask |= (d->perm[i * 2] >= 32) << i;
39860 vmode = V16HImode;
39861 goto do_subreg;
39862 }
39863
39864 /* Use vpblendd. */
39865 for (i = 0; i < 8; ++i)
39866 mask |= (d->perm[i * 4] >= 32) << i;
39867 vmode = V8SImode;
39868 goto do_subreg;
39869
39870 case V16HImode:
39871 /* See if words move in pairs. If yes, vpblendd can be used. */
39872 for (i = 0; i < 16; i += 2)
39873 if (d->perm[i] + 1 != d->perm[i + 1])
39874 break;
39875 if (i < 16)
39876 {
39877 /* See if words move the same in both lanes. If not,
39878 vpblendvb must be used. */
39879 for (i = 0; i < 8; i++)
39880 if (d->perm[i] + 8 != d->perm[i + 8])
39881 {
39882 /* Use vpblendvb. */
39883 for (i = 0; i < 32; ++i)
39884 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39885
39886 vmode = V32QImode;
39887 nelt = 32;
39888 target = gen_reg_rtx (vmode);
39889 op0 = gen_lowpart (vmode, op0);
39890 op1 = gen_lowpart (vmode, op1);
39891 goto finish_pblendvb;
39892 }
39893
39894 /* Use vpblendw. */
39895 for (i = 0; i < 16; ++i)
39896 mask |= (d->perm[i] >= 16) << i;
39897 break;
39898 }
39899
39900 /* Use vpblendd. */
39901 for (i = 0; i < 8; ++i)
39902 mask |= (d->perm[i * 2] >= 16) << i;
39903 vmode = V8SImode;
39904 goto do_subreg;
39905
39906 case V4DImode:
39907 /* Use vpblendd. */
39908 for (i = 0; i < 4; ++i)
39909 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39910 vmode = V8SImode;
39911 goto do_subreg;
39912
39913 default:
39914 gcc_unreachable ();
39915 }
39916
39917 /* This matches five different patterns with the different modes. */
39918 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39919 x = gen_rtx_SET (VOIDmode, target, x);
39920 emit_insn (x);
39921 if (target != d->target)
39922 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39923
39924 return true;
39925 }
39926
39927 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39928 in terms of the variable form of vpermilps.
39929
39930 Note that we will have already failed the immediate input vpermilps,
39931 which requires that the high and low part shuffle be identical; the
39932 variable form doesn't require that. */
39933
39934 static bool
39935 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39936 {
39937 rtx rperm[8], vperm;
39938 unsigned i;
39939
39940 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39941 return false;
39942
39943 /* We can only permute within the 128-bit lane. */
39944 for (i = 0; i < 8; ++i)
39945 {
39946 unsigned e = d->perm[i];
39947 if (i < 4 ? e >= 4 : e < 4)
39948 return false;
39949 }
39950
39951 if (d->testing_p)
39952 return true;
39953
39954 for (i = 0; i < 8; ++i)
39955 {
39956 unsigned e = d->perm[i];
39957
39958 /* Within each 128-bit lane, the elements of op0 are numbered
39959 from 0 and the elements of op1 are numbered from 4. */
39960 if (e >= 8 + 4)
39961 e -= 8;
39962 else if (e >= 4)
39963 e -= 4;
39964
39965 rperm[i] = GEN_INT (e);
39966 }
39967
39968 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39969 vperm = force_reg (V8SImode, vperm);
39970 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39971
39972 return true;
39973 }
39974
39975 /* Return true if permutation D can be performed as VMODE permutation
39976 instead. */
39977
39978 static bool
39979 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39980 {
39981 unsigned int i, j, chunk;
39982
39983 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39984 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39985 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39986 return false;
39987
39988 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39989 return true;
39990
39991 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39992 for (i = 0; i < d->nelt; i += chunk)
39993 if (d->perm[i] & (chunk - 1))
39994 return false;
39995 else
39996 for (j = 1; j < chunk; ++j)
39997 if (d->perm[i] + j != d->perm[i + j])
39998 return false;
39999
40000 return true;
40001 }
40002
40003 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40004 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
40005
40006 static bool
40007 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
40008 {
40009 unsigned i, nelt, eltsz, mask;
40010 unsigned char perm[32];
40011 enum machine_mode vmode = V16QImode;
40012 rtx rperm[32], vperm, target, op0, op1;
40013
40014 nelt = d->nelt;
40015
40016 if (!d->one_operand_p)
40017 {
40018 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
40019 {
40020 if (TARGET_AVX2
40021 && valid_perm_using_mode_p (V2TImode, d))
40022 {
40023 if (d->testing_p)
40024 return true;
40025
40026 /* Use vperm2i128 insn. The pattern uses
40027 V4DImode instead of V2TImode. */
40028 target = d->target;
40029 if (d->vmode != V4DImode)
40030 target = gen_reg_rtx (V4DImode);
40031 op0 = gen_lowpart (V4DImode, d->op0);
40032 op1 = gen_lowpart (V4DImode, d->op1);
40033 rperm[0]
40034 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
40035 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
40036 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
40037 if (target != d->target)
40038 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40039 return true;
40040 }
40041 return false;
40042 }
40043 }
40044 else
40045 {
40046 if (GET_MODE_SIZE (d->vmode) == 16)
40047 {
40048 if (!TARGET_SSSE3)
40049 return false;
40050 }
40051 else if (GET_MODE_SIZE (d->vmode) == 32)
40052 {
40053 if (!TARGET_AVX2)
40054 return false;
40055
40056 /* V4DImode should be already handled through
40057 expand_vselect by vpermq instruction. */
40058 gcc_assert (d->vmode != V4DImode);
40059
40060 vmode = V32QImode;
40061 if (d->vmode == V8SImode
40062 || d->vmode == V16HImode
40063 || d->vmode == V32QImode)
40064 {
40065 /* First see if vpermq can be used for
40066 V8SImode/V16HImode/V32QImode. */
40067 if (valid_perm_using_mode_p (V4DImode, d))
40068 {
40069 for (i = 0; i < 4; i++)
40070 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
40071 if (d->testing_p)
40072 return true;
40073 target = gen_reg_rtx (V4DImode);
40074 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
40075 perm, 4, false))
40076 {
40077 emit_move_insn (d->target,
40078 gen_lowpart (d->vmode, target));
40079 return true;
40080 }
40081 return false;
40082 }
40083
40084 /* Next see if vpermd can be used. */
40085 if (valid_perm_using_mode_p (V8SImode, d))
40086 vmode = V8SImode;
40087 }
40088 /* Or if vpermps can be used. */
40089 else if (d->vmode == V8SFmode)
40090 vmode = V8SImode;
40091
40092 if (vmode == V32QImode)
40093 {
40094 /* vpshufb only works intra lanes, it is not
40095 possible to shuffle bytes in between the lanes. */
40096 for (i = 0; i < nelt; ++i)
40097 if ((d->perm[i] ^ i) & (nelt / 2))
40098 return false;
40099 }
40100 }
40101 else
40102 return false;
40103 }
40104
40105 if (d->testing_p)
40106 return true;
40107
40108 if (vmode == V8SImode)
40109 for (i = 0; i < 8; ++i)
40110 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
40111 else
40112 {
40113 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40114 if (!d->one_operand_p)
40115 mask = 2 * nelt - 1;
40116 else if (vmode == V16QImode)
40117 mask = nelt - 1;
40118 else
40119 mask = nelt / 2 - 1;
40120
40121 for (i = 0; i < nelt; ++i)
40122 {
40123 unsigned j, e = d->perm[i] & mask;
40124 for (j = 0; j < eltsz; ++j)
40125 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
40126 }
40127 }
40128
40129 vperm = gen_rtx_CONST_VECTOR (vmode,
40130 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
40131 vperm = force_reg (vmode, vperm);
40132
40133 target = d->target;
40134 if (d->vmode != vmode)
40135 target = gen_reg_rtx (vmode);
40136 op0 = gen_lowpart (vmode, d->op0);
40137 if (d->one_operand_p)
40138 {
40139 if (vmode == V16QImode)
40140 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
40141 else if (vmode == V32QImode)
40142 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
40143 else if (vmode == V8SFmode)
40144 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
40145 else
40146 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
40147 }
40148 else
40149 {
40150 op1 = gen_lowpart (vmode, d->op1);
40151 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
40152 }
40153 if (target != d->target)
40154 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40155
40156 return true;
40157 }
40158
40159 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
40160 in a single instruction. */
40161
40162 static bool
40163 expand_vec_perm_1 (struct expand_vec_perm_d *d)
40164 {
40165 unsigned i, nelt = d->nelt;
40166 unsigned char perm2[MAX_VECT_LEN];
40167
40168 /* Check plain VEC_SELECT first, because AVX has instructions that could
40169 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
40170 input where SEL+CONCAT may not. */
40171 if (d->one_operand_p)
40172 {
40173 int mask = nelt - 1;
40174 bool identity_perm = true;
40175 bool broadcast_perm = true;
40176
40177 for (i = 0; i < nelt; i++)
40178 {
40179 perm2[i] = d->perm[i] & mask;
40180 if (perm2[i] != i)
40181 identity_perm = false;
40182 if (perm2[i])
40183 broadcast_perm = false;
40184 }
40185
40186 if (identity_perm)
40187 {
40188 if (!d->testing_p)
40189 emit_move_insn (d->target, d->op0);
40190 return true;
40191 }
40192 else if (broadcast_perm && TARGET_AVX2)
40193 {
40194 /* Use vpbroadcast{b,w,d}. */
40195 rtx (*gen) (rtx, rtx) = NULL;
40196 switch (d->vmode)
40197 {
40198 case V32QImode:
40199 gen = gen_avx2_pbroadcastv32qi_1;
40200 break;
40201 case V16HImode:
40202 gen = gen_avx2_pbroadcastv16hi_1;
40203 break;
40204 case V8SImode:
40205 gen = gen_avx2_pbroadcastv8si_1;
40206 break;
40207 case V16QImode:
40208 gen = gen_avx2_pbroadcastv16qi;
40209 break;
40210 case V8HImode:
40211 gen = gen_avx2_pbroadcastv8hi;
40212 break;
40213 case V8SFmode:
40214 gen = gen_avx2_vec_dupv8sf_1;
40215 break;
40216 /* For other modes prefer other shuffles this function creates. */
40217 default: break;
40218 }
40219 if (gen != NULL)
40220 {
40221 if (!d->testing_p)
40222 emit_insn (gen (d->target, d->op0));
40223 return true;
40224 }
40225 }
40226
40227 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
40228 return true;
40229
40230 /* There are plenty of patterns in sse.md that are written for
40231 SEL+CONCAT and are not replicated for a single op. Perhaps
40232 that should be changed, to avoid the nastiness here. */
40233
40234 /* Recognize interleave style patterns, which means incrementing
40235 every other permutation operand. */
40236 for (i = 0; i < nelt; i += 2)
40237 {
40238 perm2[i] = d->perm[i] & mask;
40239 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
40240 }
40241 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40242 d->testing_p))
40243 return true;
40244
40245 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
40246 if (nelt >= 4)
40247 {
40248 for (i = 0; i < nelt; i += 4)
40249 {
40250 perm2[i + 0] = d->perm[i + 0] & mask;
40251 perm2[i + 1] = d->perm[i + 1] & mask;
40252 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
40253 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
40254 }
40255
40256 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40257 d->testing_p))
40258 return true;
40259 }
40260 }
40261
40262 /* Finally, try the fully general two operand permute. */
40263 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
40264 d->testing_p))
40265 return true;
40266
40267 /* Recognize interleave style patterns with reversed operands. */
40268 if (!d->one_operand_p)
40269 {
40270 for (i = 0; i < nelt; ++i)
40271 {
40272 unsigned e = d->perm[i];
40273 if (e >= nelt)
40274 e -= nelt;
40275 else
40276 e += nelt;
40277 perm2[i] = e;
40278 }
40279
40280 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
40281 d->testing_p))
40282 return true;
40283 }
40284
40285 /* Try the SSE4.1 blend variable merge instructions. */
40286 if (expand_vec_perm_blend (d))
40287 return true;
40288
40289 /* Try one of the AVX vpermil variable permutations. */
40290 if (expand_vec_perm_vpermil (d))
40291 return true;
40292
40293 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
40294 vpshufb, vpermd, vpermps or vpermq variable permutation. */
40295 if (expand_vec_perm_pshufb (d))
40296 return true;
40297
40298 return false;
40299 }
40300
40301 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40302 in terms of a pair of pshuflw + pshufhw instructions. */
40303
40304 static bool
40305 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
40306 {
40307 unsigned char perm2[MAX_VECT_LEN];
40308 unsigned i;
40309 bool ok;
40310
40311 if (d->vmode != V8HImode || !d->one_operand_p)
40312 return false;
40313
40314 /* The two permutations only operate in 64-bit lanes. */
40315 for (i = 0; i < 4; ++i)
40316 if (d->perm[i] >= 4)
40317 return false;
40318 for (i = 4; i < 8; ++i)
40319 if (d->perm[i] < 4)
40320 return false;
40321
40322 if (d->testing_p)
40323 return true;
40324
40325 /* Emit the pshuflw. */
40326 memcpy (perm2, d->perm, 4);
40327 for (i = 4; i < 8; ++i)
40328 perm2[i] = i;
40329 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
40330 gcc_assert (ok);
40331
40332 /* Emit the pshufhw. */
40333 memcpy (perm2 + 4, d->perm + 4, 4);
40334 for (i = 0; i < 4; ++i)
40335 perm2[i] = i;
40336 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
40337 gcc_assert (ok);
40338
40339 return true;
40340 }
40341
40342 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40343 the permutation using the SSSE3 palignr instruction. This succeeds
40344 when all of the elements in PERM fit within one vector and we merely
40345 need to shift them down so that a single vector permutation has a
40346 chance to succeed. */
40347
40348 static bool
40349 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
40350 {
40351 unsigned i, nelt = d->nelt;
40352 unsigned min, max;
40353 bool in_order, ok;
40354 rtx shift, target;
40355 struct expand_vec_perm_d dcopy;
40356
40357 /* Even with AVX, palignr only operates on 128-bit vectors. */
40358 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40359 return false;
40360
40361 min = nelt, max = 0;
40362 for (i = 0; i < nelt; ++i)
40363 {
40364 unsigned e = d->perm[i];
40365 if (e < min)
40366 min = e;
40367 if (e > max)
40368 max = e;
40369 }
40370 if (min == 0 || max - min >= nelt)
40371 return false;
40372
40373 /* Given that we have SSSE3, we know we'll be able to implement the
40374 single operand permutation after the palignr with pshufb. */
40375 if (d->testing_p)
40376 return true;
40377
40378 dcopy = *d;
40379 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
40380 target = gen_reg_rtx (TImode);
40381 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
40382 gen_lowpart (TImode, d->op0), shift));
40383
40384 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
40385 dcopy.one_operand_p = true;
40386
40387 in_order = true;
40388 for (i = 0; i < nelt; ++i)
40389 {
40390 unsigned e = dcopy.perm[i] - min;
40391 if (e != i)
40392 in_order = false;
40393 dcopy.perm[i] = e;
40394 }
40395
40396 /* Test for the degenerate case where the alignment by itself
40397 produces the desired permutation. */
40398 if (in_order)
40399 {
40400 emit_move_insn (d->target, dcopy.op0);
40401 return true;
40402 }
40403
40404 ok = expand_vec_perm_1 (&dcopy);
40405 gcc_assert (ok);
40406
40407 return ok;
40408 }
40409
40410 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
40411
40412 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40413 a two vector permutation into a single vector permutation by using
40414 an interleave operation to merge the vectors. */
40415
40416 static bool
40417 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
40418 {
40419 struct expand_vec_perm_d dremap, dfinal;
40420 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
40421 unsigned HOST_WIDE_INT contents;
40422 unsigned char remap[2 * MAX_VECT_LEN];
40423 rtx seq;
40424 bool ok, same_halves = false;
40425
40426 if (GET_MODE_SIZE (d->vmode) == 16)
40427 {
40428 if (d->one_operand_p)
40429 return false;
40430 }
40431 else if (GET_MODE_SIZE (d->vmode) == 32)
40432 {
40433 if (!TARGET_AVX)
40434 return false;
40435 /* For 32-byte modes allow even d->one_operand_p.
40436 The lack of cross-lane shuffling in some instructions
40437 might prevent a single insn shuffle. */
40438 dfinal = *d;
40439 dfinal.testing_p = true;
40440 /* If expand_vec_perm_interleave3 can expand this into
40441 a 3 insn sequence, give up and let it be expanded as
40442 3 insn sequence. While that is one insn longer,
40443 it doesn't need a memory operand and in the common
40444 case that both interleave low and high permutations
40445 with the same operands are adjacent needs 4 insns
40446 for both after CSE. */
40447 if (expand_vec_perm_interleave3 (&dfinal))
40448 return false;
40449 }
40450 else
40451 return false;
40452
40453 /* Examine from whence the elements come. */
40454 contents = 0;
40455 for (i = 0; i < nelt; ++i)
40456 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
40457
40458 memset (remap, 0xff, sizeof (remap));
40459 dremap = *d;
40460
40461 if (GET_MODE_SIZE (d->vmode) == 16)
40462 {
40463 unsigned HOST_WIDE_INT h1, h2, h3, h4;
40464
40465 /* Split the two input vectors into 4 halves. */
40466 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
40467 h2 = h1 << nelt2;
40468 h3 = h2 << nelt2;
40469 h4 = h3 << nelt2;
40470
40471 /* If the elements from the low halves use interleave low, and similarly
40472 for interleave high. If the elements are from mis-matched halves, we
40473 can use shufps for V4SF/V4SI or do a DImode shuffle. */
40474 if ((contents & (h1 | h3)) == contents)
40475 {
40476 /* punpckl* */
40477 for (i = 0; i < nelt2; ++i)
40478 {
40479 remap[i] = i * 2;
40480 remap[i + nelt] = i * 2 + 1;
40481 dremap.perm[i * 2] = i;
40482 dremap.perm[i * 2 + 1] = i + nelt;
40483 }
40484 if (!TARGET_SSE2 && d->vmode == V4SImode)
40485 dremap.vmode = V4SFmode;
40486 }
40487 else if ((contents & (h2 | h4)) == contents)
40488 {
40489 /* punpckh* */
40490 for (i = 0; i < nelt2; ++i)
40491 {
40492 remap[i + nelt2] = i * 2;
40493 remap[i + nelt + nelt2] = i * 2 + 1;
40494 dremap.perm[i * 2] = i + nelt2;
40495 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
40496 }
40497 if (!TARGET_SSE2 && d->vmode == V4SImode)
40498 dremap.vmode = V4SFmode;
40499 }
40500 else if ((contents & (h1 | h4)) == contents)
40501 {
40502 /* shufps */
40503 for (i = 0; i < nelt2; ++i)
40504 {
40505 remap[i] = i;
40506 remap[i + nelt + nelt2] = i + nelt2;
40507 dremap.perm[i] = i;
40508 dremap.perm[i + nelt2] = i + nelt + nelt2;
40509 }
40510 if (nelt != 4)
40511 {
40512 /* shufpd */
40513 dremap.vmode = V2DImode;
40514 dremap.nelt = 2;
40515 dremap.perm[0] = 0;
40516 dremap.perm[1] = 3;
40517 }
40518 }
40519 else if ((contents & (h2 | h3)) == contents)
40520 {
40521 /* shufps */
40522 for (i = 0; i < nelt2; ++i)
40523 {
40524 remap[i + nelt2] = i;
40525 remap[i + nelt] = i + nelt2;
40526 dremap.perm[i] = i + nelt2;
40527 dremap.perm[i + nelt2] = i + nelt;
40528 }
40529 if (nelt != 4)
40530 {
40531 /* shufpd */
40532 dremap.vmode = V2DImode;
40533 dremap.nelt = 2;
40534 dremap.perm[0] = 1;
40535 dremap.perm[1] = 2;
40536 }
40537 }
40538 else
40539 return false;
40540 }
40541 else
40542 {
40543 unsigned int nelt4 = nelt / 4, nzcnt = 0;
40544 unsigned HOST_WIDE_INT q[8];
40545 unsigned int nonzero_halves[4];
40546
40547 /* Split the two input vectors into 8 quarters. */
40548 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
40549 for (i = 1; i < 8; ++i)
40550 q[i] = q[0] << (nelt4 * i);
40551 for (i = 0; i < 4; ++i)
40552 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
40553 {
40554 nonzero_halves[nzcnt] = i;
40555 ++nzcnt;
40556 }
40557
40558 if (nzcnt == 1)
40559 {
40560 gcc_assert (d->one_operand_p);
40561 nonzero_halves[1] = nonzero_halves[0];
40562 same_halves = true;
40563 }
40564 else if (d->one_operand_p)
40565 {
40566 gcc_assert (nonzero_halves[0] == 0);
40567 gcc_assert (nonzero_halves[1] == 1);
40568 }
40569
40570 if (nzcnt <= 2)
40571 {
40572 if (d->perm[0] / nelt2 == nonzero_halves[1])
40573 {
40574 /* Attempt to increase the likelihood that dfinal
40575 shuffle will be intra-lane. */
40576 char tmph = nonzero_halves[0];
40577 nonzero_halves[0] = nonzero_halves[1];
40578 nonzero_halves[1] = tmph;
40579 }
40580
40581 /* vperm2f128 or vperm2i128. */
40582 for (i = 0; i < nelt2; ++i)
40583 {
40584 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
40585 remap[i + nonzero_halves[0] * nelt2] = i;
40586 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
40587 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
40588 }
40589
40590 if (d->vmode != V8SFmode
40591 && d->vmode != V4DFmode
40592 && d->vmode != V8SImode)
40593 {
40594 dremap.vmode = V8SImode;
40595 dremap.nelt = 8;
40596 for (i = 0; i < 4; ++i)
40597 {
40598 dremap.perm[i] = i + nonzero_halves[0] * 4;
40599 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
40600 }
40601 }
40602 }
40603 else if (d->one_operand_p)
40604 return false;
40605 else if (TARGET_AVX2
40606 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
40607 {
40608 /* vpunpckl* */
40609 for (i = 0; i < nelt4; ++i)
40610 {
40611 remap[i] = i * 2;
40612 remap[i + nelt] = i * 2 + 1;
40613 remap[i + nelt2] = i * 2 + nelt2;
40614 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
40615 dremap.perm[i * 2] = i;
40616 dremap.perm[i * 2 + 1] = i + nelt;
40617 dremap.perm[i * 2 + nelt2] = i + nelt2;
40618 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
40619 }
40620 }
40621 else if (TARGET_AVX2
40622 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
40623 {
40624 /* vpunpckh* */
40625 for (i = 0; i < nelt4; ++i)
40626 {
40627 remap[i + nelt4] = i * 2;
40628 remap[i + nelt + nelt4] = i * 2 + 1;
40629 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
40630 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
40631 dremap.perm[i * 2] = i + nelt4;
40632 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
40633 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
40634 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
40635 }
40636 }
40637 else
40638 return false;
40639 }
40640
40641 /* Use the remapping array set up above to move the elements from their
40642 swizzled locations into their final destinations. */
40643 dfinal = *d;
40644 for (i = 0; i < nelt; ++i)
40645 {
40646 unsigned e = remap[d->perm[i]];
40647 gcc_assert (e < nelt);
40648 /* If same_halves is true, both halves of the remapped vector are the
40649 same. Avoid cross-lane accesses if possible. */
40650 if (same_halves && i >= nelt2)
40651 {
40652 gcc_assert (e < nelt2);
40653 dfinal.perm[i] = e + nelt2;
40654 }
40655 else
40656 dfinal.perm[i] = e;
40657 }
40658 dremap.target = gen_reg_rtx (dremap.vmode);
40659 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40660 dfinal.op1 = dfinal.op0;
40661 dfinal.one_operand_p = true;
40662
40663 /* Test if the final remap can be done with a single insn. For V4SFmode or
40664 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40665 start_sequence ();
40666 ok = expand_vec_perm_1 (&dfinal);
40667 seq = get_insns ();
40668 end_sequence ();
40669
40670 if (!ok)
40671 return false;
40672
40673 if (d->testing_p)
40674 return true;
40675
40676 if (dremap.vmode != dfinal.vmode)
40677 {
40678 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40679 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40680 }
40681
40682 ok = expand_vec_perm_1 (&dremap);
40683 gcc_assert (ok);
40684
40685 emit_insn (seq);
40686 return true;
40687 }
40688
40689 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40690 a single vector cross-lane permutation into vpermq followed
40691 by any of the single insn permutations. */
40692
40693 static bool
40694 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40695 {
40696 struct expand_vec_perm_d dremap, dfinal;
40697 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40698 unsigned contents[2];
40699 bool ok;
40700
40701 if (!(TARGET_AVX2
40702 && (d->vmode == V32QImode || d->vmode == V16HImode)
40703 && d->one_operand_p))
40704 return false;
40705
40706 contents[0] = 0;
40707 contents[1] = 0;
40708 for (i = 0; i < nelt2; ++i)
40709 {
40710 contents[0] |= 1u << (d->perm[i] / nelt4);
40711 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40712 }
40713
40714 for (i = 0; i < 2; ++i)
40715 {
40716 unsigned int cnt = 0;
40717 for (j = 0; j < 4; ++j)
40718 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40719 return false;
40720 }
40721
40722 if (d->testing_p)
40723 return true;
40724
40725 dremap = *d;
40726 dremap.vmode = V4DImode;
40727 dremap.nelt = 4;
40728 dremap.target = gen_reg_rtx (V4DImode);
40729 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40730 dremap.op1 = dremap.op0;
40731 dremap.one_operand_p = true;
40732 for (i = 0; i < 2; ++i)
40733 {
40734 unsigned int cnt = 0;
40735 for (j = 0; j < 4; ++j)
40736 if ((contents[i] & (1u << j)) != 0)
40737 dremap.perm[2 * i + cnt++] = j;
40738 for (; cnt < 2; ++cnt)
40739 dremap.perm[2 * i + cnt] = 0;
40740 }
40741
40742 dfinal = *d;
40743 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40744 dfinal.op1 = dfinal.op0;
40745 dfinal.one_operand_p = true;
40746 for (i = 0, j = 0; i < nelt; ++i)
40747 {
40748 if (i == nelt2)
40749 j = 2;
40750 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40751 if ((d->perm[i] / nelt4) == dremap.perm[j])
40752 ;
40753 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40754 dfinal.perm[i] |= nelt4;
40755 else
40756 gcc_unreachable ();
40757 }
40758
40759 ok = expand_vec_perm_1 (&dremap);
40760 gcc_assert (ok);
40761
40762 ok = expand_vec_perm_1 (&dfinal);
40763 gcc_assert (ok);
40764
40765 return true;
40766 }
40767
40768 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40769 a vector permutation using two instructions, vperm2f128 resp.
40770 vperm2i128 followed by any single in-lane permutation. */
40771
40772 static bool
40773 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40774 {
40775 struct expand_vec_perm_d dfirst, dsecond;
40776 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40777 bool ok;
40778
40779 if (!TARGET_AVX
40780 || GET_MODE_SIZE (d->vmode) != 32
40781 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40782 return false;
40783
40784 dsecond = *d;
40785 dsecond.one_operand_p = false;
40786 dsecond.testing_p = true;
40787
40788 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40789 immediate. For perm < 16 the second permutation uses
40790 d->op0 as first operand, for perm >= 16 it uses d->op1
40791 as first operand. The second operand is the result of
40792 vperm2[fi]128. */
40793 for (perm = 0; perm < 32; perm++)
40794 {
40795 /* Ignore permutations which do not move anything cross-lane. */
40796 if (perm < 16)
40797 {
40798 /* The second shuffle for e.g. V4DFmode has
40799 0123 and ABCD operands.
40800 Ignore AB23, as 23 is already in the second lane
40801 of the first operand. */
40802 if ((perm & 0xc) == (1 << 2)) continue;
40803 /* And 01CD, as 01 is in the first lane of the first
40804 operand. */
40805 if ((perm & 3) == 0) continue;
40806 /* And 4567, as then the vperm2[fi]128 doesn't change
40807 anything on the original 4567 second operand. */
40808 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40809 }
40810 else
40811 {
40812 /* The second shuffle for e.g. V4DFmode has
40813 4567 and ABCD operands.
40814 Ignore AB67, as 67 is already in the second lane
40815 of the first operand. */
40816 if ((perm & 0xc) == (3 << 2)) continue;
40817 /* And 45CD, as 45 is in the first lane of the first
40818 operand. */
40819 if ((perm & 3) == 2) continue;
40820 /* And 0123, as then the vperm2[fi]128 doesn't change
40821 anything on the original 0123 first operand. */
40822 if ((perm & 0xf) == (1 << 2)) continue;
40823 }
40824
40825 for (i = 0; i < nelt; i++)
40826 {
40827 j = d->perm[i] / nelt2;
40828 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40829 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40830 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40831 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40832 else
40833 break;
40834 }
40835
40836 if (i == nelt)
40837 {
40838 start_sequence ();
40839 ok = expand_vec_perm_1 (&dsecond);
40840 end_sequence ();
40841 }
40842 else
40843 ok = false;
40844
40845 if (ok)
40846 {
40847 if (d->testing_p)
40848 return true;
40849
40850 /* Found a usable second shuffle. dfirst will be
40851 vperm2f128 on d->op0 and d->op1. */
40852 dsecond.testing_p = false;
40853 dfirst = *d;
40854 dfirst.target = gen_reg_rtx (d->vmode);
40855 for (i = 0; i < nelt; i++)
40856 dfirst.perm[i] = (i & (nelt2 - 1))
40857 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40858
40859 ok = expand_vec_perm_1 (&dfirst);
40860 gcc_assert (ok);
40861
40862 /* And dsecond is some single insn shuffle, taking
40863 d->op0 and result of vperm2f128 (if perm < 16) or
40864 d->op1 and result of vperm2f128 (otherwise). */
40865 dsecond.op1 = dfirst.target;
40866 if (perm >= 16)
40867 dsecond.op0 = dfirst.op1;
40868
40869 ok = expand_vec_perm_1 (&dsecond);
40870 gcc_assert (ok);
40871
40872 return true;
40873 }
40874
40875 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40876 if (d->one_operand_p)
40877 return false;
40878 }
40879
40880 return false;
40881 }
40882
40883 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40884 a two vector permutation using 2 intra-lane interleave insns
40885 and cross-lane shuffle for 32-byte vectors. */
40886
40887 static bool
40888 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40889 {
40890 unsigned i, nelt;
40891 rtx (*gen) (rtx, rtx, rtx);
40892
40893 if (d->one_operand_p)
40894 return false;
40895 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40896 ;
40897 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40898 ;
40899 else
40900 return false;
40901
40902 nelt = d->nelt;
40903 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40904 return false;
40905 for (i = 0; i < nelt; i += 2)
40906 if (d->perm[i] != d->perm[0] + i / 2
40907 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40908 return false;
40909
40910 if (d->testing_p)
40911 return true;
40912
40913 switch (d->vmode)
40914 {
40915 case V32QImode:
40916 if (d->perm[0])
40917 gen = gen_vec_interleave_highv32qi;
40918 else
40919 gen = gen_vec_interleave_lowv32qi;
40920 break;
40921 case V16HImode:
40922 if (d->perm[0])
40923 gen = gen_vec_interleave_highv16hi;
40924 else
40925 gen = gen_vec_interleave_lowv16hi;
40926 break;
40927 case V8SImode:
40928 if (d->perm[0])
40929 gen = gen_vec_interleave_highv8si;
40930 else
40931 gen = gen_vec_interleave_lowv8si;
40932 break;
40933 case V4DImode:
40934 if (d->perm[0])
40935 gen = gen_vec_interleave_highv4di;
40936 else
40937 gen = gen_vec_interleave_lowv4di;
40938 break;
40939 case V8SFmode:
40940 if (d->perm[0])
40941 gen = gen_vec_interleave_highv8sf;
40942 else
40943 gen = gen_vec_interleave_lowv8sf;
40944 break;
40945 case V4DFmode:
40946 if (d->perm[0])
40947 gen = gen_vec_interleave_highv4df;
40948 else
40949 gen = gen_vec_interleave_lowv4df;
40950 break;
40951 default:
40952 gcc_unreachable ();
40953 }
40954
40955 emit_insn (gen (d->target, d->op0, d->op1));
40956 return true;
40957 }
40958
40959 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40960 a single vector permutation using a single intra-lane vector
40961 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40962 the non-swapped and swapped vectors together. */
40963
40964 static bool
40965 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40966 {
40967 struct expand_vec_perm_d dfirst, dsecond;
40968 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40969 rtx seq;
40970 bool ok;
40971 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40972
40973 if (!TARGET_AVX
40974 || TARGET_AVX2
40975 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40976 || !d->one_operand_p)
40977 return false;
40978
40979 dfirst = *d;
40980 for (i = 0; i < nelt; i++)
40981 dfirst.perm[i] = 0xff;
40982 for (i = 0, msk = 0; i < nelt; i++)
40983 {
40984 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40985 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40986 return false;
40987 dfirst.perm[j] = d->perm[i];
40988 if (j != i)
40989 msk |= (1 << i);
40990 }
40991 for (i = 0; i < nelt; i++)
40992 if (dfirst.perm[i] == 0xff)
40993 dfirst.perm[i] = i;
40994
40995 if (!d->testing_p)
40996 dfirst.target = gen_reg_rtx (dfirst.vmode);
40997
40998 start_sequence ();
40999 ok = expand_vec_perm_1 (&dfirst);
41000 seq = get_insns ();
41001 end_sequence ();
41002
41003 if (!ok)
41004 return false;
41005
41006 if (d->testing_p)
41007 return true;
41008
41009 emit_insn (seq);
41010
41011 dsecond = *d;
41012 dsecond.op0 = dfirst.target;
41013 dsecond.op1 = dfirst.target;
41014 dsecond.one_operand_p = true;
41015 dsecond.target = gen_reg_rtx (dsecond.vmode);
41016 for (i = 0; i < nelt; i++)
41017 dsecond.perm[i] = i ^ nelt2;
41018
41019 ok = expand_vec_perm_1 (&dsecond);
41020 gcc_assert (ok);
41021
41022 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
41023 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
41024 return true;
41025 }
41026
41027 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
41028 permutation using two vperm2f128, followed by a vshufpd insn blending
41029 the two vectors together. */
41030
41031 static bool
41032 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
41033 {
41034 struct expand_vec_perm_d dfirst, dsecond, dthird;
41035 bool ok;
41036
41037 if (!TARGET_AVX || (d->vmode != V4DFmode))
41038 return false;
41039
41040 if (d->testing_p)
41041 return true;
41042
41043 dfirst = *d;
41044 dsecond = *d;
41045 dthird = *d;
41046
41047 dfirst.perm[0] = (d->perm[0] & ~1);
41048 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
41049 dfirst.perm[2] = (d->perm[2] & ~1);
41050 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
41051 dsecond.perm[0] = (d->perm[1] & ~1);
41052 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
41053 dsecond.perm[2] = (d->perm[3] & ~1);
41054 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
41055 dthird.perm[0] = (d->perm[0] % 2);
41056 dthird.perm[1] = (d->perm[1] % 2) + 4;
41057 dthird.perm[2] = (d->perm[2] % 2) + 2;
41058 dthird.perm[3] = (d->perm[3] % 2) + 6;
41059
41060 dfirst.target = gen_reg_rtx (dfirst.vmode);
41061 dsecond.target = gen_reg_rtx (dsecond.vmode);
41062 dthird.op0 = dfirst.target;
41063 dthird.op1 = dsecond.target;
41064 dthird.one_operand_p = false;
41065
41066 canonicalize_perm (&dfirst);
41067 canonicalize_perm (&dsecond);
41068
41069 ok = expand_vec_perm_1 (&dfirst)
41070 && expand_vec_perm_1 (&dsecond)
41071 && expand_vec_perm_1 (&dthird);
41072
41073 gcc_assert (ok);
41074
41075 return true;
41076 }
41077
41078 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
41079 permutation with two pshufb insns and an ior. We should have already
41080 failed all two instruction sequences. */
41081
41082 static bool
41083 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
41084 {
41085 rtx rperm[2][16], vperm, l, h, op, m128;
41086 unsigned int i, nelt, eltsz;
41087
41088 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
41089 return false;
41090 gcc_assert (!d->one_operand_p);
41091
41092 nelt = d->nelt;
41093 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41094
41095 /* Generate two permutation masks. If the required element is within
41096 the given vector it is shuffled into the proper lane. If the required
41097 element is in the other vector, force a zero into the lane by setting
41098 bit 7 in the permutation mask. */
41099 m128 = GEN_INT (-128);
41100 for (i = 0; i < nelt; ++i)
41101 {
41102 unsigned j, e = d->perm[i];
41103 unsigned which = (e >= nelt);
41104 if (e >= nelt)
41105 e -= nelt;
41106
41107 for (j = 0; j < eltsz; ++j)
41108 {
41109 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
41110 rperm[1-which][i*eltsz + j] = m128;
41111 }
41112 }
41113
41114 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
41115 vperm = force_reg (V16QImode, vperm);
41116
41117 l = gen_reg_rtx (V16QImode);
41118 op = gen_lowpart (V16QImode, d->op0);
41119 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
41120
41121 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
41122 vperm = force_reg (V16QImode, vperm);
41123
41124 h = gen_reg_rtx (V16QImode);
41125 op = gen_lowpart (V16QImode, d->op1);
41126 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
41127
41128 op = d->target;
41129 if (d->vmode != V16QImode)
41130 op = gen_reg_rtx (V16QImode);
41131 emit_insn (gen_iorv16qi3 (op, l, h));
41132 if (op != d->target)
41133 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41134
41135 return true;
41136 }
41137
41138 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
41139 with two vpshufb insns, vpermq and vpor. We should have already failed
41140 all two or three instruction sequences. */
41141
41142 static bool
41143 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
41144 {
41145 rtx rperm[2][32], vperm, l, h, hp, op, m128;
41146 unsigned int i, nelt, eltsz;
41147
41148 if (!TARGET_AVX2
41149 || !d->one_operand_p
41150 || (d->vmode != V32QImode && d->vmode != V16HImode))
41151 return false;
41152
41153 if (d->testing_p)
41154 return true;
41155
41156 nelt = d->nelt;
41157 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41158
41159 /* Generate two permutation masks. If the required element is within
41160 the same lane, it is shuffled in. If the required element from the
41161 other lane, force a zero by setting bit 7 in the permutation mask.
41162 In the other mask the mask has non-negative elements if element
41163 is requested from the other lane, but also moved to the other lane,
41164 so that the result of vpshufb can have the two V2TImode halves
41165 swapped. */
41166 m128 = GEN_INT (-128);
41167 for (i = 0; i < nelt; ++i)
41168 {
41169 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41170 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41171
41172 for (j = 0; j < eltsz; ++j)
41173 {
41174 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
41175 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
41176 }
41177 }
41178
41179 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41180 vperm = force_reg (V32QImode, vperm);
41181
41182 h = gen_reg_rtx (V32QImode);
41183 op = gen_lowpart (V32QImode, d->op0);
41184 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41185
41186 /* Swap the 128-byte lanes of h into hp. */
41187 hp = gen_reg_rtx (V4DImode);
41188 op = gen_lowpart (V4DImode, h);
41189 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
41190 const1_rtx));
41191
41192 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41193 vperm = force_reg (V32QImode, vperm);
41194
41195 l = gen_reg_rtx (V32QImode);
41196 op = gen_lowpart (V32QImode, d->op0);
41197 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41198
41199 op = d->target;
41200 if (d->vmode != V32QImode)
41201 op = gen_reg_rtx (V32QImode);
41202 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
41203 if (op != d->target)
41204 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41205
41206 return true;
41207 }
41208
41209 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
41210 and extract-odd permutations of two V32QImode and V16QImode operand
41211 with two vpshufb insns, vpor and vpermq. We should have already
41212 failed all two or three instruction sequences. */
41213
41214 static bool
41215 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
41216 {
41217 rtx rperm[2][32], vperm, l, h, ior, op, m128;
41218 unsigned int i, nelt, eltsz;
41219
41220 if (!TARGET_AVX2
41221 || d->one_operand_p
41222 || (d->vmode != V32QImode && d->vmode != V16HImode))
41223 return false;
41224
41225 for (i = 0; i < d->nelt; ++i)
41226 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
41227 return false;
41228
41229 if (d->testing_p)
41230 return true;
41231
41232 nelt = d->nelt;
41233 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41234
41235 /* Generate two permutation masks. In the first permutation mask
41236 the first quarter will contain indexes for the first half
41237 of the op0, the second quarter will contain bit 7 set, third quarter
41238 will contain indexes for the second half of the op0 and the
41239 last quarter bit 7 set. In the second permutation mask
41240 the first quarter will contain bit 7 set, the second quarter
41241 indexes for the first half of the op1, the third quarter bit 7 set
41242 and last quarter indexes for the second half of the op1.
41243 I.e. the first mask e.g. for V32QImode extract even will be:
41244 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
41245 (all values masked with 0xf except for -128) and second mask
41246 for extract even will be
41247 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
41248 m128 = GEN_INT (-128);
41249 for (i = 0; i < nelt; ++i)
41250 {
41251 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41252 unsigned which = d->perm[i] >= nelt;
41253 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
41254
41255 for (j = 0; j < eltsz; ++j)
41256 {
41257 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
41258 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
41259 }
41260 }
41261
41262 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41263 vperm = force_reg (V32QImode, vperm);
41264
41265 l = gen_reg_rtx (V32QImode);
41266 op = gen_lowpart (V32QImode, d->op0);
41267 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41268
41269 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41270 vperm = force_reg (V32QImode, vperm);
41271
41272 h = gen_reg_rtx (V32QImode);
41273 op = gen_lowpart (V32QImode, d->op1);
41274 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41275
41276 ior = gen_reg_rtx (V32QImode);
41277 emit_insn (gen_iorv32qi3 (ior, l, h));
41278
41279 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
41280 op = gen_reg_rtx (V4DImode);
41281 ior = gen_lowpart (V4DImode, ior);
41282 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
41283 const1_rtx, GEN_INT (3)));
41284 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41285
41286 return true;
41287 }
41288
41289 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
41290 and extract-odd permutations. */
41291
41292 static bool
41293 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
41294 {
41295 rtx t1, t2, t3, t4, t5;
41296
41297 switch (d->vmode)
41298 {
41299 case V4DFmode:
41300 t1 = gen_reg_rtx (V4DFmode);
41301 t2 = gen_reg_rtx (V4DFmode);
41302
41303 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41304 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
41305 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
41306
41307 /* Now an unpck[lh]pd will produce the result required. */
41308 if (odd)
41309 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
41310 else
41311 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
41312 emit_insn (t3);
41313 break;
41314
41315 case V8SFmode:
41316 {
41317 int mask = odd ? 0xdd : 0x88;
41318
41319 t1 = gen_reg_rtx (V8SFmode);
41320 t2 = gen_reg_rtx (V8SFmode);
41321 t3 = gen_reg_rtx (V8SFmode);
41322
41323 /* Shuffle within the 128-bit lanes to produce:
41324 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
41325 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
41326 GEN_INT (mask)));
41327
41328 /* Shuffle the lanes around to produce:
41329 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
41330 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
41331 GEN_INT (0x3)));
41332
41333 /* Shuffle within the 128-bit lanes to produce:
41334 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
41335 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
41336
41337 /* Shuffle within the 128-bit lanes to produce:
41338 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
41339 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
41340
41341 /* Shuffle the lanes around to produce:
41342 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
41343 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
41344 GEN_INT (0x20)));
41345 }
41346 break;
41347
41348 case V2DFmode:
41349 case V4SFmode:
41350 case V2DImode:
41351 case V4SImode:
41352 /* These are always directly implementable by expand_vec_perm_1. */
41353 gcc_unreachable ();
41354
41355 case V8HImode:
41356 if (TARGET_SSSE3)
41357 return expand_vec_perm_pshufb2 (d);
41358 else
41359 {
41360 /* We need 2*log2(N)-1 operations to achieve odd/even
41361 with interleave. */
41362 t1 = gen_reg_rtx (V8HImode);
41363 t2 = gen_reg_rtx (V8HImode);
41364 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
41365 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
41366 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
41367 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
41368 if (odd)
41369 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
41370 else
41371 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
41372 emit_insn (t3);
41373 }
41374 break;
41375
41376 case V16QImode:
41377 if (TARGET_SSSE3)
41378 return expand_vec_perm_pshufb2 (d);
41379 else
41380 {
41381 t1 = gen_reg_rtx (V16QImode);
41382 t2 = gen_reg_rtx (V16QImode);
41383 t3 = gen_reg_rtx (V16QImode);
41384 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
41385 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
41386 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
41387 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
41388 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
41389 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
41390 if (odd)
41391 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
41392 else
41393 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
41394 emit_insn (t3);
41395 }
41396 break;
41397
41398 case V16HImode:
41399 case V32QImode:
41400 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
41401
41402 case V4DImode:
41403 if (!TARGET_AVX2)
41404 {
41405 struct expand_vec_perm_d d_copy = *d;
41406 d_copy.vmode = V4DFmode;
41407 d_copy.target = gen_reg_rtx (V4DFmode);
41408 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
41409 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
41410 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41411 {
41412 if (!d->testing_p)
41413 emit_move_insn (d->target,
41414 gen_lowpart (V4DImode, d_copy.target));
41415 return true;
41416 }
41417 return false;
41418 }
41419
41420 t1 = gen_reg_rtx (V4DImode);
41421 t2 = gen_reg_rtx (V4DImode);
41422
41423 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41424 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
41425 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
41426
41427 /* Now an vpunpck[lh]qdq will produce the result required. */
41428 if (odd)
41429 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
41430 else
41431 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
41432 emit_insn (t3);
41433 break;
41434
41435 case V8SImode:
41436 if (!TARGET_AVX2)
41437 {
41438 struct expand_vec_perm_d d_copy = *d;
41439 d_copy.vmode = V8SFmode;
41440 d_copy.target = gen_reg_rtx (V8SFmode);
41441 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
41442 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
41443 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41444 {
41445 if (!d->testing_p)
41446 emit_move_insn (d->target,
41447 gen_lowpart (V8SImode, d_copy.target));
41448 return true;
41449 }
41450 return false;
41451 }
41452
41453 t1 = gen_reg_rtx (V8SImode);
41454 t2 = gen_reg_rtx (V8SImode);
41455 t3 = gen_reg_rtx (V4DImode);
41456 t4 = gen_reg_rtx (V4DImode);
41457 t5 = gen_reg_rtx (V4DImode);
41458
41459 /* Shuffle the lanes around into
41460 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
41461 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
41462 gen_lowpart (V4DImode, d->op1),
41463 GEN_INT (0x20)));
41464 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
41465 gen_lowpart (V4DImode, d->op1),
41466 GEN_INT (0x31)));
41467
41468 /* Swap the 2nd and 3rd position in each lane into
41469 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
41470 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
41471 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41472 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
41473 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41474
41475 /* Now an vpunpck[lh]qdq will produce
41476 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
41477 if (odd)
41478 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
41479 gen_lowpart (V4DImode, t2));
41480 else
41481 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
41482 gen_lowpart (V4DImode, t2));
41483 emit_insn (t3);
41484 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
41485 break;
41486
41487 default:
41488 gcc_unreachable ();
41489 }
41490
41491 return true;
41492 }
41493
41494 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41495 extract-even and extract-odd permutations. */
41496
41497 static bool
41498 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
41499 {
41500 unsigned i, odd, nelt = d->nelt;
41501
41502 odd = d->perm[0];
41503 if (odd != 0 && odd != 1)
41504 return false;
41505
41506 for (i = 1; i < nelt; ++i)
41507 if (d->perm[i] != 2 * i + odd)
41508 return false;
41509
41510 return expand_vec_perm_even_odd_1 (d, odd);
41511 }
41512
41513 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
41514 permutations. We assume that expand_vec_perm_1 has already failed. */
41515
41516 static bool
41517 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
41518 {
41519 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
41520 enum machine_mode vmode = d->vmode;
41521 unsigned char perm2[4];
41522 rtx op0 = d->op0, dest;
41523 bool ok;
41524
41525 switch (vmode)
41526 {
41527 case V4DFmode:
41528 case V8SFmode:
41529 /* These are special-cased in sse.md so that we can optionally
41530 use the vbroadcast instruction. They expand to two insns
41531 if the input happens to be in a register. */
41532 gcc_unreachable ();
41533
41534 case V2DFmode:
41535 case V2DImode:
41536 case V4SFmode:
41537 case V4SImode:
41538 /* These are always implementable using standard shuffle patterns. */
41539 gcc_unreachable ();
41540
41541 case V8HImode:
41542 case V16QImode:
41543 /* These can be implemented via interleave. We save one insn by
41544 stopping once we have promoted to V4SImode and then use pshufd. */
41545 do
41546 {
41547 rtx dest;
41548 rtx (*gen) (rtx, rtx, rtx)
41549 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
41550 : gen_vec_interleave_lowv8hi;
41551
41552 if (elt >= nelt2)
41553 {
41554 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
41555 : gen_vec_interleave_highv8hi;
41556 elt -= nelt2;
41557 }
41558 nelt2 /= 2;
41559
41560 dest = gen_reg_rtx (vmode);
41561 emit_insn (gen (dest, op0, op0));
41562 vmode = get_mode_wider_vector (vmode);
41563 op0 = gen_lowpart (vmode, dest);
41564 }
41565 while (vmode != V4SImode);
41566
41567 memset (perm2, elt, 4);
41568 dest = gen_reg_rtx (V4SImode);
41569 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
41570 gcc_assert (ok);
41571 if (!d->testing_p)
41572 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
41573 return true;
41574
41575 case V32QImode:
41576 case V16HImode:
41577 case V8SImode:
41578 case V4DImode:
41579 /* For AVX2 broadcasts of the first element vpbroadcast* or
41580 vpermq should be used by expand_vec_perm_1. */
41581 gcc_assert (!TARGET_AVX2 || d->perm[0]);
41582 return false;
41583
41584 default:
41585 gcc_unreachable ();
41586 }
41587 }
41588
41589 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41590 broadcast permutations. */
41591
41592 static bool
41593 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
41594 {
41595 unsigned i, elt, nelt = d->nelt;
41596
41597 if (!d->one_operand_p)
41598 return false;
41599
41600 elt = d->perm[0];
41601 for (i = 1; i < nelt; ++i)
41602 if (d->perm[i] != elt)
41603 return false;
41604
41605 return expand_vec_perm_broadcast_1 (d);
41606 }
41607
41608 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
41609 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
41610 all the shorter instruction sequences. */
41611
41612 static bool
41613 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
41614 {
41615 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
41616 unsigned int i, nelt, eltsz;
41617 bool used[4];
41618
41619 if (!TARGET_AVX2
41620 || d->one_operand_p
41621 || (d->vmode != V32QImode && d->vmode != V16HImode))
41622 return false;
41623
41624 if (d->testing_p)
41625 return true;
41626
41627 nelt = d->nelt;
41628 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41629
41630 /* Generate 4 permutation masks. If the required element is within
41631 the same lane, it is shuffled in. If the required element from the
41632 other lane, force a zero by setting bit 7 in the permutation mask.
41633 In the other mask the mask has non-negative elements if element
41634 is requested from the other lane, but also moved to the other lane,
41635 so that the result of vpshufb can have the two V2TImode halves
41636 swapped. */
41637 m128 = GEN_INT (-128);
41638 for (i = 0; i < 32; ++i)
41639 {
41640 rperm[0][i] = m128;
41641 rperm[1][i] = m128;
41642 rperm[2][i] = m128;
41643 rperm[3][i] = m128;
41644 }
41645 used[0] = false;
41646 used[1] = false;
41647 used[2] = false;
41648 used[3] = false;
41649 for (i = 0; i < nelt; ++i)
41650 {
41651 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41652 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41653 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
41654
41655 for (j = 0; j < eltsz; ++j)
41656 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
41657 used[which] = true;
41658 }
41659
41660 for (i = 0; i < 2; ++i)
41661 {
41662 if (!used[2 * i + 1])
41663 {
41664 h[i] = NULL_RTX;
41665 continue;
41666 }
41667 vperm = gen_rtx_CONST_VECTOR (V32QImode,
41668 gen_rtvec_v (32, rperm[2 * i + 1]));
41669 vperm = force_reg (V32QImode, vperm);
41670 h[i] = gen_reg_rtx (V32QImode);
41671 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41672 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
41673 }
41674
41675 /* Swap the 128-byte lanes of h[X]. */
41676 for (i = 0; i < 2; ++i)
41677 {
41678 if (h[i] == NULL_RTX)
41679 continue;
41680 op = gen_reg_rtx (V4DImode);
41681 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
41682 const2_rtx, GEN_INT (3), const0_rtx,
41683 const1_rtx));
41684 h[i] = gen_lowpart (V32QImode, op);
41685 }
41686
41687 for (i = 0; i < 2; ++i)
41688 {
41689 if (!used[2 * i])
41690 {
41691 l[i] = NULL_RTX;
41692 continue;
41693 }
41694 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41695 vperm = force_reg (V32QImode, vperm);
41696 l[i] = gen_reg_rtx (V32QImode);
41697 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41698 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41699 }
41700
41701 for (i = 0; i < 2; ++i)
41702 {
41703 if (h[i] && l[i])
41704 {
41705 op = gen_reg_rtx (V32QImode);
41706 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41707 l[i] = op;
41708 }
41709 else if (h[i])
41710 l[i] = h[i];
41711 }
41712
41713 gcc_assert (l[0] && l[1]);
41714 op = d->target;
41715 if (d->vmode != V32QImode)
41716 op = gen_reg_rtx (V32QImode);
41717 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41718 if (op != d->target)
41719 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41720 return true;
41721 }
41722
41723 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41724 With all of the interface bits taken care of, perform the expansion
41725 in D and return true on success. */
41726
41727 static bool
41728 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41729 {
41730 /* Try a single instruction expansion. */
41731 if (expand_vec_perm_1 (d))
41732 return true;
41733
41734 /* Try sequences of two instructions. */
41735
41736 if (expand_vec_perm_pshuflw_pshufhw (d))
41737 return true;
41738
41739 if (expand_vec_perm_palignr (d))
41740 return true;
41741
41742 if (expand_vec_perm_interleave2 (d))
41743 return true;
41744
41745 if (expand_vec_perm_broadcast (d))
41746 return true;
41747
41748 if (expand_vec_perm_vpermq_perm_1 (d))
41749 return true;
41750
41751 if (expand_vec_perm_vperm2f128 (d))
41752 return true;
41753
41754 /* Try sequences of three instructions. */
41755
41756 if (expand_vec_perm_2vperm2f128_vshuf (d))
41757 return true;
41758
41759 if (expand_vec_perm_pshufb2 (d))
41760 return true;
41761
41762 if (expand_vec_perm_interleave3 (d))
41763 return true;
41764
41765 if (expand_vec_perm_vperm2f128_vblend (d))
41766 return true;
41767
41768 /* Try sequences of four instructions. */
41769
41770 if (expand_vec_perm_vpshufb2_vpermq (d))
41771 return true;
41772
41773 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41774 return true;
41775
41776 /* ??? Look for narrow permutations whose element orderings would
41777 allow the promotion to a wider mode. */
41778
41779 /* ??? Look for sequences of interleave or a wider permute that place
41780 the data into the correct lanes for a half-vector shuffle like
41781 pshuf[lh]w or vpermilps. */
41782
41783 /* ??? Look for sequences of interleave that produce the desired results.
41784 The combinatorics of punpck[lh] get pretty ugly... */
41785
41786 if (expand_vec_perm_even_odd (d))
41787 return true;
41788
41789 /* Even longer sequences. */
41790 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41791 return true;
41792
41793 return false;
41794 }
41795
41796 /* If a permutation only uses one operand, make it clear. Returns true
41797 if the permutation references both operands. */
41798
41799 static bool
41800 canonicalize_perm (struct expand_vec_perm_d *d)
41801 {
41802 int i, which, nelt = d->nelt;
41803
41804 for (i = which = 0; i < nelt; ++i)
41805 which |= (d->perm[i] < nelt ? 1 : 2);
41806
41807 d->one_operand_p = true;
41808 switch (which)
41809 {
41810 default:
41811 gcc_unreachable();
41812
41813 case 3:
41814 if (!rtx_equal_p (d->op0, d->op1))
41815 {
41816 d->one_operand_p = false;
41817 break;
41818 }
41819 /* The elements of PERM do not suggest that only the first operand
41820 is used, but both operands are identical. Allow easier matching
41821 of the permutation by folding the permutation into the single
41822 input vector. */
41823 /* FALLTHRU */
41824
41825 case 2:
41826 for (i = 0; i < nelt; ++i)
41827 d->perm[i] &= nelt - 1;
41828 d->op0 = d->op1;
41829 break;
41830
41831 case 1:
41832 d->op1 = d->op0;
41833 break;
41834 }
41835
41836 return (which == 3);
41837 }
41838
41839 bool
41840 ix86_expand_vec_perm_const (rtx operands[4])
41841 {
41842 struct expand_vec_perm_d d;
41843 unsigned char perm[MAX_VECT_LEN];
41844 int i, nelt;
41845 bool two_args;
41846 rtx sel;
41847
41848 d.target = operands[0];
41849 d.op0 = operands[1];
41850 d.op1 = operands[2];
41851 sel = operands[3];
41852
41853 d.vmode = GET_MODE (d.target);
41854 gcc_assert (VECTOR_MODE_P (d.vmode));
41855 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41856 d.testing_p = false;
41857
41858 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41859 gcc_assert (XVECLEN (sel, 0) == nelt);
41860 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41861
41862 for (i = 0; i < nelt; ++i)
41863 {
41864 rtx e = XVECEXP (sel, 0, i);
41865 int ei = INTVAL (e) & (2 * nelt - 1);
41866 d.perm[i] = ei;
41867 perm[i] = ei;
41868 }
41869
41870 two_args = canonicalize_perm (&d);
41871
41872 if (ix86_expand_vec_perm_const_1 (&d))
41873 return true;
41874
41875 /* If the selector says both arguments are needed, but the operands are the
41876 same, the above tried to expand with one_operand_p and flattened selector.
41877 If that didn't work, retry without one_operand_p; we succeeded with that
41878 during testing. */
41879 if (two_args && d.one_operand_p)
41880 {
41881 d.one_operand_p = false;
41882 memcpy (d.perm, perm, sizeof (perm));
41883 return ix86_expand_vec_perm_const_1 (&d);
41884 }
41885
41886 return false;
41887 }
41888
41889 /* Implement targetm.vectorize.vec_perm_const_ok. */
41890
41891 static bool
41892 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41893 const unsigned char *sel)
41894 {
41895 struct expand_vec_perm_d d;
41896 unsigned int i, nelt, which;
41897 bool ret;
41898
41899 d.vmode = vmode;
41900 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41901 d.testing_p = true;
41902
41903 /* Given sufficient ISA support we can just return true here
41904 for selected vector modes. */
41905 if (GET_MODE_SIZE (d.vmode) == 16)
41906 {
41907 /* All implementable with a single vpperm insn. */
41908 if (TARGET_XOP)
41909 return true;
41910 /* All implementable with 2 pshufb + 1 ior. */
41911 if (TARGET_SSSE3)
41912 return true;
41913 /* All implementable with shufpd or unpck[lh]pd. */
41914 if (d.nelt == 2)
41915 return true;
41916 }
41917
41918 /* Extract the values from the vector CST into the permutation
41919 array in D. */
41920 memcpy (d.perm, sel, nelt);
41921 for (i = which = 0; i < nelt; ++i)
41922 {
41923 unsigned char e = d.perm[i];
41924 gcc_assert (e < 2 * nelt);
41925 which |= (e < nelt ? 1 : 2);
41926 }
41927
41928 /* For all elements from second vector, fold the elements to first. */
41929 if (which == 2)
41930 for (i = 0; i < nelt; ++i)
41931 d.perm[i] -= nelt;
41932
41933 /* Check whether the mask can be applied to the vector type. */
41934 d.one_operand_p = (which != 3);
41935
41936 /* Implementable with shufps or pshufd. */
41937 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41938 return true;
41939
41940 /* Otherwise we have to go through the motions and see if we can
41941 figure out how to generate the requested permutation. */
41942 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41943 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41944 if (!d.one_operand_p)
41945 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41946
41947 start_sequence ();
41948 ret = ix86_expand_vec_perm_const_1 (&d);
41949 end_sequence ();
41950
41951 return ret;
41952 }
41953
41954 void
41955 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41956 {
41957 struct expand_vec_perm_d d;
41958 unsigned i, nelt;
41959
41960 d.target = targ;
41961 d.op0 = op0;
41962 d.op1 = op1;
41963 d.vmode = GET_MODE (targ);
41964 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41965 d.one_operand_p = false;
41966 d.testing_p = false;
41967
41968 for (i = 0; i < nelt; ++i)
41969 d.perm[i] = i * 2 + odd;
41970
41971 /* We'll either be able to implement the permutation directly... */
41972 if (expand_vec_perm_1 (&d))
41973 return;
41974
41975 /* ... or we use the special-case patterns. */
41976 expand_vec_perm_even_odd_1 (&d, odd);
41977 }
41978
41979 static void
41980 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41981 {
41982 struct expand_vec_perm_d d;
41983 unsigned i, nelt, base;
41984 bool ok;
41985
41986 d.target = targ;
41987 d.op0 = op0;
41988 d.op1 = op1;
41989 d.vmode = GET_MODE (targ);
41990 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41991 d.one_operand_p = false;
41992 d.testing_p = false;
41993
41994 base = high_p ? nelt / 2 : 0;
41995 for (i = 0; i < nelt / 2; ++i)
41996 {
41997 d.perm[i * 2] = i + base;
41998 d.perm[i * 2 + 1] = i + base + nelt;
41999 }
42000
42001 /* Note that for AVX this isn't one instruction. */
42002 ok = ix86_expand_vec_perm_const_1 (&d);
42003 gcc_assert (ok);
42004 }
42005
42006
42007 /* Expand a vector operation CODE for a V*QImode in terms of the
42008 same operation on V*HImode. */
42009
42010 void
42011 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
42012 {
42013 enum machine_mode qimode = GET_MODE (dest);
42014 enum machine_mode himode;
42015 rtx (*gen_il) (rtx, rtx, rtx);
42016 rtx (*gen_ih) (rtx, rtx, rtx);
42017 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
42018 struct expand_vec_perm_d d;
42019 bool ok, full_interleave;
42020 bool uns_p = false;
42021 int i;
42022
42023 switch (qimode)
42024 {
42025 case V16QImode:
42026 himode = V8HImode;
42027 gen_il = gen_vec_interleave_lowv16qi;
42028 gen_ih = gen_vec_interleave_highv16qi;
42029 break;
42030 case V32QImode:
42031 himode = V16HImode;
42032 gen_il = gen_avx2_interleave_lowv32qi;
42033 gen_ih = gen_avx2_interleave_highv32qi;
42034 break;
42035 default:
42036 gcc_unreachable ();
42037 }
42038
42039 op2_l = op2_h = op2;
42040 switch (code)
42041 {
42042 case MULT:
42043 /* Unpack data such that we've got a source byte in each low byte of
42044 each word. We don't care what goes into the high byte of each word.
42045 Rather than trying to get zero in there, most convenient is to let
42046 it be a copy of the low byte. */
42047 op2_l = gen_reg_rtx (qimode);
42048 op2_h = gen_reg_rtx (qimode);
42049 emit_insn (gen_il (op2_l, op2, op2));
42050 emit_insn (gen_ih (op2_h, op2, op2));
42051 /* FALLTHRU */
42052
42053 op1_l = gen_reg_rtx (qimode);
42054 op1_h = gen_reg_rtx (qimode);
42055 emit_insn (gen_il (op1_l, op1, op1));
42056 emit_insn (gen_ih (op1_h, op1, op1));
42057 full_interleave = qimode == V16QImode;
42058 break;
42059
42060 case ASHIFT:
42061 case LSHIFTRT:
42062 uns_p = true;
42063 /* FALLTHRU */
42064 case ASHIFTRT:
42065 op1_l = gen_reg_rtx (himode);
42066 op1_h = gen_reg_rtx (himode);
42067 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
42068 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
42069 full_interleave = true;
42070 break;
42071 default:
42072 gcc_unreachable ();
42073 }
42074
42075 /* Perform the operation. */
42076 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
42077 1, OPTAB_DIRECT);
42078 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
42079 1, OPTAB_DIRECT);
42080 gcc_assert (res_l && res_h);
42081
42082 /* Merge the data back into the right place. */
42083 d.target = dest;
42084 d.op0 = gen_lowpart (qimode, res_l);
42085 d.op1 = gen_lowpart (qimode, res_h);
42086 d.vmode = qimode;
42087 d.nelt = GET_MODE_NUNITS (qimode);
42088 d.one_operand_p = false;
42089 d.testing_p = false;
42090
42091 if (full_interleave)
42092 {
42093 /* For SSE2, we used an full interleave, so the desired
42094 results are in the even elements. */
42095 for (i = 0; i < 32; ++i)
42096 d.perm[i] = i * 2;
42097 }
42098 else
42099 {
42100 /* For AVX, the interleave used above was not cross-lane. So the
42101 extraction is evens but with the second and third quarter swapped.
42102 Happily, that is even one insn shorter than even extraction. */
42103 for (i = 0; i < 32; ++i)
42104 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
42105 }
42106
42107 ok = ix86_expand_vec_perm_const_1 (&d);
42108 gcc_assert (ok);
42109
42110 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42111 gen_rtx_fmt_ee (code, qimode, op1, op2));
42112 }
42113
42114 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
42115 if op is CONST_VECTOR with all odd elements equal to their
42116 preceding element. */
42117
42118 static bool
42119 const_vector_equal_evenodd_p (rtx op)
42120 {
42121 enum machine_mode mode = GET_MODE (op);
42122 int i, nunits = GET_MODE_NUNITS (mode);
42123 if (GET_CODE (op) != CONST_VECTOR
42124 || nunits != CONST_VECTOR_NUNITS (op))
42125 return false;
42126 for (i = 0; i < nunits; i += 2)
42127 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
42128 return false;
42129 return true;
42130 }
42131
42132 void
42133 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
42134 bool uns_p, bool odd_p)
42135 {
42136 enum machine_mode mode = GET_MODE (op1);
42137 enum machine_mode wmode = GET_MODE (dest);
42138 rtx x;
42139 rtx orig_op1 = op1, orig_op2 = op2;
42140
42141 if (!nonimmediate_operand (op1, mode))
42142 op1 = force_reg (mode, op1);
42143 if (!nonimmediate_operand (op2, mode))
42144 op2 = force_reg (mode, op2);
42145
42146 /* We only play even/odd games with vectors of SImode. */
42147 gcc_assert (mode == V4SImode || mode == V8SImode);
42148
42149 /* If we're looking for the odd results, shift those members down to
42150 the even slots. For some cpus this is faster than a PSHUFD. */
42151 if (odd_p)
42152 {
42153 /* For XOP use vpmacsdqh, but only for smult, as it is only
42154 signed. */
42155 if (TARGET_XOP && mode == V4SImode && !uns_p)
42156 {
42157 x = force_reg (wmode, CONST0_RTX (wmode));
42158 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
42159 return;
42160 }
42161
42162 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
42163 if (!const_vector_equal_evenodd_p (orig_op1))
42164 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
42165 x, NULL, 1, OPTAB_DIRECT);
42166 if (!const_vector_equal_evenodd_p (orig_op2))
42167 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
42168 x, NULL, 1, OPTAB_DIRECT);
42169 op1 = gen_lowpart (mode, op1);
42170 op2 = gen_lowpart (mode, op2);
42171 }
42172
42173 if (mode == V8SImode)
42174 {
42175 if (uns_p)
42176 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
42177 else
42178 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
42179 }
42180 else if (uns_p)
42181 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
42182 else if (TARGET_SSE4_1)
42183 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
42184 else
42185 {
42186 rtx s1, s2, t0, t1, t2;
42187
42188 /* The easiest way to implement this without PMULDQ is to go through
42189 the motions as if we are performing a full 64-bit multiply. With
42190 the exception that we need to do less shuffling of the elements. */
42191
42192 /* Compute the sign-extension, aka highparts, of the two operands. */
42193 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42194 op1, pc_rtx, pc_rtx);
42195 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42196 op2, pc_rtx, pc_rtx);
42197
42198 /* Multiply LO(A) * HI(B), and vice-versa. */
42199 t1 = gen_reg_rtx (wmode);
42200 t2 = gen_reg_rtx (wmode);
42201 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
42202 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
42203
42204 /* Multiply LO(A) * LO(B). */
42205 t0 = gen_reg_rtx (wmode);
42206 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
42207
42208 /* Combine and shift the highparts into place. */
42209 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
42210 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
42211 1, OPTAB_DIRECT);
42212
42213 /* Combine high and low parts. */
42214 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
42215 return;
42216 }
42217 emit_insn (x);
42218 }
42219
42220 void
42221 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
42222 bool uns_p, bool high_p)
42223 {
42224 enum machine_mode wmode = GET_MODE (dest);
42225 enum machine_mode mode = GET_MODE (op1);
42226 rtx t1, t2, t3, t4, mask;
42227
42228 switch (mode)
42229 {
42230 case V4SImode:
42231 t1 = gen_reg_rtx (mode);
42232 t2 = gen_reg_rtx (mode);
42233 if (TARGET_XOP && !uns_p)
42234 {
42235 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
42236 shuffle the elements once so that all elements are in the right
42237 place for immediate use: { A C B D }. */
42238 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
42239 const1_rtx, GEN_INT (3)));
42240 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
42241 const1_rtx, GEN_INT (3)));
42242 }
42243 else
42244 {
42245 /* Put the elements into place for the multiply. */
42246 ix86_expand_vec_interleave (t1, op1, op1, high_p);
42247 ix86_expand_vec_interleave (t2, op2, op2, high_p);
42248 high_p = false;
42249 }
42250 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
42251 break;
42252
42253 case V8SImode:
42254 /* Shuffle the elements between the lanes. After this we
42255 have { A B E F | C D G H } for each operand. */
42256 t1 = gen_reg_rtx (V4DImode);
42257 t2 = gen_reg_rtx (V4DImode);
42258 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
42259 const0_rtx, const2_rtx,
42260 const1_rtx, GEN_INT (3)));
42261 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
42262 const0_rtx, const2_rtx,
42263 const1_rtx, GEN_INT (3)));
42264
42265 /* Shuffle the elements within the lanes. After this we
42266 have { A A B B | C C D D } or { E E F F | G G H H }. */
42267 t3 = gen_reg_rtx (V8SImode);
42268 t4 = gen_reg_rtx (V8SImode);
42269 mask = GEN_INT (high_p
42270 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
42271 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
42272 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
42273 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
42274
42275 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
42276 break;
42277
42278 case V8HImode:
42279 case V16HImode:
42280 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
42281 uns_p, OPTAB_DIRECT);
42282 t2 = expand_binop (mode,
42283 uns_p ? umul_highpart_optab : smul_highpart_optab,
42284 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
42285 gcc_assert (t1 && t2);
42286
42287 t3 = gen_reg_rtx (mode);
42288 ix86_expand_vec_interleave (t3, t1, t2, high_p);
42289 emit_move_insn (dest, gen_lowpart (wmode, t3));
42290 break;
42291
42292 case V16QImode:
42293 case V32QImode:
42294 t1 = gen_reg_rtx (wmode);
42295 t2 = gen_reg_rtx (wmode);
42296 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
42297 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
42298
42299 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
42300 break;
42301
42302 default:
42303 gcc_unreachable ();
42304 }
42305 }
42306
42307 void
42308 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
42309 {
42310 rtx res_1, res_2, res_3, res_4;
42311
42312 res_1 = gen_reg_rtx (V4SImode);
42313 res_2 = gen_reg_rtx (V4SImode);
42314 res_3 = gen_reg_rtx (V2DImode);
42315 res_4 = gen_reg_rtx (V2DImode);
42316 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
42317 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
42318
42319 /* Move the results in element 2 down to element 1; we don't care
42320 what goes in elements 2 and 3. Then we can merge the parts
42321 back together with an interleave.
42322
42323 Note that two other sequences were tried:
42324 (1) Use interleaves at the start instead of psrldq, which allows
42325 us to use a single shufps to merge things back at the end.
42326 (2) Use shufps here to combine the two vectors, then pshufd to
42327 put the elements in the correct order.
42328 In both cases the cost of the reformatting stall was too high
42329 and the overall sequence slower. */
42330
42331 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
42332 const0_rtx, const2_rtx,
42333 const0_rtx, const0_rtx));
42334 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
42335 const0_rtx, const2_rtx,
42336 const0_rtx, const0_rtx));
42337 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
42338
42339 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
42340 }
42341
42342 void
42343 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
42344 {
42345 enum machine_mode mode = GET_MODE (op0);
42346 rtx t1, t2, t3, t4, t5, t6;
42347
42348 if (TARGET_XOP && mode == V2DImode)
42349 {
42350 /* op1: A,B,C,D, op2: E,F,G,H */
42351 op1 = gen_lowpart (V4SImode, op1);
42352 op2 = gen_lowpart (V4SImode, op2);
42353
42354 t1 = gen_reg_rtx (V4SImode);
42355 t2 = gen_reg_rtx (V4SImode);
42356 t3 = gen_reg_rtx (V2DImode);
42357 t4 = gen_reg_rtx (V2DImode);
42358
42359 /* t1: B,A,D,C */
42360 emit_insn (gen_sse2_pshufd_1 (t1, op1,
42361 GEN_INT (1),
42362 GEN_INT (0),
42363 GEN_INT (3),
42364 GEN_INT (2)));
42365
42366 /* t2: (B*E),(A*F),(D*G),(C*H) */
42367 emit_insn (gen_mulv4si3 (t2, t1, op2));
42368
42369 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
42370 emit_insn (gen_xop_phadddq (t3, t2));
42371
42372 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
42373 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
42374
42375 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
42376 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
42377 }
42378 else
42379 {
42380 enum machine_mode nmode;
42381 rtx (*umul) (rtx, rtx, rtx);
42382
42383 if (mode == V2DImode)
42384 {
42385 umul = gen_vec_widen_umult_even_v4si;
42386 nmode = V4SImode;
42387 }
42388 else if (mode == V4DImode)
42389 {
42390 umul = gen_vec_widen_umult_even_v8si;
42391 nmode = V8SImode;
42392 }
42393 else
42394 gcc_unreachable ();
42395
42396
42397 /* Multiply low parts. */
42398 t1 = gen_reg_rtx (mode);
42399 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
42400
42401 /* Shift input vectors right 32 bits so we can multiply high parts. */
42402 t6 = GEN_INT (32);
42403 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
42404 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
42405
42406 /* Multiply high parts by low parts. */
42407 t4 = gen_reg_rtx (mode);
42408 t5 = gen_reg_rtx (mode);
42409 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
42410 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
42411
42412 /* Combine and shift the highparts back. */
42413 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
42414 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
42415
42416 /* Combine high and low parts. */
42417 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
42418 }
42419
42420 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42421 gen_rtx_MULT (mode, op1, op2));
42422 }
42423
42424 /* Return 1 if control tansfer instruction INSN
42425 should be encoded with bnd prefix.
42426 If insn is NULL then return 1 when control
42427 transfer instructions should be prefixed with
42428 bnd by default for current function. */
42429
42430 bool
42431 ix86_bnd_prefixed_insn_p (rtx insn ATTRIBUTE_UNUSED)
42432 {
42433 return false;
42434 }
42435
42436 /* Calculate integer abs() using only SSE2 instructions. */
42437
42438 void
42439 ix86_expand_sse2_abs (rtx target, rtx input)
42440 {
42441 enum machine_mode mode = GET_MODE (target);
42442 rtx tmp0, tmp1, x;
42443
42444 switch (mode)
42445 {
42446 /* For 32-bit signed integer X, the best way to calculate the absolute
42447 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
42448 case V4SImode:
42449 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
42450 GEN_INT (GET_MODE_BITSIZE
42451 (GET_MODE_INNER (mode)) - 1),
42452 NULL, 0, OPTAB_DIRECT);
42453 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
42454 NULL, 0, OPTAB_DIRECT);
42455 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
42456 target, 0, OPTAB_DIRECT);
42457 break;
42458
42459 /* For 16-bit signed integer X, the best way to calculate the absolute
42460 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
42461 case V8HImode:
42462 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42463
42464 x = expand_simple_binop (mode, SMAX, tmp0, input,
42465 target, 0, OPTAB_DIRECT);
42466 break;
42467
42468 /* For 8-bit signed integer X, the best way to calculate the absolute
42469 value of X is min ((unsigned char) X, (unsigned char) (-X)),
42470 as SSE2 provides the PMINUB insn. */
42471 case V16QImode:
42472 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42473
42474 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
42475 target, 0, OPTAB_DIRECT);
42476 break;
42477
42478 default:
42479 gcc_unreachable ();
42480 }
42481
42482 if (x != target)
42483 emit_move_insn (target, x);
42484 }
42485
42486 /* Expand an insert into a vector register through pinsr insn.
42487 Return true if successful. */
42488
42489 bool
42490 ix86_expand_pinsr (rtx *operands)
42491 {
42492 rtx dst = operands[0];
42493 rtx src = operands[3];
42494
42495 unsigned int size = INTVAL (operands[1]);
42496 unsigned int pos = INTVAL (operands[2]);
42497
42498 if (GET_CODE (dst) == SUBREG)
42499 {
42500 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
42501 dst = SUBREG_REG (dst);
42502 }
42503
42504 if (GET_CODE (src) == SUBREG)
42505 src = SUBREG_REG (src);
42506
42507 switch (GET_MODE (dst))
42508 {
42509 case V16QImode:
42510 case V8HImode:
42511 case V4SImode:
42512 case V2DImode:
42513 {
42514 enum machine_mode srcmode, dstmode;
42515 rtx (*pinsr)(rtx, rtx, rtx, rtx);
42516
42517 srcmode = mode_for_size (size, MODE_INT, 0);
42518
42519 switch (srcmode)
42520 {
42521 case QImode:
42522 if (!TARGET_SSE4_1)
42523 return false;
42524 dstmode = V16QImode;
42525 pinsr = gen_sse4_1_pinsrb;
42526 break;
42527
42528 case HImode:
42529 if (!TARGET_SSE2)
42530 return false;
42531 dstmode = V8HImode;
42532 pinsr = gen_sse2_pinsrw;
42533 break;
42534
42535 case SImode:
42536 if (!TARGET_SSE4_1)
42537 return false;
42538 dstmode = V4SImode;
42539 pinsr = gen_sse4_1_pinsrd;
42540 break;
42541
42542 case DImode:
42543 gcc_assert (TARGET_64BIT);
42544 if (!TARGET_SSE4_1)
42545 return false;
42546 dstmode = V2DImode;
42547 pinsr = gen_sse4_1_pinsrq;
42548 break;
42549
42550 default:
42551 return false;
42552 }
42553
42554 rtx d = dst;
42555 if (GET_MODE (dst) != dstmode)
42556 d = gen_reg_rtx (dstmode);
42557 src = gen_lowpart (srcmode, src);
42558
42559 pos /= size;
42560
42561 emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
42562 GEN_INT (1 << pos)));
42563 if (d != dst)
42564 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
42565 return true;
42566 }
42567
42568 default:
42569 return false;
42570 }
42571 }
42572 \f
42573 /* This function returns the calling abi specific va_list type node.
42574 It returns the FNDECL specific va_list type. */
42575
42576 static tree
42577 ix86_fn_abi_va_list (tree fndecl)
42578 {
42579 if (!TARGET_64BIT)
42580 return va_list_type_node;
42581 gcc_assert (fndecl != NULL_TREE);
42582
42583 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
42584 return ms_va_list_type_node;
42585 else
42586 return sysv_va_list_type_node;
42587 }
42588
42589 /* Returns the canonical va_list type specified by TYPE. If there
42590 is no valid TYPE provided, it return NULL_TREE. */
42591
42592 static tree
42593 ix86_canonical_va_list_type (tree type)
42594 {
42595 tree wtype, htype;
42596
42597 /* Resolve references and pointers to va_list type. */
42598 if (TREE_CODE (type) == MEM_REF)
42599 type = TREE_TYPE (type);
42600 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
42601 type = TREE_TYPE (type);
42602 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
42603 type = TREE_TYPE (type);
42604
42605 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
42606 {
42607 wtype = va_list_type_node;
42608 gcc_assert (wtype != NULL_TREE);
42609 htype = type;
42610 if (TREE_CODE (wtype) == ARRAY_TYPE)
42611 {
42612 /* If va_list is an array type, the argument may have decayed
42613 to a pointer type, e.g. by being passed to another function.
42614 In that case, unwrap both types so that we can compare the
42615 underlying records. */
42616 if (TREE_CODE (htype) == ARRAY_TYPE
42617 || POINTER_TYPE_P (htype))
42618 {
42619 wtype = TREE_TYPE (wtype);
42620 htype = TREE_TYPE (htype);
42621 }
42622 }
42623 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42624 return va_list_type_node;
42625 wtype = sysv_va_list_type_node;
42626 gcc_assert (wtype != NULL_TREE);
42627 htype = type;
42628 if (TREE_CODE (wtype) == ARRAY_TYPE)
42629 {
42630 /* If va_list is an array type, the argument may have decayed
42631 to a pointer type, e.g. by being passed to another function.
42632 In that case, unwrap both types so that we can compare the
42633 underlying records. */
42634 if (TREE_CODE (htype) == ARRAY_TYPE
42635 || POINTER_TYPE_P (htype))
42636 {
42637 wtype = TREE_TYPE (wtype);
42638 htype = TREE_TYPE (htype);
42639 }
42640 }
42641 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42642 return sysv_va_list_type_node;
42643 wtype = ms_va_list_type_node;
42644 gcc_assert (wtype != NULL_TREE);
42645 htype = type;
42646 if (TREE_CODE (wtype) == ARRAY_TYPE)
42647 {
42648 /* If va_list is an array type, the argument may have decayed
42649 to a pointer type, e.g. by being passed to another function.
42650 In that case, unwrap both types so that we can compare the
42651 underlying records. */
42652 if (TREE_CODE (htype) == ARRAY_TYPE
42653 || POINTER_TYPE_P (htype))
42654 {
42655 wtype = TREE_TYPE (wtype);
42656 htype = TREE_TYPE (htype);
42657 }
42658 }
42659 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42660 return ms_va_list_type_node;
42661 return NULL_TREE;
42662 }
42663 return std_canonical_va_list_type (type);
42664 }
42665
42666 /* Iterate through the target-specific builtin types for va_list.
42667 IDX denotes the iterator, *PTREE is set to the result type of
42668 the va_list builtin, and *PNAME to its internal type.
42669 Returns zero if there is no element for this index, otherwise
42670 IDX should be increased upon the next call.
42671 Note, do not iterate a base builtin's name like __builtin_va_list.
42672 Used from c_common_nodes_and_builtins. */
42673
42674 static int
42675 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
42676 {
42677 if (TARGET_64BIT)
42678 {
42679 switch (idx)
42680 {
42681 default:
42682 break;
42683
42684 case 0:
42685 *ptree = ms_va_list_type_node;
42686 *pname = "__builtin_ms_va_list";
42687 return 1;
42688
42689 case 1:
42690 *ptree = sysv_va_list_type_node;
42691 *pname = "__builtin_sysv_va_list";
42692 return 1;
42693 }
42694 }
42695
42696 return 0;
42697 }
42698
42699 #undef TARGET_SCHED_DISPATCH
42700 #define TARGET_SCHED_DISPATCH has_dispatch
42701 #undef TARGET_SCHED_DISPATCH_DO
42702 #define TARGET_SCHED_DISPATCH_DO do_dispatch
42703 #undef TARGET_SCHED_REASSOCIATION_WIDTH
42704 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
42705 #undef TARGET_SCHED_REORDER
42706 #define TARGET_SCHED_REORDER ix86_sched_reorder
42707 #undef TARGET_SCHED_ADJUST_PRIORITY
42708 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
42709 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
42710 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
42711 ix86_dependencies_evaluation_hook
42712
42713 /* The size of the dispatch window is the total number of bytes of
42714 object code allowed in a window. */
42715 #define DISPATCH_WINDOW_SIZE 16
42716
42717 /* Number of dispatch windows considered for scheduling. */
42718 #define MAX_DISPATCH_WINDOWS 3
42719
42720 /* Maximum number of instructions in a window. */
42721 #define MAX_INSN 4
42722
42723 /* Maximum number of immediate operands in a window. */
42724 #define MAX_IMM 4
42725
42726 /* Maximum number of immediate bits allowed in a window. */
42727 #define MAX_IMM_SIZE 128
42728
42729 /* Maximum number of 32 bit immediates allowed in a window. */
42730 #define MAX_IMM_32 4
42731
42732 /* Maximum number of 64 bit immediates allowed in a window. */
42733 #define MAX_IMM_64 2
42734
42735 /* Maximum total of loads or prefetches allowed in a window. */
42736 #define MAX_LOAD 2
42737
42738 /* Maximum total of stores allowed in a window. */
42739 #define MAX_STORE 1
42740
42741 #undef BIG
42742 #define BIG 100
42743
42744
42745 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
42746 enum dispatch_group {
42747 disp_no_group = 0,
42748 disp_load,
42749 disp_store,
42750 disp_load_store,
42751 disp_prefetch,
42752 disp_imm,
42753 disp_imm_32,
42754 disp_imm_64,
42755 disp_branch,
42756 disp_cmp,
42757 disp_jcc,
42758 disp_last
42759 };
42760
42761 /* Number of allowable groups in a dispatch window. It is an array
42762 indexed by dispatch_group enum. 100 is used as a big number,
42763 because the number of these kind of operations does not have any
42764 effect in dispatch window, but we need them for other reasons in
42765 the table. */
42766 static unsigned int num_allowable_groups[disp_last] = {
42767 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42768 };
42769
42770 char group_name[disp_last + 1][16] = {
42771 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42772 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42773 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42774 };
42775
42776 /* Instruction path. */
42777 enum insn_path {
42778 no_path = 0,
42779 path_single, /* Single micro op. */
42780 path_double, /* Double micro op. */
42781 path_multi, /* Instructions with more than 2 micro op.. */
42782 last_path
42783 };
42784
42785 /* sched_insn_info defines a window to the instructions scheduled in
42786 the basic block. It contains a pointer to the insn_info table and
42787 the instruction scheduled.
42788
42789 Windows are allocated for each basic block and are linked
42790 together. */
42791 typedef struct sched_insn_info_s {
42792 rtx insn;
42793 enum dispatch_group group;
42794 enum insn_path path;
42795 int byte_len;
42796 int imm_bytes;
42797 } sched_insn_info;
42798
42799 /* Linked list of dispatch windows. This is a two way list of
42800 dispatch windows of a basic block. It contains information about
42801 the number of uops in the window and the total number of
42802 instructions and of bytes in the object code for this dispatch
42803 window. */
42804 typedef struct dispatch_windows_s {
42805 int num_insn; /* Number of insn in the window. */
42806 int num_uops; /* Number of uops in the window. */
42807 int window_size; /* Number of bytes in the window. */
42808 int window_num; /* Window number between 0 or 1. */
42809 int num_imm; /* Number of immediates in an insn. */
42810 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42811 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42812 int imm_size; /* Total immediates in the window. */
42813 int num_loads; /* Total memory loads in the window. */
42814 int num_stores; /* Total memory stores in the window. */
42815 int violation; /* Violation exists in window. */
42816 sched_insn_info *window; /* Pointer to the window. */
42817 struct dispatch_windows_s *next;
42818 struct dispatch_windows_s *prev;
42819 } dispatch_windows;
42820
42821 /* Immediate valuse used in an insn. */
42822 typedef struct imm_info_s
42823 {
42824 int imm;
42825 int imm32;
42826 int imm64;
42827 } imm_info;
42828
42829 static dispatch_windows *dispatch_window_list;
42830 static dispatch_windows *dispatch_window_list1;
42831
42832 /* Get dispatch group of insn. */
42833
42834 static enum dispatch_group
42835 get_mem_group (rtx insn)
42836 {
42837 enum attr_memory memory;
42838
42839 if (INSN_CODE (insn) < 0)
42840 return disp_no_group;
42841 memory = get_attr_memory (insn);
42842 if (memory == MEMORY_STORE)
42843 return disp_store;
42844
42845 if (memory == MEMORY_LOAD)
42846 return disp_load;
42847
42848 if (memory == MEMORY_BOTH)
42849 return disp_load_store;
42850
42851 return disp_no_group;
42852 }
42853
42854 /* Return true if insn is a compare instruction. */
42855
42856 static bool
42857 is_cmp (rtx insn)
42858 {
42859 enum attr_type type;
42860
42861 type = get_attr_type (insn);
42862 return (type == TYPE_TEST
42863 || type == TYPE_ICMP
42864 || type == TYPE_FCMP
42865 || GET_CODE (PATTERN (insn)) == COMPARE);
42866 }
42867
42868 /* Return true if a dispatch violation encountered. */
42869
42870 static bool
42871 dispatch_violation (void)
42872 {
42873 if (dispatch_window_list->next)
42874 return dispatch_window_list->next->violation;
42875 return dispatch_window_list->violation;
42876 }
42877
42878 /* Return true if insn is a branch instruction. */
42879
42880 static bool
42881 is_branch (rtx insn)
42882 {
42883 return (CALL_P (insn) || JUMP_P (insn));
42884 }
42885
42886 /* Return true if insn is a prefetch instruction. */
42887
42888 static bool
42889 is_prefetch (rtx insn)
42890 {
42891 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42892 }
42893
42894 /* This function initializes a dispatch window and the list container holding a
42895 pointer to the window. */
42896
42897 static void
42898 init_window (int window_num)
42899 {
42900 int i;
42901 dispatch_windows *new_list;
42902
42903 if (window_num == 0)
42904 new_list = dispatch_window_list;
42905 else
42906 new_list = dispatch_window_list1;
42907
42908 new_list->num_insn = 0;
42909 new_list->num_uops = 0;
42910 new_list->window_size = 0;
42911 new_list->next = NULL;
42912 new_list->prev = NULL;
42913 new_list->window_num = window_num;
42914 new_list->num_imm = 0;
42915 new_list->num_imm_32 = 0;
42916 new_list->num_imm_64 = 0;
42917 new_list->imm_size = 0;
42918 new_list->num_loads = 0;
42919 new_list->num_stores = 0;
42920 new_list->violation = false;
42921
42922 for (i = 0; i < MAX_INSN; i++)
42923 {
42924 new_list->window[i].insn = NULL;
42925 new_list->window[i].group = disp_no_group;
42926 new_list->window[i].path = no_path;
42927 new_list->window[i].byte_len = 0;
42928 new_list->window[i].imm_bytes = 0;
42929 }
42930 return;
42931 }
42932
42933 /* This function allocates and initializes a dispatch window and the
42934 list container holding a pointer to the window. */
42935
42936 static dispatch_windows *
42937 allocate_window (void)
42938 {
42939 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42940 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42941
42942 return new_list;
42943 }
42944
42945 /* This routine initializes the dispatch scheduling information. It
42946 initiates building dispatch scheduler tables and constructs the
42947 first dispatch window. */
42948
42949 static void
42950 init_dispatch_sched (void)
42951 {
42952 /* Allocate a dispatch list and a window. */
42953 dispatch_window_list = allocate_window ();
42954 dispatch_window_list1 = allocate_window ();
42955 init_window (0);
42956 init_window (1);
42957 }
42958
42959 /* This function returns true if a branch is detected. End of a basic block
42960 does not have to be a branch, but here we assume only branches end a
42961 window. */
42962
42963 static bool
42964 is_end_basic_block (enum dispatch_group group)
42965 {
42966 return group == disp_branch;
42967 }
42968
42969 /* This function is called when the end of a window processing is reached. */
42970
42971 static void
42972 process_end_window (void)
42973 {
42974 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42975 if (dispatch_window_list->next)
42976 {
42977 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42978 gcc_assert (dispatch_window_list->window_size
42979 + dispatch_window_list1->window_size <= 48);
42980 init_window (1);
42981 }
42982 init_window (0);
42983 }
42984
42985 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42986 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42987 for 48 bytes of instructions. Note that these windows are not dispatch
42988 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42989
42990 static dispatch_windows *
42991 allocate_next_window (int window_num)
42992 {
42993 if (window_num == 0)
42994 {
42995 if (dispatch_window_list->next)
42996 init_window (1);
42997 init_window (0);
42998 return dispatch_window_list;
42999 }
43000
43001 dispatch_window_list->next = dispatch_window_list1;
43002 dispatch_window_list1->prev = dispatch_window_list;
43003
43004 return dispatch_window_list1;
43005 }
43006
43007 /* Increment the number of immediate operands of an instruction. */
43008
43009 static int
43010 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
43011 {
43012 if (*in_rtx == 0)
43013 return 0;
43014
43015 switch ( GET_CODE (*in_rtx))
43016 {
43017 case CONST:
43018 case SYMBOL_REF:
43019 case CONST_INT:
43020 (imm_values->imm)++;
43021 if (x86_64_immediate_operand (*in_rtx, SImode))
43022 (imm_values->imm32)++;
43023 else
43024 (imm_values->imm64)++;
43025 break;
43026
43027 case CONST_DOUBLE:
43028 (imm_values->imm)++;
43029 (imm_values->imm64)++;
43030 break;
43031
43032 case CODE_LABEL:
43033 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
43034 {
43035 (imm_values->imm)++;
43036 (imm_values->imm32)++;
43037 }
43038 break;
43039
43040 default:
43041 break;
43042 }
43043
43044 return 0;
43045 }
43046
43047 /* Compute number of immediate operands of an instruction. */
43048
43049 static void
43050 find_constant (rtx in_rtx, imm_info *imm_values)
43051 {
43052 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
43053 (rtx_function) find_constant_1, (void *) imm_values);
43054 }
43055
43056 /* Return total size of immediate operands of an instruction along with number
43057 of corresponding immediate-operands. It initializes its parameters to zero
43058 befor calling FIND_CONSTANT.
43059 INSN is the input instruction. IMM is the total of immediates.
43060 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
43061 bit immediates. */
43062
43063 static int
43064 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
43065 {
43066 imm_info imm_values = {0, 0, 0};
43067
43068 find_constant (insn, &imm_values);
43069 *imm = imm_values.imm;
43070 *imm32 = imm_values.imm32;
43071 *imm64 = imm_values.imm64;
43072 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
43073 }
43074
43075 /* This function indicates if an operand of an instruction is an
43076 immediate. */
43077
43078 static bool
43079 has_immediate (rtx insn)
43080 {
43081 int num_imm_operand;
43082 int num_imm32_operand;
43083 int num_imm64_operand;
43084
43085 if (insn)
43086 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43087 &num_imm64_operand);
43088 return false;
43089 }
43090
43091 /* Return single or double path for instructions. */
43092
43093 static enum insn_path
43094 get_insn_path (rtx insn)
43095 {
43096 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
43097
43098 if ((int)path == 0)
43099 return path_single;
43100
43101 if ((int)path == 1)
43102 return path_double;
43103
43104 return path_multi;
43105 }
43106
43107 /* Return insn dispatch group. */
43108
43109 static enum dispatch_group
43110 get_insn_group (rtx insn)
43111 {
43112 enum dispatch_group group = get_mem_group (insn);
43113 if (group)
43114 return group;
43115
43116 if (is_branch (insn))
43117 return disp_branch;
43118
43119 if (is_cmp (insn))
43120 return disp_cmp;
43121
43122 if (has_immediate (insn))
43123 return disp_imm;
43124
43125 if (is_prefetch (insn))
43126 return disp_prefetch;
43127
43128 return disp_no_group;
43129 }
43130
43131 /* Count number of GROUP restricted instructions in a dispatch
43132 window WINDOW_LIST. */
43133
43134 static int
43135 count_num_restricted (rtx insn, dispatch_windows *window_list)
43136 {
43137 enum dispatch_group group = get_insn_group (insn);
43138 int imm_size;
43139 int num_imm_operand;
43140 int num_imm32_operand;
43141 int num_imm64_operand;
43142
43143 if (group == disp_no_group)
43144 return 0;
43145
43146 if (group == disp_imm)
43147 {
43148 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43149 &num_imm64_operand);
43150 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
43151 || num_imm_operand + window_list->num_imm > MAX_IMM
43152 || (num_imm32_operand > 0
43153 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
43154 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
43155 || (num_imm64_operand > 0
43156 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
43157 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
43158 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
43159 && num_imm64_operand > 0
43160 && ((window_list->num_imm_64 > 0
43161 && window_list->num_insn >= 2)
43162 || window_list->num_insn >= 3)))
43163 return BIG;
43164
43165 return 1;
43166 }
43167
43168 if ((group == disp_load_store
43169 && (window_list->num_loads >= MAX_LOAD
43170 || window_list->num_stores >= MAX_STORE))
43171 || ((group == disp_load
43172 || group == disp_prefetch)
43173 && window_list->num_loads >= MAX_LOAD)
43174 || (group == disp_store
43175 && window_list->num_stores >= MAX_STORE))
43176 return BIG;
43177
43178 return 1;
43179 }
43180
43181 /* This function returns true if insn satisfies dispatch rules on the
43182 last window scheduled. */
43183
43184 static bool
43185 fits_dispatch_window (rtx insn)
43186 {
43187 dispatch_windows *window_list = dispatch_window_list;
43188 dispatch_windows *window_list_next = dispatch_window_list->next;
43189 unsigned int num_restrict;
43190 enum dispatch_group group = get_insn_group (insn);
43191 enum insn_path path = get_insn_path (insn);
43192 int sum;
43193
43194 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
43195 instructions should be given the lowest priority in the
43196 scheduling process in Haifa scheduler to make sure they will be
43197 scheduled in the same dispatch window as the reference to them. */
43198 if (group == disp_jcc || group == disp_cmp)
43199 return false;
43200
43201 /* Check nonrestricted. */
43202 if (group == disp_no_group || group == disp_branch)
43203 return true;
43204
43205 /* Get last dispatch window. */
43206 if (window_list_next)
43207 window_list = window_list_next;
43208
43209 if (window_list->window_num == 1)
43210 {
43211 sum = window_list->prev->window_size + window_list->window_size;
43212
43213 if (sum == 32
43214 || (min_insn_size (insn) + sum) >= 48)
43215 /* Window 1 is full. Go for next window. */
43216 return true;
43217 }
43218
43219 num_restrict = count_num_restricted (insn, window_list);
43220
43221 if (num_restrict > num_allowable_groups[group])
43222 return false;
43223
43224 /* See if it fits in the first window. */
43225 if (window_list->window_num == 0)
43226 {
43227 /* The first widow should have only single and double path
43228 uops. */
43229 if (path == path_double
43230 && (window_list->num_uops + 2) > MAX_INSN)
43231 return false;
43232 else if (path != path_single)
43233 return false;
43234 }
43235 return true;
43236 }
43237
43238 /* Add an instruction INSN with NUM_UOPS micro-operations to the
43239 dispatch window WINDOW_LIST. */
43240
43241 static void
43242 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
43243 {
43244 int byte_len = min_insn_size (insn);
43245 int num_insn = window_list->num_insn;
43246 int imm_size;
43247 sched_insn_info *window = window_list->window;
43248 enum dispatch_group group = get_insn_group (insn);
43249 enum insn_path path = get_insn_path (insn);
43250 int num_imm_operand;
43251 int num_imm32_operand;
43252 int num_imm64_operand;
43253
43254 if (!window_list->violation && group != disp_cmp
43255 && !fits_dispatch_window (insn))
43256 window_list->violation = true;
43257
43258 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43259 &num_imm64_operand);
43260
43261 /* Initialize window with new instruction. */
43262 window[num_insn].insn = insn;
43263 window[num_insn].byte_len = byte_len;
43264 window[num_insn].group = group;
43265 window[num_insn].path = path;
43266 window[num_insn].imm_bytes = imm_size;
43267
43268 window_list->window_size += byte_len;
43269 window_list->num_insn = num_insn + 1;
43270 window_list->num_uops = window_list->num_uops + num_uops;
43271 window_list->imm_size += imm_size;
43272 window_list->num_imm += num_imm_operand;
43273 window_list->num_imm_32 += num_imm32_operand;
43274 window_list->num_imm_64 += num_imm64_operand;
43275
43276 if (group == disp_store)
43277 window_list->num_stores += 1;
43278 else if (group == disp_load
43279 || group == disp_prefetch)
43280 window_list->num_loads += 1;
43281 else if (group == disp_load_store)
43282 {
43283 window_list->num_stores += 1;
43284 window_list->num_loads += 1;
43285 }
43286 }
43287
43288 /* Adds a scheduled instruction, INSN, to the current dispatch window.
43289 If the total bytes of instructions or the number of instructions in
43290 the window exceed allowable, it allocates a new window. */
43291
43292 static void
43293 add_to_dispatch_window (rtx insn)
43294 {
43295 int byte_len;
43296 dispatch_windows *window_list;
43297 dispatch_windows *next_list;
43298 dispatch_windows *window0_list;
43299 enum insn_path path;
43300 enum dispatch_group insn_group;
43301 bool insn_fits;
43302 int num_insn;
43303 int num_uops;
43304 int window_num;
43305 int insn_num_uops;
43306 int sum;
43307
43308 if (INSN_CODE (insn) < 0)
43309 return;
43310
43311 byte_len = min_insn_size (insn);
43312 window_list = dispatch_window_list;
43313 next_list = window_list->next;
43314 path = get_insn_path (insn);
43315 insn_group = get_insn_group (insn);
43316
43317 /* Get the last dispatch window. */
43318 if (next_list)
43319 window_list = dispatch_window_list->next;
43320
43321 if (path == path_single)
43322 insn_num_uops = 1;
43323 else if (path == path_double)
43324 insn_num_uops = 2;
43325 else
43326 insn_num_uops = (int) path;
43327
43328 /* If current window is full, get a new window.
43329 Window number zero is full, if MAX_INSN uops are scheduled in it.
43330 Window number one is full, if window zero's bytes plus window
43331 one's bytes is 32, or if the bytes of the new instruction added
43332 to the total makes it greater than 48, or it has already MAX_INSN
43333 instructions in it. */
43334 num_insn = window_list->num_insn;
43335 num_uops = window_list->num_uops;
43336 window_num = window_list->window_num;
43337 insn_fits = fits_dispatch_window (insn);
43338
43339 if (num_insn >= MAX_INSN
43340 || num_uops + insn_num_uops > MAX_INSN
43341 || !(insn_fits))
43342 {
43343 window_num = ~window_num & 1;
43344 window_list = allocate_next_window (window_num);
43345 }
43346
43347 if (window_num == 0)
43348 {
43349 add_insn_window (insn, window_list, insn_num_uops);
43350 if (window_list->num_insn >= MAX_INSN
43351 && insn_group == disp_branch)
43352 {
43353 process_end_window ();
43354 return;
43355 }
43356 }
43357 else if (window_num == 1)
43358 {
43359 window0_list = window_list->prev;
43360 sum = window0_list->window_size + window_list->window_size;
43361 if (sum == 32
43362 || (byte_len + sum) >= 48)
43363 {
43364 process_end_window ();
43365 window_list = dispatch_window_list;
43366 }
43367
43368 add_insn_window (insn, window_list, insn_num_uops);
43369 }
43370 else
43371 gcc_unreachable ();
43372
43373 if (is_end_basic_block (insn_group))
43374 {
43375 /* End of basic block is reached do end-basic-block process. */
43376 process_end_window ();
43377 return;
43378 }
43379 }
43380
43381 /* Print the dispatch window, WINDOW_NUM, to FILE. */
43382
43383 DEBUG_FUNCTION static void
43384 debug_dispatch_window_file (FILE *file, int window_num)
43385 {
43386 dispatch_windows *list;
43387 int i;
43388
43389 if (window_num == 0)
43390 list = dispatch_window_list;
43391 else
43392 list = dispatch_window_list1;
43393
43394 fprintf (file, "Window #%d:\n", list->window_num);
43395 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
43396 list->num_insn, list->num_uops, list->window_size);
43397 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43398 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
43399
43400 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
43401 list->num_stores);
43402 fprintf (file, " insn info:\n");
43403
43404 for (i = 0; i < MAX_INSN; i++)
43405 {
43406 if (!list->window[i].insn)
43407 break;
43408 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
43409 i, group_name[list->window[i].group],
43410 i, (void *)list->window[i].insn,
43411 i, list->window[i].path,
43412 i, list->window[i].byte_len,
43413 i, list->window[i].imm_bytes);
43414 }
43415 }
43416
43417 /* Print to stdout a dispatch window. */
43418
43419 DEBUG_FUNCTION void
43420 debug_dispatch_window (int window_num)
43421 {
43422 debug_dispatch_window_file (stdout, window_num);
43423 }
43424
43425 /* Print INSN dispatch information to FILE. */
43426
43427 DEBUG_FUNCTION static void
43428 debug_insn_dispatch_info_file (FILE *file, rtx insn)
43429 {
43430 int byte_len;
43431 enum insn_path path;
43432 enum dispatch_group group;
43433 int imm_size;
43434 int num_imm_operand;
43435 int num_imm32_operand;
43436 int num_imm64_operand;
43437
43438 if (INSN_CODE (insn) < 0)
43439 return;
43440
43441 byte_len = min_insn_size (insn);
43442 path = get_insn_path (insn);
43443 group = get_insn_group (insn);
43444 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43445 &num_imm64_operand);
43446
43447 fprintf (file, " insn info:\n");
43448 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
43449 group_name[group], path, byte_len);
43450 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43451 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
43452 }
43453
43454 /* Print to STDERR the status of the ready list with respect to
43455 dispatch windows. */
43456
43457 DEBUG_FUNCTION void
43458 debug_ready_dispatch (void)
43459 {
43460 int i;
43461 int no_ready = number_in_ready ();
43462
43463 fprintf (stdout, "Number of ready: %d\n", no_ready);
43464
43465 for (i = 0; i < no_ready; i++)
43466 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
43467 }
43468
43469 /* This routine is the driver of the dispatch scheduler. */
43470
43471 static void
43472 do_dispatch (rtx insn, int mode)
43473 {
43474 if (mode == DISPATCH_INIT)
43475 init_dispatch_sched ();
43476 else if (mode == ADD_TO_DISPATCH_WINDOW)
43477 add_to_dispatch_window (insn);
43478 }
43479
43480 /* Return TRUE if Dispatch Scheduling is supported. */
43481
43482 static bool
43483 has_dispatch (rtx insn, int action)
43484 {
43485 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 || TARGET_BDVER4)
43486 && flag_dispatch_scheduler)
43487 switch (action)
43488 {
43489 default:
43490 return false;
43491
43492 case IS_DISPATCH_ON:
43493 return true;
43494 break;
43495
43496 case IS_CMP:
43497 return is_cmp (insn);
43498
43499 case DISPATCH_VIOLATION:
43500 return dispatch_violation ();
43501
43502 case FITS_DISPATCH_WINDOW:
43503 return fits_dispatch_window (insn);
43504 }
43505
43506 return false;
43507 }
43508
43509 /* Implementation of reassociation_width target hook used by
43510 reassoc phase to identify parallelism level in reassociated
43511 tree. Statements tree_code is passed in OPC. Arguments type
43512 is passed in MODE.
43513
43514 Currently parallel reassociation is enabled for Atom
43515 processors only and we set reassociation width to be 2
43516 because Atom may issue up to 2 instructions per cycle.
43517
43518 Return value should be fixed if parallel reassociation is
43519 enabled for other processors. */
43520
43521 static int
43522 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
43523 enum machine_mode mode)
43524 {
43525 int res = 1;
43526
43527 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
43528 res = 2;
43529 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
43530 res = 2;
43531
43532 return res;
43533 }
43534
43535 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
43536 place emms and femms instructions. */
43537
43538 static enum machine_mode
43539 ix86_preferred_simd_mode (enum machine_mode mode)
43540 {
43541 if (!TARGET_SSE)
43542 return word_mode;
43543
43544 switch (mode)
43545 {
43546 case QImode:
43547 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
43548 case HImode:
43549 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
43550 case SImode:
43551 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
43552 case DImode:
43553 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
43554
43555 case SFmode:
43556 if (TARGET_AVX && !TARGET_PREFER_AVX128)
43557 return V8SFmode;
43558 else
43559 return V4SFmode;
43560
43561 case DFmode:
43562 if (!TARGET_VECTORIZE_DOUBLE)
43563 return word_mode;
43564 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
43565 return V4DFmode;
43566 else if (TARGET_SSE2)
43567 return V2DFmode;
43568 /* FALLTHRU */
43569
43570 default:
43571 return word_mode;
43572 }
43573 }
43574
43575 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
43576 vectors. */
43577
43578 static unsigned int
43579 ix86_autovectorize_vector_sizes (void)
43580 {
43581 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
43582 }
43583
43584 \f
43585
43586 /* Return class of registers which could be used for pseudo of MODE
43587 and of class RCLASS for spilling instead of memory. Return NO_REGS
43588 if it is not possible or non-profitable. */
43589 static reg_class_t
43590 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
43591 {
43592 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
43593 && (mode == SImode || (TARGET_64BIT && mode == DImode))
43594 && INTEGER_CLASS_P (rclass))
43595 return ALL_SSE_REGS;
43596 return NO_REGS;
43597 }
43598
43599 /* Implement targetm.vectorize.init_cost. */
43600
43601 static void *
43602 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
43603 {
43604 unsigned *cost = XNEWVEC (unsigned, 3);
43605 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
43606 return cost;
43607 }
43608
43609 /* Implement targetm.vectorize.add_stmt_cost. */
43610
43611 static unsigned
43612 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
43613 struct _stmt_vec_info *stmt_info, int misalign,
43614 enum vect_cost_model_location where)
43615 {
43616 unsigned *cost = (unsigned *) data;
43617 unsigned retval = 0;
43618
43619 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
43620 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
43621
43622 /* Statements in an inner loop relative to the loop being
43623 vectorized are weighted more heavily. The value here is
43624 arbitrary and could potentially be improved with analysis. */
43625 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
43626 count *= 50; /* FIXME. */
43627
43628 retval = (unsigned) (count * stmt_cost);
43629 cost[where] += retval;
43630
43631 return retval;
43632 }
43633
43634 /* Implement targetm.vectorize.finish_cost. */
43635
43636 static void
43637 ix86_finish_cost (void *data, unsigned *prologue_cost,
43638 unsigned *body_cost, unsigned *epilogue_cost)
43639 {
43640 unsigned *cost = (unsigned *) data;
43641 *prologue_cost = cost[vect_prologue];
43642 *body_cost = cost[vect_body];
43643 *epilogue_cost = cost[vect_epilogue];
43644 }
43645
43646 /* Implement targetm.vectorize.destroy_cost_data. */
43647
43648 static void
43649 ix86_destroy_cost_data (void *data)
43650 {
43651 free (data);
43652 }
43653
43654 /* Validate target specific memory model bits in VAL. */
43655
43656 static unsigned HOST_WIDE_INT
43657 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
43658 {
43659 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
43660 bool strong;
43661
43662 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
43663 |MEMMODEL_MASK)
43664 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
43665 {
43666 warning (OPT_Winvalid_memory_model,
43667 "Unknown architecture specific memory model");
43668 return MEMMODEL_SEQ_CST;
43669 }
43670 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
43671 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
43672 {
43673 warning (OPT_Winvalid_memory_model,
43674 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
43675 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
43676 }
43677 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
43678 {
43679 warning (OPT_Winvalid_memory_model,
43680 "HLE_RELEASE not used with RELEASE or stronger memory model");
43681 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
43682 }
43683 return val;
43684 }
43685
43686 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
43687
43688 static bool
43689 ix86_float_exceptions_rounding_supported_p (void)
43690 {
43691 /* For x87 floating point with standard excess precision handling,
43692 there is no adddf3 pattern (since x87 floating point only has
43693 XFmode operations) so the default hook implementation gets this
43694 wrong. */
43695 return TARGET_80387 || TARGET_SSE_MATH;
43696 }
43697
43698 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
43699
43700 static void
43701 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
43702 {
43703 if (!TARGET_80387 && !TARGET_SSE_MATH)
43704 return;
43705 tree exceptions_var = create_tmp_var (integer_type_node, NULL);
43706 if (TARGET_80387)
43707 {
43708 tree fenv_index_type = build_index_type (size_int (6));
43709 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
43710 tree fenv_var = create_tmp_var (fenv_type, NULL);
43711 mark_addressable (fenv_var);
43712 tree fenv_ptr = build_pointer_type (fenv_type);
43713 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
43714 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
43715 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
43716 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
43717 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
43718 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
43719 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
43720 tree hold_fnclex = build_call_expr (fnclex, 0);
43721 *hold = build2 (COMPOUND_EXPR, void_type_node, hold_fnstenv,
43722 hold_fnclex);
43723 *clear = build_call_expr (fnclex, 0);
43724 tree sw_var = create_tmp_var (short_unsigned_type_node, NULL);
43725 mark_addressable (sw_var);
43726 tree su_ptr = build_pointer_type (short_unsigned_type_node);
43727 tree sw_addr = build1 (ADDR_EXPR, su_ptr, sw_var);
43728 tree fnstsw_call = build_call_expr (fnstsw, 1, sw_addr);
43729 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
43730 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
43731 exceptions_var, exceptions_x87);
43732 *update = build2 (COMPOUND_EXPR, integer_type_node,
43733 fnstsw_call, update_mod);
43734 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
43735 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
43736 }
43737 if (TARGET_SSE_MATH)
43738 {
43739 tree mxcsr_orig_var = create_tmp_var (unsigned_type_node, NULL);
43740 tree mxcsr_mod_var = create_tmp_var (unsigned_type_node, NULL);
43741 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
43742 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
43743 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
43744 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
43745 mxcsr_orig_var, stmxcsr_hold_call);
43746 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
43747 mxcsr_orig_var,
43748 build_int_cst (unsigned_type_node, 0x1f80));
43749 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
43750 build_int_cst (unsigned_type_node, 0xffffffc0));
43751 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
43752 mxcsr_mod_var, hold_mod_val);
43753 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
43754 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
43755 hold_assign_orig, hold_assign_mod);
43756 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
43757 ldmxcsr_hold_call);
43758 if (*hold)
43759 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
43760 else
43761 *hold = hold_all;
43762 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
43763 if (*clear)
43764 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
43765 ldmxcsr_clear_call);
43766 else
43767 *clear = ldmxcsr_clear_call;
43768 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
43769 tree exceptions_sse = fold_convert (integer_type_node,
43770 stxmcsr_update_call);
43771 if (*update)
43772 {
43773 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
43774 exceptions_var, exceptions_sse);
43775 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
43776 exceptions_var, exceptions_mod);
43777 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
43778 exceptions_assign);
43779 }
43780 else
43781 *update = build2 (MODIFY_EXPR, integer_type_node,
43782 exceptions_var, exceptions_sse);
43783 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
43784 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
43785 ldmxcsr_update_call);
43786 }
43787 tree atomic_feraiseexcept
43788 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
43789 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
43790 1, exceptions_var);
43791 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
43792 atomic_feraiseexcept_call);
43793 }
43794
43795 /* Initialize the GCC target structure. */
43796 #undef TARGET_RETURN_IN_MEMORY
43797 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
43798
43799 #undef TARGET_LEGITIMIZE_ADDRESS
43800 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
43801
43802 #undef TARGET_ATTRIBUTE_TABLE
43803 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
43804 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
43805 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
43806 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43807 # undef TARGET_MERGE_DECL_ATTRIBUTES
43808 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
43809 #endif
43810
43811 #undef TARGET_COMP_TYPE_ATTRIBUTES
43812 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
43813
43814 #undef TARGET_INIT_BUILTINS
43815 #define TARGET_INIT_BUILTINS ix86_init_builtins
43816 #undef TARGET_BUILTIN_DECL
43817 #define TARGET_BUILTIN_DECL ix86_builtin_decl
43818 #undef TARGET_EXPAND_BUILTIN
43819 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
43820
43821 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
43822 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
43823 ix86_builtin_vectorized_function
43824
43825 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
43826 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
43827
43828 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
43829 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
43830
43831 #undef TARGET_VECTORIZE_BUILTIN_GATHER
43832 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
43833
43834 #undef TARGET_BUILTIN_RECIPROCAL
43835 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
43836
43837 #undef TARGET_ASM_FUNCTION_EPILOGUE
43838 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
43839
43840 #undef TARGET_ENCODE_SECTION_INFO
43841 #ifndef SUBTARGET_ENCODE_SECTION_INFO
43842 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
43843 #else
43844 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
43845 #endif
43846
43847 #undef TARGET_ASM_OPEN_PAREN
43848 #define TARGET_ASM_OPEN_PAREN ""
43849 #undef TARGET_ASM_CLOSE_PAREN
43850 #define TARGET_ASM_CLOSE_PAREN ""
43851
43852 #undef TARGET_ASM_BYTE_OP
43853 #define TARGET_ASM_BYTE_OP ASM_BYTE
43854
43855 #undef TARGET_ASM_ALIGNED_HI_OP
43856 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
43857 #undef TARGET_ASM_ALIGNED_SI_OP
43858 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
43859 #ifdef ASM_QUAD
43860 #undef TARGET_ASM_ALIGNED_DI_OP
43861 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
43862 #endif
43863
43864 #undef TARGET_PROFILE_BEFORE_PROLOGUE
43865 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
43866
43867 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
43868 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
43869
43870 #undef TARGET_ASM_UNALIGNED_HI_OP
43871 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
43872 #undef TARGET_ASM_UNALIGNED_SI_OP
43873 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
43874 #undef TARGET_ASM_UNALIGNED_DI_OP
43875 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
43876
43877 #undef TARGET_PRINT_OPERAND
43878 #define TARGET_PRINT_OPERAND ix86_print_operand
43879 #undef TARGET_PRINT_OPERAND_ADDRESS
43880 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
43881 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
43882 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
43883 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
43884 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
43885
43886 #undef TARGET_SCHED_INIT_GLOBAL
43887 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
43888 #undef TARGET_SCHED_ADJUST_COST
43889 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
43890 #undef TARGET_SCHED_ISSUE_RATE
43891 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
43892 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
43893 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
43894 ia32_multipass_dfa_lookahead
43895 #undef TARGET_SCHED_MACRO_FUSION_P
43896 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
43897 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
43898 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
43899
43900 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
43901 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
43902
43903 #undef TARGET_MEMMODEL_CHECK
43904 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
43905
43906 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
43907 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
43908
43909 #ifdef HAVE_AS_TLS
43910 #undef TARGET_HAVE_TLS
43911 #define TARGET_HAVE_TLS true
43912 #endif
43913 #undef TARGET_CANNOT_FORCE_CONST_MEM
43914 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
43915 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
43916 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
43917
43918 #undef TARGET_DELEGITIMIZE_ADDRESS
43919 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
43920
43921 #undef TARGET_MS_BITFIELD_LAYOUT_P
43922 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
43923
43924 #if TARGET_MACHO
43925 #undef TARGET_BINDS_LOCAL_P
43926 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
43927 #endif
43928 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43929 #undef TARGET_BINDS_LOCAL_P
43930 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
43931 #endif
43932
43933 #undef TARGET_ASM_OUTPUT_MI_THUNK
43934 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
43935 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
43936 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
43937
43938 #undef TARGET_ASM_FILE_START
43939 #define TARGET_ASM_FILE_START x86_file_start
43940
43941 #undef TARGET_OPTION_OVERRIDE
43942 #define TARGET_OPTION_OVERRIDE ix86_option_override
43943
43944 #undef TARGET_REGISTER_MOVE_COST
43945 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
43946 #undef TARGET_MEMORY_MOVE_COST
43947 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
43948 #undef TARGET_RTX_COSTS
43949 #define TARGET_RTX_COSTS ix86_rtx_costs
43950 #undef TARGET_ADDRESS_COST
43951 #define TARGET_ADDRESS_COST ix86_address_cost
43952
43953 #undef TARGET_FIXED_CONDITION_CODE_REGS
43954 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
43955 #undef TARGET_CC_MODES_COMPATIBLE
43956 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
43957
43958 #undef TARGET_MACHINE_DEPENDENT_REORG
43959 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
43960
43961 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
43962 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
43963
43964 #undef TARGET_BUILD_BUILTIN_VA_LIST
43965 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
43966
43967 #undef TARGET_FOLD_BUILTIN
43968 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
43969
43970 #undef TARGET_COMPARE_VERSION_PRIORITY
43971 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
43972
43973 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
43974 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
43975 ix86_generate_version_dispatcher_body
43976
43977 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
43978 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
43979 ix86_get_function_versions_dispatcher
43980
43981 #undef TARGET_ENUM_VA_LIST_P
43982 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
43983
43984 #undef TARGET_FN_ABI_VA_LIST
43985 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
43986
43987 #undef TARGET_CANONICAL_VA_LIST_TYPE
43988 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
43989
43990 #undef TARGET_EXPAND_BUILTIN_VA_START
43991 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
43992
43993 #undef TARGET_MD_ASM_CLOBBERS
43994 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
43995
43996 #undef TARGET_PROMOTE_PROTOTYPES
43997 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
43998 #undef TARGET_STRUCT_VALUE_RTX
43999 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
44000 #undef TARGET_SETUP_INCOMING_VARARGS
44001 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
44002 #undef TARGET_MUST_PASS_IN_STACK
44003 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
44004 #undef TARGET_FUNCTION_ARG_ADVANCE
44005 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
44006 #undef TARGET_FUNCTION_ARG
44007 #define TARGET_FUNCTION_ARG ix86_function_arg
44008 #undef TARGET_FUNCTION_ARG_BOUNDARY
44009 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
44010 #undef TARGET_PASS_BY_REFERENCE
44011 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
44012 #undef TARGET_INTERNAL_ARG_POINTER
44013 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
44014 #undef TARGET_UPDATE_STACK_BOUNDARY
44015 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
44016 #undef TARGET_GET_DRAP_RTX
44017 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
44018 #undef TARGET_STRICT_ARGUMENT_NAMING
44019 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
44020 #undef TARGET_STATIC_CHAIN
44021 #define TARGET_STATIC_CHAIN ix86_static_chain
44022 #undef TARGET_TRAMPOLINE_INIT
44023 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
44024 #undef TARGET_RETURN_POPS_ARGS
44025 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
44026
44027 #undef TARGET_LEGITIMATE_COMBINED_INSN
44028 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
44029
44030 #undef TARGET_ASAN_SHADOW_OFFSET
44031 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
44032
44033 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
44034 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
44035
44036 #undef TARGET_SCALAR_MODE_SUPPORTED_P
44037 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
44038
44039 #undef TARGET_VECTOR_MODE_SUPPORTED_P
44040 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
44041
44042 #undef TARGET_C_MODE_FOR_SUFFIX
44043 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
44044
44045 #ifdef HAVE_AS_TLS
44046 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
44047 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
44048 #endif
44049
44050 #ifdef SUBTARGET_INSERT_ATTRIBUTES
44051 #undef TARGET_INSERT_ATTRIBUTES
44052 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
44053 #endif
44054
44055 #undef TARGET_MANGLE_TYPE
44056 #define TARGET_MANGLE_TYPE ix86_mangle_type
44057
44058 #if !TARGET_MACHO
44059 #undef TARGET_STACK_PROTECT_FAIL
44060 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
44061 #endif
44062
44063 #undef TARGET_FUNCTION_VALUE
44064 #define TARGET_FUNCTION_VALUE ix86_function_value
44065
44066 #undef TARGET_FUNCTION_VALUE_REGNO_P
44067 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
44068
44069 #undef TARGET_PROMOTE_FUNCTION_MODE
44070 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
44071
44072 #undef TARGET_MEMBER_TYPE_FORCES_BLK
44073 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
44074
44075 #undef TARGET_INSTANTIATE_DECLS
44076 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
44077
44078 #undef TARGET_SECONDARY_RELOAD
44079 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
44080
44081 #undef TARGET_CLASS_MAX_NREGS
44082 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
44083
44084 #undef TARGET_PREFERRED_RELOAD_CLASS
44085 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
44086 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
44087 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
44088 #undef TARGET_CLASS_LIKELY_SPILLED_P
44089 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
44090
44091 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
44092 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
44093 ix86_builtin_vectorization_cost
44094 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
44095 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
44096 ix86_vectorize_vec_perm_const_ok
44097 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
44098 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
44099 ix86_preferred_simd_mode
44100 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
44101 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
44102 ix86_autovectorize_vector_sizes
44103 #undef TARGET_VECTORIZE_INIT_COST
44104 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
44105 #undef TARGET_VECTORIZE_ADD_STMT_COST
44106 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
44107 #undef TARGET_VECTORIZE_FINISH_COST
44108 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
44109 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
44110 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
44111
44112 #undef TARGET_SET_CURRENT_FUNCTION
44113 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
44114
44115 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
44116 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
44117
44118 #undef TARGET_OPTION_SAVE
44119 #define TARGET_OPTION_SAVE ix86_function_specific_save
44120
44121 #undef TARGET_OPTION_RESTORE
44122 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
44123
44124 #undef TARGET_OPTION_PRINT
44125 #define TARGET_OPTION_PRINT ix86_function_specific_print
44126
44127 #undef TARGET_OPTION_FUNCTION_VERSIONS
44128 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
44129
44130 #undef TARGET_CAN_INLINE_P
44131 #define TARGET_CAN_INLINE_P ix86_can_inline_p
44132
44133 #undef TARGET_EXPAND_TO_RTL_HOOK
44134 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
44135
44136 #undef TARGET_LEGITIMATE_ADDRESS_P
44137 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
44138
44139 #undef TARGET_LRA_P
44140 #define TARGET_LRA_P hook_bool_void_true
44141
44142 #undef TARGET_REGISTER_PRIORITY
44143 #define TARGET_REGISTER_PRIORITY ix86_register_priority
44144
44145 #undef TARGET_REGISTER_USAGE_LEVELING_P
44146 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
44147
44148 #undef TARGET_LEGITIMATE_CONSTANT_P
44149 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
44150
44151 #undef TARGET_FRAME_POINTER_REQUIRED
44152 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
44153
44154 #undef TARGET_CAN_ELIMINATE
44155 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
44156
44157 #undef TARGET_EXTRA_LIVE_ON_ENTRY
44158 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
44159
44160 #undef TARGET_ASM_CODE_END
44161 #define TARGET_ASM_CODE_END ix86_code_end
44162
44163 #undef TARGET_CONDITIONAL_REGISTER_USAGE
44164 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
44165
44166 #if TARGET_MACHO
44167 #undef TARGET_INIT_LIBFUNCS
44168 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
44169 #endif
44170
44171 #undef TARGET_SPILL_CLASS
44172 #define TARGET_SPILL_CLASS ix86_spill_class
44173
44174 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
44175 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
44176 ix86_float_exceptions_rounding_supported_p
44177
44178 struct gcc_target targetm = TARGET_INITIALIZER;
44179 \f
44180 #include "gt-i386.h"