]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
Mark symbols as constant
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "flags.h"
49 #include "explow.h"
50 #include "expr.h"
51 #include "reload.h"
52 #include "langhooks.h"
53 #include "opts.h"
54 #include "params.h"
55 #include "gimplify.h"
56 #include "dwarf2.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
60 #include "dumpfile.h"
61 #include "builtins.h"
62 #include "rtl-iter.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
67 #include "selftest.h"
68 #include "selftest-rtl.h"
69
70 /* This file should be included last. */
71 #include "target-def.h"
72
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
75
76 /* Classifies an address.
77
78 ADDRESS_REG_IMM
79 A simple base register plus immediate offset.
80
81 ADDRESS_REG_WB
82 A base register indexed by immediate offset with writeback.
83
84 ADDRESS_REG_REG
85 A base register indexed by (optionally scaled) register.
86
87 ADDRESS_REG_UXTW
88 A base register indexed by (optionally scaled) zero-extended register.
89
90 ADDRESS_REG_SXTW
91 A base register indexed by (optionally scaled) sign-extended register.
92
93 ADDRESS_LO_SUM
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
95
96 ADDRESS_SYMBOLIC:
97 A constant symbolic address, in pc-relative literal pool. */
98
99 enum aarch64_address_type {
100 ADDRESS_REG_IMM,
101 ADDRESS_REG_WB,
102 ADDRESS_REG_REG,
103 ADDRESS_REG_UXTW,
104 ADDRESS_REG_SXTW,
105 ADDRESS_LO_SUM,
106 ADDRESS_SYMBOLIC
107 };
108
109 struct aarch64_address_info {
110 enum aarch64_address_type type;
111 rtx base;
112 rtx offset;
113 int shift;
114 enum aarch64_symbol_type symbol_type;
115 };
116
117 struct simd_immediate_info
118 {
119 rtx value;
120 int shift;
121 int element_width;
122 bool mvn;
123 bool msl;
124 };
125
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel;
128
129 #ifdef HAVE_AS_TLS
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
132 #endif
133
134 static bool aarch64_composite_type_p (const_tree, machine_mode);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
136 const_tree,
137 machine_mode *, int *,
138 bool *);
139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
144 const unsigned char *sel);
145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
147 const_tree type,
148 int misalignment,
149 bool is_packed);
150
151 /* Major revision number of the ARM Architecture implemented by the target. */
152 unsigned aarch64_architecture_version;
153
154 /* The processor for which instructions should be scheduled. */
155 enum aarch64_processor aarch64_tune = cortexa53;
156
157 /* Mask to specify which instruction scheduling options should be used. */
158 unsigned long aarch64_tune_flags = 0;
159
160 /* Global flag for PC relative loads. */
161 bool aarch64_pcrelative_literal_loads;
162
163 /* Support for command line parsing of boolean flags in the tuning
164 structures. */
165 struct aarch64_flag_desc
166 {
167 const char* name;
168 unsigned int flag;
169 };
170
171 #define AARCH64_FUSION_PAIR(name, internal_name) \
172 { name, AARCH64_FUSE_##internal_name },
173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
174 {
175 { "none", AARCH64_FUSE_NOTHING },
176 #include "aarch64-fusion-pairs.def"
177 { "all", AARCH64_FUSE_ALL },
178 { NULL, AARCH64_FUSE_NOTHING }
179 };
180
181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
182 { name, AARCH64_EXTRA_TUNE_##internal_name },
183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
184 {
185 { "none", AARCH64_EXTRA_TUNE_NONE },
186 #include "aarch64-tuning-flags.def"
187 { "all", AARCH64_EXTRA_TUNE_ALL },
188 { NULL, AARCH64_EXTRA_TUNE_NONE }
189 };
190
191 /* Tuning parameters. */
192
193 static const struct cpu_addrcost_table generic_addrcost_table =
194 {
195 {
196 1, /* hi */
197 0, /* si */
198 0, /* di */
199 1, /* ti */
200 },
201 0, /* pre_modify */
202 0, /* post_modify */
203 0, /* register_offset */
204 0, /* register_sextend */
205 0, /* register_zextend */
206 0 /* imm_offset */
207 };
208
209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
210 {
211 {
212 1, /* hi */
213 0, /* si */
214 0, /* di */
215 1, /* ti */
216 },
217 0, /* pre_modify */
218 0, /* post_modify */
219 0, /* register_offset */
220 0, /* register_sextend */
221 0, /* register_zextend */
222 0, /* imm_offset */
223 };
224
225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
226 {
227 {
228 0, /* hi */
229 0, /* si */
230 0, /* di */
231 2, /* ti */
232 },
233 0, /* pre_modify */
234 0, /* post_modify */
235 1, /* register_offset */
236 1, /* register_sextend */
237 2, /* register_zextend */
238 0, /* imm_offset */
239 };
240
241 static const struct cpu_addrcost_table xgene1_addrcost_table =
242 {
243 {
244 1, /* hi */
245 0, /* si */
246 0, /* di */
247 1, /* ti */
248 },
249 1, /* pre_modify */
250 0, /* post_modify */
251 0, /* register_offset */
252 1, /* register_sextend */
253 1, /* register_zextend */
254 0, /* imm_offset */
255 };
256
257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
258 {
259 {
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
264 },
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
271 };
272
273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
274 {
275 {
276 1, /* hi */
277 1, /* si */
278 1, /* di */
279 2, /* ti */
280 },
281 0, /* pre_modify */
282 0, /* post_modify */
283 2, /* register_offset */
284 3, /* register_sextend */
285 3, /* register_zextend */
286 0, /* imm_offset */
287 };
288
289 static const struct cpu_regmove_cost generic_regmove_cost =
290 {
291 1, /* GP2GP */
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost. */
294 5, /* GP2FP */
295 5, /* FP2GP */
296 2 /* FP2FP */
297 };
298
299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
300 {
301 1, /* GP2GP */
302 /* Avoid the use of slow int<->fp moves for spilling by setting
303 their cost higher than memmov_cost. */
304 5, /* GP2FP */
305 5, /* FP2GP */
306 2 /* FP2FP */
307 };
308
309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
310 {
311 1, /* GP2GP */
312 /* Avoid the use of slow int<->fp moves for spilling by setting
313 their cost higher than memmov_cost. */
314 5, /* GP2FP */
315 5, /* FP2GP */
316 2 /* FP2FP */
317 };
318
319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
320 {
321 1, /* GP2GP */
322 /* Avoid the use of slow int<->fp moves for spilling by setting
323 their cost higher than memmov_cost (actual, 4 and 9). */
324 9, /* GP2FP */
325 9, /* FP2GP */
326 1 /* FP2FP */
327 };
328
329 static const struct cpu_regmove_cost thunderx_regmove_cost =
330 {
331 2, /* GP2GP */
332 2, /* GP2FP */
333 6, /* FP2GP */
334 4 /* FP2FP */
335 };
336
337 static const struct cpu_regmove_cost xgene1_regmove_cost =
338 {
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 8, /* GP2FP */
343 8, /* FP2GP */
344 2 /* FP2FP */
345 };
346
347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
348 {
349 2, /* GP2GP */
350 /* Avoid the use of int<->fp moves for spilling. */
351 6, /* GP2FP */
352 6, /* FP2GP */
353 4 /* FP2FP */
354 };
355
356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
357 {
358 1, /* GP2GP */
359 /* Avoid the use of int<->fp moves for spilling. */
360 8, /* GP2FP */
361 8, /* FP2GP */
362 4 /* FP2FP */
363 };
364
365 /* Generic costs for vector insn classes. */
366 static const struct cpu_vector_cost generic_vector_cost =
367 {
368 1, /* scalar_int_stmt_cost */
369 1, /* scalar_fp_stmt_cost */
370 1, /* scalar_load_cost */
371 1, /* scalar_store_cost */
372 1, /* vec_int_stmt_cost */
373 1, /* vec_fp_stmt_cost */
374 2, /* vec_permute_cost */
375 1, /* vec_to_scalar_cost */
376 1, /* scalar_to_vec_cost */
377 1, /* vec_align_load_cost */
378 1, /* vec_unalign_load_cost */
379 1, /* vec_unalign_store_cost */
380 1, /* vec_store_cost */
381 3, /* cond_taken_branch_cost */
382 1 /* cond_not_taken_branch_cost */
383 };
384
385 /* ThunderX costs for vector insn classes. */
386 static const struct cpu_vector_cost thunderx_vector_cost =
387 {
388 1, /* scalar_int_stmt_cost */
389 1, /* scalar_fp_stmt_cost */
390 3, /* scalar_load_cost */
391 1, /* scalar_store_cost */
392 4, /* vec_int_stmt_cost */
393 4, /* vec_fp_stmt_cost */
394 4, /* vec_permute_cost */
395 2, /* vec_to_scalar_cost */
396 2, /* scalar_to_vec_cost */
397 3, /* vec_align_load_cost */
398 10, /* vec_unalign_load_cost */
399 10, /* vec_unalign_store_cost */
400 1, /* vec_store_cost */
401 3, /* cond_taken_branch_cost */
402 3 /* cond_not_taken_branch_cost */
403 };
404
405 /* Generic costs for vector insn classes. */
406 static const struct cpu_vector_cost cortexa57_vector_cost =
407 {
408 1, /* scalar_int_stmt_cost */
409 1, /* scalar_fp_stmt_cost */
410 4, /* scalar_load_cost */
411 1, /* scalar_store_cost */
412 2, /* vec_int_stmt_cost */
413 2, /* vec_fp_stmt_cost */
414 3, /* vec_permute_cost */
415 8, /* vec_to_scalar_cost */
416 8, /* scalar_to_vec_cost */
417 4, /* vec_align_load_cost */
418 4, /* vec_unalign_load_cost */
419 1, /* vec_unalign_store_cost */
420 1, /* vec_store_cost */
421 1, /* cond_taken_branch_cost */
422 1 /* cond_not_taken_branch_cost */
423 };
424
425 static const struct cpu_vector_cost exynosm1_vector_cost =
426 {
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 5, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 3, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 3, /* vec_permute_cost */
434 3, /* vec_to_scalar_cost */
435 3, /* scalar_to_vec_cost */
436 5, /* vec_align_load_cost */
437 5, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 1, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
442 };
443
444 /* Generic costs for vector insn classes. */
445 static const struct cpu_vector_cost xgene1_vector_cost =
446 {
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 5, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 2, /* vec_int_stmt_cost */
452 2, /* vec_fp_stmt_cost */
453 2, /* vec_permute_cost */
454 4, /* vec_to_scalar_cost */
455 4, /* scalar_to_vec_cost */
456 10, /* vec_align_load_cost */
457 10, /* vec_unalign_load_cost */
458 2, /* vec_unalign_store_cost */
459 2, /* vec_store_cost */
460 2, /* cond_taken_branch_cost */
461 1 /* cond_not_taken_branch_cost */
462 };
463
464 /* Costs for vector insn classes for Vulcan. */
465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
466 {
467 1, /* scalar_int_stmt_cost */
468 6, /* scalar_fp_stmt_cost */
469 4, /* scalar_load_cost */
470 1, /* scalar_store_cost */
471 5, /* vec_int_stmt_cost */
472 6, /* vec_fp_stmt_cost */
473 3, /* vec_permute_cost */
474 6, /* vec_to_scalar_cost */
475 5, /* scalar_to_vec_cost */
476 8, /* vec_align_load_cost */
477 8, /* vec_unalign_load_cost */
478 4, /* vec_unalign_store_cost */
479 4, /* vec_store_cost */
480 2, /* cond_taken_branch_cost */
481 1 /* cond_not_taken_branch_cost */
482 };
483
484 /* Generic costs for branch instructions. */
485 static const struct cpu_branch_cost generic_branch_cost =
486 {
487 1, /* Predictable. */
488 3 /* Unpredictable. */
489 };
490
491 /* Branch costs for Cortex-A57. */
492 static const struct cpu_branch_cost cortexa57_branch_cost =
493 {
494 1, /* Predictable. */
495 3 /* Unpredictable. */
496 };
497
498 /* Branch costs for Vulcan. */
499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
500 {
501 1, /* Predictable. */
502 3 /* Unpredictable. */
503 };
504
505 /* Generic approximation modes. */
506 static const cpu_approx_modes generic_approx_modes =
507 {
508 AARCH64_APPROX_NONE, /* division */
509 AARCH64_APPROX_NONE, /* sqrt */
510 AARCH64_APPROX_NONE /* recip_sqrt */
511 };
512
513 /* Approximation modes for Exynos M1. */
514 static const cpu_approx_modes exynosm1_approx_modes =
515 {
516 AARCH64_APPROX_NONE, /* division */
517 AARCH64_APPROX_ALL, /* sqrt */
518 AARCH64_APPROX_ALL /* recip_sqrt */
519 };
520
521 /* Approximation modes for X-Gene 1. */
522 static const cpu_approx_modes xgene1_approx_modes =
523 {
524 AARCH64_APPROX_NONE, /* division */
525 AARCH64_APPROX_NONE, /* sqrt */
526 AARCH64_APPROX_ALL /* recip_sqrt */
527 };
528
529 /* Generic prefetch settings (which disable prefetch). */
530 static const cpu_prefetch_tune generic_prefetch_tune =
531 {
532 0, /* num_slots */
533 -1, /* l1_cache_size */
534 -1, /* l1_cache_line_size */
535 -1, /* l2_cache_size */
536 -1 /* default_opt_level */
537 };
538
539 static const cpu_prefetch_tune exynosm1_prefetch_tune =
540 {
541 0, /* num_slots */
542 -1, /* l1_cache_size */
543 64, /* l1_cache_line_size */
544 -1, /* l2_cache_size */
545 -1 /* default_opt_level */
546 };
547
548 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
549 {
550 4, /* num_slots */
551 32, /* l1_cache_size */
552 64, /* l1_cache_line_size */
553 1024, /* l2_cache_size */
554 3 /* default_opt_level */
555 };
556
557 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
558 {
559 0, /* num_slots */
560 -1, /* l1_cache_size */
561 64, /* l1_cache_line_size */
562 -1, /* l2_cache_size */
563 -1 /* default_opt_level */
564 };
565
566 static const struct tune_params generic_tunings =
567 {
568 &cortexa57_extra_costs,
569 &generic_addrcost_table,
570 &generic_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 2, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
577 8, /* function_align. */
578 4, /* jump_align. */
579 8, /* loop_align. */
580 2, /* int_reassoc_width. */
581 4, /* fp_reassoc_width. */
582 1, /* vec_reassoc_width. */
583 2, /* min_div_recip_mul_sf. */
584 2, /* min_div_recip_mul_df. */
585 0, /* max_case_values. */
586 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
587 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
588 &generic_prefetch_tune
589 };
590
591 static const struct tune_params cortexa35_tunings =
592 {
593 &cortexa53_extra_costs,
594 &generic_addrcost_table,
595 &cortexa53_regmove_cost,
596 &generic_vector_cost,
597 &cortexa57_branch_cost,
598 &generic_approx_modes,
599 4, /* memmov_cost */
600 1, /* issue_rate */
601 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
602 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
603 16, /* function_align. */
604 4, /* jump_align. */
605 8, /* loop_align. */
606 2, /* int_reassoc_width. */
607 4, /* fp_reassoc_width. */
608 1, /* vec_reassoc_width. */
609 2, /* min_div_recip_mul_sf. */
610 2, /* min_div_recip_mul_df. */
611 0, /* max_case_values. */
612 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
613 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
614 &generic_prefetch_tune
615 };
616
617 static const struct tune_params cortexa53_tunings =
618 {
619 &cortexa53_extra_costs,
620 &generic_addrcost_table,
621 &cortexa53_regmove_cost,
622 &generic_vector_cost,
623 &cortexa57_branch_cost,
624 &generic_approx_modes,
625 4, /* memmov_cost */
626 2, /* issue_rate */
627 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
628 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
629 16, /* function_align. */
630 4, /* jump_align. */
631 8, /* loop_align. */
632 2, /* int_reassoc_width. */
633 4, /* fp_reassoc_width. */
634 1, /* vec_reassoc_width. */
635 2, /* min_div_recip_mul_sf. */
636 2, /* min_div_recip_mul_df. */
637 0, /* max_case_values. */
638 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
639 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
640 &generic_prefetch_tune
641 };
642
643 static const struct tune_params cortexa57_tunings =
644 {
645 &cortexa57_extra_costs,
646 &cortexa57_addrcost_table,
647 &cortexa57_regmove_cost,
648 &cortexa57_vector_cost,
649 &cortexa57_branch_cost,
650 &generic_approx_modes,
651 4, /* memmov_cost */
652 3, /* issue_rate */
653 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
654 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
655 16, /* function_align. */
656 4, /* jump_align. */
657 8, /* loop_align. */
658 2, /* int_reassoc_width. */
659 4, /* fp_reassoc_width. */
660 1, /* vec_reassoc_width. */
661 2, /* min_div_recip_mul_sf. */
662 2, /* min_div_recip_mul_df. */
663 0, /* max_case_values. */
664 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
665 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
666 &generic_prefetch_tune
667 };
668
669 static const struct tune_params cortexa72_tunings =
670 {
671 &cortexa57_extra_costs,
672 &cortexa57_addrcost_table,
673 &cortexa57_regmove_cost,
674 &cortexa57_vector_cost,
675 &cortexa57_branch_cost,
676 &generic_approx_modes,
677 4, /* memmov_cost */
678 3, /* issue_rate */
679 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
680 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
681 16, /* function_align. */
682 4, /* jump_align. */
683 8, /* loop_align. */
684 2, /* int_reassoc_width. */
685 4, /* fp_reassoc_width. */
686 1, /* vec_reassoc_width. */
687 2, /* min_div_recip_mul_sf. */
688 2, /* min_div_recip_mul_df. */
689 0, /* max_case_values. */
690 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
691 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
692 &generic_prefetch_tune
693 };
694
695 static const struct tune_params cortexa73_tunings =
696 {
697 &cortexa57_extra_costs,
698 &cortexa57_addrcost_table,
699 &cortexa57_regmove_cost,
700 &cortexa57_vector_cost,
701 &cortexa57_branch_cost,
702 &generic_approx_modes,
703 4, /* memmov_cost. */
704 2, /* issue_rate. */
705 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
706 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
707 16, /* function_align. */
708 4, /* jump_align. */
709 8, /* loop_align. */
710 2, /* int_reassoc_width. */
711 4, /* fp_reassoc_width. */
712 1, /* vec_reassoc_width. */
713 2, /* min_div_recip_mul_sf. */
714 2, /* min_div_recip_mul_df. */
715 0, /* max_case_values. */
716 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
717 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
718 &generic_prefetch_tune
719 };
720
721
722
723 static const struct tune_params exynosm1_tunings =
724 {
725 &exynosm1_extra_costs,
726 &exynosm1_addrcost_table,
727 &exynosm1_regmove_cost,
728 &exynosm1_vector_cost,
729 &generic_branch_cost,
730 &exynosm1_approx_modes,
731 4, /* memmov_cost */
732 3, /* issue_rate */
733 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
734 4, /* function_align. */
735 4, /* jump_align. */
736 4, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 48, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
745 &exynosm1_prefetch_tune
746 };
747
748 static const struct tune_params thunderx_tunings =
749 {
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
770 &generic_prefetch_tune
771 };
772
773 static const struct tune_params xgene1_tunings =
774 {
775 &xgene1_extra_costs,
776 &xgene1_addrcost_table,
777 &xgene1_regmove_cost,
778 &xgene1_vector_cost,
779 &generic_branch_cost,
780 &xgene1_approx_modes,
781 6, /* memmov_cost */
782 4, /* issue_rate */
783 AARCH64_FUSE_NOTHING, /* fusible_ops */
784 16, /* function_align. */
785 8, /* jump_align. */
786 16, /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
795 &generic_prefetch_tune
796 };
797
798 static const struct tune_params qdf24xx_tunings =
799 {
800 &qdf24xx_extra_costs,
801 &qdf24xx_addrcost_table,
802 &qdf24xx_regmove_cost,
803 &generic_vector_cost,
804 &generic_branch_cost,
805 &generic_approx_modes,
806 4, /* memmov_cost */
807 4, /* issue_rate */
808 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
810 16, /* function_align. */
811 8, /* jump_align. */
812 16, /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
821 &qdf24xx_prefetch_tune
822 };
823
824 static const struct tune_params thunderx2t99_tunings =
825 {
826 &thunderx2t99_extra_costs,
827 &thunderx2t99_addrcost_table,
828 &thunderx2t99_regmove_cost,
829 &thunderx2t99_vector_cost,
830 &thunderx2t99_branch_cost,
831 &generic_approx_modes,
832 4, /* memmov_cost. */
833 4, /* issue_rate. */
834 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops */
835 16, /* function_align. */
836 8, /* jump_align. */
837 16, /* loop_align. */
838 3, /* int_reassoc_width. */
839 2, /* fp_reassoc_width. */
840 2, /* vec_reassoc_width. */
841 2, /* min_div_recip_mul_sf. */
842 2, /* min_div_recip_mul_df. */
843 0, /* max_case_values. */
844 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
845 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
846 &thunderx2t99_prefetch_tune
847 };
848
849 /* Support for fine-grained override of the tuning structures. */
850 struct aarch64_tuning_override_function
851 {
852 const char* name;
853 void (*parse_override)(const char*, struct tune_params*);
854 };
855
856 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
857 static void aarch64_parse_tune_string (const char*, struct tune_params*);
858
859 static const struct aarch64_tuning_override_function
860 aarch64_tuning_override_functions[] =
861 {
862 { "fuse", aarch64_parse_fuse_string },
863 { "tune", aarch64_parse_tune_string },
864 { NULL, NULL }
865 };
866
867 /* A processor implementing AArch64. */
868 struct processor
869 {
870 const char *const name;
871 enum aarch64_processor ident;
872 enum aarch64_processor sched_core;
873 enum aarch64_arch arch;
874 unsigned architecture_version;
875 const unsigned long flags;
876 const struct tune_params *const tune;
877 };
878
879 /* Architectures implementing AArch64. */
880 static const struct processor all_architectures[] =
881 {
882 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
883 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
884 #include "aarch64-arches.def"
885 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
886 };
887
888 /* Processor cores implementing AArch64. */
889 static const struct processor all_cores[] =
890 {
891 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
892 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
893 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
894 FLAGS, &COSTS##_tunings},
895 #include "aarch64-cores.def"
896 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
897 AARCH64_FL_FOR_ARCH8, &generic_tunings},
898 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
899 };
900
901
902 /* Target specification. These are populated by the -march, -mtune, -mcpu
903 handling code or by target attributes. */
904 static const struct processor *selected_arch;
905 static const struct processor *selected_cpu;
906 static const struct processor *selected_tune;
907
908 /* The current tuning set. */
909 struct tune_params aarch64_tune_params = generic_tunings;
910
911 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
912
913 /* An ISA extension in the co-processor and main instruction set space. */
914 struct aarch64_option_extension
915 {
916 const char *const name;
917 const unsigned long flags_on;
918 const unsigned long flags_off;
919 };
920
921 typedef enum aarch64_cond_code
922 {
923 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
924 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
925 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
926 }
927 aarch64_cc;
928
929 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
930
931 /* The condition codes of the processor, and the inverse function. */
932 static const char * const aarch64_condition_codes[] =
933 {
934 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
935 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
936 };
937
938 /* Generate code to enable conditional branches in functions over 1 MiB. */
939 const char *
940 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
941 const char * branch_format)
942 {
943 rtx_code_label * tmp_label = gen_label_rtx ();
944 char label_buf[256];
945 char buffer[128];
946 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
947 CODE_LABEL_NUMBER (tmp_label));
948 const char *label_ptr = targetm.strip_name_encoding (label_buf);
949 rtx dest_label = operands[pos_label];
950 operands[pos_label] = tmp_label;
951
952 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
953 output_asm_insn (buffer, operands);
954
955 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
956 operands[pos_label] = dest_label;
957 output_asm_insn (buffer, operands);
958 return "";
959 }
960
961 void
962 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
963 {
964 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
965 if (TARGET_GENERAL_REGS_ONLY)
966 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
967 else
968 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
969 }
970
971 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
972 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
973 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
974 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
975 cost (in this case the best class is the lowest cost one). Using ALL_REGS
976 irrespectively of its cost results in bad allocations with many redundant
977 int<->FP moves which are expensive on various cores.
978 To avoid this we don't allow ALL_REGS as the allocno class, but force a
979 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
980 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
981 Otherwise set the allocno class depending on the mode.
982 The result of this is that it is no longer inefficient to have a higher
983 memory move cost than the register move cost.
984 */
985
986 static reg_class_t
987 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
988 reg_class_t best_class)
989 {
990 enum machine_mode mode;
991
992 if (allocno_class != ALL_REGS)
993 return allocno_class;
994
995 if (best_class != ALL_REGS)
996 return best_class;
997
998 mode = PSEUDO_REGNO_MODE (regno);
999 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1000 }
1001
1002 static unsigned int
1003 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
1004 {
1005 if (GET_MODE_UNIT_SIZE (mode) == 4)
1006 return aarch64_tune_params.min_div_recip_mul_sf;
1007 return aarch64_tune_params.min_div_recip_mul_df;
1008 }
1009
1010 static int
1011 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1012 enum machine_mode mode)
1013 {
1014 if (VECTOR_MODE_P (mode))
1015 return aarch64_tune_params.vec_reassoc_width;
1016 if (INTEGRAL_MODE_P (mode))
1017 return aarch64_tune_params.int_reassoc_width;
1018 if (FLOAT_MODE_P (mode))
1019 return aarch64_tune_params.fp_reassoc_width;
1020 return 1;
1021 }
1022
1023 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1024 unsigned
1025 aarch64_dbx_register_number (unsigned regno)
1026 {
1027 if (GP_REGNUM_P (regno))
1028 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1029 else if (regno == SP_REGNUM)
1030 return AARCH64_DWARF_SP;
1031 else if (FP_REGNUM_P (regno))
1032 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1033
1034 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1035 equivalent DWARF register. */
1036 return DWARF_FRAME_REGISTERS;
1037 }
1038
1039 /* Return TRUE if MODE is any of the large INT modes. */
1040 static bool
1041 aarch64_vect_struct_mode_p (machine_mode mode)
1042 {
1043 return mode == OImode || mode == CImode || mode == XImode;
1044 }
1045
1046 /* Return TRUE if MODE is any of the vector modes. */
1047 static bool
1048 aarch64_vector_mode_p (machine_mode mode)
1049 {
1050 return aarch64_vector_mode_supported_p (mode)
1051 || aarch64_vect_struct_mode_p (mode);
1052 }
1053
1054 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1055 static bool
1056 aarch64_array_mode_supported_p (machine_mode mode,
1057 unsigned HOST_WIDE_INT nelems)
1058 {
1059 if (TARGET_SIMD
1060 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1061 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1062 && (nelems >= 2 && nelems <= 4))
1063 return true;
1064
1065 return false;
1066 }
1067
1068 /* Implement HARD_REGNO_NREGS. */
1069
1070 int
1071 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1072 {
1073 switch (aarch64_regno_regclass (regno))
1074 {
1075 case FP_REGS:
1076 case FP_LO_REGS:
1077 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1078 default:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1080 }
1081 gcc_unreachable ();
1082 }
1083
1084 /* Implement HARD_REGNO_MODE_OK. */
1085
1086 int
1087 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1088 {
1089 if (GET_MODE_CLASS (mode) == MODE_CC)
1090 return regno == CC_REGNUM;
1091
1092 if (regno == SP_REGNUM)
1093 /* The purpose of comparing with ptr_mode is to support the
1094 global register variable associated with the stack pointer
1095 register via the syntax of asm ("wsp") in ILP32. */
1096 return mode == Pmode || mode == ptr_mode;
1097
1098 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1099 return mode == Pmode;
1100
1101 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1102 return 1;
1103
1104 if (FP_REGNUM_P (regno))
1105 {
1106 if (aarch64_vect_struct_mode_p (mode))
1107 return
1108 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1109 else
1110 return 1;
1111 }
1112
1113 return 0;
1114 }
1115
1116 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1117 machine_mode
1118 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1119 machine_mode mode)
1120 {
1121 /* Handle modes that fit within single registers. */
1122 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1123 {
1124 if (GET_MODE_SIZE (mode) >= 4)
1125 return mode;
1126 else
1127 return SImode;
1128 }
1129 /* Fall back to generic for multi-reg and very large modes. */
1130 else
1131 return choose_hard_reg_mode (regno, nregs, false);
1132 }
1133
1134 /* Return true if calls to DECL should be treated as
1135 long-calls (ie called via a register). */
1136 static bool
1137 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1138 {
1139 return false;
1140 }
1141
1142 /* Return true if calls to symbol-ref SYM should be treated as
1143 long-calls (ie called via a register). */
1144 bool
1145 aarch64_is_long_call_p (rtx sym)
1146 {
1147 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1148 }
1149
1150 /* Return true if calls to symbol-ref SYM should not go through
1151 plt stubs. */
1152
1153 bool
1154 aarch64_is_noplt_call_p (rtx sym)
1155 {
1156 const_tree decl = SYMBOL_REF_DECL (sym);
1157
1158 if (flag_pic
1159 && decl
1160 && (!flag_plt
1161 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1162 && !targetm.binds_local_p (decl))
1163 return true;
1164
1165 return false;
1166 }
1167
1168 /* Return true if the offsets to a zero/sign-extract operation
1169 represent an expression that matches an extend operation. The
1170 operands represent the paramters from
1171
1172 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1173 bool
1174 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1175 rtx extract_imm)
1176 {
1177 HOST_WIDE_INT mult_val, extract_val;
1178
1179 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1180 return false;
1181
1182 mult_val = INTVAL (mult_imm);
1183 extract_val = INTVAL (extract_imm);
1184
1185 if (extract_val > 8
1186 && extract_val < GET_MODE_BITSIZE (mode)
1187 && exact_log2 (extract_val & ~7) > 0
1188 && (extract_val & 7) <= 4
1189 && mult_val == (1 << (extract_val & 7)))
1190 return true;
1191
1192 return false;
1193 }
1194
1195 /* Emit an insn that's a simple single-set. Both the operands must be
1196 known to be valid. */
1197 inline static rtx_insn *
1198 emit_set_insn (rtx x, rtx y)
1199 {
1200 return emit_insn (gen_rtx_SET (x, y));
1201 }
1202
1203 /* X and Y are two things to compare using CODE. Emit the compare insn and
1204 return the rtx for register 0 in the proper mode. */
1205 rtx
1206 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1207 {
1208 machine_mode mode = SELECT_CC_MODE (code, x, y);
1209 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1210
1211 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1212 return cc_reg;
1213 }
1214
1215 /* Build the SYMBOL_REF for __tls_get_addr. */
1216
1217 static GTY(()) rtx tls_get_addr_libfunc;
1218
1219 rtx
1220 aarch64_tls_get_addr (void)
1221 {
1222 if (!tls_get_addr_libfunc)
1223 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1224 return tls_get_addr_libfunc;
1225 }
1226
1227 /* Return the TLS model to use for ADDR. */
1228
1229 static enum tls_model
1230 tls_symbolic_operand_type (rtx addr)
1231 {
1232 enum tls_model tls_kind = TLS_MODEL_NONE;
1233 rtx sym, addend;
1234
1235 if (GET_CODE (addr) == CONST)
1236 {
1237 split_const (addr, &sym, &addend);
1238 if (GET_CODE (sym) == SYMBOL_REF)
1239 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1240 }
1241 else if (GET_CODE (addr) == SYMBOL_REF)
1242 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1243
1244 return tls_kind;
1245 }
1246
1247 /* We'll allow lo_sum's in addresses in our legitimate addresses
1248 so that combine would take care of combining addresses where
1249 necessary, but for generation purposes, we'll generate the address
1250 as :
1251 RTL Absolute
1252 tmp = hi (symbol_ref); adrp x1, foo
1253 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1254 nop
1255
1256 PIC TLS
1257 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1258 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1259 bl __tls_get_addr
1260 nop
1261
1262 Load TLS symbol, depending on TLS mechanism and TLS access model.
1263
1264 Global Dynamic - Traditional TLS:
1265 adrp tmp, :tlsgd:imm
1266 add dest, tmp, #:tlsgd_lo12:imm
1267 bl __tls_get_addr
1268
1269 Global Dynamic - TLS Descriptors:
1270 adrp dest, :tlsdesc:imm
1271 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1272 add dest, dest, #:tlsdesc_lo12:imm
1273 blr tmp
1274 mrs tp, tpidr_el0
1275 add dest, dest, tp
1276
1277 Initial Exec:
1278 mrs tp, tpidr_el0
1279 adrp tmp, :gottprel:imm
1280 ldr dest, [tmp, #:gottprel_lo12:imm]
1281 add dest, dest, tp
1282
1283 Local Exec:
1284 mrs tp, tpidr_el0
1285 add t0, tp, #:tprel_hi12:imm, lsl #12
1286 add t0, t0, #:tprel_lo12_nc:imm
1287 */
1288
1289 static void
1290 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1291 enum aarch64_symbol_type type)
1292 {
1293 switch (type)
1294 {
1295 case SYMBOL_SMALL_ABSOLUTE:
1296 {
1297 /* In ILP32, the mode of dest can be either SImode or DImode. */
1298 rtx tmp_reg = dest;
1299 machine_mode mode = GET_MODE (dest);
1300
1301 gcc_assert (mode == Pmode || mode == ptr_mode);
1302
1303 if (can_create_pseudo_p ())
1304 tmp_reg = gen_reg_rtx (mode);
1305
1306 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1307 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1308 return;
1309 }
1310
1311 case SYMBOL_TINY_ABSOLUTE:
1312 emit_insn (gen_rtx_SET (dest, imm));
1313 return;
1314
1315 case SYMBOL_SMALL_GOT_28K:
1316 {
1317 machine_mode mode = GET_MODE (dest);
1318 rtx gp_rtx = pic_offset_table_rtx;
1319 rtx insn;
1320 rtx mem;
1321
1322 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1323 here before rtl expand. Tree IVOPT will generate rtl pattern to
1324 decide rtx costs, in which case pic_offset_table_rtx is not
1325 initialized. For that case no need to generate the first adrp
1326 instruction as the final cost for global variable access is
1327 one instruction. */
1328 if (gp_rtx != NULL)
1329 {
1330 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1331 using the page base as GOT base, the first page may be wasted,
1332 in the worst scenario, there is only 28K space for GOT).
1333
1334 The generate instruction sequence for accessing global variable
1335 is:
1336
1337 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1338
1339 Only one instruction needed. But we must initialize
1340 pic_offset_table_rtx properly. We generate initialize insn for
1341 every global access, and allow CSE to remove all redundant.
1342
1343 The final instruction sequences will look like the following
1344 for multiply global variables access.
1345
1346 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1347
1348 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1349 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1350 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1351 ... */
1352
1353 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1354 crtl->uses_pic_offset_table = 1;
1355 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1356
1357 if (mode != GET_MODE (gp_rtx))
1358 gp_rtx = gen_lowpart (mode, gp_rtx);
1359
1360 }
1361
1362 if (mode == ptr_mode)
1363 {
1364 if (mode == DImode)
1365 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1366 else
1367 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1368
1369 mem = XVECEXP (SET_SRC (insn), 0, 0);
1370 }
1371 else
1372 {
1373 gcc_assert (mode == Pmode);
1374
1375 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1376 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1377 }
1378
1379 /* The operand is expected to be MEM. Whenever the related insn
1380 pattern changed, above code which calculate mem should be
1381 updated. */
1382 gcc_assert (GET_CODE (mem) == MEM);
1383 MEM_READONLY_P (mem) = 1;
1384 MEM_NOTRAP_P (mem) = 1;
1385 emit_insn (insn);
1386 return;
1387 }
1388
1389 case SYMBOL_SMALL_GOT_4G:
1390 {
1391 /* In ILP32, the mode of dest can be either SImode or DImode,
1392 while the got entry is always of SImode size. The mode of
1393 dest depends on how dest is used: if dest is assigned to a
1394 pointer (e.g. in the memory), it has SImode; it may have
1395 DImode if dest is dereferenced to access the memeory.
1396 This is why we have to handle three different ldr_got_small
1397 patterns here (two patterns for ILP32). */
1398
1399 rtx insn;
1400 rtx mem;
1401 rtx tmp_reg = dest;
1402 machine_mode mode = GET_MODE (dest);
1403
1404 if (can_create_pseudo_p ())
1405 tmp_reg = gen_reg_rtx (mode);
1406
1407 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1408 if (mode == ptr_mode)
1409 {
1410 if (mode == DImode)
1411 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1412 else
1413 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1414
1415 mem = XVECEXP (SET_SRC (insn), 0, 0);
1416 }
1417 else
1418 {
1419 gcc_assert (mode == Pmode);
1420
1421 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1422 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1423 }
1424
1425 gcc_assert (GET_CODE (mem) == MEM);
1426 MEM_READONLY_P (mem) = 1;
1427 MEM_NOTRAP_P (mem) = 1;
1428 emit_insn (insn);
1429 return;
1430 }
1431
1432 case SYMBOL_SMALL_TLSGD:
1433 {
1434 rtx_insn *insns;
1435 machine_mode mode = GET_MODE (dest);
1436 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1437
1438 start_sequence ();
1439 if (TARGET_ILP32)
1440 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1441 else
1442 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1443 insns = get_insns ();
1444 end_sequence ();
1445
1446 RTL_CONST_CALL_P (insns) = 1;
1447 emit_libcall_block (insns, dest, result, imm);
1448 return;
1449 }
1450
1451 case SYMBOL_SMALL_TLSDESC:
1452 {
1453 machine_mode mode = GET_MODE (dest);
1454 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1455 rtx tp;
1456
1457 gcc_assert (mode == Pmode || mode == ptr_mode);
1458
1459 /* In ILP32, the got entry is always of SImode size. Unlike
1460 small GOT, the dest is fixed at reg 0. */
1461 if (TARGET_ILP32)
1462 emit_insn (gen_tlsdesc_small_si (imm));
1463 else
1464 emit_insn (gen_tlsdesc_small_di (imm));
1465 tp = aarch64_load_tp (NULL);
1466
1467 if (mode != Pmode)
1468 tp = gen_lowpart (mode, tp);
1469
1470 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1471 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1472 return;
1473 }
1474
1475 case SYMBOL_SMALL_TLSIE:
1476 {
1477 /* In ILP32, the mode of dest can be either SImode or DImode,
1478 while the got entry is always of SImode size. The mode of
1479 dest depends on how dest is used: if dest is assigned to a
1480 pointer (e.g. in the memory), it has SImode; it may have
1481 DImode if dest is dereferenced to access the memeory.
1482 This is why we have to handle three different tlsie_small
1483 patterns here (two patterns for ILP32). */
1484 machine_mode mode = GET_MODE (dest);
1485 rtx tmp_reg = gen_reg_rtx (mode);
1486 rtx tp = aarch64_load_tp (NULL);
1487
1488 if (mode == ptr_mode)
1489 {
1490 if (mode == DImode)
1491 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1492 else
1493 {
1494 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1495 tp = gen_lowpart (mode, tp);
1496 }
1497 }
1498 else
1499 {
1500 gcc_assert (mode == Pmode);
1501 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1502 }
1503
1504 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1505 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1506 return;
1507 }
1508
1509 case SYMBOL_TLSLE12:
1510 case SYMBOL_TLSLE24:
1511 case SYMBOL_TLSLE32:
1512 case SYMBOL_TLSLE48:
1513 {
1514 machine_mode mode = GET_MODE (dest);
1515 rtx tp = aarch64_load_tp (NULL);
1516
1517 if (mode != Pmode)
1518 tp = gen_lowpart (mode, tp);
1519
1520 switch (type)
1521 {
1522 case SYMBOL_TLSLE12:
1523 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1524 (dest, tp, imm));
1525 break;
1526 case SYMBOL_TLSLE24:
1527 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1528 (dest, tp, imm));
1529 break;
1530 case SYMBOL_TLSLE32:
1531 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1532 (dest, imm));
1533 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1534 (dest, dest, tp));
1535 break;
1536 case SYMBOL_TLSLE48:
1537 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1538 (dest, imm));
1539 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1540 (dest, dest, tp));
1541 break;
1542 default:
1543 gcc_unreachable ();
1544 }
1545
1546 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1547 return;
1548 }
1549
1550 case SYMBOL_TINY_GOT:
1551 emit_insn (gen_ldr_got_tiny (dest, imm));
1552 return;
1553
1554 case SYMBOL_TINY_TLSIE:
1555 {
1556 machine_mode mode = GET_MODE (dest);
1557 rtx tp = aarch64_load_tp (NULL);
1558
1559 if (mode == ptr_mode)
1560 {
1561 if (mode == DImode)
1562 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1563 else
1564 {
1565 tp = gen_lowpart (mode, tp);
1566 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1567 }
1568 }
1569 else
1570 {
1571 gcc_assert (mode == Pmode);
1572 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1573 }
1574
1575 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1576 return;
1577 }
1578
1579 default:
1580 gcc_unreachable ();
1581 }
1582 }
1583
1584 /* Emit a move from SRC to DEST. Assume that the move expanders can
1585 handle all moves if !can_create_pseudo_p (). The distinction is
1586 important because, unlike emit_move_insn, the move expanders know
1587 how to force Pmode objects into the constant pool even when the
1588 constant pool address is not itself legitimate. */
1589 static rtx
1590 aarch64_emit_move (rtx dest, rtx src)
1591 {
1592 return (can_create_pseudo_p ()
1593 ? emit_move_insn (dest, src)
1594 : emit_move_insn_1 (dest, src));
1595 }
1596
1597 /* Split a 128-bit move operation into two 64-bit move operations,
1598 taking care to handle partial overlap of register to register
1599 copies. Special cases are needed when moving between GP regs and
1600 FP regs. SRC can be a register, constant or memory; DST a register
1601 or memory. If either operand is memory it must not have any side
1602 effects. */
1603 void
1604 aarch64_split_128bit_move (rtx dst, rtx src)
1605 {
1606 rtx dst_lo, dst_hi;
1607 rtx src_lo, src_hi;
1608
1609 machine_mode mode = GET_MODE (dst);
1610
1611 gcc_assert (mode == TImode || mode == TFmode);
1612 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1613 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1614
1615 if (REG_P (dst) && REG_P (src))
1616 {
1617 int src_regno = REGNO (src);
1618 int dst_regno = REGNO (dst);
1619
1620 /* Handle FP <-> GP regs. */
1621 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1622 {
1623 src_lo = gen_lowpart (word_mode, src);
1624 src_hi = gen_highpart (word_mode, src);
1625
1626 if (mode == TImode)
1627 {
1628 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1629 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1630 }
1631 else
1632 {
1633 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1634 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1635 }
1636 return;
1637 }
1638 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1639 {
1640 dst_lo = gen_lowpart (word_mode, dst);
1641 dst_hi = gen_highpart (word_mode, dst);
1642
1643 if (mode == TImode)
1644 {
1645 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1646 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1647 }
1648 else
1649 {
1650 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1651 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1652 }
1653 return;
1654 }
1655 }
1656
1657 dst_lo = gen_lowpart (word_mode, dst);
1658 dst_hi = gen_highpart (word_mode, dst);
1659 src_lo = gen_lowpart (word_mode, src);
1660 src_hi = gen_highpart_mode (word_mode, mode, src);
1661
1662 /* At most one pairing may overlap. */
1663 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1664 {
1665 aarch64_emit_move (dst_hi, src_hi);
1666 aarch64_emit_move (dst_lo, src_lo);
1667 }
1668 else
1669 {
1670 aarch64_emit_move (dst_lo, src_lo);
1671 aarch64_emit_move (dst_hi, src_hi);
1672 }
1673 }
1674
1675 bool
1676 aarch64_split_128bit_move_p (rtx dst, rtx src)
1677 {
1678 return (! REG_P (src)
1679 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1680 }
1681
1682 /* Split a complex SIMD combine. */
1683
1684 void
1685 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1686 {
1687 machine_mode src_mode = GET_MODE (src1);
1688 machine_mode dst_mode = GET_MODE (dst);
1689
1690 gcc_assert (VECTOR_MODE_P (dst_mode));
1691
1692 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1693 {
1694 rtx (*gen) (rtx, rtx, rtx);
1695
1696 switch (src_mode)
1697 {
1698 case V8QImode:
1699 gen = gen_aarch64_simd_combinev8qi;
1700 break;
1701 case V4HImode:
1702 gen = gen_aarch64_simd_combinev4hi;
1703 break;
1704 case V2SImode:
1705 gen = gen_aarch64_simd_combinev2si;
1706 break;
1707 case V4HFmode:
1708 gen = gen_aarch64_simd_combinev4hf;
1709 break;
1710 case V2SFmode:
1711 gen = gen_aarch64_simd_combinev2sf;
1712 break;
1713 case DImode:
1714 gen = gen_aarch64_simd_combinedi;
1715 break;
1716 case DFmode:
1717 gen = gen_aarch64_simd_combinedf;
1718 break;
1719 default:
1720 gcc_unreachable ();
1721 }
1722
1723 emit_insn (gen (dst, src1, src2));
1724 return;
1725 }
1726 }
1727
1728 /* Split a complex SIMD move. */
1729
1730 void
1731 aarch64_split_simd_move (rtx dst, rtx src)
1732 {
1733 machine_mode src_mode = GET_MODE (src);
1734 machine_mode dst_mode = GET_MODE (dst);
1735
1736 gcc_assert (VECTOR_MODE_P (dst_mode));
1737
1738 if (REG_P (dst) && REG_P (src))
1739 {
1740 rtx (*gen) (rtx, rtx);
1741
1742 gcc_assert (VECTOR_MODE_P (src_mode));
1743
1744 switch (src_mode)
1745 {
1746 case V16QImode:
1747 gen = gen_aarch64_split_simd_movv16qi;
1748 break;
1749 case V8HImode:
1750 gen = gen_aarch64_split_simd_movv8hi;
1751 break;
1752 case V4SImode:
1753 gen = gen_aarch64_split_simd_movv4si;
1754 break;
1755 case V2DImode:
1756 gen = gen_aarch64_split_simd_movv2di;
1757 break;
1758 case V8HFmode:
1759 gen = gen_aarch64_split_simd_movv8hf;
1760 break;
1761 case V4SFmode:
1762 gen = gen_aarch64_split_simd_movv4sf;
1763 break;
1764 case V2DFmode:
1765 gen = gen_aarch64_split_simd_movv2df;
1766 break;
1767 default:
1768 gcc_unreachable ();
1769 }
1770
1771 emit_insn (gen (dst, src));
1772 return;
1773 }
1774 }
1775
1776 bool
1777 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1778 machine_mode ymode, rtx y)
1779 {
1780 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1781 gcc_assert (r != NULL);
1782 return rtx_equal_p (x, r);
1783 }
1784
1785
1786 static rtx
1787 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1788 {
1789 if (can_create_pseudo_p ())
1790 return force_reg (mode, value);
1791 else
1792 {
1793 x = aarch64_emit_move (x, value);
1794 return x;
1795 }
1796 }
1797
1798
1799 static rtx
1800 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1801 {
1802 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1803 {
1804 rtx high;
1805 /* Load the full offset into a register. This
1806 might be improvable in the future. */
1807 high = GEN_INT (offset);
1808 offset = 0;
1809 high = aarch64_force_temporary (mode, temp, high);
1810 reg = aarch64_force_temporary (mode, temp,
1811 gen_rtx_PLUS (mode, high, reg));
1812 }
1813 return plus_constant (mode, reg, offset);
1814 }
1815
1816 static int
1817 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1818 machine_mode mode)
1819 {
1820 int i;
1821 unsigned HOST_WIDE_INT val, val2, mask;
1822 int one_match, zero_match;
1823 int num_insns;
1824
1825 val = INTVAL (imm);
1826
1827 if (aarch64_move_imm (val, mode))
1828 {
1829 if (generate)
1830 emit_insn (gen_rtx_SET (dest, imm));
1831 return 1;
1832 }
1833
1834 if ((val >> 32) == 0 || mode == SImode)
1835 {
1836 if (generate)
1837 {
1838 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1839 if (mode == SImode)
1840 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1841 GEN_INT ((val >> 16) & 0xffff)));
1842 else
1843 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1844 GEN_INT ((val >> 16) & 0xffff)));
1845 }
1846 return 2;
1847 }
1848
1849 /* Remaining cases are all for DImode. */
1850
1851 mask = 0xffff;
1852 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1853 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1854 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1855 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1856
1857 if (zero_match != 2 && one_match != 2)
1858 {
1859 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1860 For a 64-bit bitmask try whether changing 16 bits to all ones or
1861 zeroes creates a valid bitmask. To check any repeated bitmask,
1862 try using 16 bits from the other 32-bit half of val. */
1863
1864 for (i = 0; i < 64; i += 16, mask <<= 16)
1865 {
1866 val2 = val & ~mask;
1867 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1868 break;
1869 val2 = val | mask;
1870 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1871 break;
1872 val2 = val2 & ~mask;
1873 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1874 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1875 break;
1876 }
1877 if (i != 64)
1878 {
1879 if (generate)
1880 {
1881 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1882 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1883 GEN_INT ((val >> i) & 0xffff)));
1884 }
1885 return 2;
1886 }
1887 }
1888
1889 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1890 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1891 otherwise skip zero bits. */
1892
1893 num_insns = 1;
1894 mask = 0xffff;
1895 val2 = one_match > zero_match ? ~val : val;
1896 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1897
1898 if (generate)
1899 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1900 ? (val | ~(mask << i))
1901 : (val & (mask << i)))));
1902 for (i += 16; i < 64; i += 16)
1903 {
1904 if ((val2 & (mask << i)) == 0)
1905 continue;
1906 if (generate)
1907 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1908 GEN_INT ((val >> i) & 0xffff)));
1909 num_insns ++;
1910 }
1911
1912 return num_insns;
1913 }
1914
1915
1916 void
1917 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1918 {
1919 machine_mode mode = GET_MODE (dest);
1920
1921 gcc_assert (mode == SImode || mode == DImode);
1922
1923 /* Check on what type of symbol it is. */
1924 if (GET_CODE (imm) == SYMBOL_REF
1925 || GET_CODE (imm) == LABEL_REF
1926 || GET_CODE (imm) == CONST)
1927 {
1928 rtx mem, base, offset;
1929 enum aarch64_symbol_type sty;
1930
1931 /* If we have (const (plus symbol offset)), separate out the offset
1932 before we start classifying the symbol. */
1933 split_const (imm, &base, &offset);
1934
1935 sty = aarch64_classify_symbol (base, offset);
1936 switch (sty)
1937 {
1938 case SYMBOL_FORCE_TO_MEM:
1939 if (offset != const0_rtx
1940 && targetm.cannot_force_const_mem (mode, imm))
1941 {
1942 gcc_assert (can_create_pseudo_p ());
1943 base = aarch64_force_temporary (mode, dest, base);
1944 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1945 aarch64_emit_move (dest, base);
1946 return;
1947 }
1948
1949 mem = force_const_mem (ptr_mode, imm);
1950 gcc_assert (mem);
1951
1952 /* If we aren't generating PC relative literals, then
1953 we need to expand the literal pool access carefully.
1954 This is something that needs to be done in a number
1955 of places, so could well live as a separate function. */
1956 if (!aarch64_pcrelative_literal_loads)
1957 {
1958 gcc_assert (can_create_pseudo_p ());
1959 base = gen_reg_rtx (ptr_mode);
1960 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1961 mem = gen_rtx_MEM (ptr_mode, base);
1962 }
1963
1964 if (mode != ptr_mode)
1965 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1966
1967 emit_insn (gen_rtx_SET (dest, mem));
1968
1969 return;
1970
1971 case SYMBOL_SMALL_TLSGD:
1972 case SYMBOL_SMALL_TLSDESC:
1973 case SYMBOL_SMALL_TLSIE:
1974 case SYMBOL_SMALL_GOT_28K:
1975 case SYMBOL_SMALL_GOT_4G:
1976 case SYMBOL_TINY_GOT:
1977 case SYMBOL_TINY_TLSIE:
1978 if (offset != const0_rtx)
1979 {
1980 gcc_assert(can_create_pseudo_p ());
1981 base = aarch64_force_temporary (mode, dest, base);
1982 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1983 aarch64_emit_move (dest, base);
1984 return;
1985 }
1986 /* FALLTHRU */
1987
1988 case SYMBOL_SMALL_ABSOLUTE:
1989 case SYMBOL_TINY_ABSOLUTE:
1990 case SYMBOL_TLSLE12:
1991 case SYMBOL_TLSLE24:
1992 case SYMBOL_TLSLE32:
1993 case SYMBOL_TLSLE48:
1994 aarch64_load_symref_appropriately (dest, imm, sty);
1995 return;
1996
1997 default:
1998 gcc_unreachable ();
1999 }
2000 }
2001
2002 if (!CONST_INT_P (imm))
2003 {
2004 if (GET_CODE (imm) == HIGH)
2005 emit_insn (gen_rtx_SET (dest, imm));
2006 else
2007 {
2008 rtx mem = force_const_mem (mode, imm);
2009 gcc_assert (mem);
2010 emit_insn (gen_rtx_SET (dest, mem));
2011 }
2012
2013 return;
2014 }
2015
2016 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2017 }
2018
2019 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2020 temporary value if necessary. FRAME_RELATED_P should be true if
2021 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2022 to the generated instructions. If SCRATCHREG is known to hold
2023 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2024 immediate again.
2025
2026 Since this function may be used to adjust the stack pointer, we must
2027 ensure that it cannot cause transient stack deallocation (for example
2028 by first incrementing SP and then decrementing when adjusting by a
2029 large immediate). */
2030
2031 static void
2032 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2033 HOST_WIDE_INT delta, bool frame_related_p,
2034 bool emit_move_imm)
2035 {
2036 HOST_WIDE_INT mdelta = abs_hwi (delta);
2037 rtx this_rtx = gen_rtx_REG (mode, regnum);
2038 rtx_insn *insn;
2039
2040 if (!mdelta)
2041 return;
2042
2043 /* Single instruction adjustment. */
2044 if (aarch64_uimm12_shift (mdelta))
2045 {
2046 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2047 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2048 return;
2049 }
2050
2051 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2052 Only do this if mdelta is not a 16-bit move as adjusting using a move
2053 is better. */
2054 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2055 {
2056 HOST_WIDE_INT low_off = mdelta & 0xfff;
2057
2058 low_off = delta < 0 ? -low_off : low_off;
2059 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2060 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2061 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2062 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2063 return;
2064 }
2065
2066 /* Emit a move immediate if required and an addition/subtraction. */
2067 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2068 if (emit_move_imm)
2069 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2070 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2071 : gen_add2_insn (this_rtx, scratch_rtx));
2072 if (frame_related_p)
2073 {
2074 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2075 rtx adj = plus_constant (mode, this_rtx, delta);
2076 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2077 }
2078 }
2079
2080 static inline void
2081 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2082 HOST_WIDE_INT delta)
2083 {
2084 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2085 }
2086
2087 static inline void
2088 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2089 {
2090 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2091 true, emit_move_imm);
2092 }
2093
2094 static inline void
2095 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2096 {
2097 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2098 frame_related_p, true);
2099 }
2100
2101 static bool
2102 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2103 tree exp ATTRIBUTE_UNUSED)
2104 {
2105 /* Currently, always true. */
2106 return true;
2107 }
2108
2109 /* Implement TARGET_PASS_BY_REFERENCE. */
2110
2111 static bool
2112 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2113 machine_mode mode,
2114 const_tree type,
2115 bool named ATTRIBUTE_UNUSED)
2116 {
2117 HOST_WIDE_INT size;
2118 machine_mode dummymode;
2119 int nregs;
2120
2121 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2122 size = (mode == BLKmode && type)
2123 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2124
2125 /* Aggregates are passed by reference based on their size. */
2126 if (type && AGGREGATE_TYPE_P (type))
2127 {
2128 size = int_size_in_bytes (type);
2129 }
2130
2131 /* Variable sized arguments are always returned by reference. */
2132 if (size < 0)
2133 return true;
2134
2135 /* Can this be a candidate to be passed in fp/simd register(s)? */
2136 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2137 &dummymode, &nregs,
2138 NULL))
2139 return false;
2140
2141 /* Arguments which are variable sized or larger than 2 registers are
2142 passed by reference unless they are a homogenous floating point
2143 aggregate. */
2144 return size > 2 * UNITS_PER_WORD;
2145 }
2146
2147 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2148 static bool
2149 aarch64_return_in_msb (const_tree valtype)
2150 {
2151 machine_mode dummy_mode;
2152 int dummy_int;
2153
2154 /* Never happens in little-endian mode. */
2155 if (!BYTES_BIG_ENDIAN)
2156 return false;
2157
2158 /* Only composite types smaller than or equal to 16 bytes can
2159 be potentially returned in registers. */
2160 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2161 || int_size_in_bytes (valtype) <= 0
2162 || int_size_in_bytes (valtype) > 16)
2163 return false;
2164
2165 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2166 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2167 is always passed/returned in the least significant bits of fp/simd
2168 register(s). */
2169 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2170 &dummy_mode, &dummy_int, NULL))
2171 return false;
2172
2173 return true;
2174 }
2175
2176 /* Implement TARGET_FUNCTION_VALUE.
2177 Define how to find the value returned by a function. */
2178
2179 static rtx
2180 aarch64_function_value (const_tree type, const_tree func,
2181 bool outgoing ATTRIBUTE_UNUSED)
2182 {
2183 machine_mode mode;
2184 int unsignedp;
2185 int count;
2186 machine_mode ag_mode;
2187
2188 mode = TYPE_MODE (type);
2189 if (INTEGRAL_TYPE_P (type))
2190 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2191
2192 if (aarch64_return_in_msb (type))
2193 {
2194 HOST_WIDE_INT size = int_size_in_bytes (type);
2195
2196 if (size % UNITS_PER_WORD != 0)
2197 {
2198 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2199 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2200 }
2201 }
2202
2203 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2204 &ag_mode, &count, NULL))
2205 {
2206 if (!aarch64_composite_type_p (type, mode))
2207 {
2208 gcc_assert (count == 1 && mode == ag_mode);
2209 return gen_rtx_REG (mode, V0_REGNUM);
2210 }
2211 else
2212 {
2213 int i;
2214 rtx par;
2215
2216 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2217 for (i = 0; i < count; i++)
2218 {
2219 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2220 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2221 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2222 XVECEXP (par, 0, i) = tmp;
2223 }
2224 return par;
2225 }
2226 }
2227 else
2228 return gen_rtx_REG (mode, R0_REGNUM);
2229 }
2230
2231 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2232 Return true if REGNO is the number of a hard register in which the values
2233 of called function may come back. */
2234
2235 static bool
2236 aarch64_function_value_regno_p (const unsigned int regno)
2237 {
2238 /* Maximum of 16 bytes can be returned in the general registers. Examples
2239 of 16-byte return values are: 128-bit integers and 16-byte small
2240 structures (excluding homogeneous floating-point aggregates). */
2241 if (regno == R0_REGNUM || regno == R1_REGNUM)
2242 return true;
2243
2244 /* Up to four fp/simd registers can return a function value, e.g. a
2245 homogeneous floating-point aggregate having four members. */
2246 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2247 return TARGET_FLOAT;
2248
2249 return false;
2250 }
2251
2252 /* Implement TARGET_RETURN_IN_MEMORY.
2253
2254 If the type T of the result of a function is such that
2255 void func (T arg)
2256 would require that arg be passed as a value in a register (or set of
2257 registers) according to the parameter passing rules, then the result
2258 is returned in the same registers as would be used for such an
2259 argument. */
2260
2261 static bool
2262 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2263 {
2264 HOST_WIDE_INT size;
2265 machine_mode ag_mode;
2266 int count;
2267
2268 if (!AGGREGATE_TYPE_P (type)
2269 && TREE_CODE (type) != COMPLEX_TYPE
2270 && TREE_CODE (type) != VECTOR_TYPE)
2271 /* Simple scalar types always returned in registers. */
2272 return false;
2273
2274 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2275 type,
2276 &ag_mode,
2277 &count,
2278 NULL))
2279 return false;
2280
2281 /* Types larger than 2 registers returned in memory. */
2282 size = int_size_in_bytes (type);
2283 return (size < 0 || size > 2 * UNITS_PER_WORD);
2284 }
2285
2286 static bool
2287 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2288 const_tree type, int *nregs)
2289 {
2290 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2291 return aarch64_vfp_is_call_or_return_candidate (mode,
2292 type,
2293 &pcum->aapcs_vfp_rmode,
2294 nregs,
2295 NULL);
2296 }
2297
2298 /* Given MODE and TYPE of a function argument, return the alignment in
2299 bits. The idea is to suppress any stronger alignment requested by
2300 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2301 This is a helper function for local use only. */
2302
2303 static unsigned int
2304 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2305 {
2306 if (!type)
2307 return GET_MODE_ALIGNMENT (mode);
2308
2309 if (integer_zerop (TYPE_SIZE (type)))
2310 return 0;
2311
2312 gcc_assert (TYPE_MODE (type) == mode);
2313
2314 if (!AGGREGATE_TYPE_P (type))
2315 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2316
2317 if (TREE_CODE (type) == ARRAY_TYPE)
2318 return TYPE_ALIGN (TREE_TYPE (type));
2319
2320 unsigned int alignment = 0;
2321 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2322 if (TREE_CODE (field) == FIELD_DECL)
2323 alignment = std::max (alignment, DECL_ALIGN (field));
2324
2325 return alignment;
2326 }
2327
2328 /* Layout a function argument according to the AAPCS64 rules. The rule
2329 numbers refer to the rule numbers in the AAPCS64. */
2330
2331 static void
2332 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2333 const_tree type,
2334 bool named ATTRIBUTE_UNUSED)
2335 {
2336 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2337 int ncrn, nvrn, nregs;
2338 bool allocate_ncrn, allocate_nvrn;
2339 HOST_WIDE_INT size;
2340
2341 /* We need to do this once per argument. */
2342 if (pcum->aapcs_arg_processed)
2343 return;
2344
2345 pcum->aapcs_arg_processed = true;
2346
2347 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2348 size
2349 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2350 UNITS_PER_WORD);
2351
2352 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2353 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2354 mode,
2355 type,
2356 &nregs);
2357
2358 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2359 The following code thus handles passing by SIMD/FP registers first. */
2360
2361 nvrn = pcum->aapcs_nvrn;
2362
2363 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2364 and homogenous short-vector aggregates (HVA). */
2365 if (allocate_nvrn)
2366 {
2367 if (!TARGET_FLOAT)
2368 aarch64_err_no_fpadvsimd (mode, "argument");
2369
2370 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2371 {
2372 pcum->aapcs_nextnvrn = nvrn + nregs;
2373 if (!aarch64_composite_type_p (type, mode))
2374 {
2375 gcc_assert (nregs == 1);
2376 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2377 }
2378 else
2379 {
2380 rtx par;
2381 int i;
2382 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2383 for (i = 0; i < nregs; i++)
2384 {
2385 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2386 V0_REGNUM + nvrn + i);
2387 tmp = gen_rtx_EXPR_LIST
2388 (VOIDmode, tmp,
2389 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2390 XVECEXP (par, 0, i) = tmp;
2391 }
2392 pcum->aapcs_reg = par;
2393 }
2394 return;
2395 }
2396 else
2397 {
2398 /* C.3 NSRN is set to 8. */
2399 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2400 goto on_stack;
2401 }
2402 }
2403
2404 ncrn = pcum->aapcs_ncrn;
2405 nregs = size / UNITS_PER_WORD;
2406
2407 /* C6 - C9. though the sign and zero extension semantics are
2408 handled elsewhere. This is the case where the argument fits
2409 entirely general registers. */
2410 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2411 {
2412
2413 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2414
2415 /* C.8 if the argument has an alignment of 16 then the NGRN is
2416 rounded up to the next even number. */
2417 if (nregs == 2
2418 && ncrn % 2
2419 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2420 comparison is there because for > 16 * BITS_PER_UNIT
2421 alignment nregs should be > 2 and therefore it should be
2422 passed by reference rather than value. */
2423 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2424 {
2425 ++ncrn;
2426 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2427 }
2428
2429 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2430 A reg is still generated for it, but the caller should be smart
2431 enough not to use it. */
2432 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2433 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2434 else
2435 {
2436 rtx par;
2437 int i;
2438
2439 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2440 for (i = 0; i < nregs; i++)
2441 {
2442 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2443 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2444 GEN_INT (i * UNITS_PER_WORD));
2445 XVECEXP (par, 0, i) = tmp;
2446 }
2447 pcum->aapcs_reg = par;
2448 }
2449
2450 pcum->aapcs_nextncrn = ncrn + nregs;
2451 return;
2452 }
2453
2454 /* C.11 */
2455 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2456
2457 /* The argument is passed on stack; record the needed number of words for
2458 this argument and align the total size if necessary. */
2459 on_stack:
2460 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2461
2462 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2463 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2464 16 / UNITS_PER_WORD);
2465 return;
2466 }
2467
2468 /* Implement TARGET_FUNCTION_ARG. */
2469
2470 static rtx
2471 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2472 const_tree type, bool named)
2473 {
2474 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2475 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2476
2477 if (mode == VOIDmode)
2478 return NULL_RTX;
2479
2480 aarch64_layout_arg (pcum_v, mode, type, named);
2481 return pcum->aapcs_reg;
2482 }
2483
2484 void
2485 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2486 const_tree fntype ATTRIBUTE_UNUSED,
2487 rtx libname ATTRIBUTE_UNUSED,
2488 const_tree fndecl ATTRIBUTE_UNUSED,
2489 unsigned n_named ATTRIBUTE_UNUSED)
2490 {
2491 pcum->aapcs_ncrn = 0;
2492 pcum->aapcs_nvrn = 0;
2493 pcum->aapcs_nextncrn = 0;
2494 pcum->aapcs_nextnvrn = 0;
2495 pcum->pcs_variant = ARM_PCS_AAPCS64;
2496 pcum->aapcs_reg = NULL_RTX;
2497 pcum->aapcs_arg_processed = false;
2498 pcum->aapcs_stack_words = 0;
2499 pcum->aapcs_stack_size = 0;
2500
2501 if (!TARGET_FLOAT
2502 && fndecl && TREE_PUBLIC (fndecl)
2503 && fntype && fntype != error_mark_node)
2504 {
2505 const_tree type = TREE_TYPE (fntype);
2506 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2507 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2508 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2509 &mode, &nregs, NULL))
2510 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2511 }
2512 return;
2513 }
2514
2515 static void
2516 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2517 machine_mode mode,
2518 const_tree type,
2519 bool named)
2520 {
2521 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2522 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2523 {
2524 aarch64_layout_arg (pcum_v, mode, type, named);
2525 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2526 != (pcum->aapcs_stack_words != 0));
2527 pcum->aapcs_arg_processed = false;
2528 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2529 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2530 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2531 pcum->aapcs_stack_words = 0;
2532 pcum->aapcs_reg = NULL_RTX;
2533 }
2534 }
2535
2536 bool
2537 aarch64_function_arg_regno_p (unsigned regno)
2538 {
2539 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2540 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2541 }
2542
2543 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2544 PARM_BOUNDARY bits of alignment, but will be given anything up
2545 to STACK_BOUNDARY bits if the type requires it. This makes sure
2546 that both before and after the layout of each argument, the Next
2547 Stacked Argument Address (NSAA) will have a minimum alignment of
2548 8 bytes. */
2549
2550 static unsigned int
2551 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2552 {
2553 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2554 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2555 }
2556
2557 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2558
2559 Return true if an argument passed on the stack should be padded upwards,
2560 i.e. if the least-significant byte of the stack slot has useful data.
2561
2562 Small aggregate types are placed in the lowest memory address.
2563
2564 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2565
2566 bool
2567 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2568 {
2569 /* On little-endian targets, the least significant byte of every stack
2570 argument is passed at the lowest byte address of the stack slot. */
2571 if (!BYTES_BIG_ENDIAN)
2572 return true;
2573
2574 /* Otherwise, integral, floating-point and pointer types are padded downward:
2575 the least significant byte of a stack argument is passed at the highest
2576 byte address of the stack slot. */
2577 if (type
2578 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2579 || POINTER_TYPE_P (type))
2580 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2581 return false;
2582
2583 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2584 return true;
2585 }
2586
2587 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2588
2589 It specifies padding for the last (may also be the only)
2590 element of a block move between registers and memory. If
2591 assuming the block is in the memory, padding upward means that
2592 the last element is padded after its highest significant byte,
2593 while in downward padding, the last element is padded at the
2594 its least significant byte side.
2595
2596 Small aggregates and small complex types are always padded
2597 upwards.
2598
2599 We don't need to worry about homogeneous floating-point or
2600 short-vector aggregates; their move is not affected by the
2601 padding direction determined here. Regardless of endianness,
2602 each element of such an aggregate is put in the least
2603 significant bits of a fp/simd register.
2604
2605 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2606 register has useful data, and return the opposite if the most
2607 significant byte does. */
2608
2609 bool
2610 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2611 bool first ATTRIBUTE_UNUSED)
2612 {
2613
2614 /* Small composite types are always padded upward. */
2615 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2616 {
2617 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2618 : GET_MODE_SIZE (mode));
2619 if (size < 2 * UNITS_PER_WORD)
2620 return true;
2621 }
2622
2623 /* Otherwise, use the default padding. */
2624 return !BYTES_BIG_ENDIAN;
2625 }
2626
2627 static machine_mode
2628 aarch64_libgcc_cmp_return_mode (void)
2629 {
2630 return SImode;
2631 }
2632
2633 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2634
2635 /* We use the 12-bit shifted immediate arithmetic instructions so values
2636 must be multiple of (1 << 12), i.e. 4096. */
2637 #define ARITH_FACTOR 4096
2638
2639 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2640 #error Cannot use simple address calculation for stack probing
2641 #endif
2642
2643 /* The pair of scratch registers used for stack probing. */
2644 #define PROBE_STACK_FIRST_REG 9
2645 #define PROBE_STACK_SECOND_REG 10
2646
2647 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2648 inclusive. These are offsets from the current stack pointer. */
2649
2650 static void
2651 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2652 {
2653 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2654
2655 /* See the same assertion on PROBE_INTERVAL above. */
2656 gcc_assert ((first % ARITH_FACTOR) == 0);
2657
2658 /* See if we have a constant small number of probes to generate. If so,
2659 that's the easy case. */
2660 if (size <= PROBE_INTERVAL)
2661 {
2662 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2663
2664 emit_set_insn (reg1,
2665 plus_constant (Pmode,
2666 stack_pointer_rtx, -(first + base)));
2667 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2668 }
2669
2670 /* The run-time loop is made up of 8 insns in the generic case while the
2671 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2672 else if (size <= 4 * PROBE_INTERVAL)
2673 {
2674 HOST_WIDE_INT i, rem;
2675
2676 emit_set_insn (reg1,
2677 plus_constant (Pmode,
2678 stack_pointer_rtx,
2679 -(first + PROBE_INTERVAL)));
2680 emit_stack_probe (reg1);
2681
2682 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2683 it exceeds SIZE. If only two probes are needed, this will not
2684 generate any code. Then probe at FIRST + SIZE. */
2685 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2686 {
2687 emit_set_insn (reg1,
2688 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2689 emit_stack_probe (reg1);
2690 }
2691
2692 rem = size - (i - PROBE_INTERVAL);
2693 if (rem > 256)
2694 {
2695 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2696
2697 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2698 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2699 }
2700 else
2701 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2702 }
2703
2704 /* Otherwise, do the same as above, but in a loop. Note that we must be
2705 extra careful with variables wrapping around because we might be at
2706 the very top (or the very bottom) of the address space and we have
2707 to be able to handle this case properly; in particular, we use an
2708 equality test for the loop condition. */
2709 else
2710 {
2711 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2712
2713 /* Step 1: round SIZE to the previous multiple of the interval. */
2714
2715 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2716
2717
2718 /* Step 2: compute initial and final value of the loop counter. */
2719
2720 /* TEST_ADDR = SP + FIRST. */
2721 emit_set_insn (reg1,
2722 plus_constant (Pmode, stack_pointer_rtx, -first));
2723
2724 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2725 emit_set_insn (reg2,
2726 plus_constant (Pmode, stack_pointer_rtx,
2727 -(first + rounded_size)));
2728
2729
2730 /* Step 3: the loop
2731
2732 do
2733 {
2734 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2735 probe at TEST_ADDR
2736 }
2737 while (TEST_ADDR != LAST_ADDR)
2738
2739 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2740 until it is equal to ROUNDED_SIZE. */
2741
2742 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2743
2744
2745 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2746 that SIZE is equal to ROUNDED_SIZE. */
2747
2748 if (size != rounded_size)
2749 {
2750 HOST_WIDE_INT rem = size - rounded_size;
2751
2752 if (rem > 256)
2753 {
2754 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2755
2756 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2757 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2758 }
2759 else
2760 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2761 }
2762 }
2763
2764 /* Make sure nothing is scheduled before we are done. */
2765 emit_insn (gen_blockage ());
2766 }
2767
2768 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2769 absolute addresses. */
2770
2771 const char *
2772 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2773 {
2774 static int labelno = 0;
2775 char loop_lab[32];
2776 rtx xops[2];
2777
2778 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2779
2780 /* Loop. */
2781 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2782
2783 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2784 xops[0] = reg1;
2785 xops[1] = GEN_INT (PROBE_INTERVAL);
2786 output_asm_insn ("sub\t%0, %0, %1", xops);
2787
2788 /* Probe at TEST_ADDR. */
2789 output_asm_insn ("str\txzr, [%0]", xops);
2790
2791 /* Test if TEST_ADDR == LAST_ADDR. */
2792 xops[1] = reg2;
2793 output_asm_insn ("cmp\t%0, %1", xops);
2794
2795 /* Branch. */
2796 fputs ("\tb.ne\t", asm_out_file);
2797 assemble_name_raw (asm_out_file, loop_lab);
2798 fputc ('\n', asm_out_file);
2799
2800 return "";
2801 }
2802
2803 static bool
2804 aarch64_frame_pointer_required (void)
2805 {
2806 /* In aarch64_override_options_after_change
2807 flag_omit_leaf_frame_pointer turns off the frame pointer by
2808 default. Turn it back on now if we've not got a leaf
2809 function. */
2810 if (flag_omit_leaf_frame_pointer
2811 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2812 return true;
2813
2814 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2815 if (crtl->calls_eh_return)
2816 return true;
2817
2818 return false;
2819 }
2820
2821 /* Mark the registers that need to be saved by the callee and calculate
2822 the size of the callee-saved registers area and frame record (both FP
2823 and LR may be omitted). */
2824 static void
2825 aarch64_layout_frame (void)
2826 {
2827 HOST_WIDE_INT offset = 0;
2828 int regno, last_fp_reg = INVALID_REGNUM;
2829
2830 if (reload_completed && cfun->machine->frame.laid_out)
2831 return;
2832
2833 #define SLOT_NOT_REQUIRED (-2)
2834 #define SLOT_REQUIRED (-1)
2835
2836 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2837 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2838
2839 /* First mark all the registers that really need to be saved... */
2840 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2841 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2842
2843 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2844 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2845
2846 /* ... that includes the eh data registers (if needed)... */
2847 if (crtl->calls_eh_return)
2848 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2849 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2850 = SLOT_REQUIRED;
2851
2852 /* ... and any callee saved register that dataflow says is live. */
2853 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2854 if (df_regs_ever_live_p (regno)
2855 && (regno == R30_REGNUM
2856 || !call_used_regs[regno]))
2857 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2858
2859 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2860 if (df_regs_ever_live_p (regno)
2861 && !call_used_regs[regno])
2862 {
2863 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2864 last_fp_reg = regno;
2865 }
2866
2867 if (frame_pointer_needed)
2868 {
2869 /* FP and LR are placed in the linkage record. */
2870 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2871 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2872 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2873 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2874 offset += 2 * UNITS_PER_WORD;
2875 }
2876
2877 /* Now assign stack slots for them. */
2878 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2879 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2880 {
2881 cfun->machine->frame.reg_offset[regno] = offset;
2882 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2883 cfun->machine->frame.wb_candidate1 = regno;
2884 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2885 cfun->machine->frame.wb_candidate2 = regno;
2886 offset += UNITS_PER_WORD;
2887 }
2888
2889 HOST_WIDE_INT max_int_offset = offset;
2890 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2891 bool has_align_gap = offset != max_int_offset;
2892
2893 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2894 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2895 {
2896 /* If there is an alignment gap between integer and fp callee-saves,
2897 allocate the last fp register to it if possible. */
2898 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2899 {
2900 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2901 break;
2902 }
2903
2904 cfun->machine->frame.reg_offset[regno] = offset;
2905 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2906 cfun->machine->frame.wb_candidate1 = regno;
2907 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2908 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2909 cfun->machine->frame.wb_candidate2 = regno;
2910 offset += UNITS_PER_WORD;
2911 }
2912
2913 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2914
2915 cfun->machine->frame.saved_regs_size = offset;
2916
2917 HOST_WIDE_INT varargs_and_saved_regs_size
2918 = offset + cfun->machine->frame.saved_varargs_size;
2919
2920 cfun->machine->frame.hard_fp_offset
2921 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2922 STACK_BOUNDARY / BITS_PER_UNIT);
2923
2924 cfun->machine->frame.frame_size
2925 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2926 + crtl->outgoing_args_size,
2927 STACK_BOUNDARY / BITS_PER_UNIT);
2928
2929 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2930
2931 cfun->machine->frame.initial_adjust = 0;
2932 cfun->machine->frame.final_adjust = 0;
2933 cfun->machine->frame.callee_adjust = 0;
2934 cfun->machine->frame.callee_offset = 0;
2935
2936 HOST_WIDE_INT max_push_offset = 0;
2937 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2938 max_push_offset = 512;
2939 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2940 max_push_offset = 256;
2941
2942 if (cfun->machine->frame.frame_size < max_push_offset
2943 && crtl->outgoing_args_size == 0)
2944 {
2945 /* Simple, small frame with no outgoing arguments:
2946 stp reg1, reg2, [sp, -frame_size]!
2947 stp reg3, reg4, [sp, 16] */
2948 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2949 }
2950 else if ((crtl->outgoing_args_size
2951 + cfun->machine->frame.saved_regs_size < 512)
2952 && !(cfun->calls_alloca
2953 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2954 {
2955 /* Frame with small outgoing arguments:
2956 sub sp, sp, frame_size
2957 stp reg1, reg2, [sp, outgoing_args_size]
2958 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2959 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2960 cfun->machine->frame.callee_offset
2961 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2962 }
2963 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2964 {
2965 /* Frame with large outgoing arguments but a small local area:
2966 stp reg1, reg2, [sp, -hard_fp_offset]!
2967 stp reg3, reg4, [sp, 16]
2968 sub sp, sp, outgoing_args_size */
2969 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2970 cfun->machine->frame.final_adjust
2971 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2972 }
2973 else if (!frame_pointer_needed
2974 && varargs_and_saved_regs_size < max_push_offset)
2975 {
2976 /* Frame with large local area and outgoing arguments (this pushes the
2977 callee-saves first, followed by the locals and outgoing area):
2978 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2979 stp reg3, reg4, [sp, 16]
2980 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2981 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2982 cfun->machine->frame.final_adjust
2983 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2984 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2985 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2986 }
2987 else
2988 {
2989 /* Frame with large local area and outgoing arguments using frame pointer:
2990 sub sp, sp, hard_fp_offset
2991 stp x29, x30, [sp, 0]
2992 add x29, sp, 0
2993 stp reg3, reg4, [sp, 16]
2994 sub sp, sp, outgoing_args_size */
2995 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2996 cfun->machine->frame.final_adjust
2997 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2998 }
2999
3000 cfun->machine->frame.laid_out = true;
3001 }
3002
3003 /* Return true if the register REGNO is saved on entry to
3004 the current function. */
3005
3006 static bool
3007 aarch64_register_saved_on_entry (int regno)
3008 {
3009 return cfun->machine->frame.reg_offset[regno] >= 0;
3010 }
3011
3012 /* Return the next register up from REGNO up to LIMIT for the callee
3013 to save. */
3014
3015 static unsigned
3016 aarch64_next_callee_save (unsigned regno, unsigned limit)
3017 {
3018 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3019 regno ++;
3020 return regno;
3021 }
3022
3023 /* Push the register number REGNO of mode MODE to the stack with write-back
3024 adjusting the stack by ADJUSTMENT. */
3025
3026 static void
3027 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3028 HOST_WIDE_INT adjustment)
3029 {
3030 rtx base_rtx = stack_pointer_rtx;
3031 rtx insn, reg, mem;
3032
3033 reg = gen_rtx_REG (mode, regno);
3034 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3035 plus_constant (Pmode, base_rtx, -adjustment));
3036 mem = gen_rtx_MEM (mode, mem);
3037
3038 insn = emit_move_insn (mem, reg);
3039 RTX_FRAME_RELATED_P (insn) = 1;
3040 }
3041
3042 /* Generate and return an instruction to store the pair of registers
3043 REG and REG2 of mode MODE to location BASE with write-back adjusting
3044 the stack location BASE by ADJUSTMENT. */
3045
3046 static rtx
3047 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3048 HOST_WIDE_INT adjustment)
3049 {
3050 switch (mode)
3051 {
3052 case DImode:
3053 return gen_storewb_pairdi_di (base, base, reg, reg2,
3054 GEN_INT (-adjustment),
3055 GEN_INT (UNITS_PER_WORD - adjustment));
3056 case DFmode:
3057 return gen_storewb_pairdf_di (base, base, reg, reg2,
3058 GEN_INT (-adjustment),
3059 GEN_INT (UNITS_PER_WORD - adjustment));
3060 default:
3061 gcc_unreachable ();
3062 }
3063 }
3064
3065 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3066 stack pointer by ADJUSTMENT. */
3067
3068 static void
3069 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3070 {
3071 rtx_insn *insn;
3072 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3073
3074 if (regno2 == INVALID_REGNUM)
3075 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3076
3077 rtx reg1 = gen_rtx_REG (mode, regno1);
3078 rtx reg2 = gen_rtx_REG (mode, regno2);
3079
3080 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3081 reg2, adjustment));
3082 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3083 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3084 RTX_FRAME_RELATED_P (insn) = 1;
3085 }
3086
3087 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3088 adjusting it by ADJUSTMENT afterwards. */
3089
3090 static rtx
3091 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3092 HOST_WIDE_INT adjustment)
3093 {
3094 switch (mode)
3095 {
3096 case DImode:
3097 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3098 GEN_INT (UNITS_PER_WORD));
3099 case DFmode:
3100 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3101 GEN_INT (UNITS_PER_WORD));
3102 default:
3103 gcc_unreachable ();
3104 }
3105 }
3106
3107 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3108 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3109 into CFI_OPS. */
3110
3111 static void
3112 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3113 rtx *cfi_ops)
3114 {
3115 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3116 rtx reg1 = gen_rtx_REG (mode, regno1);
3117
3118 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3119
3120 if (regno2 == INVALID_REGNUM)
3121 {
3122 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3123 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3124 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3125 }
3126 else
3127 {
3128 rtx reg2 = gen_rtx_REG (mode, regno2);
3129 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3130 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3131 reg2, adjustment));
3132 }
3133 }
3134
3135 /* Generate and return a store pair instruction of mode MODE to store
3136 register REG1 to MEM1 and register REG2 to MEM2. */
3137
3138 static rtx
3139 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3140 rtx reg2)
3141 {
3142 switch (mode)
3143 {
3144 case DImode:
3145 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3146
3147 case DFmode:
3148 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3149
3150 default:
3151 gcc_unreachable ();
3152 }
3153 }
3154
3155 /* Generate and regurn a load pair isntruction of mode MODE to load register
3156 REG1 from MEM1 and register REG2 from MEM2. */
3157
3158 static rtx
3159 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3160 rtx mem2)
3161 {
3162 switch (mode)
3163 {
3164 case DImode:
3165 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3166
3167 case DFmode:
3168 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3169
3170 default:
3171 gcc_unreachable ();
3172 }
3173 }
3174
3175 /* Return TRUE if return address signing should be enabled for the current
3176 function, otherwise return FALSE. */
3177
3178 bool
3179 aarch64_return_address_signing_enabled (void)
3180 {
3181 /* This function should only be called after frame laid out. */
3182 gcc_assert (cfun->machine->frame.laid_out);
3183
3184 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3185 if it's LR is pushed onto stack. */
3186 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3187 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3188 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3189 }
3190
3191 /* Emit code to save the callee-saved registers from register number START
3192 to LIMIT to the stack at the location starting at offset START_OFFSET,
3193 skipping any write-back candidates if SKIP_WB is true. */
3194
3195 static void
3196 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3197 unsigned start, unsigned limit, bool skip_wb)
3198 {
3199 rtx_insn *insn;
3200 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3201 ? gen_frame_mem : gen_rtx_MEM);
3202 unsigned regno;
3203 unsigned regno2;
3204
3205 for (regno = aarch64_next_callee_save (start, limit);
3206 regno <= limit;
3207 regno = aarch64_next_callee_save (regno + 1, limit))
3208 {
3209 rtx reg, mem;
3210 HOST_WIDE_INT offset;
3211
3212 if (skip_wb
3213 && (regno == cfun->machine->frame.wb_candidate1
3214 || regno == cfun->machine->frame.wb_candidate2))
3215 continue;
3216
3217 if (cfun->machine->reg_is_wrapped_separately[regno])
3218 continue;
3219
3220 reg = gen_rtx_REG (mode, regno);
3221 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3222 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3223 offset));
3224
3225 regno2 = aarch64_next_callee_save (regno + 1, limit);
3226
3227 if (regno2 <= limit
3228 && !cfun->machine->reg_is_wrapped_separately[regno2]
3229 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3230 == cfun->machine->frame.reg_offset[regno2]))
3231
3232 {
3233 rtx reg2 = gen_rtx_REG (mode, regno2);
3234 rtx mem2;
3235
3236 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3237 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3238 offset));
3239 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3240 reg2));
3241
3242 /* The first part of a frame-related parallel insn is
3243 always assumed to be relevant to the frame
3244 calculations; subsequent parts, are only
3245 frame-related if explicitly marked. */
3246 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3247 regno = regno2;
3248 }
3249 else
3250 insn = emit_move_insn (mem, reg);
3251
3252 RTX_FRAME_RELATED_P (insn) = 1;
3253 }
3254 }
3255
3256 /* Emit code to restore the callee registers of mode MODE from register
3257 number START up to and including LIMIT. Restore from the stack offset
3258 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3259 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3260
3261 static void
3262 aarch64_restore_callee_saves (machine_mode mode,
3263 HOST_WIDE_INT start_offset, unsigned start,
3264 unsigned limit, bool skip_wb, rtx *cfi_ops)
3265 {
3266 rtx base_rtx = stack_pointer_rtx;
3267 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3268 ? gen_frame_mem : gen_rtx_MEM);
3269 unsigned regno;
3270 unsigned regno2;
3271 HOST_WIDE_INT offset;
3272
3273 for (regno = aarch64_next_callee_save (start, limit);
3274 regno <= limit;
3275 regno = aarch64_next_callee_save (regno + 1, limit))
3276 {
3277 if (cfun->machine->reg_is_wrapped_separately[regno])
3278 continue;
3279
3280 rtx reg, mem;
3281
3282 if (skip_wb
3283 && (regno == cfun->machine->frame.wb_candidate1
3284 || regno == cfun->machine->frame.wb_candidate2))
3285 continue;
3286
3287 reg = gen_rtx_REG (mode, regno);
3288 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3289 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3290
3291 regno2 = aarch64_next_callee_save (regno + 1, limit);
3292
3293 if (regno2 <= limit
3294 && !cfun->machine->reg_is_wrapped_separately[regno2]
3295 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3296 == cfun->machine->frame.reg_offset[regno2]))
3297 {
3298 rtx reg2 = gen_rtx_REG (mode, regno2);
3299 rtx mem2;
3300
3301 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3302 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3303 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3304
3305 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3306 regno = regno2;
3307 }
3308 else
3309 emit_move_insn (reg, mem);
3310 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3311 }
3312 }
3313
3314 static inline bool
3315 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3316 HOST_WIDE_INT offset)
3317 {
3318 return offset >= -256 && offset < 256;
3319 }
3320
3321 static inline bool
3322 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3323 {
3324 return (offset >= 0
3325 && offset < 4096 * GET_MODE_SIZE (mode)
3326 && offset % GET_MODE_SIZE (mode) == 0);
3327 }
3328
3329 bool
3330 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3331 {
3332 return (offset >= -64 * GET_MODE_SIZE (mode)
3333 && offset < 64 * GET_MODE_SIZE (mode)
3334 && offset % GET_MODE_SIZE (mode) == 0);
3335 }
3336
3337 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3338
3339 static sbitmap
3340 aarch64_get_separate_components (void)
3341 {
3342 aarch64_layout_frame ();
3343
3344 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3345 bitmap_clear (components);
3346
3347 /* The registers we need saved to the frame. */
3348 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3349 if (aarch64_register_saved_on_entry (regno))
3350 {
3351 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3352 if (!frame_pointer_needed)
3353 offset += cfun->machine->frame.frame_size
3354 - cfun->machine->frame.hard_fp_offset;
3355 /* Check that we can access the stack slot of the register with one
3356 direct load with no adjustments needed. */
3357 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3358 bitmap_set_bit (components, regno);
3359 }
3360
3361 /* Don't mess with the hard frame pointer. */
3362 if (frame_pointer_needed)
3363 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3364
3365 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3366 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3367 /* If aarch64_layout_frame has chosen registers to store/restore with
3368 writeback don't interfere with them to avoid having to output explicit
3369 stack adjustment instructions. */
3370 if (reg2 != INVALID_REGNUM)
3371 bitmap_clear_bit (components, reg2);
3372 if (reg1 != INVALID_REGNUM)
3373 bitmap_clear_bit (components, reg1);
3374
3375 bitmap_clear_bit (components, LR_REGNUM);
3376 bitmap_clear_bit (components, SP_REGNUM);
3377
3378 return components;
3379 }
3380
3381 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3382
3383 static sbitmap
3384 aarch64_components_for_bb (basic_block bb)
3385 {
3386 bitmap in = DF_LIVE_IN (bb);
3387 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3388 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3389
3390 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3391 bitmap_clear (components);
3392
3393 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3394 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3395 if ((!call_used_regs[regno])
3396 && (bitmap_bit_p (in, regno)
3397 || bitmap_bit_p (gen, regno)
3398 || bitmap_bit_p (kill, regno)))
3399 bitmap_set_bit (components, regno);
3400
3401 return components;
3402 }
3403
3404 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3405 Nothing to do for aarch64. */
3406
3407 static void
3408 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3409 {
3410 }
3411
3412 /* Return the next set bit in BMP from START onwards. Return the total number
3413 of bits in BMP if no set bit is found at or after START. */
3414
3415 static unsigned int
3416 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3417 {
3418 unsigned int nbits = SBITMAP_SIZE (bmp);
3419 if (start == nbits)
3420 return start;
3421
3422 gcc_assert (start < nbits);
3423 for (unsigned int i = start; i < nbits; i++)
3424 if (bitmap_bit_p (bmp, i))
3425 return i;
3426
3427 return nbits;
3428 }
3429
3430 /* Do the work for aarch64_emit_prologue_components and
3431 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3432 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3433 for these components or the epilogue sequence. That is, it determines
3434 whether we should emit stores or loads and what kind of CFA notes to attach
3435 to the insns. Otherwise the logic for the two sequences is very
3436 similar. */
3437
3438 static void
3439 aarch64_process_components (sbitmap components, bool prologue_p)
3440 {
3441 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3442 ? HARD_FRAME_POINTER_REGNUM
3443 : STACK_POINTER_REGNUM);
3444
3445 unsigned last_regno = SBITMAP_SIZE (components);
3446 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3447 rtx_insn *insn = NULL;
3448
3449 while (regno != last_regno)
3450 {
3451 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3452 so DFmode for the vector registers is enough. */
3453 machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3454 rtx reg = gen_rtx_REG (mode, regno);
3455 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3456 if (!frame_pointer_needed)
3457 offset += cfun->machine->frame.frame_size
3458 - cfun->machine->frame.hard_fp_offset;
3459 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3460 rtx mem = gen_frame_mem (mode, addr);
3461
3462 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3463 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3464 /* No more registers to handle after REGNO.
3465 Emit a single save/restore and exit. */
3466 if (regno2 == last_regno)
3467 {
3468 insn = emit_insn (set);
3469 RTX_FRAME_RELATED_P (insn) = 1;
3470 if (prologue_p)
3471 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3472 else
3473 add_reg_note (insn, REG_CFA_RESTORE, reg);
3474 break;
3475 }
3476
3477 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3478 /* The next register is not of the same class or its offset is not
3479 mergeable with the current one into a pair. */
3480 if (!satisfies_constraint_Ump (mem)
3481 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3482 || (offset2 - cfun->machine->frame.reg_offset[regno])
3483 != GET_MODE_SIZE (mode))
3484 {
3485 insn = emit_insn (set);
3486 RTX_FRAME_RELATED_P (insn) = 1;
3487 if (prologue_p)
3488 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3489 else
3490 add_reg_note (insn, REG_CFA_RESTORE, reg);
3491
3492 regno = regno2;
3493 continue;
3494 }
3495
3496 /* REGNO2 can be saved/restored in a pair with REGNO. */
3497 rtx reg2 = gen_rtx_REG (mode, regno2);
3498 if (!frame_pointer_needed)
3499 offset2 += cfun->machine->frame.frame_size
3500 - cfun->machine->frame.hard_fp_offset;
3501 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3502 rtx mem2 = gen_frame_mem (mode, addr2);
3503 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3504 : gen_rtx_SET (reg2, mem2);
3505
3506 if (prologue_p)
3507 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3508 else
3509 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3510
3511 RTX_FRAME_RELATED_P (insn) = 1;
3512 if (prologue_p)
3513 {
3514 add_reg_note (insn, REG_CFA_OFFSET, set);
3515 add_reg_note (insn, REG_CFA_OFFSET, set2);
3516 }
3517 else
3518 {
3519 add_reg_note (insn, REG_CFA_RESTORE, reg);
3520 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3521 }
3522
3523 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3524 }
3525 }
3526
3527 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3528
3529 static void
3530 aarch64_emit_prologue_components (sbitmap components)
3531 {
3532 aarch64_process_components (components, true);
3533 }
3534
3535 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3536
3537 static void
3538 aarch64_emit_epilogue_components (sbitmap components)
3539 {
3540 aarch64_process_components (components, false);
3541 }
3542
3543 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3544
3545 static void
3546 aarch64_set_handled_components (sbitmap components)
3547 {
3548 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3549 if (bitmap_bit_p (components, regno))
3550 cfun->machine->reg_is_wrapped_separately[regno] = true;
3551 }
3552
3553 /* AArch64 stack frames generated by this compiler look like:
3554
3555 +-------------------------------+
3556 | |
3557 | incoming stack arguments |
3558 | |
3559 +-------------------------------+
3560 | | <-- incoming stack pointer (aligned)
3561 | callee-allocated save area |
3562 | for register varargs |
3563 | |
3564 +-------------------------------+
3565 | local variables | <-- frame_pointer_rtx
3566 | |
3567 +-------------------------------+
3568 | padding0 | \
3569 +-------------------------------+ |
3570 | callee-saved registers | | frame.saved_regs_size
3571 +-------------------------------+ |
3572 | LR' | |
3573 +-------------------------------+ |
3574 | FP' | / <- hard_frame_pointer_rtx (aligned)
3575 +-------------------------------+
3576 | dynamic allocation |
3577 +-------------------------------+
3578 | padding |
3579 +-------------------------------+
3580 | outgoing stack arguments | <-- arg_pointer
3581 | |
3582 +-------------------------------+
3583 | | <-- stack_pointer_rtx (aligned)
3584
3585 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3586 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3587 unchanged. */
3588
3589 /* Generate the prologue instructions for entry into a function.
3590 Establish the stack frame by decreasing the stack pointer with a
3591 properly calculated size and, if necessary, create a frame record
3592 filled with the values of LR and previous frame pointer. The
3593 current FP is also set up if it is in use. */
3594
3595 void
3596 aarch64_expand_prologue (void)
3597 {
3598 aarch64_layout_frame ();
3599
3600 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3601 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3602 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3603 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3604 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3605 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3606 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3607 rtx_insn *insn;
3608
3609 /* Sign return address for functions. */
3610 if (aarch64_return_address_signing_enabled ())
3611 {
3612 insn = emit_insn (gen_pacisp ());
3613 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3614 RTX_FRAME_RELATED_P (insn) = 1;
3615 }
3616
3617 if (flag_stack_usage_info)
3618 current_function_static_stack_size = frame_size;
3619
3620 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3621 {
3622 if (crtl->is_leaf && !cfun->calls_alloca)
3623 {
3624 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3625 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3626 frame_size - STACK_CHECK_PROTECT);
3627 }
3628 else if (frame_size > 0)
3629 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3630 }
3631
3632 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3633
3634 if (callee_adjust != 0)
3635 aarch64_push_regs (reg1, reg2, callee_adjust);
3636
3637 if (frame_pointer_needed)
3638 {
3639 if (callee_adjust == 0)
3640 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3641 R30_REGNUM, false);
3642 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3643 stack_pointer_rtx,
3644 GEN_INT (callee_offset)));
3645 RTX_FRAME_RELATED_P (insn) = 1;
3646 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3647 }
3648
3649 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3650 callee_adjust != 0 || frame_pointer_needed);
3651 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3652 callee_adjust != 0 || frame_pointer_needed);
3653 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3654 }
3655
3656 /* Return TRUE if we can use a simple_return insn.
3657
3658 This function checks whether the callee saved stack is empty, which
3659 means no restore actions are need. The pro_and_epilogue will use
3660 this to check whether shrink-wrapping opt is feasible. */
3661
3662 bool
3663 aarch64_use_return_insn_p (void)
3664 {
3665 if (!reload_completed)
3666 return false;
3667
3668 if (crtl->profile)
3669 return false;
3670
3671 aarch64_layout_frame ();
3672
3673 return cfun->machine->frame.frame_size == 0;
3674 }
3675
3676 /* Generate the epilogue instructions for returning from a function.
3677 This is almost exactly the reverse of the prolog sequence, except
3678 that we need to insert barriers to avoid scheduling loads that read
3679 from a deallocated stack, and we optimize the unwind records by
3680 emitting them all together if possible. */
3681 void
3682 aarch64_expand_epilogue (bool for_sibcall)
3683 {
3684 aarch64_layout_frame ();
3685
3686 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3687 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3688 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3689 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3690 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3691 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3692 rtx cfi_ops = NULL;
3693 rtx_insn *insn;
3694
3695 /* We need to add memory barrier to prevent read from deallocated stack. */
3696 bool need_barrier_p = (get_frame_size ()
3697 + cfun->machine->frame.saved_varargs_size) != 0;
3698
3699 /* Emit a barrier to prevent loads from a deallocated stack. */
3700 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3701 || crtl->calls_eh_return)
3702 {
3703 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3704 need_barrier_p = false;
3705 }
3706
3707 /* Restore the stack pointer from the frame pointer if it may not
3708 be the same as the stack pointer. */
3709 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3710 {
3711 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3712 hard_frame_pointer_rtx,
3713 GEN_INT (-callee_offset)));
3714 /* If writeback is used when restoring callee-saves, the CFA
3715 is restored on the instruction doing the writeback. */
3716 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3717 }
3718 else
3719 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3720
3721 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3722 callee_adjust != 0, &cfi_ops);
3723 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3724 callee_adjust != 0, &cfi_ops);
3725
3726 if (need_barrier_p)
3727 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3728
3729 if (callee_adjust != 0)
3730 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3731
3732 if (callee_adjust != 0 || initial_adjust > 65536)
3733 {
3734 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3735 insn = get_last_insn ();
3736 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3737 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3738 RTX_FRAME_RELATED_P (insn) = 1;
3739 cfi_ops = NULL;
3740 }
3741
3742 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3743
3744 if (cfi_ops)
3745 {
3746 /* Emit delayed restores and reset the CFA to be SP. */
3747 insn = get_last_insn ();
3748 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3749 REG_NOTES (insn) = cfi_ops;
3750 RTX_FRAME_RELATED_P (insn) = 1;
3751 }
3752
3753 /* We prefer to emit the combined return/authenticate instruction RETAA,
3754 however there are three cases in which we must instead emit an explicit
3755 authentication instruction.
3756
3757 1) Sibcalls don't return in a normal way, so if we're about to call one
3758 we must authenticate.
3759
3760 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3761 generating code for !TARGET_ARMV8_3 we can't use it and must
3762 explicitly authenticate.
3763
3764 3) On an eh_return path we make extra stack adjustments to update the
3765 canonical frame address to be the exception handler's CFA. We want
3766 to authenticate using the CFA of the function which calls eh_return.
3767 */
3768 if (aarch64_return_address_signing_enabled ()
3769 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3770 {
3771 insn = emit_insn (gen_autisp ());
3772 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3773 RTX_FRAME_RELATED_P (insn) = 1;
3774 }
3775
3776 /* Stack adjustment for exception handler. */
3777 if (crtl->calls_eh_return)
3778 {
3779 /* We need to unwind the stack by the offset computed by
3780 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3781 to be SP; letting the CFA move during this adjustment
3782 is just as correct as retaining the CFA from the body
3783 of the function. Therefore, do nothing special. */
3784 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3785 }
3786
3787 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3788 if (!for_sibcall)
3789 emit_jump_insn (ret_rtx);
3790 }
3791
3792 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3793 normally or return to a previous frame after unwinding.
3794
3795 An EH return uses a single shared return sequence. The epilogue is
3796 exactly like a normal epilogue except that it has an extra input
3797 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3798 that must be applied after the frame has been destroyed. An extra label
3799 is inserted before the epilogue which initializes this register to zero,
3800 and this is the entry point for a normal return.
3801
3802 An actual EH return updates the return address, initializes the stack
3803 adjustment and jumps directly into the epilogue (bypassing the zeroing
3804 of the adjustment). Since the return address is typically saved on the
3805 stack when a function makes a call, the saved LR must be updated outside
3806 the epilogue.
3807
3808 This poses problems as the store is generated well before the epilogue,
3809 so the offset of LR is not known yet. Also optimizations will remove the
3810 store as it appears dead, even after the epilogue is generated (as the
3811 base or offset for loading LR is different in many cases).
3812
3813 To avoid these problems this implementation forces the frame pointer
3814 in eh_return functions so that the location of LR is fixed and known early.
3815 It also marks the store volatile, so no optimization is permitted to
3816 remove the store. */
3817 rtx
3818 aarch64_eh_return_handler_rtx (void)
3819 {
3820 rtx tmp = gen_frame_mem (Pmode,
3821 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3822
3823 /* Mark the store volatile, so no optimization is permitted to remove it. */
3824 MEM_VOLATILE_P (tmp) = true;
3825 return tmp;
3826 }
3827
3828 /* Output code to add DELTA to the first argument, and then jump
3829 to FUNCTION. Used for C++ multiple inheritance. */
3830 static void
3831 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3832 HOST_WIDE_INT delta,
3833 HOST_WIDE_INT vcall_offset,
3834 tree function)
3835 {
3836 /* The this pointer is always in x0. Note that this differs from
3837 Arm where the this pointer maybe bumped to r1 if r0 is required
3838 to return a pointer to an aggregate. On AArch64 a result value
3839 pointer will be in x8. */
3840 int this_regno = R0_REGNUM;
3841 rtx this_rtx, temp0, temp1, addr, funexp;
3842 rtx_insn *insn;
3843
3844 reload_completed = 1;
3845 emit_note (NOTE_INSN_PROLOGUE_END);
3846
3847 if (vcall_offset == 0)
3848 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3849 else
3850 {
3851 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3852
3853 this_rtx = gen_rtx_REG (Pmode, this_regno);
3854 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3855 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3856
3857 addr = this_rtx;
3858 if (delta != 0)
3859 {
3860 if (delta >= -256 && delta < 256)
3861 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3862 plus_constant (Pmode, this_rtx, delta));
3863 else
3864 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3865 }
3866
3867 if (Pmode == ptr_mode)
3868 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3869 else
3870 aarch64_emit_move (temp0,
3871 gen_rtx_ZERO_EXTEND (Pmode,
3872 gen_rtx_MEM (ptr_mode, addr)));
3873
3874 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3875 addr = plus_constant (Pmode, temp0, vcall_offset);
3876 else
3877 {
3878 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3879 Pmode);
3880 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3881 }
3882
3883 if (Pmode == ptr_mode)
3884 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3885 else
3886 aarch64_emit_move (temp1,
3887 gen_rtx_SIGN_EXTEND (Pmode,
3888 gen_rtx_MEM (ptr_mode, addr)));
3889
3890 emit_insn (gen_add2_insn (this_rtx, temp1));
3891 }
3892
3893 /* Generate a tail call to the target function. */
3894 if (!TREE_USED (function))
3895 {
3896 assemble_external (function);
3897 TREE_USED (function) = 1;
3898 }
3899 funexp = XEXP (DECL_RTL (function), 0);
3900 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3901 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3902 SIBLING_CALL_P (insn) = 1;
3903
3904 insn = get_insns ();
3905 shorten_branches (insn);
3906 final_start_function (insn, file, 1);
3907 final (insn, file, 1);
3908 final_end_function ();
3909
3910 /* Stop pretending to be a post-reload pass. */
3911 reload_completed = 0;
3912 }
3913
3914 static bool
3915 aarch64_tls_referenced_p (rtx x)
3916 {
3917 if (!TARGET_HAVE_TLS)
3918 return false;
3919 subrtx_iterator::array_type array;
3920 FOR_EACH_SUBRTX (iter, array, x, ALL)
3921 {
3922 const_rtx x = *iter;
3923 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3924 return true;
3925 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3926 TLS offsets, not real symbol references. */
3927 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3928 iter.skip_subrtxes ();
3929 }
3930 return false;
3931 }
3932
3933
3934 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3935 a left shift of 0 or 12 bits. */
3936 bool
3937 aarch64_uimm12_shift (HOST_WIDE_INT val)
3938 {
3939 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3940 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3941 );
3942 }
3943
3944
3945 /* Return true if val is an immediate that can be loaded into a
3946 register by a MOVZ instruction. */
3947 static bool
3948 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3949 {
3950 if (GET_MODE_SIZE (mode) > 4)
3951 {
3952 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3953 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3954 return 1;
3955 }
3956 else
3957 {
3958 /* Ignore sign extension. */
3959 val &= (HOST_WIDE_INT) 0xffffffff;
3960 }
3961 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3962 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3963 }
3964
3965 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3966
3967 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3968 {
3969 0x0000000100000001ull,
3970 0x0001000100010001ull,
3971 0x0101010101010101ull,
3972 0x1111111111111111ull,
3973 0x5555555555555555ull,
3974 };
3975
3976
3977 /* Return true if val is a valid bitmask immediate. */
3978
3979 bool
3980 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3981 {
3982 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3983 int bits;
3984
3985 /* Check for a single sequence of one bits and return quickly if so.
3986 The special cases of all ones and all zeroes returns false. */
3987 val = (unsigned HOST_WIDE_INT) val_in;
3988 tmp = val + (val & -val);
3989
3990 if (tmp == (tmp & -tmp))
3991 return (val + 1) > 1;
3992
3993 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3994 if (mode == SImode)
3995 val = (val << 32) | (val & 0xffffffff);
3996
3997 /* Invert if the immediate doesn't start with a zero bit - this means we
3998 only need to search for sequences of one bits. */
3999 if (val & 1)
4000 val = ~val;
4001
4002 /* Find the first set bit and set tmp to val with the first sequence of one
4003 bits removed. Return success if there is a single sequence of ones. */
4004 first_one = val & -val;
4005 tmp = val & (val + first_one);
4006
4007 if (tmp == 0)
4008 return true;
4009
4010 /* Find the next set bit and compute the difference in bit position. */
4011 next_one = tmp & -tmp;
4012 bits = clz_hwi (first_one) - clz_hwi (next_one);
4013 mask = val ^ tmp;
4014
4015 /* Check the bit position difference is a power of 2, and that the first
4016 sequence of one bits fits within 'bits' bits. */
4017 if ((mask >> bits) != 0 || bits != (bits & -bits))
4018 return false;
4019
4020 /* Check the sequence of one bits is repeated 64/bits times. */
4021 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4022 }
4023
4024 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4025 Assumed precondition: VAL_IN Is not zero. */
4026
4027 unsigned HOST_WIDE_INT
4028 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4029 {
4030 int lowest_bit_set = ctz_hwi (val_in);
4031 int highest_bit_set = floor_log2 (val_in);
4032 gcc_assert (val_in != 0);
4033
4034 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4035 (HOST_WIDE_INT_1U << lowest_bit_set));
4036 }
4037
4038 /* Create constant where bits outside of lowest bit set to highest bit set
4039 are set to 1. */
4040
4041 unsigned HOST_WIDE_INT
4042 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4043 {
4044 return val_in | ~aarch64_and_split_imm1 (val_in);
4045 }
4046
4047 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4048
4049 bool
4050 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4051 {
4052 if (aarch64_bitmask_imm (val_in, mode))
4053 return false;
4054
4055 if (aarch64_move_imm (val_in, mode))
4056 return false;
4057
4058 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4059
4060 return aarch64_bitmask_imm (imm2, mode);
4061 }
4062
4063 /* Return true if val is an immediate that can be loaded into a
4064 register in a single instruction. */
4065 bool
4066 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4067 {
4068 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4069 return 1;
4070 return aarch64_bitmask_imm (val, mode);
4071 }
4072
4073 static bool
4074 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4075 {
4076 rtx base, offset;
4077
4078 if (GET_CODE (x) == HIGH)
4079 return true;
4080
4081 split_const (x, &base, &offset);
4082 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4083 {
4084 if (aarch64_classify_symbol (base, offset)
4085 != SYMBOL_FORCE_TO_MEM)
4086 return true;
4087 else
4088 /* Avoid generating a 64-bit relocation in ILP32; leave
4089 to aarch64_expand_mov_immediate to handle it properly. */
4090 return mode != ptr_mode;
4091 }
4092
4093 return aarch64_tls_referenced_p (x);
4094 }
4095
4096 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4097 The expansion for a table switch is quite expensive due to the number
4098 of instructions, the table lookup and hard to predict indirect jump.
4099 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4100 set, otherwise use tables for > 16 cases as a tradeoff between size and
4101 performance. When optimizing for size, use the default setting. */
4102
4103 static unsigned int
4104 aarch64_case_values_threshold (void)
4105 {
4106 /* Use the specified limit for the number of cases before using jump
4107 tables at higher optimization levels. */
4108 if (optimize > 2
4109 && selected_cpu->tune->max_case_values != 0)
4110 return selected_cpu->tune->max_case_values;
4111 else
4112 return optimize_size ? default_case_values_threshold () : 17;
4113 }
4114
4115 /* Return true if register REGNO is a valid index register.
4116 STRICT_P is true if REG_OK_STRICT is in effect. */
4117
4118 bool
4119 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4120 {
4121 if (!HARD_REGISTER_NUM_P (regno))
4122 {
4123 if (!strict_p)
4124 return true;
4125
4126 if (!reg_renumber)
4127 return false;
4128
4129 regno = reg_renumber[regno];
4130 }
4131 return GP_REGNUM_P (regno);
4132 }
4133
4134 /* Return true if register REGNO is a valid base register for mode MODE.
4135 STRICT_P is true if REG_OK_STRICT is in effect. */
4136
4137 bool
4138 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4139 {
4140 if (!HARD_REGISTER_NUM_P (regno))
4141 {
4142 if (!strict_p)
4143 return true;
4144
4145 if (!reg_renumber)
4146 return false;
4147
4148 regno = reg_renumber[regno];
4149 }
4150
4151 /* The fake registers will be eliminated to either the stack or
4152 hard frame pointer, both of which are usually valid base registers.
4153 Reload deals with the cases where the eliminated form isn't valid. */
4154 return (GP_REGNUM_P (regno)
4155 || regno == SP_REGNUM
4156 || regno == FRAME_POINTER_REGNUM
4157 || regno == ARG_POINTER_REGNUM);
4158 }
4159
4160 /* Return true if X is a valid base register for mode MODE.
4161 STRICT_P is true if REG_OK_STRICT is in effect. */
4162
4163 static bool
4164 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4165 {
4166 if (!strict_p && GET_CODE (x) == SUBREG)
4167 x = SUBREG_REG (x);
4168
4169 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4170 }
4171
4172 /* Return true if address offset is a valid index. If it is, fill in INFO
4173 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4174
4175 static bool
4176 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4177 machine_mode mode, bool strict_p)
4178 {
4179 enum aarch64_address_type type;
4180 rtx index;
4181 int shift;
4182
4183 /* (reg:P) */
4184 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4185 && GET_MODE (x) == Pmode)
4186 {
4187 type = ADDRESS_REG_REG;
4188 index = x;
4189 shift = 0;
4190 }
4191 /* (sign_extend:DI (reg:SI)) */
4192 else if ((GET_CODE (x) == SIGN_EXTEND
4193 || GET_CODE (x) == ZERO_EXTEND)
4194 && GET_MODE (x) == DImode
4195 && GET_MODE (XEXP (x, 0)) == SImode)
4196 {
4197 type = (GET_CODE (x) == SIGN_EXTEND)
4198 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4199 index = XEXP (x, 0);
4200 shift = 0;
4201 }
4202 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4203 else if (GET_CODE (x) == MULT
4204 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4205 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4206 && GET_MODE (XEXP (x, 0)) == DImode
4207 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4208 && CONST_INT_P (XEXP (x, 1)))
4209 {
4210 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4211 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4212 index = XEXP (XEXP (x, 0), 0);
4213 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4214 }
4215 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4216 else if (GET_CODE (x) == ASHIFT
4217 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4218 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4219 && GET_MODE (XEXP (x, 0)) == DImode
4220 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4221 && CONST_INT_P (XEXP (x, 1)))
4222 {
4223 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4224 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4225 index = XEXP (XEXP (x, 0), 0);
4226 shift = INTVAL (XEXP (x, 1));
4227 }
4228 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4229 else if ((GET_CODE (x) == SIGN_EXTRACT
4230 || GET_CODE (x) == ZERO_EXTRACT)
4231 && GET_MODE (x) == DImode
4232 && GET_CODE (XEXP (x, 0)) == MULT
4233 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4234 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4235 {
4236 type = (GET_CODE (x) == SIGN_EXTRACT)
4237 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4238 index = XEXP (XEXP (x, 0), 0);
4239 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4240 if (INTVAL (XEXP (x, 1)) != 32 + shift
4241 || INTVAL (XEXP (x, 2)) != 0)
4242 shift = -1;
4243 }
4244 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4245 (const_int 0xffffffff<<shift)) */
4246 else if (GET_CODE (x) == AND
4247 && GET_MODE (x) == DImode
4248 && GET_CODE (XEXP (x, 0)) == MULT
4249 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4250 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4251 && CONST_INT_P (XEXP (x, 1)))
4252 {
4253 type = ADDRESS_REG_UXTW;
4254 index = XEXP (XEXP (x, 0), 0);
4255 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4256 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4257 shift = -1;
4258 }
4259 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4260 else if ((GET_CODE (x) == SIGN_EXTRACT
4261 || GET_CODE (x) == ZERO_EXTRACT)
4262 && GET_MODE (x) == DImode
4263 && GET_CODE (XEXP (x, 0)) == ASHIFT
4264 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4265 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4266 {
4267 type = (GET_CODE (x) == SIGN_EXTRACT)
4268 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4269 index = XEXP (XEXP (x, 0), 0);
4270 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4271 if (INTVAL (XEXP (x, 1)) != 32 + shift
4272 || INTVAL (XEXP (x, 2)) != 0)
4273 shift = -1;
4274 }
4275 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4276 (const_int 0xffffffff<<shift)) */
4277 else if (GET_CODE (x) == AND
4278 && GET_MODE (x) == DImode
4279 && GET_CODE (XEXP (x, 0)) == ASHIFT
4280 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4281 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4282 && CONST_INT_P (XEXP (x, 1)))
4283 {
4284 type = ADDRESS_REG_UXTW;
4285 index = XEXP (XEXP (x, 0), 0);
4286 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4287 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4288 shift = -1;
4289 }
4290 /* (mult:P (reg:P) (const_int scale)) */
4291 else if (GET_CODE (x) == MULT
4292 && GET_MODE (x) == Pmode
4293 && GET_MODE (XEXP (x, 0)) == Pmode
4294 && CONST_INT_P (XEXP (x, 1)))
4295 {
4296 type = ADDRESS_REG_REG;
4297 index = XEXP (x, 0);
4298 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4299 }
4300 /* (ashift:P (reg:P) (const_int shift)) */
4301 else if (GET_CODE (x) == ASHIFT
4302 && GET_MODE (x) == Pmode
4303 && GET_MODE (XEXP (x, 0)) == Pmode
4304 && CONST_INT_P (XEXP (x, 1)))
4305 {
4306 type = ADDRESS_REG_REG;
4307 index = XEXP (x, 0);
4308 shift = INTVAL (XEXP (x, 1));
4309 }
4310 else
4311 return false;
4312
4313 if (GET_CODE (index) == SUBREG)
4314 index = SUBREG_REG (index);
4315
4316 if ((shift == 0 ||
4317 (shift > 0 && shift <= 3
4318 && (1 << shift) == GET_MODE_SIZE (mode)))
4319 && REG_P (index)
4320 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4321 {
4322 info->type = type;
4323 info->offset = index;
4324 info->shift = shift;
4325 return true;
4326 }
4327
4328 return false;
4329 }
4330
4331 /* Return true if MODE is one of the modes for which we
4332 support LDP/STP operations. */
4333
4334 static bool
4335 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4336 {
4337 return mode == SImode || mode == DImode
4338 || mode == SFmode || mode == DFmode
4339 || (aarch64_vector_mode_supported_p (mode)
4340 && GET_MODE_SIZE (mode) == 8);
4341 }
4342
4343 /* Return true if REGNO is a virtual pointer register, or an eliminable
4344 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4345 include stack_pointer or hard_frame_pointer. */
4346 static bool
4347 virt_or_elim_regno_p (unsigned regno)
4348 {
4349 return ((regno >= FIRST_VIRTUAL_REGISTER
4350 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4351 || regno == FRAME_POINTER_REGNUM
4352 || regno == ARG_POINTER_REGNUM);
4353 }
4354
4355 /* Return true if X is a valid address for machine mode MODE. If it is,
4356 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4357 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4358
4359 static bool
4360 aarch64_classify_address (struct aarch64_address_info *info,
4361 rtx x, machine_mode mode,
4362 RTX_CODE outer_code, bool strict_p)
4363 {
4364 enum rtx_code code = GET_CODE (x);
4365 rtx op0, op1;
4366
4367 /* On BE, we use load/store pair for all large int mode load/stores.
4368 TI/TFmode may also use a load/store pair. */
4369 bool load_store_pair_p = (outer_code == PARALLEL
4370 || mode == TImode
4371 || mode == TFmode
4372 || (BYTES_BIG_ENDIAN
4373 && aarch64_vect_struct_mode_p (mode)));
4374
4375 bool allow_reg_index_p =
4376 !load_store_pair_p
4377 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4378 && !aarch64_vect_struct_mode_p (mode);
4379
4380 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4381 REG addressing. */
4382 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4383 && (code != POST_INC && code != REG))
4384 return false;
4385
4386 switch (code)
4387 {
4388 case REG:
4389 case SUBREG:
4390 info->type = ADDRESS_REG_IMM;
4391 info->base = x;
4392 info->offset = const0_rtx;
4393 return aarch64_base_register_rtx_p (x, strict_p);
4394
4395 case PLUS:
4396 op0 = XEXP (x, 0);
4397 op1 = XEXP (x, 1);
4398
4399 if (! strict_p
4400 && REG_P (op0)
4401 && virt_or_elim_regno_p (REGNO (op0))
4402 && CONST_INT_P (op1))
4403 {
4404 info->type = ADDRESS_REG_IMM;
4405 info->base = op0;
4406 info->offset = op1;
4407
4408 return true;
4409 }
4410
4411 if (GET_MODE_SIZE (mode) != 0
4412 && CONST_INT_P (op1)
4413 && aarch64_base_register_rtx_p (op0, strict_p))
4414 {
4415 HOST_WIDE_INT offset = INTVAL (op1);
4416
4417 info->type = ADDRESS_REG_IMM;
4418 info->base = op0;
4419 info->offset = op1;
4420
4421 /* TImode and TFmode values are allowed in both pairs of X
4422 registers and individual Q registers. The available
4423 address modes are:
4424 X,X: 7-bit signed scaled offset
4425 Q: 9-bit signed offset
4426 We conservatively require an offset representable in either mode.
4427 When performing the check for pairs of X registers i.e. LDP/STP
4428 pass down DImode since that is the natural size of the LDP/STP
4429 instruction memory accesses. */
4430 if (mode == TImode || mode == TFmode)
4431 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4432 && (offset_9bit_signed_unscaled_p (mode, offset)
4433 || offset_12bit_unsigned_scaled_p (mode, offset)));
4434
4435 /* A 7bit offset check because OImode will emit a ldp/stp
4436 instruction (only big endian will get here).
4437 For ldp/stp instructions, the offset is scaled for the size of a
4438 single element of the pair. */
4439 if (mode == OImode)
4440 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4441
4442 /* Three 9/12 bit offsets checks because CImode will emit three
4443 ldr/str instructions (only big endian will get here). */
4444 if (mode == CImode)
4445 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4446 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4447 || offset_12bit_unsigned_scaled_p (V16QImode,
4448 offset + 32)));
4449
4450 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4451 instructions (only big endian will get here). */
4452 if (mode == XImode)
4453 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4454 && aarch64_offset_7bit_signed_scaled_p (TImode,
4455 offset + 32));
4456
4457 if (load_store_pair_p)
4458 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4459 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4460 else
4461 return (offset_9bit_signed_unscaled_p (mode, offset)
4462 || offset_12bit_unsigned_scaled_p (mode, offset));
4463 }
4464
4465 if (allow_reg_index_p)
4466 {
4467 /* Look for base + (scaled/extended) index register. */
4468 if (aarch64_base_register_rtx_p (op0, strict_p)
4469 && aarch64_classify_index (info, op1, mode, strict_p))
4470 {
4471 info->base = op0;
4472 return true;
4473 }
4474 if (aarch64_base_register_rtx_p (op1, strict_p)
4475 && aarch64_classify_index (info, op0, mode, strict_p))
4476 {
4477 info->base = op1;
4478 return true;
4479 }
4480 }
4481
4482 return false;
4483
4484 case POST_INC:
4485 case POST_DEC:
4486 case PRE_INC:
4487 case PRE_DEC:
4488 info->type = ADDRESS_REG_WB;
4489 info->base = XEXP (x, 0);
4490 info->offset = NULL_RTX;
4491 return aarch64_base_register_rtx_p (info->base, strict_p);
4492
4493 case POST_MODIFY:
4494 case PRE_MODIFY:
4495 info->type = ADDRESS_REG_WB;
4496 info->base = XEXP (x, 0);
4497 if (GET_CODE (XEXP (x, 1)) == PLUS
4498 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4499 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4500 && aarch64_base_register_rtx_p (info->base, strict_p))
4501 {
4502 HOST_WIDE_INT offset;
4503 info->offset = XEXP (XEXP (x, 1), 1);
4504 offset = INTVAL (info->offset);
4505
4506 /* TImode and TFmode values are allowed in both pairs of X
4507 registers and individual Q registers. The available
4508 address modes are:
4509 X,X: 7-bit signed scaled offset
4510 Q: 9-bit signed offset
4511 We conservatively require an offset representable in either mode.
4512 */
4513 if (mode == TImode || mode == TFmode)
4514 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4515 && offset_9bit_signed_unscaled_p (mode, offset));
4516
4517 if (load_store_pair_p)
4518 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4519 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4520 else
4521 return offset_9bit_signed_unscaled_p (mode, offset);
4522 }
4523 return false;
4524
4525 case CONST:
4526 case SYMBOL_REF:
4527 case LABEL_REF:
4528 /* load literal: pc-relative constant pool entry. Only supported
4529 for SI mode or larger. */
4530 info->type = ADDRESS_SYMBOLIC;
4531
4532 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4533 {
4534 rtx sym, addend;
4535
4536 split_const (x, &sym, &addend);
4537 return ((GET_CODE (sym) == LABEL_REF
4538 || (GET_CODE (sym) == SYMBOL_REF
4539 && CONSTANT_POOL_ADDRESS_P (sym)
4540 && aarch64_pcrelative_literal_loads)));
4541 }
4542 return false;
4543
4544 case LO_SUM:
4545 info->type = ADDRESS_LO_SUM;
4546 info->base = XEXP (x, 0);
4547 info->offset = XEXP (x, 1);
4548 if (allow_reg_index_p
4549 && aarch64_base_register_rtx_p (info->base, strict_p))
4550 {
4551 rtx sym, offs;
4552 split_const (info->offset, &sym, &offs);
4553 if (GET_CODE (sym) == SYMBOL_REF
4554 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4555 {
4556 /* The symbol and offset must be aligned to the access size. */
4557 unsigned int align;
4558 unsigned int ref_size;
4559
4560 if (CONSTANT_POOL_ADDRESS_P (sym))
4561 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4562 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4563 {
4564 tree exp = SYMBOL_REF_DECL (sym);
4565 align = TYPE_ALIGN (TREE_TYPE (exp));
4566 align = CONSTANT_ALIGNMENT (exp, align);
4567 }
4568 else if (SYMBOL_REF_DECL (sym))
4569 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4570 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4571 && SYMBOL_REF_BLOCK (sym) != NULL)
4572 align = SYMBOL_REF_BLOCK (sym)->alignment;
4573 else
4574 align = BITS_PER_UNIT;
4575
4576 ref_size = GET_MODE_SIZE (mode);
4577 if (ref_size == 0)
4578 ref_size = GET_MODE_SIZE (DImode);
4579
4580 return ((INTVAL (offs) & (ref_size - 1)) == 0
4581 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4582 }
4583 }
4584 return false;
4585
4586 default:
4587 return false;
4588 }
4589 }
4590
4591 /* Return true if the address X is valid for a PRFM instruction.
4592 STRICT_P is true if we should do strict checking with
4593 aarch64_classify_address. */
4594
4595 bool
4596 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4597 {
4598 struct aarch64_address_info addr;
4599
4600 /* PRFM accepts the same addresses as DImode... */
4601 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4602 if (!res)
4603 return false;
4604
4605 /* ... except writeback forms. */
4606 return addr.type != ADDRESS_REG_WB;
4607 }
4608
4609 bool
4610 aarch64_symbolic_address_p (rtx x)
4611 {
4612 rtx offset;
4613
4614 split_const (x, &x, &offset);
4615 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4616 }
4617
4618 /* Classify the base of symbolic expression X. */
4619
4620 enum aarch64_symbol_type
4621 aarch64_classify_symbolic_expression (rtx x)
4622 {
4623 rtx offset;
4624
4625 split_const (x, &x, &offset);
4626 return aarch64_classify_symbol (x, offset);
4627 }
4628
4629
4630 /* Return TRUE if X is a legitimate address for accessing memory in
4631 mode MODE. */
4632 static bool
4633 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4634 {
4635 struct aarch64_address_info addr;
4636
4637 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4638 }
4639
4640 /* Return TRUE if X is a legitimate address for accessing memory in
4641 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4642 pair operation. */
4643 bool
4644 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4645 RTX_CODE outer_code, bool strict_p)
4646 {
4647 struct aarch64_address_info addr;
4648
4649 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4650 }
4651
4652 /* Split an out-of-range address displacement into a base and offset.
4653 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4654 to increase opportunities for sharing the base address of different sizes.
4655 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4656 static bool
4657 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4658 {
4659 HOST_WIDE_INT offset = INTVAL (*disp);
4660 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4661
4662 if (mode == TImode || mode == TFmode
4663 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4664 base = (offset + 0x100) & ~0x1ff;
4665
4666 *off = GEN_INT (base);
4667 *disp = GEN_INT (offset - base);
4668 return true;
4669 }
4670
4671 /* Return TRUE if rtx X is immediate constant 0.0 */
4672 bool
4673 aarch64_float_const_zero_rtx_p (rtx x)
4674 {
4675 if (GET_MODE (x) == VOIDmode)
4676 return false;
4677
4678 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4679 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4680 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4681 }
4682
4683 /* Return the fixed registers used for condition codes. */
4684
4685 static bool
4686 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4687 {
4688 *p1 = CC_REGNUM;
4689 *p2 = INVALID_REGNUM;
4690 return true;
4691 }
4692
4693 /* This function is used by the call expanders of the machine description.
4694 RESULT is the register in which the result is returned. It's NULL for
4695 "call" and "sibcall".
4696 MEM is the location of the function call.
4697 SIBCALL indicates whether this function call is normal call or sibling call.
4698 It will generate different pattern accordingly. */
4699
4700 void
4701 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4702 {
4703 rtx call, callee, tmp;
4704 rtvec vec;
4705 machine_mode mode;
4706
4707 gcc_assert (MEM_P (mem));
4708 callee = XEXP (mem, 0);
4709 mode = GET_MODE (callee);
4710 gcc_assert (mode == Pmode);
4711
4712 /* Decide if we should generate indirect calls by loading the
4713 address of the callee into a register before performing
4714 the branch-and-link. */
4715 if (SYMBOL_REF_P (callee)
4716 ? (aarch64_is_long_call_p (callee)
4717 || aarch64_is_noplt_call_p (callee))
4718 : !REG_P (callee))
4719 XEXP (mem, 0) = force_reg (mode, callee);
4720
4721 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4722
4723 if (result != NULL_RTX)
4724 call = gen_rtx_SET (result, call);
4725
4726 if (sibcall)
4727 tmp = ret_rtx;
4728 else
4729 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4730
4731 vec = gen_rtvec (2, call, tmp);
4732 call = gen_rtx_PARALLEL (VOIDmode, vec);
4733
4734 aarch64_emit_call_insn (call);
4735 }
4736
4737 /* Emit call insn with PAT and do aarch64-specific handling. */
4738
4739 void
4740 aarch64_emit_call_insn (rtx pat)
4741 {
4742 rtx insn = emit_call_insn (pat);
4743
4744 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4745 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4746 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4747 }
4748
4749 machine_mode
4750 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4751 {
4752 /* All floating point compares return CCFP if it is an equality
4753 comparison, and CCFPE otherwise. */
4754 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4755 {
4756 switch (code)
4757 {
4758 case EQ:
4759 case NE:
4760 case UNORDERED:
4761 case ORDERED:
4762 case UNLT:
4763 case UNLE:
4764 case UNGT:
4765 case UNGE:
4766 case UNEQ:
4767 case LTGT:
4768 return CCFPmode;
4769
4770 case LT:
4771 case LE:
4772 case GT:
4773 case GE:
4774 return CCFPEmode;
4775
4776 default:
4777 gcc_unreachable ();
4778 }
4779 }
4780
4781 /* Equality comparisons of short modes against zero can be performed
4782 using the TST instruction with the appropriate bitmask. */
4783 if (y == const0_rtx && REG_P (x)
4784 && (code == EQ || code == NE)
4785 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4786 return CC_NZmode;
4787
4788 /* Similarly, comparisons of zero_extends from shorter modes can
4789 be performed using an ANDS with an immediate mask. */
4790 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4791 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4792 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4793 && (code == EQ || code == NE))
4794 return CC_NZmode;
4795
4796 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4797 && y == const0_rtx
4798 && (code == EQ || code == NE || code == LT || code == GE)
4799 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4800 || GET_CODE (x) == NEG
4801 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4802 && CONST_INT_P (XEXP (x, 2)))))
4803 return CC_NZmode;
4804
4805 /* A compare with a shifted operand. Because of canonicalization,
4806 the comparison will have to be swapped when we emit the assembly
4807 code. */
4808 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4809 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4810 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4811 || GET_CODE (x) == LSHIFTRT
4812 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4813 return CC_SWPmode;
4814
4815 /* Similarly for a negated operand, but we can only do this for
4816 equalities. */
4817 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4818 && (REG_P (y) || GET_CODE (y) == SUBREG)
4819 && (code == EQ || code == NE)
4820 && GET_CODE (x) == NEG)
4821 return CC_Zmode;
4822
4823 /* A test for unsigned overflow. */
4824 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4825 && code == NE
4826 && GET_CODE (x) == PLUS
4827 && GET_CODE (y) == ZERO_EXTEND)
4828 return CC_Cmode;
4829
4830 /* For everything else, return CCmode. */
4831 return CCmode;
4832 }
4833
4834 static int
4835 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4836
4837 int
4838 aarch64_get_condition_code (rtx x)
4839 {
4840 machine_mode mode = GET_MODE (XEXP (x, 0));
4841 enum rtx_code comp_code = GET_CODE (x);
4842
4843 if (GET_MODE_CLASS (mode) != MODE_CC)
4844 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4845 return aarch64_get_condition_code_1 (mode, comp_code);
4846 }
4847
4848 static int
4849 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4850 {
4851 switch (mode)
4852 {
4853 case CCFPmode:
4854 case CCFPEmode:
4855 switch (comp_code)
4856 {
4857 case GE: return AARCH64_GE;
4858 case GT: return AARCH64_GT;
4859 case LE: return AARCH64_LS;
4860 case LT: return AARCH64_MI;
4861 case NE: return AARCH64_NE;
4862 case EQ: return AARCH64_EQ;
4863 case ORDERED: return AARCH64_VC;
4864 case UNORDERED: return AARCH64_VS;
4865 case UNLT: return AARCH64_LT;
4866 case UNLE: return AARCH64_LE;
4867 case UNGT: return AARCH64_HI;
4868 case UNGE: return AARCH64_PL;
4869 default: return -1;
4870 }
4871 break;
4872
4873 case CCmode:
4874 switch (comp_code)
4875 {
4876 case NE: return AARCH64_NE;
4877 case EQ: return AARCH64_EQ;
4878 case GE: return AARCH64_GE;
4879 case GT: return AARCH64_GT;
4880 case LE: return AARCH64_LE;
4881 case LT: return AARCH64_LT;
4882 case GEU: return AARCH64_CS;
4883 case GTU: return AARCH64_HI;
4884 case LEU: return AARCH64_LS;
4885 case LTU: return AARCH64_CC;
4886 default: return -1;
4887 }
4888 break;
4889
4890 case CC_SWPmode:
4891 switch (comp_code)
4892 {
4893 case NE: return AARCH64_NE;
4894 case EQ: return AARCH64_EQ;
4895 case GE: return AARCH64_LE;
4896 case GT: return AARCH64_LT;
4897 case LE: return AARCH64_GE;
4898 case LT: return AARCH64_GT;
4899 case GEU: return AARCH64_LS;
4900 case GTU: return AARCH64_CC;
4901 case LEU: return AARCH64_CS;
4902 case LTU: return AARCH64_HI;
4903 default: return -1;
4904 }
4905 break;
4906
4907 case CC_NZmode:
4908 switch (comp_code)
4909 {
4910 case NE: return AARCH64_NE;
4911 case EQ: return AARCH64_EQ;
4912 case GE: return AARCH64_PL;
4913 case LT: return AARCH64_MI;
4914 default: return -1;
4915 }
4916 break;
4917
4918 case CC_Zmode:
4919 switch (comp_code)
4920 {
4921 case NE: return AARCH64_NE;
4922 case EQ: return AARCH64_EQ;
4923 default: return -1;
4924 }
4925 break;
4926
4927 case CC_Cmode:
4928 switch (comp_code)
4929 {
4930 case NE: return AARCH64_CS;
4931 case EQ: return AARCH64_CC;
4932 default: return -1;
4933 }
4934 break;
4935
4936 default:
4937 return -1;
4938 }
4939
4940 return -1;
4941 }
4942
4943 bool
4944 aarch64_const_vec_all_same_in_range_p (rtx x,
4945 HOST_WIDE_INT minval,
4946 HOST_WIDE_INT maxval)
4947 {
4948 HOST_WIDE_INT firstval;
4949 int count, i;
4950
4951 if (GET_CODE (x) != CONST_VECTOR
4952 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4953 return false;
4954
4955 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4956 if (firstval < minval || firstval > maxval)
4957 return false;
4958
4959 count = CONST_VECTOR_NUNITS (x);
4960 for (i = 1; i < count; i++)
4961 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4962 return false;
4963
4964 return true;
4965 }
4966
4967 bool
4968 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4969 {
4970 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4971 }
4972
4973
4974 /* N Z C V. */
4975 #define AARCH64_CC_V 1
4976 #define AARCH64_CC_C (1 << 1)
4977 #define AARCH64_CC_Z (1 << 2)
4978 #define AARCH64_CC_N (1 << 3)
4979
4980 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4981 static const int aarch64_nzcv_codes[] =
4982 {
4983 0, /* EQ, Z == 1. */
4984 AARCH64_CC_Z, /* NE, Z == 0. */
4985 0, /* CS, C == 1. */
4986 AARCH64_CC_C, /* CC, C == 0. */
4987 0, /* MI, N == 1. */
4988 AARCH64_CC_N, /* PL, N == 0. */
4989 0, /* VS, V == 1. */
4990 AARCH64_CC_V, /* VC, V == 0. */
4991 0, /* HI, C ==1 && Z == 0. */
4992 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4993 AARCH64_CC_V, /* GE, N == V. */
4994 0, /* LT, N != V. */
4995 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4996 0, /* LE, !(Z == 0 && N == V). */
4997 0, /* AL, Any. */
4998 0 /* NV, Any. */
4999 };
5000
5001 static void
5002 aarch64_print_operand (FILE *f, rtx x, int code)
5003 {
5004 switch (code)
5005 {
5006 /* An integer or symbol address without a preceding # sign. */
5007 case 'c':
5008 switch (GET_CODE (x))
5009 {
5010 case CONST_INT:
5011 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5012 break;
5013
5014 case SYMBOL_REF:
5015 output_addr_const (f, x);
5016 break;
5017
5018 case CONST:
5019 if (GET_CODE (XEXP (x, 0)) == PLUS
5020 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5021 {
5022 output_addr_const (f, x);
5023 break;
5024 }
5025 /* Fall through. */
5026
5027 default:
5028 output_operand_lossage ("Unsupported operand for code '%c'", code);
5029 }
5030 break;
5031
5032 case 'e':
5033 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
5034 {
5035 int n;
5036
5037 if (!CONST_INT_P (x)
5038 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5039 {
5040 output_operand_lossage ("invalid operand for '%%%c'", code);
5041 return;
5042 }
5043
5044 switch (n)
5045 {
5046 case 3:
5047 fputc ('b', f);
5048 break;
5049 case 4:
5050 fputc ('h', f);
5051 break;
5052 case 5:
5053 fputc ('w', f);
5054 break;
5055 default:
5056 output_operand_lossage ("invalid operand for '%%%c'", code);
5057 return;
5058 }
5059 }
5060 break;
5061
5062 case 'p':
5063 {
5064 int n;
5065
5066 /* Print N such that 2^N == X. */
5067 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5068 {
5069 output_operand_lossage ("invalid operand for '%%%c'", code);
5070 return;
5071 }
5072
5073 asm_fprintf (f, "%d", n);
5074 }
5075 break;
5076
5077 case 'P':
5078 /* Print the number of non-zero bits in X (a const_int). */
5079 if (!CONST_INT_P (x))
5080 {
5081 output_operand_lossage ("invalid operand for '%%%c'", code);
5082 return;
5083 }
5084
5085 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5086 break;
5087
5088 case 'H':
5089 /* Print the higher numbered register of a pair (TImode) of regs. */
5090 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5091 {
5092 output_operand_lossage ("invalid operand for '%%%c'", code);
5093 return;
5094 }
5095
5096 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5097 break;
5098
5099 case 'M':
5100 case 'm':
5101 {
5102 int cond_code;
5103 /* Print a condition (eq, ne, etc) or its inverse. */
5104
5105 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5106 if (x == const_true_rtx)
5107 {
5108 if (code == 'M')
5109 fputs ("nv", f);
5110 return;
5111 }
5112
5113 if (!COMPARISON_P (x))
5114 {
5115 output_operand_lossage ("invalid operand for '%%%c'", code);
5116 return;
5117 }
5118
5119 cond_code = aarch64_get_condition_code (x);
5120 gcc_assert (cond_code >= 0);
5121 if (code == 'M')
5122 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5123 fputs (aarch64_condition_codes[cond_code], f);
5124 }
5125 break;
5126
5127 case 'b':
5128 case 'h':
5129 case 's':
5130 case 'd':
5131 case 'q':
5132 /* Print a scalar FP/SIMD register name. */
5133 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5134 {
5135 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5136 return;
5137 }
5138 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5139 break;
5140
5141 case 'S':
5142 case 'T':
5143 case 'U':
5144 case 'V':
5145 /* Print the first FP/SIMD register name in a list. */
5146 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5147 {
5148 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5149 return;
5150 }
5151 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5152 break;
5153
5154 case 'R':
5155 /* Print a scalar FP/SIMD register name + 1. */
5156 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5157 {
5158 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5159 return;
5160 }
5161 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5162 break;
5163
5164 case 'X':
5165 /* Print bottom 16 bits of integer constant in hex. */
5166 if (!CONST_INT_P (x))
5167 {
5168 output_operand_lossage ("invalid operand for '%%%c'", code);
5169 return;
5170 }
5171 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5172 break;
5173
5174 case 'w':
5175 case 'x':
5176 /* Print a general register name or the zero register (32-bit or
5177 64-bit). */
5178 if (x == const0_rtx
5179 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5180 {
5181 asm_fprintf (f, "%czr", code);
5182 break;
5183 }
5184
5185 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5186 {
5187 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5188 break;
5189 }
5190
5191 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5192 {
5193 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5194 break;
5195 }
5196
5197 /* Fall through */
5198
5199 case 0:
5200 /* Print a normal operand, if it's a general register, then we
5201 assume DImode. */
5202 if (x == NULL)
5203 {
5204 output_operand_lossage ("missing operand");
5205 return;
5206 }
5207
5208 switch (GET_CODE (x))
5209 {
5210 case REG:
5211 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5212 break;
5213
5214 case MEM:
5215 output_address (GET_MODE (x), XEXP (x, 0));
5216 break;
5217
5218 case CONST:
5219 case LABEL_REF:
5220 case SYMBOL_REF:
5221 output_addr_const (asm_out_file, x);
5222 break;
5223
5224 case CONST_INT:
5225 asm_fprintf (f, "%wd", INTVAL (x));
5226 break;
5227
5228 case CONST_VECTOR:
5229 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5230 {
5231 gcc_assert (
5232 aarch64_const_vec_all_same_in_range_p (x,
5233 HOST_WIDE_INT_MIN,
5234 HOST_WIDE_INT_MAX));
5235 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5236 }
5237 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5238 {
5239 fputc ('0', f);
5240 }
5241 else
5242 gcc_unreachable ();
5243 break;
5244
5245 case CONST_DOUBLE:
5246 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5247 be getting CONST_DOUBLEs holding integers. */
5248 gcc_assert (GET_MODE (x) != VOIDmode);
5249 if (aarch64_float_const_zero_rtx_p (x))
5250 {
5251 fputc ('0', f);
5252 break;
5253 }
5254 else if (aarch64_float_const_representable_p (x))
5255 {
5256 #define buf_size 20
5257 char float_buf[buf_size] = {'\0'};
5258 real_to_decimal_for_mode (float_buf,
5259 CONST_DOUBLE_REAL_VALUE (x),
5260 buf_size, buf_size,
5261 1, GET_MODE (x));
5262 asm_fprintf (asm_out_file, "%s", float_buf);
5263 break;
5264 #undef buf_size
5265 }
5266 output_operand_lossage ("invalid constant");
5267 return;
5268 default:
5269 output_operand_lossage ("invalid operand");
5270 return;
5271 }
5272 break;
5273
5274 case 'A':
5275 if (GET_CODE (x) == HIGH)
5276 x = XEXP (x, 0);
5277
5278 switch (aarch64_classify_symbolic_expression (x))
5279 {
5280 case SYMBOL_SMALL_GOT_4G:
5281 asm_fprintf (asm_out_file, ":got:");
5282 break;
5283
5284 case SYMBOL_SMALL_TLSGD:
5285 asm_fprintf (asm_out_file, ":tlsgd:");
5286 break;
5287
5288 case SYMBOL_SMALL_TLSDESC:
5289 asm_fprintf (asm_out_file, ":tlsdesc:");
5290 break;
5291
5292 case SYMBOL_SMALL_TLSIE:
5293 asm_fprintf (asm_out_file, ":gottprel:");
5294 break;
5295
5296 case SYMBOL_TLSLE24:
5297 asm_fprintf (asm_out_file, ":tprel:");
5298 break;
5299
5300 case SYMBOL_TINY_GOT:
5301 gcc_unreachable ();
5302 break;
5303
5304 default:
5305 break;
5306 }
5307 output_addr_const (asm_out_file, x);
5308 break;
5309
5310 case 'L':
5311 switch (aarch64_classify_symbolic_expression (x))
5312 {
5313 case SYMBOL_SMALL_GOT_4G:
5314 asm_fprintf (asm_out_file, ":lo12:");
5315 break;
5316
5317 case SYMBOL_SMALL_TLSGD:
5318 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5319 break;
5320
5321 case SYMBOL_SMALL_TLSDESC:
5322 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5323 break;
5324
5325 case SYMBOL_SMALL_TLSIE:
5326 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5327 break;
5328
5329 case SYMBOL_TLSLE12:
5330 asm_fprintf (asm_out_file, ":tprel_lo12:");
5331 break;
5332
5333 case SYMBOL_TLSLE24:
5334 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5335 break;
5336
5337 case SYMBOL_TINY_GOT:
5338 asm_fprintf (asm_out_file, ":got:");
5339 break;
5340
5341 case SYMBOL_TINY_TLSIE:
5342 asm_fprintf (asm_out_file, ":gottprel:");
5343 break;
5344
5345 default:
5346 break;
5347 }
5348 output_addr_const (asm_out_file, x);
5349 break;
5350
5351 case 'G':
5352
5353 switch (aarch64_classify_symbolic_expression (x))
5354 {
5355 case SYMBOL_TLSLE24:
5356 asm_fprintf (asm_out_file, ":tprel_hi12:");
5357 break;
5358 default:
5359 break;
5360 }
5361 output_addr_const (asm_out_file, x);
5362 break;
5363
5364 case 'k':
5365 {
5366 HOST_WIDE_INT cond_code;
5367 /* Print nzcv. */
5368
5369 if (!CONST_INT_P (x))
5370 {
5371 output_operand_lossage ("invalid operand for '%%%c'", code);
5372 return;
5373 }
5374
5375 cond_code = INTVAL (x);
5376 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5377 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5378 }
5379 break;
5380
5381 default:
5382 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5383 return;
5384 }
5385 }
5386
5387 static void
5388 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5389 {
5390 struct aarch64_address_info addr;
5391
5392 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5393 switch (addr.type)
5394 {
5395 case ADDRESS_REG_IMM:
5396 if (addr.offset == const0_rtx)
5397 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5398 else
5399 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5400 INTVAL (addr.offset));
5401 return;
5402
5403 case ADDRESS_REG_REG:
5404 if (addr.shift == 0)
5405 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5406 reg_names [REGNO (addr.offset)]);
5407 else
5408 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5409 reg_names [REGNO (addr.offset)], addr.shift);
5410 return;
5411
5412 case ADDRESS_REG_UXTW:
5413 if (addr.shift == 0)
5414 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5415 REGNO (addr.offset) - R0_REGNUM);
5416 else
5417 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5418 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5419 return;
5420
5421 case ADDRESS_REG_SXTW:
5422 if (addr.shift == 0)
5423 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5424 REGNO (addr.offset) - R0_REGNUM);
5425 else
5426 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5427 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5428 return;
5429
5430 case ADDRESS_REG_WB:
5431 switch (GET_CODE (x))
5432 {
5433 case PRE_INC:
5434 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5435 GET_MODE_SIZE (mode));
5436 return;
5437 case POST_INC:
5438 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5439 GET_MODE_SIZE (mode));
5440 return;
5441 case PRE_DEC:
5442 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5443 GET_MODE_SIZE (mode));
5444 return;
5445 case POST_DEC:
5446 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5447 GET_MODE_SIZE (mode));
5448 return;
5449 case PRE_MODIFY:
5450 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5451 INTVAL (addr.offset));
5452 return;
5453 case POST_MODIFY:
5454 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5455 INTVAL (addr.offset));
5456 return;
5457 default:
5458 break;
5459 }
5460 break;
5461
5462 case ADDRESS_LO_SUM:
5463 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5464 output_addr_const (f, addr.offset);
5465 asm_fprintf (f, "]");
5466 return;
5467
5468 case ADDRESS_SYMBOLIC:
5469 break;
5470 }
5471
5472 output_addr_const (f, x);
5473 }
5474
5475 bool
5476 aarch64_label_mentioned_p (rtx x)
5477 {
5478 const char *fmt;
5479 int i;
5480
5481 if (GET_CODE (x) == LABEL_REF)
5482 return true;
5483
5484 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5485 referencing instruction, but they are constant offsets, not
5486 symbols. */
5487 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5488 return false;
5489
5490 fmt = GET_RTX_FORMAT (GET_CODE (x));
5491 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5492 {
5493 if (fmt[i] == 'E')
5494 {
5495 int j;
5496
5497 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5498 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5499 return 1;
5500 }
5501 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5502 return 1;
5503 }
5504
5505 return 0;
5506 }
5507
5508 /* Implement REGNO_REG_CLASS. */
5509
5510 enum reg_class
5511 aarch64_regno_regclass (unsigned regno)
5512 {
5513 if (GP_REGNUM_P (regno))
5514 return GENERAL_REGS;
5515
5516 if (regno == SP_REGNUM)
5517 return STACK_REG;
5518
5519 if (regno == FRAME_POINTER_REGNUM
5520 || regno == ARG_POINTER_REGNUM)
5521 return POINTER_REGS;
5522
5523 if (FP_REGNUM_P (regno))
5524 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5525
5526 return NO_REGS;
5527 }
5528
5529 static rtx
5530 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5531 {
5532 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5533 where mask is selected by alignment and size of the offset.
5534 We try to pick as large a range for the offset as possible to
5535 maximize the chance of a CSE. However, for aligned addresses
5536 we limit the range to 4k so that structures with different sized
5537 elements are likely to use the same base. We need to be careful
5538 not to split a CONST for some forms of address expression, otherwise
5539 it will generate sub-optimal code. */
5540
5541 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5542 {
5543 rtx base = XEXP (x, 0);
5544 rtx offset_rtx = XEXP (x, 1);
5545 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5546
5547 if (GET_CODE (base) == PLUS)
5548 {
5549 rtx op0 = XEXP (base, 0);
5550 rtx op1 = XEXP (base, 1);
5551
5552 /* Force any scaling into a temp for CSE. */
5553 op0 = force_reg (Pmode, op0);
5554 op1 = force_reg (Pmode, op1);
5555
5556 /* Let the pointer register be in op0. */
5557 if (REG_POINTER (op1))
5558 std::swap (op0, op1);
5559
5560 /* If the pointer is virtual or frame related, then we know that
5561 virtual register instantiation or register elimination is going
5562 to apply a second constant. We want the two constants folded
5563 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5564 if (virt_or_elim_regno_p (REGNO (op0)))
5565 {
5566 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5567 NULL_RTX, true, OPTAB_DIRECT);
5568 return gen_rtx_PLUS (Pmode, base, op1);
5569 }
5570
5571 /* Otherwise, in order to encourage CSE (and thence loop strength
5572 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5573 base = expand_binop (Pmode, add_optab, op0, op1,
5574 NULL_RTX, true, OPTAB_DIRECT);
5575 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5576 }
5577
5578 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5579 HOST_WIDE_INT base_offset;
5580 if (GET_MODE_SIZE (mode) > 16)
5581 base_offset = (offset + 0x400) & ~0x7f0;
5582 /* For offsets aren't a multiple of the access size, the limit is
5583 -256...255. */
5584 else if (offset & (GET_MODE_SIZE (mode) - 1))
5585 {
5586 base_offset = (offset + 0x100) & ~0x1ff;
5587
5588 /* BLKmode typically uses LDP of X-registers. */
5589 if (mode == BLKmode)
5590 base_offset = (offset + 512) & ~0x3ff;
5591 }
5592 /* Small negative offsets are supported. */
5593 else if (IN_RANGE (offset, -256, 0))
5594 base_offset = 0;
5595 else if (mode == TImode || mode == TFmode)
5596 base_offset = (offset + 0x100) & ~0x1ff;
5597 /* Use 12-bit offset by access size. */
5598 else
5599 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5600
5601 if (base_offset != 0)
5602 {
5603 base = plus_constant (Pmode, base, base_offset);
5604 base = force_operand (base, NULL_RTX);
5605 return plus_constant (Pmode, base, offset - base_offset);
5606 }
5607 }
5608
5609 return x;
5610 }
5611
5612 /* Return the reload icode required for a constant pool in mode. */
5613 static enum insn_code
5614 aarch64_constant_pool_reload_icode (machine_mode mode)
5615 {
5616 switch (mode)
5617 {
5618 case SFmode:
5619 return CODE_FOR_aarch64_reload_movcpsfdi;
5620
5621 case DFmode:
5622 return CODE_FOR_aarch64_reload_movcpdfdi;
5623
5624 case TFmode:
5625 return CODE_FOR_aarch64_reload_movcptfdi;
5626
5627 case V8QImode:
5628 return CODE_FOR_aarch64_reload_movcpv8qidi;
5629
5630 case V16QImode:
5631 return CODE_FOR_aarch64_reload_movcpv16qidi;
5632
5633 case V4HImode:
5634 return CODE_FOR_aarch64_reload_movcpv4hidi;
5635
5636 case V8HImode:
5637 return CODE_FOR_aarch64_reload_movcpv8hidi;
5638
5639 case V2SImode:
5640 return CODE_FOR_aarch64_reload_movcpv2sidi;
5641
5642 case V4SImode:
5643 return CODE_FOR_aarch64_reload_movcpv4sidi;
5644
5645 case V2DImode:
5646 return CODE_FOR_aarch64_reload_movcpv2didi;
5647
5648 case V2DFmode:
5649 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5650
5651 default:
5652 gcc_unreachable ();
5653 }
5654
5655 gcc_unreachable ();
5656 }
5657 static reg_class_t
5658 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5659 reg_class_t rclass,
5660 machine_mode mode,
5661 secondary_reload_info *sri)
5662 {
5663
5664 /* If we have to disable direct literal pool loads and stores because the
5665 function is too big, then we need a scratch register. */
5666 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5667 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5668 || targetm.vector_mode_supported_p (GET_MODE (x)))
5669 && !aarch64_pcrelative_literal_loads)
5670 {
5671 sri->icode = aarch64_constant_pool_reload_icode (mode);
5672 return NO_REGS;
5673 }
5674
5675 /* Without the TARGET_SIMD instructions we cannot move a Q register
5676 to a Q register directly. We need a scratch. */
5677 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5678 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5679 && reg_class_subset_p (rclass, FP_REGS))
5680 {
5681 if (mode == TFmode)
5682 sri->icode = CODE_FOR_aarch64_reload_movtf;
5683 else if (mode == TImode)
5684 sri->icode = CODE_FOR_aarch64_reload_movti;
5685 return NO_REGS;
5686 }
5687
5688 /* A TFmode or TImode memory access should be handled via an FP_REGS
5689 because AArch64 has richer addressing modes for LDR/STR instructions
5690 than LDP/STP instructions. */
5691 if (TARGET_FLOAT && rclass == GENERAL_REGS
5692 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5693 return FP_REGS;
5694
5695 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5696 return GENERAL_REGS;
5697
5698 return NO_REGS;
5699 }
5700
5701 static bool
5702 aarch64_can_eliminate (const int from, const int to)
5703 {
5704 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5705 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5706
5707 if (frame_pointer_needed)
5708 {
5709 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5710 return true;
5711 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5712 return false;
5713 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5714 && !cfun->calls_alloca)
5715 return true;
5716 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5717 return true;
5718
5719 return false;
5720 }
5721 else
5722 {
5723 /* If we decided that we didn't need a leaf frame pointer but then used
5724 LR in the function, then we'll want a frame pointer after all, so
5725 prevent this elimination to ensure a frame pointer is used. */
5726 if (to == STACK_POINTER_REGNUM
5727 && flag_omit_leaf_frame_pointer
5728 && df_regs_ever_live_p (LR_REGNUM))
5729 return false;
5730 }
5731
5732 return true;
5733 }
5734
5735 HOST_WIDE_INT
5736 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5737 {
5738 aarch64_layout_frame ();
5739
5740 if (to == HARD_FRAME_POINTER_REGNUM)
5741 {
5742 if (from == ARG_POINTER_REGNUM)
5743 return cfun->machine->frame.hard_fp_offset;
5744
5745 if (from == FRAME_POINTER_REGNUM)
5746 return cfun->machine->frame.hard_fp_offset
5747 - cfun->machine->frame.locals_offset;
5748 }
5749
5750 if (to == STACK_POINTER_REGNUM)
5751 {
5752 if (from == FRAME_POINTER_REGNUM)
5753 return cfun->machine->frame.frame_size
5754 - cfun->machine->frame.locals_offset;
5755 }
5756
5757 return cfun->machine->frame.frame_size;
5758 }
5759
5760 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5761 previous frame. */
5762
5763 rtx
5764 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5765 {
5766 if (count != 0)
5767 return const0_rtx;
5768 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5769 }
5770
5771
5772 static void
5773 aarch64_asm_trampoline_template (FILE *f)
5774 {
5775 if (TARGET_ILP32)
5776 {
5777 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5778 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5779 }
5780 else
5781 {
5782 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5783 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5784 }
5785 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5786 assemble_aligned_integer (4, const0_rtx);
5787 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5788 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5789 }
5790
5791 static void
5792 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5793 {
5794 rtx fnaddr, mem, a_tramp;
5795 const int tramp_code_sz = 16;
5796
5797 /* Don't need to copy the trailing D-words, we fill those in below. */
5798 emit_block_move (m_tramp, assemble_trampoline_template (),
5799 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5800 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5801 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5802 if (GET_MODE (fnaddr) != ptr_mode)
5803 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5804 emit_move_insn (mem, fnaddr);
5805
5806 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5807 emit_move_insn (mem, chain_value);
5808
5809 /* XXX We should really define a "clear_cache" pattern and use
5810 gen_clear_cache(). */
5811 a_tramp = XEXP (m_tramp, 0);
5812 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5813 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5814 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5815 ptr_mode);
5816 }
5817
5818 static unsigned char
5819 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5820 {
5821 switch (regclass)
5822 {
5823 case CALLER_SAVE_REGS:
5824 case POINTER_REGS:
5825 case GENERAL_REGS:
5826 case ALL_REGS:
5827 case FP_REGS:
5828 case FP_LO_REGS:
5829 return
5830 aarch64_vector_mode_p (mode)
5831 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5832 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5833 case STACK_REG:
5834 return 1;
5835
5836 case NO_REGS:
5837 return 0;
5838
5839 default:
5840 break;
5841 }
5842 gcc_unreachable ();
5843 }
5844
5845 static reg_class_t
5846 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5847 {
5848 if (regclass == POINTER_REGS)
5849 return GENERAL_REGS;
5850
5851 if (regclass == STACK_REG)
5852 {
5853 if (REG_P(x)
5854 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5855 return regclass;
5856
5857 return NO_REGS;
5858 }
5859
5860 /* If it's an integer immediate that MOVI can't handle, then
5861 FP_REGS is not an option, so we return NO_REGS instead. */
5862 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5863 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5864 return NO_REGS;
5865
5866 /* Register eliminiation can result in a request for
5867 SP+constant->FP_REGS. We cannot support such operations which
5868 use SP as source and an FP_REG as destination, so reject out
5869 right now. */
5870 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5871 {
5872 rtx lhs = XEXP (x, 0);
5873
5874 /* Look through a possible SUBREG introduced by ILP32. */
5875 if (GET_CODE (lhs) == SUBREG)
5876 lhs = SUBREG_REG (lhs);
5877
5878 gcc_assert (REG_P (lhs));
5879 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5880 POINTER_REGS));
5881 return NO_REGS;
5882 }
5883
5884 return regclass;
5885 }
5886
5887 void
5888 aarch64_asm_output_labelref (FILE* f, const char *name)
5889 {
5890 asm_fprintf (f, "%U%s", name);
5891 }
5892
5893 static void
5894 aarch64_elf_asm_constructor (rtx symbol, int priority)
5895 {
5896 if (priority == DEFAULT_INIT_PRIORITY)
5897 default_ctor_section_asm_out_constructor (symbol, priority);
5898 else
5899 {
5900 section *s;
5901 /* While priority is known to be in range [0, 65535], so 18 bytes
5902 would be enough, the compiler might not know that. To avoid
5903 -Wformat-truncation false positive, use a larger size. */
5904 char buf[23];
5905 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5906 s = get_section (buf, SECTION_WRITE, NULL);
5907 switch_to_section (s);
5908 assemble_align (POINTER_SIZE);
5909 assemble_aligned_integer (POINTER_BYTES, symbol);
5910 }
5911 }
5912
5913 static void
5914 aarch64_elf_asm_destructor (rtx symbol, int priority)
5915 {
5916 if (priority == DEFAULT_INIT_PRIORITY)
5917 default_dtor_section_asm_out_destructor (symbol, priority);
5918 else
5919 {
5920 section *s;
5921 /* While priority is known to be in range [0, 65535], so 18 bytes
5922 would be enough, the compiler might not know that. To avoid
5923 -Wformat-truncation false positive, use a larger size. */
5924 char buf[23];
5925 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5926 s = get_section (buf, SECTION_WRITE, NULL);
5927 switch_to_section (s);
5928 assemble_align (POINTER_SIZE);
5929 assemble_aligned_integer (POINTER_BYTES, symbol);
5930 }
5931 }
5932
5933 const char*
5934 aarch64_output_casesi (rtx *operands)
5935 {
5936 char buf[100];
5937 char label[100];
5938 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5939 int index;
5940 static const char *const patterns[4][2] =
5941 {
5942 {
5943 "ldrb\t%w3, [%0,%w1,uxtw]",
5944 "add\t%3, %4, %w3, sxtb #2"
5945 },
5946 {
5947 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5948 "add\t%3, %4, %w3, sxth #2"
5949 },
5950 {
5951 "ldr\t%w3, [%0,%w1,uxtw #2]",
5952 "add\t%3, %4, %w3, sxtw #2"
5953 },
5954 /* We assume that DImode is only generated when not optimizing and
5955 that we don't really need 64-bit address offsets. That would
5956 imply an object file with 8GB of code in a single function! */
5957 {
5958 "ldr\t%w3, [%0,%w1,uxtw #2]",
5959 "add\t%3, %4, %w3, sxtw #2"
5960 }
5961 };
5962
5963 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5964
5965 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5966
5967 gcc_assert (index >= 0 && index <= 3);
5968
5969 /* Need to implement table size reduction, by chaning the code below. */
5970 output_asm_insn (patterns[index][0], operands);
5971 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5972 snprintf (buf, sizeof (buf),
5973 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5974 output_asm_insn (buf, operands);
5975 output_asm_insn (patterns[index][1], operands);
5976 output_asm_insn ("br\t%3", operands);
5977 assemble_label (asm_out_file, label);
5978 return "";
5979 }
5980
5981
5982 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5983 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5984 operator. */
5985
5986 int
5987 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5988 {
5989 if (shift >= 0 && shift <= 3)
5990 {
5991 int size;
5992 for (size = 8; size <= 32; size *= 2)
5993 {
5994 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5995 if (mask == bits << shift)
5996 return size;
5997 }
5998 }
5999 return 0;
6000 }
6001
6002 /* Constant pools are per function only when PC relative
6003 literal loads are true or we are in the large memory
6004 model. */
6005
6006 static inline bool
6007 aarch64_can_use_per_function_literal_pools_p (void)
6008 {
6009 return (aarch64_pcrelative_literal_loads
6010 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6011 }
6012
6013 static bool
6014 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6015 {
6016 /* Fixme:: In an ideal world this would work similar
6017 to the logic in aarch64_select_rtx_section but this
6018 breaks bootstrap in gcc go. For now we workaround
6019 this by returning false here. */
6020 return false;
6021 }
6022
6023 /* Select appropriate section for constants depending
6024 on where we place literal pools. */
6025
6026 static section *
6027 aarch64_select_rtx_section (machine_mode mode,
6028 rtx x,
6029 unsigned HOST_WIDE_INT align)
6030 {
6031 if (aarch64_can_use_per_function_literal_pools_p ())
6032 return function_section (current_function_decl);
6033
6034 return default_elf_select_rtx_section (mode, x, align);
6035 }
6036
6037 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6038 void
6039 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6040 HOST_WIDE_INT offset)
6041 {
6042 /* When using per-function literal pools, we must ensure that any code
6043 section is aligned to the minimal instruction length, lest we get
6044 errors from the assembler re "unaligned instructions". */
6045 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6046 ASM_OUTPUT_ALIGN (f, 2);
6047 }
6048
6049 /* Costs. */
6050
6051 /* Helper function for rtx cost calculation. Strip a shift expression
6052 from X. Returns the inner operand if successful, or the original
6053 expression on failure. */
6054 static rtx
6055 aarch64_strip_shift (rtx x)
6056 {
6057 rtx op = x;
6058
6059 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6060 we can convert both to ROR during final output. */
6061 if ((GET_CODE (op) == ASHIFT
6062 || GET_CODE (op) == ASHIFTRT
6063 || GET_CODE (op) == LSHIFTRT
6064 || GET_CODE (op) == ROTATERT
6065 || GET_CODE (op) == ROTATE)
6066 && CONST_INT_P (XEXP (op, 1)))
6067 return XEXP (op, 0);
6068
6069 if (GET_CODE (op) == MULT
6070 && CONST_INT_P (XEXP (op, 1))
6071 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6072 return XEXP (op, 0);
6073
6074 return x;
6075 }
6076
6077 /* Helper function for rtx cost calculation. Strip an extend
6078 expression from X. Returns the inner operand if successful, or the
6079 original expression on failure. We deal with a number of possible
6080 canonicalization variations here. */
6081 static rtx
6082 aarch64_strip_extend (rtx x)
6083 {
6084 rtx op = x;
6085
6086 /* Zero and sign extraction of a widened value. */
6087 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6088 && XEXP (op, 2) == const0_rtx
6089 && GET_CODE (XEXP (op, 0)) == MULT
6090 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6091 XEXP (op, 1)))
6092 return XEXP (XEXP (op, 0), 0);
6093
6094 /* It can also be represented (for zero-extend) as an AND with an
6095 immediate. */
6096 if (GET_CODE (op) == AND
6097 && GET_CODE (XEXP (op, 0)) == MULT
6098 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6099 && CONST_INT_P (XEXP (op, 1))
6100 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6101 INTVAL (XEXP (op, 1))) != 0)
6102 return XEXP (XEXP (op, 0), 0);
6103
6104 /* Now handle extended register, as this may also have an optional
6105 left shift by 1..4. */
6106 if (GET_CODE (op) == ASHIFT
6107 && CONST_INT_P (XEXP (op, 1))
6108 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6109 op = XEXP (op, 0);
6110
6111 if (GET_CODE (op) == ZERO_EXTEND
6112 || GET_CODE (op) == SIGN_EXTEND)
6113 op = XEXP (op, 0);
6114
6115 if (op != x)
6116 return op;
6117
6118 return x;
6119 }
6120
6121 /* Return true iff CODE is a shift supported in combination
6122 with arithmetic instructions. */
6123
6124 static bool
6125 aarch64_shift_p (enum rtx_code code)
6126 {
6127 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6128 }
6129
6130 /* Helper function for rtx cost calculation. Calculate the cost of
6131 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6132 Return the calculated cost of the expression, recursing manually in to
6133 operands where needed. */
6134
6135 static int
6136 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6137 {
6138 rtx op0, op1;
6139 const struct cpu_cost_table *extra_cost
6140 = aarch64_tune_params.insn_extra_cost;
6141 int cost = 0;
6142 bool compound_p = (outer == PLUS || outer == MINUS);
6143 machine_mode mode = GET_MODE (x);
6144
6145 gcc_checking_assert (code == MULT);
6146
6147 op0 = XEXP (x, 0);
6148 op1 = XEXP (x, 1);
6149
6150 if (VECTOR_MODE_P (mode))
6151 mode = GET_MODE_INNER (mode);
6152
6153 /* Integer multiply/fma. */
6154 if (GET_MODE_CLASS (mode) == MODE_INT)
6155 {
6156 /* The multiply will be canonicalized as a shift, cost it as such. */
6157 if (aarch64_shift_p (GET_CODE (x))
6158 || (CONST_INT_P (op1)
6159 && exact_log2 (INTVAL (op1)) > 0))
6160 {
6161 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6162 || GET_CODE (op0) == SIGN_EXTEND;
6163 if (speed)
6164 {
6165 if (compound_p)
6166 {
6167 if (REG_P (op1))
6168 /* ARITH + shift-by-register. */
6169 cost += extra_cost->alu.arith_shift_reg;
6170 else if (is_extend)
6171 /* ARITH + extended register. We don't have a cost field
6172 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6173 cost += extra_cost->alu.extend_arith;
6174 else
6175 /* ARITH + shift-by-immediate. */
6176 cost += extra_cost->alu.arith_shift;
6177 }
6178 else
6179 /* LSL (immediate). */
6180 cost += extra_cost->alu.shift;
6181
6182 }
6183 /* Strip extends as we will have costed them in the case above. */
6184 if (is_extend)
6185 op0 = aarch64_strip_extend (op0);
6186
6187 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6188
6189 return cost;
6190 }
6191
6192 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6193 compound and let the below cases handle it. After all, MNEG is a
6194 special-case alias of MSUB. */
6195 if (GET_CODE (op0) == NEG)
6196 {
6197 op0 = XEXP (op0, 0);
6198 compound_p = true;
6199 }
6200
6201 /* Integer multiplies or FMAs have zero/sign extending variants. */
6202 if ((GET_CODE (op0) == ZERO_EXTEND
6203 && GET_CODE (op1) == ZERO_EXTEND)
6204 || (GET_CODE (op0) == SIGN_EXTEND
6205 && GET_CODE (op1) == SIGN_EXTEND))
6206 {
6207 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6208 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6209
6210 if (speed)
6211 {
6212 if (compound_p)
6213 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6214 cost += extra_cost->mult[0].extend_add;
6215 else
6216 /* MUL/SMULL/UMULL. */
6217 cost += extra_cost->mult[0].extend;
6218 }
6219
6220 return cost;
6221 }
6222
6223 /* This is either an integer multiply or a MADD. In both cases
6224 we want to recurse and cost the operands. */
6225 cost += rtx_cost (op0, mode, MULT, 0, speed);
6226 cost += rtx_cost (op1, mode, MULT, 1, speed);
6227
6228 if (speed)
6229 {
6230 if (compound_p)
6231 /* MADD/MSUB. */
6232 cost += extra_cost->mult[mode == DImode].add;
6233 else
6234 /* MUL. */
6235 cost += extra_cost->mult[mode == DImode].simple;
6236 }
6237
6238 return cost;
6239 }
6240 else
6241 {
6242 if (speed)
6243 {
6244 /* Floating-point FMA/FMUL can also support negations of the
6245 operands, unless the rounding mode is upward or downward in
6246 which case FNMUL is different than FMUL with operand negation. */
6247 bool neg0 = GET_CODE (op0) == NEG;
6248 bool neg1 = GET_CODE (op1) == NEG;
6249 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6250 {
6251 if (neg0)
6252 op0 = XEXP (op0, 0);
6253 if (neg1)
6254 op1 = XEXP (op1, 0);
6255 }
6256
6257 if (compound_p)
6258 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6259 cost += extra_cost->fp[mode == DFmode].fma;
6260 else
6261 /* FMUL/FNMUL. */
6262 cost += extra_cost->fp[mode == DFmode].mult;
6263 }
6264
6265 cost += rtx_cost (op0, mode, MULT, 0, speed);
6266 cost += rtx_cost (op1, mode, MULT, 1, speed);
6267 return cost;
6268 }
6269 }
6270
6271 static int
6272 aarch64_address_cost (rtx x,
6273 machine_mode mode,
6274 addr_space_t as ATTRIBUTE_UNUSED,
6275 bool speed)
6276 {
6277 enum rtx_code c = GET_CODE (x);
6278 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6279 struct aarch64_address_info info;
6280 int cost = 0;
6281 info.shift = 0;
6282
6283 if (!aarch64_classify_address (&info, x, mode, c, false))
6284 {
6285 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6286 {
6287 /* This is a CONST or SYMBOL ref which will be split
6288 in a different way depending on the code model in use.
6289 Cost it through the generic infrastructure. */
6290 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6291 /* Divide through by the cost of one instruction to
6292 bring it to the same units as the address costs. */
6293 cost_symbol_ref /= COSTS_N_INSNS (1);
6294 /* The cost is then the cost of preparing the address,
6295 followed by an immediate (possibly 0) offset. */
6296 return cost_symbol_ref + addr_cost->imm_offset;
6297 }
6298 else
6299 {
6300 /* This is most likely a jump table from a case
6301 statement. */
6302 return addr_cost->register_offset;
6303 }
6304 }
6305
6306 switch (info.type)
6307 {
6308 case ADDRESS_LO_SUM:
6309 case ADDRESS_SYMBOLIC:
6310 case ADDRESS_REG_IMM:
6311 cost += addr_cost->imm_offset;
6312 break;
6313
6314 case ADDRESS_REG_WB:
6315 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6316 cost += addr_cost->pre_modify;
6317 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6318 cost += addr_cost->post_modify;
6319 else
6320 gcc_unreachable ();
6321
6322 break;
6323
6324 case ADDRESS_REG_REG:
6325 cost += addr_cost->register_offset;
6326 break;
6327
6328 case ADDRESS_REG_SXTW:
6329 cost += addr_cost->register_sextend;
6330 break;
6331
6332 case ADDRESS_REG_UXTW:
6333 cost += addr_cost->register_zextend;
6334 break;
6335
6336 default:
6337 gcc_unreachable ();
6338 }
6339
6340
6341 if (info.shift > 0)
6342 {
6343 /* For the sake of calculating the cost of the shifted register
6344 component, we can treat same sized modes in the same way. */
6345 switch (GET_MODE_BITSIZE (mode))
6346 {
6347 case 16:
6348 cost += addr_cost->addr_scale_costs.hi;
6349 break;
6350
6351 case 32:
6352 cost += addr_cost->addr_scale_costs.si;
6353 break;
6354
6355 case 64:
6356 cost += addr_cost->addr_scale_costs.di;
6357 break;
6358
6359 /* We can't tell, or this is a 128-bit vector. */
6360 default:
6361 cost += addr_cost->addr_scale_costs.ti;
6362 break;
6363 }
6364 }
6365
6366 return cost;
6367 }
6368
6369 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6370 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6371 to be taken. */
6372
6373 int
6374 aarch64_branch_cost (bool speed_p, bool predictable_p)
6375 {
6376 /* When optimizing for speed, use the cost of unpredictable branches. */
6377 const struct cpu_branch_cost *branch_costs =
6378 aarch64_tune_params.branch_costs;
6379
6380 if (!speed_p || predictable_p)
6381 return branch_costs->predictable;
6382 else
6383 return branch_costs->unpredictable;
6384 }
6385
6386 /* Return true if the RTX X in mode MODE is a zero or sign extract
6387 usable in an ADD or SUB (extended register) instruction. */
6388 static bool
6389 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6390 {
6391 /* Catch add with a sign extract.
6392 This is add_<optab><mode>_multp2. */
6393 if (GET_CODE (x) == SIGN_EXTRACT
6394 || GET_CODE (x) == ZERO_EXTRACT)
6395 {
6396 rtx op0 = XEXP (x, 0);
6397 rtx op1 = XEXP (x, 1);
6398 rtx op2 = XEXP (x, 2);
6399
6400 if (GET_CODE (op0) == MULT
6401 && CONST_INT_P (op1)
6402 && op2 == const0_rtx
6403 && CONST_INT_P (XEXP (op0, 1))
6404 && aarch64_is_extend_from_extract (mode,
6405 XEXP (op0, 1),
6406 op1))
6407 {
6408 return true;
6409 }
6410 }
6411 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6412 No shift. */
6413 else if (GET_CODE (x) == SIGN_EXTEND
6414 || GET_CODE (x) == ZERO_EXTEND)
6415 return REG_P (XEXP (x, 0));
6416
6417 return false;
6418 }
6419
6420 static bool
6421 aarch64_frint_unspec_p (unsigned int u)
6422 {
6423 switch (u)
6424 {
6425 case UNSPEC_FRINTZ:
6426 case UNSPEC_FRINTP:
6427 case UNSPEC_FRINTM:
6428 case UNSPEC_FRINTA:
6429 case UNSPEC_FRINTN:
6430 case UNSPEC_FRINTX:
6431 case UNSPEC_FRINTI:
6432 return true;
6433
6434 default:
6435 return false;
6436 }
6437 }
6438
6439 /* Return true iff X is an rtx that will match an extr instruction
6440 i.e. as described in the *extr<mode>5_insn family of patterns.
6441 OP0 and OP1 will be set to the operands of the shifts involved
6442 on success and will be NULL_RTX otherwise. */
6443
6444 static bool
6445 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6446 {
6447 rtx op0, op1;
6448 machine_mode mode = GET_MODE (x);
6449
6450 *res_op0 = NULL_RTX;
6451 *res_op1 = NULL_RTX;
6452
6453 if (GET_CODE (x) != IOR)
6454 return false;
6455
6456 op0 = XEXP (x, 0);
6457 op1 = XEXP (x, 1);
6458
6459 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6460 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6461 {
6462 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6463 if (GET_CODE (op1) == ASHIFT)
6464 std::swap (op0, op1);
6465
6466 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6467 return false;
6468
6469 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6470 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6471
6472 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6473 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6474 {
6475 *res_op0 = XEXP (op0, 0);
6476 *res_op1 = XEXP (op1, 0);
6477 return true;
6478 }
6479 }
6480
6481 return false;
6482 }
6483
6484 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6485 storing it in *COST. Result is true if the total cost of the operation
6486 has now been calculated. */
6487 static bool
6488 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6489 {
6490 rtx inner;
6491 rtx comparator;
6492 enum rtx_code cmpcode;
6493
6494 if (COMPARISON_P (op0))
6495 {
6496 inner = XEXP (op0, 0);
6497 comparator = XEXP (op0, 1);
6498 cmpcode = GET_CODE (op0);
6499 }
6500 else
6501 {
6502 inner = op0;
6503 comparator = const0_rtx;
6504 cmpcode = NE;
6505 }
6506
6507 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6508 {
6509 /* Conditional branch. */
6510 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6511 return true;
6512 else
6513 {
6514 if (cmpcode == NE || cmpcode == EQ)
6515 {
6516 if (comparator == const0_rtx)
6517 {
6518 /* TBZ/TBNZ/CBZ/CBNZ. */
6519 if (GET_CODE (inner) == ZERO_EXTRACT)
6520 /* TBZ/TBNZ. */
6521 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6522 ZERO_EXTRACT, 0, speed);
6523 else
6524 /* CBZ/CBNZ. */
6525 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6526
6527 return true;
6528 }
6529 }
6530 else if (cmpcode == LT || cmpcode == GE)
6531 {
6532 /* TBZ/TBNZ. */
6533 if (comparator == const0_rtx)
6534 return true;
6535 }
6536 }
6537 }
6538 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6539 {
6540 /* CCMP. */
6541 if (GET_CODE (op1) == COMPARE)
6542 {
6543 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6544 if (XEXP (op1, 1) == const0_rtx)
6545 *cost += 1;
6546 if (speed)
6547 {
6548 machine_mode mode = GET_MODE (XEXP (op1, 0));
6549 const struct cpu_cost_table *extra_cost
6550 = aarch64_tune_params.insn_extra_cost;
6551
6552 if (GET_MODE_CLASS (mode) == MODE_INT)
6553 *cost += extra_cost->alu.arith;
6554 else
6555 *cost += extra_cost->fp[mode == DFmode].compare;
6556 }
6557 return true;
6558 }
6559
6560 /* It's a conditional operation based on the status flags,
6561 so it must be some flavor of CSEL. */
6562
6563 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6564 if (GET_CODE (op1) == NEG
6565 || GET_CODE (op1) == NOT
6566 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6567 op1 = XEXP (op1, 0);
6568 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6569 {
6570 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6571 op1 = XEXP (op1, 0);
6572 op2 = XEXP (op2, 0);
6573 }
6574
6575 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6576 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6577 return true;
6578 }
6579
6580 /* We don't know what this is, cost all operands. */
6581 return false;
6582 }
6583
6584 /* Check whether X is a bitfield operation of the form shift + extend that
6585 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6586 operand to which the bitfield operation is applied. Otherwise return
6587 NULL_RTX. */
6588
6589 static rtx
6590 aarch64_extend_bitfield_pattern_p (rtx x)
6591 {
6592 rtx_code outer_code = GET_CODE (x);
6593 machine_mode outer_mode = GET_MODE (x);
6594
6595 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6596 && outer_mode != SImode && outer_mode != DImode)
6597 return NULL_RTX;
6598
6599 rtx inner = XEXP (x, 0);
6600 rtx_code inner_code = GET_CODE (inner);
6601 machine_mode inner_mode = GET_MODE (inner);
6602 rtx op = NULL_RTX;
6603
6604 switch (inner_code)
6605 {
6606 case ASHIFT:
6607 if (CONST_INT_P (XEXP (inner, 1))
6608 && (inner_mode == QImode || inner_mode == HImode))
6609 op = XEXP (inner, 0);
6610 break;
6611 case LSHIFTRT:
6612 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6613 && (inner_mode == QImode || inner_mode == HImode))
6614 op = XEXP (inner, 0);
6615 break;
6616 case ASHIFTRT:
6617 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6618 && (inner_mode == QImode || inner_mode == HImode))
6619 op = XEXP (inner, 0);
6620 break;
6621 default:
6622 break;
6623 }
6624
6625 return op;
6626 }
6627
6628 /* Return true if the mask and a shift amount from an RTX of the form
6629 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6630 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6631
6632 bool
6633 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6634 {
6635 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6636 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6637 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6638 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6639 }
6640
6641 /* Calculate the cost of calculating X, storing it in *COST. Result
6642 is true if the total cost of the operation has now been calculated. */
6643 static bool
6644 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6645 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6646 {
6647 rtx op0, op1, op2;
6648 const struct cpu_cost_table *extra_cost
6649 = aarch64_tune_params.insn_extra_cost;
6650 int code = GET_CODE (x);
6651
6652 /* By default, assume that everything has equivalent cost to the
6653 cheapest instruction. Any additional costs are applied as a delta
6654 above this default. */
6655 *cost = COSTS_N_INSNS (1);
6656
6657 switch (code)
6658 {
6659 case SET:
6660 /* The cost depends entirely on the operands to SET. */
6661 *cost = 0;
6662 op0 = SET_DEST (x);
6663 op1 = SET_SRC (x);
6664
6665 switch (GET_CODE (op0))
6666 {
6667 case MEM:
6668 if (speed)
6669 {
6670 rtx address = XEXP (op0, 0);
6671 if (VECTOR_MODE_P (mode))
6672 *cost += extra_cost->ldst.storev;
6673 else if (GET_MODE_CLASS (mode) == MODE_INT)
6674 *cost += extra_cost->ldst.store;
6675 else if (mode == SFmode)
6676 *cost += extra_cost->ldst.storef;
6677 else if (mode == DFmode)
6678 *cost += extra_cost->ldst.stored;
6679
6680 *cost +=
6681 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6682 0, speed));
6683 }
6684
6685 *cost += rtx_cost (op1, mode, SET, 1, speed);
6686 return true;
6687
6688 case SUBREG:
6689 if (! REG_P (SUBREG_REG (op0)))
6690 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6691
6692 /* Fall through. */
6693 case REG:
6694 /* The cost is one per vector-register copied. */
6695 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6696 {
6697 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6698 / GET_MODE_SIZE (V4SImode);
6699 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6700 }
6701 /* const0_rtx is in general free, but we will use an
6702 instruction to set a register to 0. */
6703 else if (REG_P (op1) || op1 == const0_rtx)
6704 {
6705 /* The cost is 1 per register copied. */
6706 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6707 / UNITS_PER_WORD;
6708 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6709 }
6710 else
6711 /* Cost is just the cost of the RHS of the set. */
6712 *cost += rtx_cost (op1, mode, SET, 1, speed);
6713 return true;
6714
6715 case ZERO_EXTRACT:
6716 case SIGN_EXTRACT:
6717 /* Bit-field insertion. Strip any redundant widening of
6718 the RHS to meet the width of the target. */
6719 if (GET_CODE (op1) == SUBREG)
6720 op1 = SUBREG_REG (op1);
6721 if ((GET_CODE (op1) == ZERO_EXTEND
6722 || GET_CODE (op1) == SIGN_EXTEND)
6723 && CONST_INT_P (XEXP (op0, 1))
6724 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6725 >= INTVAL (XEXP (op0, 1))))
6726 op1 = XEXP (op1, 0);
6727
6728 if (CONST_INT_P (op1))
6729 {
6730 /* MOV immediate is assumed to always be cheap. */
6731 *cost = COSTS_N_INSNS (1);
6732 }
6733 else
6734 {
6735 /* BFM. */
6736 if (speed)
6737 *cost += extra_cost->alu.bfi;
6738 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6739 }
6740
6741 return true;
6742
6743 default:
6744 /* We can't make sense of this, assume default cost. */
6745 *cost = COSTS_N_INSNS (1);
6746 return false;
6747 }
6748 return false;
6749
6750 case CONST_INT:
6751 /* If an instruction can incorporate a constant within the
6752 instruction, the instruction's expression avoids calling
6753 rtx_cost() on the constant. If rtx_cost() is called on a
6754 constant, then it is usually because the constant must be
6755 moved into a register by one or more instructions.
6756
6757 The exception is constant 0, which can be expressed
6758 as XZR/WZR and is therefore free. The exception to this is
6759 if we have (set (reg) (const0_rtx)) in which case we must cost
6760 the move. However, we can catch that when we cost the SET, so
6761 we don't need to consider that here. */
6762 if (x == const0_rtx)
6763 *cost = 0;
6764 else
6765 {
6766 /* To an approximation, building any other constant is
6767 proportionally expensive to the number of instructions
6768 required to build that constant. This is true whether we
6769 are compiling for SPEED or otherwise. */
6770 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6771 (NULL_RTX, x, false, mode));
6772 }
6773 return true;
6774
6775 case CONST_DOUBLE:
6776 if (speed)
6777 {
6778 /* mov[df,sf]_aarch64. */
6779 if (aarch64_float_const_representable_p (x))
6780 /* FMOV (scalar immediate). */
6781 *cost += extra_cost->fp[mode == DFmode].fpconst;
6782 else if (!aarch64_float_const_zero_rtx_p (x))
6783 {
6784 /* This will be a load from memory. */
6785 if (mode == DFmode)
6786 *cost += extra_cost->ldst.loadd;
6787 else
6788 *cost += extra_cost->ldst.loadf;
6789 }
6790 else
6791 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6792 or MOV v0.s[0], wzr - neither of which are modeled by the
6793 cost tables. Just use the default cost. */
6794 {
6795 }
6796 }
6797
6798 return true;
6799
6800 case MEM:
6801 if (speed)
6802 {
6803 /* For loads we want the base cost of a load, plus an
6804 approximation for the additional cost of the addressing
6805 mode. */
6806 rtx address = XEXP (x, 0);
6807 if (VECTOR_MODE_P (mode))
6808 *cost += extra_cost->ldst.loadv;
6809 else if (GET_MODE_CLASS (mode) == MODE_INT)
6810 *cost += extra_cost->ldst.load;
6811 else if (mode == SFmode)
6812 *cost += extra_cost->ldst.loadf;
6813 else if (mode == DFmode)
6814 *cost += extra_cost->ldst.loadd;
6815
6816 *cost +=
6817 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6818 0, speed));
6819 }
6820
6821 return true;
6822
6823 case NEG:
6824 op0 = XEXP (x, 0);
6825
6826 if (VECTOR_MODE_P (mode))
6827 {
6828 if (speed)
6829 {
6830 /* FNEG. */
6831 *cost += extra_cost->vect.alu;
6832 }
6833 return false;
6834 }
6835
6836 if (GET_MODE_CLASS (mode) == MODE_INT)
6837 {
6838 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6839 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6840 {
6841 /* CSETM. */
6842 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6843 return true;
6844 }
6845
6846 /* Cost this as SUB wzr, X. */
6847 op0 = CONST0_RTX (mode);
6848 op1 = XEXP (x, 0);
6849 goto cost_minus;
6850 }
6851
6852 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6853 {
6854 /* Support (neg(fma...)) as a single instruction only if
6855 sign of zeros is unimportant. This matches the decision
6856 making in aarch64.md. */
6857 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6858 {
6859 /* FNMADD. */
6860 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6861 return true;
6862 }
6863 if (GET_CODE (op0) == MULT)
6864 {
6865 /* FNMUL. */
6866 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6867 return true;
6868 }
6869 if (speed)
6870 /* FNEG. */
6871 *cost += extra_cost->fp[mode == DFmode].neg;
6872 return false;
6873 }
6874
6875 return false;
6876
6877 case CLRSB:
6878 case CLZ:
6879 if (speed)
6880 {
6881 if (VECTOR_MODE_P (mode))
6882 *cost += extra_cost->vect.alu;
6883 else
6884 *cost += extra_cost->alu.clz;
6885 }
6886
6887 return false;
6888
6889 case COMPARE:
6890 op0 = XEXP (x, 0);
6891 op1 = XEXP (x, 1);
6892
6893 if (op1 == const0_rtx
6894 && GET_CODE (op0) == AND)
6895 {
6896 x = op0;
6897 mode = GET_MODE (op0);
6898 goto cost_logic;
6899 }
6900
6901 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6902 {
6903 /* TODO: A write to the CC flags possibly costs extra, this
6904 needs encoding in the cost tables. */
6905
6906 mode = GET_MODE (op0);
6907 /* ANDS. */
6908 if (GET_CODE (op0) == AND)
6909 {
6910 x = op0;
6911 goto cost_logic;
6912 }
6913
6914 if (GET_CODE (op0) == PLUS)
6915 {
6916 /* ADDS (and CMN alias). */
6917 x = op0;
6918 goto cost_plus;
6919 }
6920
6921 if (GET_CODE (op0) == MINUS)
6922 {
6923 /* SUBS. */
6924 x = op0;
6925 goto cost_minus;
6926 }
6927
6928 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6929 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6930 && CONST_INT_P (XEXP (op0, 2)))
6931 {
6932 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6933 Handle it here directly rather than going to cost_logic
6934 since we know the immediate generated for the TST is valid
6935 so we can avoid creating an intermediate rtx for it only
6936 for costing purposes. */
6937 if (speed)
6938 *cost += extra_cost->alu.logical;
6939
6940 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6941 ZERO_EXTRACT, 0, speed);
6942 return true;
6943 }
6944
6945 if (GET_CODE (op1) == NEG)
6946 {
6947 /* CMN. */
6948 if (speed)
6949 *cost += extra_cost->alu.arith;
6950
6951 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6952 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6953 return true;
6954 }
6955
6956 /* CMP.
6957
6958 Compare can freely swap the order of operands, and
6959 canonicalization puts the more complex operation first.
6960 But the integer MINUS logic expects the shift/extend
6961 operation in op1. */
6962 if (! (REG_P (op0)
6963 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6964 {
6965 op0 = XEXP (x, 1);
6966 op1 = XEXP (x, 0);
6967 }
6968 goto cost_minus;
6969 }
6970
6971 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6972 {
6973 /* FCMP. */
6974 if (speed)
6975 *cost += extra_cost->fp[mode == DFmode].compare;
6976
6977 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6978 {
6979 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6980 /* FCMP supports constant 0.0 for no extra cost. */
6981 return true;
6982 }
6983 return false;
6984 }
6985
6986 if (VECTOR_MODE_P (mode))
6987 {
6988 /* Vector compare. */
6989 if (speed)
6990 *cost += extra_cost->vect.alu;
6991
6992 if (aarch64_float_const_zero_rtx_p (op1))
6993 {
6994 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6995 cost. */
6996 return true;
6997 }
6998 return false;
6999 }
7000 return false;
7001
7002 case MINUS:
7003 {
7004 op0 = XEXP (x, 0);
7005 op1 = XEXP (x, 1);
7006
7007 cost_minus:
7008 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7009
7010 /* Detect valid immediates. */
7011 if ((GET_MODE_CLASS (mode) == MODE_INT
7012 || (GET_MODE_CLASS (mode) == MODE_CC
7013 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7014 && CONST_INT_P (op1)
7015 && aarch64_uimm12_shift (INTVAL (op1)))
7016 {
7017 if (speed)
7018 /* SUB(S) (immediate). */
7019 *cost += extra_cost->alu.arith;
7020 return true;
7021 }
7022
7023 /* Look for SUB (extended register). */
7024 if (aarch64_rtx_arith_op_extract_p (op1, mode))
7025 {
7026 if (speed)
7027 *cost += extra_cost->alu.extend_arith;
7028
7029 op1 = aarch64_strip_extend (op1);
7030 *cost += rtx_cost (op1, VOIDmode,
7031 (enum rtx_code) GET_CODE (op1), 0, speed);
7032 return true;
7033 }
7034
7035 rtx new_op1 = aarch64_strip_extend (op1);
7036
7037 /* Cost this as an FMA-alike operation. */
7038 if ((GET_CODE (new_op1) == MULT
7039 || aarch64_shift_p (GET_CODE (new_op1)))
7040 && code != COMPARE)
7041 {
7042 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7043 (enum rtx_code) code,
7044 speed);
7045 return true;
7046 }
7047
7048 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7049
7050 if (speed)
7051 {
7052 if (VECTOR_MODE_P (mode))
7053 {
7054 /* Vector SUB. */
7055 *cost += extra_cost->vect.alu;
7056 }
7057 else if (GET_MODE_CLASS (mode) == MODE_INT)
7058 {
7059 /* SUB(S). */
7060 *cost += extra_cost->alu.arith;
7061 }
7062 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7063 {
7064 /* FSUB. */
7065 *cost += extra_cost->fp[mode == DFmode].addsub;
7066 }
7067 }
7068 return true;
7069 }
7070
7071 case PLUS:
7072 {
7073 rtx new_op0;
7074
7075 op0 = XEXP (x, 0);
7076 op1 = XEXP (x, 1);
7077
7078 cost_plus:
7079 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7080 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7081 {
7082 /* CSINC. */
7083 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7084 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7085 return true;
7086 }
7087
7088 if (GET_MODE_CLASS (mode) == MODE_INT
7089 && CONST_INT_P (op1)
7090 && aarch64_uimm12_shift (INTVAL (op1)))
7091 {
7092 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7093
7094 if (speed)
7095 /* ADD (immediate). */
7096 *cost += extra_cost->alu.arith;
7097 return true;
7098 }
7099
7100 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7101
7102 /* Look for ADD (extended register). */
7103 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7104 {
7105 if (speed)
7106 *cost += extra_cost->alu.extend_arith;
7107
7108 op0 = aarch64_strip_extend (op0);
7109 *cost += rtx_cost (op0, VOIDmode,
7110 (enum rtx_code) GET_CODE (op0), 0, speed);
7111 return true;
7112 }
7113
7114 /* Strip any extend, leave shifts behind as we will
7115 cost them through mult_cost. */
7116 new_op0 = aarch64_strip_extend (op0);
7117
7118 if (GET_CODE (new_op0) == MULT
7119 || aarch64_shift_p (GET_CODE (new_op0)))
7120 {
7121 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7122 speed);
7123 return true;
7124 }
7125
7126 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7127
7128 if (speed)
7129 {
7130 if (VECTOR_MODE_P (mode))
7131 {
7132 /* Vector ADD. */
7133 *cost += extra_cost->vect.alu;
7134 }
7135 else if (GET_MODE_CLASS (mode) == MODE_INT)
7136 {
7137 /* ADD. */
7138 *cost += extra_cost->alu.arith;
7139 }
7140 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7141 {
7142 /* FADD. */
7143 *cost += extra_cost->fp[mode == DFmode].addsub;
7144 }
7145 }
7146 return true;
7147 }
7148
7149 case BSWAP:
7150 *cost = COSTS_N_INSNS (1);
7151
7152 if (speed)
7153 {
7154 if (VECTOR_MODE_P (mode))
7155 *cost += extra_cost->vect.alu;
7156 else
7157 *cost += extra_cost->alu.rev;
7158 }
7159 return false;
7160
7161 case IOR:
7162 if (aarch_rev16_p (x))
7163 {
7164 *cost = COSTS_N_INSNS (1);
7165
7166 if (speed)
7167 {
7168 if (VECTOR_MODE_P (mode))
7169 *cost += extra_cost->vect.alu;
7170 else
7171 *cost += extra_cost->alu.rev;
7172 }
7173 return true;
7174 }
7175
7176 if (aarch64_extr_rtx_p (x, &op0, &op1))
7177 {
7178 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7179 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7180 if (speed)
7181 *cost += extra_cost->alu.shift;
7182
7183 return true;
7184 }
7185 /* Fall through. */
7186 case XOR:
7187 case AND:
7188 cost_logic:
7189 op0 = XEXP (x, 0);
7190 op1 = XEXP (x, 1);
7191
7192 if (VECTOR_MODE_P (mode))
7193 {
7194 if (speed)
7195 *cost += extra_cost->vect.alu;
7196 return true;
7197 }
7198
7199 if (code == AND
7200 && GET_CODE (op0) == MULT
7201 && CONST_INT_P (XEXP (op0, 1))
7202 && CONST_INT_P (op1)
7203 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7204 INTVAL (op1)) != 0)
7205 {
7206 /* This is a UBFM/SBFM. */
7207 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7208 if (speed)
7209 *cost += extra_cost->alu.bfx;
7210 return true;
7211 }
7212
7213 if (GET_MODE_CLASS (mode) == MODE_INT)
7214 {
7215 if (CONST_INT_P (op1))
7216 {
7217 /* We have a mask + shift version of a UBFIZ
7218 i.e. the *andim_ashift<mode>_bfiz pattern. */
7219 if (GET_CODE (op0) == ASHIFT
7220 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7221 XEXP (op0, 1)))
7222 {
7223 *cost += rtx_cost (XEXP (op0, 0), mode,
7224 (enum rtx_code) code, 0, speed);
7225 if (speed)
7226 *cost += extra_cost->alu.bfx;
7227
7228 return true;
7229 }
7230 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7231 {
7232 /* We possibly get the immediate for free, this is not
7233 modelled. */
7234 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7235 if (speed)
7236 *cost += extra_cost->alu.logical;
7237
7238 return true;
7239 }
7240 }
7241 else
7242 {
7243 rtx new_op0 = op0;
7244
7245 /* Handle ORN, EON, or BIC. */
7246 if (GET_CODE (op0) == NOT)
7247 op0 = XEXP (op0, 0);
7248
7249 new_op0 = aarch64_strip_shift (op0);
7250
7251 /* If we had a shift on op0 then this is a logical-shift-
7252 by-register/immediate operation. Otherwise, this is just
7253 a logical operation. */
7254 if (speed)
7255 {
7256 if (new_op0 != op0)
7257 {
7258 /* Shift by immediate. */
7259 if (CONST_INT_P (XEXP (op0, 1)))
7260 *cost += extra_cost->alu.log_shift;
7261 else
7262 *cost += extra_cost->alu.log_shift_reg;
7263 }
7264 else
7265 *cost += extra_cost->alu.logical;
7266 }
7267
7268 /* In both cases we want to cost both operands. */
7269 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7270 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7271
7272 return true;
7273 }
7274 }
7275 return false;
7276
7277 case NOT:
7278 x = XEXP (x, 0);
7279 op0 = aarch64_strip_shift (x);
7280
7281 if (VECTOR_MODE_P (mode))
7282 {
7283 /* Vector NOT. */
7284 *cost += extra_cost->vect.alu;
7285 return false;
7286 }
7287
7288 /* MVN-shifted-reg. */
7289 if (op0 != x)
7290 {
7291 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7292
7293 if (speed)
7294 *cost += extra_cost->alu.log_shift;
7295
7296 return true;
7297 }
7298 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7299 Handle the second form here taking care that 'a' in the above can
7300 be a shift. */
7301 else if (GET_CODE (op0) == XOR)
7302 {
7303 rtx newop0 = XEXP (op0, 0);
7304 rtx newop1 = XEXP (op0, 1);
7305 rtx op0_stripped = aarch64_strip_shift (newop0);
7306
7307 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7308 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7309
7310 if (speed)
7311 {
7312 if (op0_stripped != newop0)
7313 *cost += extra_cost->alu.log_shift;
7314 else
7315 *cost += extra_cost->alu.logical;
7316 }
7317
7318 return true;
7319 }
7320 /* MVN. */
7321 if (speed)
7322 *cost += extra_cost->alu.logical;
7323
7324 return false;
7325
7326 case ZERO_EXTEND:
7327
7328 op0 = XEXP (x, 0);
7329 /* If a value is written in SI mode, then zero extended to DI
7330 mode, the operation will in general be free as a write to
7331 a 'w' register implicitly zeroes the upper bits of an 'x'
7332 register. However, if this is
7333
7334 (set (reg) (zero_extend (reg)))
7335
7336 we must cost the explicit register move. */
7337 if (mode == DImode
7338 && GET_MODE (op0) == SImode
7339 && outer == SET)
7340 {
7341 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7342
7343 /* If OP_COST is non-zero, then the cost of the zero extend
7344 is effectively the cost of the inner operation. Otherwise
7345 we have a MOV instruction and we take the cost from the MOV
7346 itself. This is true independently of whether we are
7347 optimizing for space or time. */
7348 if (op_cost)
7349 *cost = op_cost;
7350
7351 return true;
7352 }
7353 else if (MEM_P (op0))
7354 {
7355 /* All loads can zero extend to any size for free. */
7356 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7357 return true;
7358 }
7359
7360 op0 = aarch64_extend_bitfield_pattern_p (x);
7361 if (op0)
7362 {
7363 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7364 if (speed)
7365 *cost += extra_cost->alu.bfx;
7366 return true;
7367 }
7368
7369 if (speed)
7370 {
7371 if (VECTOR_MODE_P (mode))
7372 {
7373 /* UMOV. */
7374 *cost += extra_cost->vect.alu;
7375 }
7376 else
7377 {
7378 /* We generate an AND instead of UXTB/UXTH. */
7379 *cost += extra_cost->alu.logical;
7380 }
7381 }
7382 return false;
7383
7384 case SIGN_EXTEND:
7385 if (MEM_P (XEXP (x, 0)))
7386 {
7387 /* LDRSH. */
7388 if (speed)
7389 {
7390 rtx address = XEXP (XEXP (x, 0), 0);
7391 *cost += extra_cost->ldst.load_sign_extend;
7392
7393 *cost +=
7394 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7395 0, speed));
7396 }
7397 return true;
7398 }
7399
7400 op0 = aarch64_extend_bitfield_pattern_p (x);
7401 if (op0)
7402 {
7403 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7404 if (speed)
7405 *cost += extra_cost->alu.bfx;
7406 return true;
7407 }
7408
7409 if (speed)
7410 {
7411 if (VECTOR_MODE_P (mode))
7412 *cost += extra_cost->vect.alu;
7413 else
7414 *cost += extra_cost->alu.extend;
7415 }
7416 return false;
7417
7418 case ASHIFT:
7419 op0 = XEXP (x, 0);
7420 op1 = XEXP (x, 1);
7421
7422 if (CONST_INT_P (op1))
7423 {
7424 if (speed)
7425 {
7426 if (VECTOR_MODE_P (mode))
7427 {
7428 /* Vector shift (immediate). */
7429 *cost += extra_cost->vect.alu;
7430 }
7431 else
7432 {
7433 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7434 aliases. */
7435 *cost += extra_cost->alu.shift;
7436 }
7437 }
7438
7439 /* We can incorporate zero/sign extend for free. */
7440 if (GET_CODE (op0) == ZERO_EXTEND
7441 || GET_CODE (op0) == SIGN_EXTEND)
7442 op0 = XEXP (op0, 0);
7443
7444 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7445 return true;
7446 }
7447 else
7448 {
7449 if (speed)
7450 {
7451 if (VECTOR_MODE_P (mode))
7452 {
7453 /* Vector shift (register). */
7454 *cost += extra_cost->vect.alu;
7455 }
7456 else
7457 {
7458 /* LSLV. */
7459 *cost += extra_cost->alu.shift_reg;
7460 }
7461 }
7462 return false; /* All arguments need to be in registers. */
7463 }
7464
7465 case ROTATE:
7466 case ROTATERT:
7467 case LSHIFTRT:
7468 case ASHIFTRT:
7469 op0 = XEXP (x, 0);
7470 op1 = XEXP (x, 1);
7471
7472 if (CONST_INT_P (op1))
7473 {
7474 /* ASR (immediate) and friends. */
7475 if (speed)
7476 {
7477 if (VECTOR_MODE_P (mode))
7478 *cost += extra_cost->vect.alu;
7479 else
7480 *cost += extra_cost->alu.shift;
7481 }
7482
7483 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7484 return true;
7485 }
7486 else
7487 {
7488
7489 /* ASR (register) and friends. */
7490 if (speed)
7491 {
7492 if (VECTOR_MODE_P (mode))
7493 *cost += extra_cost->vect.alu;
7494 else
7495 *cost += extra_cost->alu.shift_reg;
7496 }
7497 return false; /* All arguments need to be in registers. */
7498 }
7499
7500 case SYMBOL_REF:
7501
7502 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7503 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7504 {
7505 /* LDR. */
7506 if (speed)
7507 *cost += extra_cost->ldst.load;
7508 }
7509 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7510 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7511 {
7512 /* ADRP, followed by ADD. */
7513 *cost += COSTS_N_INSNS (1);
7514 if (speed)
7515 *cost += 2 * extra_cost->alu.arith;
7516 }
7517 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7518 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7519 {
7520 /* ADR. */
7521 if (speed)
7522 *cost += extra_cost->alu.arith;
7523 }
7524
7525 if (flag_pic)
7526 {
7527 /* One extra load instruction, after accessing the GOT. */
7528 *cost += COSTS_N_INSNS (1);
7529 if (speed)
7530 *cost += extra_cost->ldst.load;
7531 }
7532 return true;
7533
7534 case HIGH:
7535 case LO_SUM:
7536 /* ADRP/ADD (immediate). */
7537 if (speed)
7538 *cost += extra_cost->alu.arith;
7539 return true;
7540
7541 case ZERO_EXTRACT:
7542 case SIGN_EXTRACT:
7543 /* UBFX/SBFX. */
7544 if (speed)
7545 {
7546 if (VECTOR_MODE_P (mode))
7547 *cost += extra_cost->vect.alu;
7548 else
7549 *cost += extra_cost->alu.bfx;
7550 }
7551
7552 /* We can trust that the immediates used will be correct (there
7553 are no by-register forms), so we need only cost op0. */
7554 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7555 return true;
7556
7557 case MULT:
7558 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7559 /* aarch64_rtx_mult_cost always handles recursion to its
7560 operands. */
7561 return true;
7562
7563 case MOD:
7564 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7565 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7566 an unconditional negate. This case should only ever be reached through
7567 the set_smod_pow2_cheap check in expmed.c. */
7568 if (CONST_INT_P (XEXP (x, 1))
7569 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7570 && (mode == SImode || mode == DImode))
7571 {
7572 /* We expand to 4 instructions. Reset the baseline. */
7573 *cost = COSTS_N_INSNS (4);
7574
7575 if (speed)
7576 *cost += 2 * extra_cost->alu.logical
7577 + 2 * extra_cost->alu.arith;
7578
7579 return true;
7580 }
7581
7582 /* Fall-through. */
7583 case UMOD:
7584 if (speed)
7585 {
7586 /* Slighly prefer UMOD over SMOD. */
7587 if (VECTOR_MODE_P (mode))
7588 *cost += extra_cost->vect.alu;
7589 else if (GET_MODE_CLASS (mode) == MODE_INT)
7590 *cost += (extra_cost->mult[mode == DImode].add
7591 + extra_cost->mult[mode == DImode].idiv
7592 + (code == MOD ? 1 : 0));
7593 }
7594 return false; /* All arguments need to be in registers. */
7595
7596 case DIV:
7597 case UDIV:
7598 case SQRT:
7599 if (speed)
7600 {
7601 if (VECTOR_MODE_P (mode))
7602 *cost += extra_cost->vect.alu;
7603 else if (GET_MODE_CLASS (mode) == MODE_INT)
7604 /* There is no integer SQRT, so only DIV and UDIV can get
7605 here. */
7606 *cost += (extra_cost->mult[mode == DImode].idiv
7607 /* Slighly prefer UDIV over SDIV. */
7608 + (code == DIV ? 1 : 0));
7609 else
7610 *cost += extra_cost->fp[mode == DFmode].div;
7611 }
7612 return false; /* All arguments need to be in registers. */
7613
7614 case IF_THEN_ELSE:
7615 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7616 XEXP (x, 2), cost, speed);
7617
7618 case EQ:
7619 case NE:
7620 case GT:
7621 case GTU:
7622 case LT:
7623 case LTU:
7624 case GE:
7625 case GEU:
7626 case LE:
7627 case LEU:
7628
7629 return false; /* All arguments must be in registers. */
7630
7631 case FMA:
7632 op0 = XEXP (x, 0);
7633 op1 = XEXP (x, 1);
7634 op2 = XEXP (x, 2);
7635
7636 if (speed)
7637 {
7638 if (VECTOR_MODE_P (mode))
7639 *cost += extra_cost->vect.alu;
7640 else
7641 *cost += extra_cost->fp[mode == DFmode].fma;
7642 }
7643
7644 /* FMSUB, FNMADD, and FNMSUB are free. */
7645 if (GET_CODE (op0) == NEG)
7646 op0 = XEXP (op0, 0);
7647
7648 if (GET_CODE (op2) == NEG)
7649 op2 = XEXP (op2, 0);
7650
7651 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7652 and the by-element operand as operand 0. */
7653 if (GET_CODE (op1) == NEG)
7654 op1 = XEXP (op1, 0);
7655
7656 /* Catch vector-by-element operations. The by-element operand can
7657 either be (vec_duplicate (vec_select (x))) or just
7658 (vec_select (x)), depending on whether we are multiplying by
7659 a vector or a scalar.
7660
7661 Canonicalization is not very good in these cases, FMA4 will put the
7662 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7663 if (GET_CODE (op0) == VEC_DUPLICATE)
7664 op0 = XEXP (op0, 0);
7665 else if (GET_CODE (op1) == VEC_DUPLICATE)
7666 op1 = XEXP (op1, 0);
7667
7668 if (GET_CODE (op0) == VEC_SELECT)
7669 op0 = XEXP (op0, 0);
7670 else if (GET_CODE (op1) == VEC_SELECT)
7671 op1 = XEXP (op1, 0);
7672
7673 /* If the remaining parameters are not registers,
7674 get the cost to put them into registers. */
7675 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7676 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7677 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7678 return true;
7679
7680 case FLOAT:
7681 case UNSIGNED_FLOAT:
7682 if (speed)
7683 *cost += extra_cost->fp[mode == DFmode].fromint;
7684 return false;
7685
7686 case FLOAT_EXTEND:
7687 if (speed)
7688 {
7689 if (VECTOR_MODE_P (mode))
7690 {
7691 /*Vector truncate. */
7692 *cost += extra_cost->vect.alu;
7693 }
7694 else
7695 *cost += extra_cost->fp[mode == DFmode].widen;
7696 }
7697 return false;
7698
7699 case FLOAT_TRUNCATE:
7700 if (speed)
7701 {
7702 if (VECTOR_MODE_P (mode))
7703 {
7704 /*Vector conversion. */
7705 *cost += extra_cost->vect.alu;
7706 }
7707 else
7708 *cost += extra_cost->fp[mode == DFmode].narrow;
7709 }
7710 return false;
7711
7712 case FIX:
7713 case UNSIGNED_FIX:
7714 x = XEXP (x, 0);
7715 /* Strip the rounding part. They will all be implemented
7716 by the fcvt* family of instructions anyway. */
7717 if (GET_CODE (x) == UNSPEC)
7718 {
7719 unsigned int uns_code = XINT (x, 1);
7720
7721 if (uns_code == UNSPEC_FRINTA
7722 || uns_code == UNSPEC_FRINTM
7723 || uns_code == UNSPEC_FRINTN
7724 || uns_code == UNSPEC_FRINTP
7725 || uns_code == UNSPEC_FRINTZ)
7726 x = XVECEXP (x, 0, 0);
7727 }
7728
7729 if (speed)
7730 {
7731 if (VECTOR_MODE_P (mode))
7732 *cost += extra_cost->vect.alu;
7733 else
7734 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7735 }
7736
7737 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7738 fixed-point fcvt. */
7739 if (GET_CODE (x) == MULT
7740 && ((VECTOR_MODE_P (mode)
7741 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7742 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7743 {
7744 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7745 0, speed);
7746 return true;
7747 }
7748
7749 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7750 return true;
7751
7752 case ABS:
7753 if (VECTOR_MODE_P (mode))
7754 {
7755 /* ABS (vector). */
7756 if (speed)
7757 *cost += extra_cost->vect.alu;
7758 }
7759 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7760 {
7761 op0 = XEXP (x, 0);
7762
7763 /* FABD, which is analogous to FADD. */
7764 if (GET_CODE (op0) == MINUS)
7765 {
7766 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7767 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7768 if (speed)
7769 *cost += extra_cost->fp[mode == DFmode].addsub;
7770
7771 return true;
7772 }
7773 /* Simple FABS is analogous to FNEG. */
7774 if (speed)
7775 *cost += extra_cost->fp[mode == DFmode].neg;
7776 }
7777 else
7778 {
7779 /* Integer ABS will either be split to
7780 two arithmetic instructions, or will be an ABS
7781 (scalar), which we don't model. */
7782 *cost = COSTS_N_INSNS (2);
7783 if (speed)
7784 *cost += 2 * extra_cost->alu.arith;
7785 }
7786 return false;
7787
7788 case SMAX:
7789 case SMIN:
7790 if (speed)
7791 {
7792 if (VECTOR_MODE_P (mode))
7793 *cost += extra_cost->vect.alu;
7794 else
7795 {
7796 /* FMAXNM/FMINNM/FMAX/FMIN.
7797 TODO: This may not be accurate for all implementations, but
7798 we do not model this in the cost tables. */
7799 *cost += extra_cost->fp[mode == DFmode].addsub;
7800 }
7801 }
7802 return false;
7803
7804 case UNSPEC:
7805 /* The floating point round to integer frint* instructions. */
7806 if (aarch64_frint_unspec_p (XINT (x, 1)))
7807 {
7808 if (speed)
7809 *cost += extra_cost->fp[mode == DFmode].roundint;
7810
7811 return false;
7812 }
7813
7814 if (XINT (x, 1) == UNSPEC_RBIT)
7815 {
7816 if (speed)
7817 *cost += extra_cost->alu.rev;
7818
7819 return false;
7820 }
7821 break;
7822
7823 case TRUNCATE:
7824
7825 /* Decompose <su>muldi3_highpart. */
7826 if (/* (truncate:DI */
7827 mode == DImode
7828 /* (lshiftrt:TI */
7829 && GET_MODE (XEXP (x, 0)) == TImode
7830 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7831 /* (mult:TI */
7832 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7833 /* (ANY_EXTEND:TI (reg:DI))
7834 (ANY_EXTEND:TI (reg:DI))) */
7835 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7836 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7837 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7838 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7839 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7840 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7841 /* (const_int 64) */
7842 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7843 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7844 {
7845 /* UMULH/SMULH. */
7846 if (speed)
7847 *cost += extra_cost->mult[mode == DImode].extend;
7848 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7849 mode, MULT, 0, speed);
7850 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7851 mode, MULT, 1, speed);
7852 return true;
7853 }
7854
7855 /* Fall through. */
7856 default:
7857 break;
7858 }
7859
7860 if (dump_file
7861 && flag_aarch64_verbose_cost)
7862 fprintf (dump_file,
7863 "\nFailed to cost RTX. Assuming default cost.\n");
7864
7865 return true;
7866 }
7867
7868 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7869 calculated for X. This cost is stored in *COST. Returns true
7870 if the total cost of X was calculated. */
7871 static bool
7872 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7873 int param, int *cost, bool speed)
7874 {
7875 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7876
7877 if (dump_file
7878 && flag_aarch64_verbose_cost)
7879 {
7880 print_rtl_single (dump_file, x);
7881 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7882 speed ? "Hot" : "Cold",
7883 *cost, result ? "final" : "partial");
7884 }
7885
7886 return result;
7887 }
7888
7889 static int
7890 aarch64_register_move_cost (machine_mode mode,
7891 reg_class_t from_i, reg_class_t to_i)
7892 {
7893 enum reg_class from = (enum reg_class) from_i;
7894 enum reg_class to = (enum reg_class) to_i;
7895 const struct cpu_regmove_cost *regmove_cost
7896 = aarch64_tune_params.regmove_cost;
7897
7898 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7899 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7900 to = GENERAL_REGS;
7901
7902 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7903 from = GENERAL_REGS;
7904
7905 /* Moving between GPR and stack cost is the same as GP2GP. */
7906 if ((from == GENERAL_REGS && to == STACK_REG)
7907 || (to == GENERAL_REGS && from == STACK_REG))
7908 return regmove_cost->GP2GP;
7909
7910 /* To/From the stack register, we move via the gprs. */
7911 if (to == STACK_REG || from == STACK_REG)
7912 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7913 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7914
7915 if (GET_MODE_SIZE (mode) == 16)
7916 {
7917 /* 128-bit operations on general registers require 2 instructions. */
7918 if (from == GENERAL_REGS && to == GENERAL_REGS)
7919 return regmove_cost->GP2GP * 2;
7920 else if (from == GENERAL_REGS)
7921 return regmove_cost->GP2FP * 2;
7922 else if (to == GENERAL_REGS)
7923 return regmove_cost->FP2GP * 2;
7924
7925 /* When AdvSIMD instructions are disabled it is not possible to move
7926 a 128-bit value directly between Q registers. This is handled in
7927 secondary reload. A general register is used as a scratch to move
7928 the upper DI value and the lower DI value is moved directly,
7929 hence the cost is the sum of three moves. */
7930 if (! TARGET_SIMD)
7931 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7932
7933 return regmove_cost->FP2FP;
7934 }
7935
7936 if (from == GENERAL_REGS && to == GENERAL_REGS)
7937 return regmove_cost->GP2GP;
7938 else if (from == GENERAL_REGS)
7939 return regmove_cost->GP2FP;
7940 else if (to == GENERAL_REGS)
7941 return regmove_cost->FP2GP;
7942
7943 return regmove_cost->FP2FP;
7944 }
7945
7946 static int
7947 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7948 reg_class_t rclass ATTRIBUTE_UNUSED,
7949 bool in ATTRIBUTE_UNUSED)
7950 {
7951 return aarch64_tune_params.memmov_cost;
7952 }
7953
7954 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7955 to optimize 1.0/sqrt. */
7956
7957 static bool
7958 use_rsqrt_p (machine_mode mode)
7959 {
7960 return (!flag_trapping_math
7961 && flag_unsafe_math_optimizations
7962 && ((aarch64_tune_params.approx_modes->recip_sqrt
7963 & AARCH64_APPROX_MODE (mode))
7964 || flag_mrecip_low_precision_sqrt));
7965 }
7966
7967 /* Function to decide when to use the approximate reciprocal square root
7968 builtin. */
7969
7970 static tree
7971 aarch64_builtin_reciprocal (tree fndecl)
7972 {
7973 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7974
7975 if (!use_rsqrt_p (mode))
7976 return NULL_TREE;
7977 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7978 }
7979
7980 typedef rtx (*rsqrte_type) (rtx, rtx);
7981
7982 /* Select reciprocal square root initial estimate insn depending on machine
7983 mode. */
7984
7985 static rsqrte_type
7986 get_rsqrte_type (machine_mode mode)
7987 {
7988 switch (mode)
7989 {
7990 case DFmode: return gen_aarch64_rsqrtedf;
7991 case SFmode: return gen_aarch64_rsqrtesf;
7992 case V2DFmode: return gen_aarch64_rsqrtev2df;
7993 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7994 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7995 default: gcc_unreachable ();
7996 }
7997 }
7998
7999 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8000
8001 /* Select reciprocal square root series step insn depending on machine mode. */
8002
8003 static rsqrts_type
8004 get_rsqrts_type (machine_mode mode)
8005 {
8006 switch (mode)
8007 {
8008 case DFmode: return gen_aarch64_rsqrtsdf;
8009 case SFmode: return gen_aarch64_rsqrtssf;
8010 case V2DFmode: return gen_aarch64_rsqrtsv2df;
8011 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
8012 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
8013 default: gcc_unreachable ();
8014 }
8015 }
8016
8017 /* Emit instruction sequence to compute either the approximate square root
8018 or its approximate reciprocal, depending on the flag RECP, and return
8019 whether the sequence was emitted or not. */
8020
8021 bool
8022 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8023 {
8024 machine_mode mode = GET_MODE (dst);
8025
8026 if (GET_MODE_INNER (mode) == HFmode)
8027 {
8028 gcc_assert (!recp);
8029 return false;
8030 }
8031
8032 machine_mode mmsk
8033 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
8034 GET_MODE_NUNITS (mode));
8035 if (!recp)
8036 {
8037 if (!(flag_mlow_precision_sqrt
8038 || (aarch64_tune_params.approx_modes->sqrt
8039 & AARCH64_APPROX_MODE (mode))))
8040 return false;
8041
8042 if (flag_finite_math_only
8043 || flag_trapping_math
8044 || !flag_unsafe_math_optimizations
8045 || optimize_function_for_size_p (cfun))
8046 return false;
8047 }
8048 else
8049 /* Caller assumes we cannot fail. */
8050 gcc_assert (use_rsqrt_p (mode));
8051
8052
8053 rtx xmsk = gen_reg_rtx (mmsk);
8054 if (!recp)
8055 /* When calculating the approximate square root, compare the
8056 argument with 0.0 and create a mask. */
8057 emit_insn (gen_rtx_SET (xmsk,
8058 gen_rtx_NEG (mmsk,
8059 gen_rtx_EQ (mmsk, src,
8060 CONST0_RTX (mode)))));
8061
8062 /* Estimate the approximate reciprocal square root. */
8063 rtx xdst = gen_reg_rtx (mode);
8064 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8065
8066 /* Iterate over the series twice for SF and thrice for DF. */
8067 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8068
8069 /* Optionally iterate over the series once less for faster performance
8070 while sacrificing the accuracy. */
8071 if ((recp && flag_mrecip_low_precision_sqrt)
8072 || (!recp && flag_mlow_precision_sqrt))
8073 iterations--;
8074
8075 /* Iterate over the series to calculate the approximate reciprocal square
8076 root. */
8077 rtx x1 = gen_reg_rtx (mode);
8078 while (iterations--)
8079 {
8080 rtx x2 = gen_reg_rtx (mode);
8081 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8082
8083 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8084
8085 if (iterations > 0)
8086 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8087 }
8088
8089 if (!recp)
8090 {
8091 /* Qualify the approximate reciprocal square root when the argument is
8092 0.0 by squashing the intermediary result to 0.0. */
8093 rtx xtmp = gen_reg_rtx (mmsk);
8094 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8095 gen_rtx_SUBREG (mmsk, xdst, 0)));
8096 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8097
8098 /* Calculate the approximate square root. */
8099 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8100 }
8101
8102 /* Finalize the approximation. */
8103 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8104
8105 return true;
8106 }
8107
8108 typedef rtx (*recpe_type) (rtx, rtx);
8109
8110 /* Select reciprocal initial estimate insn depending on machine mode. */
8111
8112 static recpe_type
8113 get_recpe_type (machine_mode mode)
8114 {
8115 switch (mode)
8116 {
8117 case SFmode: return (gen_aarch64_frecpesf);
8118 case V2SFmode: return (gen_aarch64_frecpev2sf);
8119 case V4SFmode: return (gen_aarch64_frecpev4sf);
8120 case DFmode: return (gen_aarch64_frecpedf);
8121 case V2DFmode: return (gen_aarch64_frecpev2df);
8122 default: gcc_unreachable ();
8123 }
8124 }
8125
8126 typedef rtx (*recps_type) (rtx, rtx, rtx);
8127
8128 /* Select reciprocal series step insn depending on machine mode. */
8129
8130 static recps_type
8131 get_recps_type (machine_mode mode)
8132 {
8133 switch (mode)
8134 {
8135 case SFmode: return (gen_aarch64_frecpssf);
8136 case V2SFmode: return (gen_aarch64_frecpsv2sf);
8137 case V4SFmode: return (gen_aarch64_frecpsv4sf);
8138 case DFmode: return (gen_aarch64_frecpsdf);
8139 case V2DFmode: return (gen_aarch64_frecpsv2df);
8140 default: gcc_unreachable ();
8141 }
8142 }
8143
8144 /* Emit the instruction sequence to compute the approximation for the division
8145 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8146
8147 bool
8148 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8149 {
8150 machine_mode mode = GET_MODE (quo);
8151
8152 if (GET_MODE_INNER (mode) == HFmode)
8153 return false;
8154
8155 bool use_approx_division_p = (flag_mlow_precision_div
8156 || (aarch64_tune_params.approx_modes->division
8157 & AARCH64_APPROX_MODE (mode)));
8158
8159 if (!flag_finite_math_only
8160 || flag_trapping_math
8161 || !flag_unsafe_math_optimizations
8162 || optimize_function_for_size_p (cfun)
8163 || !use_approx_division_p)
8164 return false;
8165
8166 /* Estimate the approximate reciprocal. */
8167 rtx xrcp = gen_reg_rtx (mode);
8168 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8169
8170 /* Iterate over the series twice for SF and thrice for DF. */
8171 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8172
8173 /* Optionally iterate over the series once less for faster performance,
8174 while sacrificing the accuracy. */
8175 if (flag_mlow_precision_div)
8176 iterations--;
8177
8178 /* Iterate over the series to calculate the approximate reciprocal. */
8179 rtx xtmp = gen_reg_rtx (mode);
8180 while (iterations--)
8181 {
8182 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8183
8184 if (iterations > 0)
8185 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8186 }
8187
8188 if (num != CONST1_RTX (mode))
8189 {
8190 /* As the approximate reciprocal of DEN is already calculated, only
8191 calculate the approximate division when NUM is not 1.0. */
8192 rtx xnum = force_reg (mode, num);
8193 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8194 }
8195
8196 /* Finalize the approximation. */
8197 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8198 return true;
8199 }
8200
8201 /* Return the number of instructions that can be issued per cycle. */
8202 static int
8203 aarch64_sched_issue_rate (void)
8204 {
8205 return aarch64_tune_params.issue_rate;
8206 }
8207
8208 static int
8209 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8210 {
8211 int issue_rate = aarch64_sched_issue_rate ();
8212
8213 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8214 }
8215
8216
8217 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8218 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8219 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8220
8221 static int
8222 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8223 int ready_index)
8224 {
8225 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8226 }
8227
8228
8229 /* Vectorizer cost model target hooks. */
8230
8231 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8232 static int
8233 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8234 tree vectype,
8235 int misalign ATTRIBUTE_UNUSED)
8236 {
8237 unsigned elements;
8238 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8239 bool fp = false;
8240
8241 if (vectype != NULL)
8242 fp = FLOAT_TYPE_P (vectype);
8243
8244 switch (type_of_cost)
8245 {
8246 case scalar_stmt:
8247 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8248
8249 case scalar_load:
8250 return costs->scalar_load_cost;
8251
8252 case scalar_store:
8253 return costs->scalar_store_cost;
8254
8255 case vector_stmt:
8256 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8257
8258 case vector_load:
8259 return costs->vec_align_load_cost;
8260
8261 case vector_store:
8262 return costs->vec_store_cost;
8263
8264 case vec_to_scalar:
8265 return costs->vec_to_scalar_cost;
8266
8267 case scalar_to_vec:
8268 return costs->scalar_to_vec_cost;
8269
8270 case unaligned_load:
8271 return costs->vec_unalign_load_cost;
8272
8273 case unaligned_store:
8274 return costs->vec_unalign_store_cost;
8275
8276 case cond_branch_taken:
8277 return costs->cond_taken_branch_cost;
8278
8279 case cond_branch_not_taken:
8280 return costs->cond_not_taken_branch_cost;
8281
8282 case vec_perm:
8283 return costs->vec_permute_cost;
8284
8285 case vec_promote_demote:
8286 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8287
8288 case vec_construct:
8289 elements = TYPE_VECTOR_SUBPARTS (vectype);
8290 return elements / 2 + 1;
8291
8292 default:
8293 gcc_unreachable ();
8294 }
8295 }
8296
8297 /* Implement targetm.vectorize.add_stmt_cost. */
8298 static unsigned
8299 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8300 struct _stmt_vec_info *stmt_info, int misalign,
8301 enum vect_cost_model_location where)
8302 {
8303 unsigned *cost = (unsigned *) data;
8304 unsigned retval = 0;
8305
8306 if (flag_vect_cost_model)
8307 {
8308 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8309 int stmt_cost =
8310 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8311
8312 /* Statements in an inner loop relative to the loop being
8313 vectorized are weighted more heavily. The value here is
8314 arbitrary and could potentially be improved with analysis. */
8315 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8316 count *= 50; /* FIXME */
8317
8318 retval = (unsigned) (count * stmt_cost);
8319 cost[where] += retval;
8320 }
8321
8322 return retval;
8323 }
8324
8325 static void initialize_aarch64_code_model (struct gcc_options *);
8326
8327 /* Parse the TO_PARSE string and put the architecture struct that it
8328 selects into RES and the architectural features into ISA_FLAGS.
8329 Return an aarch64_parse_opt_result describing the parse result.
8330 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8331
8332 static enum aarch64_parse_opt_result
8333 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8334 unsigned long *isa_flags)
8335 {
8336 char *ext;
8337 const struct processor *arch;
8338 char *str = (char *) alloca (strlen (to_parse) + 1);
8339 size_t len;
8340
8341 strcpy (str, to_parse);
8342
8343 ext = strchr (str, '+');
8344
8345 if (ext != NULL)
8346 len = ext - str;
8347 else
8348 len = strlen (str);
8349
8350 if (len == 0)
8351 return AARCH64_PARSE_MISSING_ARG;
8352
8353
8354 /* Loop through the list of supported ARCHes to find a match. */
8355 for (arch = all_architectures; arch->name != NULL; arch++)
8356 {
8357 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8358 {
8359 unsigned long isa_temp = arch->flags;
8360
8361 if (ext != NULL)
8362 {
8363 /* TO_PARSE string contains at least one extension. */
8364 enum aarch64_parse_opt_result ext_res
8365 = aarch64_parse_extension (ext, &isa_temp);
8366
8367 if (ext_res != AARCH64_PARSE_OK)
8368 return ext_res;
8369 }
8370 /* Extension parsing was successful. Confirm the result
8371 arch and ISA flags. */
8372 *res = arch;
8373 *isa_flags = isa_temp;
8374 return AARCH64_PARSE_OK;
8375 }
8376 }
8377
8378 /* ARCH name not found in list. */
8379 return AARCH64_PARSE_INVALID_ARG;
8380 }
8381
8382 /* Parse the TO_PARSE string and put the result tuning in RES and the
8383 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8384 describing the parse result. If there is an error parsing, RES and
8385 ISA_FLAGS are left unchanged. */
8386
8387 static enum aarch64_parse_opt_result
8388 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8389 unsigned long *isa_flags)
8390 {
8391 char *ext;
8392 const struct processor *cpu;
8393 char *str = (char *) alloca (strlen (to_parse) + 1);
8394 size_t len;
8395
8396 strcpy (str, to_parse);
8397
8398 ext = strchr (str, '+');
8399
8400 if (ext != NULL)
8401 len = ext - str;
8402 else
8403 len = strlen (str);
8404
8405 if (len == 0)
8406 return AARCH64_PARSE_MISSING_ARG;
8407
8408
8409 /* Loop through the list of supported CPUs to find a match. */
8410 for (cpu = all_cores; cpu->name != NULL; cpu++)
8411 {
8412 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8413 {
8414 unsigned long isa_temp = cpu->flags;
8415
8416
8417 if (ext != NULL)
8418 {
8419 /* TO_PARSE string contains at least one extension. */
8420 enum aarch64_parse_opt_result ext_res
8421 = aarch64_parse_extension (ext, &isa_temp);
8422
8423 if (ext_res != AARCH64_PARSE_OK)
8424 return ext_res;
8425 }
8426 /* Extension parsing was successfull. Confirm the result
8427 cpu and ISA flags. */
8428 *res = cpu;
8429 *isa_flags = isa_temp;
8430 return AARCH64_PARSE_OK;
8431 }
8432 }
8433
8434 /* CPU name not found in list. */
8435 return AARCH64_PARSE_INVALID_ARG;
8436 }
8437
8438 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8439 Return an aarch64_parse_opt_result describing the parse result.
8440 If the parsing fails the RES does not change. */
8441
8442 static enum aarch64_parse_opt_result
8443 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8444 {
8445 const struct processor *cpu;
8446 char *str = (char *) alloca (strlen (to_parse) + 1);
8447
8448 strcpy (str, to_parse);
8449
8450 /* Loop through the list of supported CPUs to find a match. */
8451 for (cpu = all_cores; cpu->name != NULL; cpu++)
8452 {
8453 if (strcmp (cpu->name, str) == 0)
8454 {
8455 *res = cpu;
8456 return AARCH64_PARSE_OK;
8457 }
8458 }
8459
8460 /* CPU name not found in list. */
8461 return AARCH64_PARSE_INVALID_ARG;
8462 }
8463
8464 /* Parse TOKEN, which has length LENGTH to see if it is an option
8465 described in FLAG. If it is, return the index bit for that fusion type.
8466 If not, error (printing OPTION_NAME) and return zero. */
8467
8468 static unsigned int
8469 aarch64_parse_one_option_token (const char *token,
8470 size_t length,
8471 const struct aarch64_flag_desc *flag,
8472 const char *option_name)
8473 {
8474 for (; flag->name != NULL; flag++)
8475 {
8476 if (length == strlen (flag->name)
8477 && !strncmp (flag->name, token, length))
8478 return flag->flag;
8479 }
8480
8481 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8482 return 0;
8483 }
8484
8485 /* Parse OPTION which is a comma-separated list of flags to enable.
8486 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8487 default state we inherit from the CPU tuning structures. OPTION_NAME
8488 gives the top-level option we are parsing in the -moverride string,
8489 for use in error messages. */
8490
8491 static unsigned int
8492 aarch64_parse_boolean_options (const char *option,
8493 const struct aarch64_flag_desc *flags,
8494 unsigned int initial_state,
8495 const char *option_name)
8496 {
8497 const char separator = '.';
8498 const char* specs = option;
8499 const char* ntoken = option;
8500 unsigned int found_flags = initial_state;
8501
8502 while ((ntoken = strchr (specs, separator)))
8503 {
8504 size_t token_length = ntoken - specs;
8505 unsigned token_ops = aarch64_parse_one_option_token (specs,
8506 token_length,
8507 flags,
8508 option_name);
8509 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8510 in the token stream, reset the supported operations. So:
8511
8512 adrp+add.cmp+branch.none.adrp+add
8513
8514 would have the result of turning on only adrp+add fusion. */
8515 if (!token_ops)
8516 found_flags = 0;
8517
8518 found_flags |= token_ops;
8519 specs = ++ntoken;
8520 }
8521
8522 /* We ended with a comma, print something. */
8523 if (!(*specs))
8524 {
8525 error ("%s string ill-formed\n", option_name);
8526 return 0;
8527 }
8528
8529 /* We still have one more token to parse. */
8530 size_t token_length = strlen (specs);
8531 unsigned token_ops = aarch64_parse_one_option_token (specs,
8532 token_length,
8533 flags,
8534 option_name);
8535 if (!token_ops)
8536 found_flags = 0;
8537
8538 found_flags |= token_ops;
8539 return found_flags;
8540 }
8541
8542 /* Support for overriding instruction fusion. */
8543
8544 static void
8545 aarch64_parse_fuse_string (const char *fuse_string,
8546 struct tune_params *tune)
8547 {
8548 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8549 aarch64_fusible_pairs,
8550 tune->fusible_ops,
8551 "fuse=");
8552 }
8553
8554 /* Support for overriding other tuning flags. */
8555
8556 static void
8557 aarch64_parse_tune_string (const char *tune_string,
8558 struct tune_params *tune)
8559 {
8560 tune->extra_tuning_flags
8561 = aarch64_parse_boolean_options (tune_string,
8562 aarch64_tuning_flags,
8563 tune->extra_tuning_flags,
8564 "tune=");
8565 }
8566
8567 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8568 we understand. If it is, extract the option string and handoff to
8569 the appropriate function. */
8570
8571 void
8572 aarch64_parse_one_override_token (const char* token,
8573 size_t length,
8574 struct tune_params *tune)
8575 {
8576 const struct aarch64_tuning_override_function *fn
8577 = aarch64_tuning_override_functions;
8578
8579 const char *option_part = strchr (token, '=');
8580 if (!option_part)
8581 {
8582 error ("tuning string missing in option (%s)", token);
8583 return;
8584 }
8585
8586 /* Get the length of the option name. */
8587 length = option_part - token;
8588 /* Skip the '=' to get to the option string. */
8589 option_part++;
8590
8591 for (; fn->name != NULL; fn++)
8592 {
8593 if (!strncmp (fn->name, token, length))
8594 {
8595 fn->parse_override (option_part, tune);
8596 return;
8597 }
8598 }
8599
8600 error ("unknown tuning option (%s)",token);
8601 return;
8602 }
8603
8604 /* A checking mechanism for the implementation of the tls size. */
8605
8606 static void
8607 initialize_aarch64_tls_size (struct gcc_options *opts)
8608 {
8609 if (aarch64_tls_size == 0)
8610 aarch64_tls_size = 24;
8611
8612 switch (opts->x_aarch64_cmodel_var)
8613 {
8614 case AARCH64_CMODEL_TINY:
8615 /* Both the default and maximum TLS size allowed under tiny is 1M which
8616 needs two instructions to address, so we clamp the size to 24. */
8617 if (aarch64_tls_size > 24)
8618 aarch64_tls_size = 24;
8619 break;
8620 case AARCH64_CMODEL_SMALL:
8621 /* The maximum TLS size allowed under small is 4G. */
8622 if (aarch64_tls_size > 32)
8623 aarch64_tls_size = 32;
8624 break;
8625 case AARCH64_CMODEL_LARGE:
8626 /* The maximum TLS size allowed under large is 16E.
8627 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8628 if (aarch64_tls_size > 48)
8629 aarch64_tls_size = 48;
8630 break;
8631 default:
8632 gcc_unreachable ();
8633 }
8634
8635 return;
8636 }
8637
8638 /* Parse STRING looking for options in the format:
8639 string :: option:string
8640 option :: name=substring
8641 name :: {a-z}
8642 substring :: defined by option. */
8643
8644 static void
8645 aarch64_parse_override_string (const char* input_string,
8646 struct tune_params* tune)
8647 {
8648 const char separator = ':';
8649 size_t string_length = strlen (input_string) + 1;
8650 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8651 char *string = string_root;
8652 strncpy (string, input_string, string_length);
8653 string[string_length - 1] = '\0';
8654
8655 char* ntoken = string;
8656
8657 while ((ntoken = strchr (string, separator)))
8658 {
8659 size_t token_length = ntoken - string;
8660 /* Make this substring look like a string. */
8661 *ntoken = '\0';
8662 aarch64_parse_one_override_token (string, token_length, tune);
8663 string = ++ntoken;
8664 }
8665
8666 /* One last option to parse. */
8667 aarch64_parse_one_override_token (string, strlen (string), tune);
8668 free (string_root);
8669 }
8670
8671
8672 static void
8673 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8674 {
8675 /* The logic here is that if we are disabling all frame pointer generation
8676 then we do not need to disable leaf frame pointer generation as a
8677 separate operation. But if we are *only* disabling leaf frame pointer
8678 generation then we set flag_omit_frame_pointer to true, but in
8679 aarch64_frame_pointer_required we return false only for leaf functions.
8680
8681 PR 70044: We have to be careful about being called multiple times for the
8682 same function. Once we have decided to set flag_omit_frame_pointer just
8683 so that we can omit leaf frame pointers, we must then not interpret a
8684 second call as meaning that all frame pointer generation should be
8685 omitted. We do this by setting flag_omit_frame_pointer to a special,
8686 non-zero value. */
8687 if (opts->x_flag_omit_frame_pointer == 2)
8688 opts->x_flag_omit_frame_pointer = 0;
8689
8690 if (opts->x_flag_omit_frame_pointer)
8691 opts->x_flag_omit_leaf_frame_pointer = false;
8692 else if (opts->x_flag_omit_leaf_frame_pointer)
8693 opts->x_flag_omit_frame_pointer = 2;
8694
8695 /* If not optimizing for size, set the default
8696 alignment to what the target wants. */
8697 if (!opts->x_optimize_size)
8698 {
8699 if (opts->x_align_loops <= 0)
8700 opts->x_align_loops = aarch64_tune_params.loop_align;
8701 if (opts->x_align_jumps <= 0)
8702 opts->x_align_jumps = aarch64_tune_params.jump_align;
8703 if (opts->x_align_functions <= 0)
8704 opts->x_align_functions = aarch64_tune_params.function_align;
8705 }
8706
8707 /* We default to no pc-relative literal loads. */
8708
8709 aarch64_pcrelative_literal_loads = false;
8710
8711 /* If -mpc-relative-literal-loads is set on the command line, this
8712 implies that the user asked for PC relative literal loads. */
8713 if (opts->x_pcrelative_literal_loads == 1)
8714 aarch64_pcrelative_literal_loads = true;
8715
8716 /* This is PR70113. When building the Linux kernel with
8717 CONFIG_ARM64_ERRATUM_843419, support for relocations
8718 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8719 removed from the kernel to avoid loading objects with possibly
8720 offending sequences. Without -mpc-relative-literal-loads we would
8721 generate such relocations, preventing the kernel build from
8722 succeeding. */
8723 if (opts->x_pcrelative_literal_loads == 2
8724 && TARGET_FIX_ERR_A53_843419)
8725 aarch64_pcrelative_literal_loads = true;
8726
8727 /* In the tiny memory model it makes no sense to disallow PC relative
8728 literal pool loads. */
8729 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8730 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8731 aarch64_pcrelative_literal_loads = true;
8732
8733 /* When enabling the lower precision Newton series for the square root, also
8734 enable it for the reciprocal square root, since the latter is an
8735 intermediary step for the former. */
8736 if (flag_mlow_precision_sqrt)
8737 flag_mrecip_low_precision_sqrt = true;
8738 }
8739
8740 /* 'Unpack' up the internal tuning structs and update the options
8741 in OPTS. The caller must have set up selected_tune and selected_arch
8742 as all the other target-specific codegen decisions are
8743 derived from them. */
8744
8745 void
8746 aarch64_override_options_internal (struct gcc_options *opts)
8747 {
8748 aarch64_tune_flags = selected_tune->flags;
8749 aarch64_tune = selected_tune->sched_core;
8750 /* Make a copy of the tuning parameters attached to the core, which
8751 we may later overwrite. */
8752 aarch64_tune_params = *(selected_tune->tune);
8753 aarch64_architecture_version = selected_arch->architecture_version;
8754
8755 if (opts->x_aarch64_override_tune_string)
8756 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8757 &aarch64_tune_params);
8758
8759 /* This target defaults to strict volatile bitfields. */
8760 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8761 opts->x_flag_strict_volatile_bitfields = 1;
8762
8763 initialize_aarch64_code_model (opts);
8764 initialize_aarch64_tls_size (opts);
8765
8766 int queue_depth = 0;
8767 switch (aarch64_tune_params.autoprefetcher_model)
8768 {
8769 case tune_params::AUTOPREFETCHER_OFF:
8770 queue_depth = -1;
8771 break;
8772 case tune_params::AUTOPREFETCHER_WEAK:
8773 queue_depth = 0;
8774 break;
8775 case tune_params::AUTOPREFETCHER_STRONG:
8776 queue_depth = max_insn_queue_index + 1;
8777 break;
8778 default:
8779 gcc_unreachable ();
8780 }
8781
8782 /* We don't mind passing in global_options_set here as we don't use
8783 the *options_set structs anyway. */
8784 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8785 queue_depth,
8786 opts->x_param_values,
8787 global_options_set.x_param_values);
8788
8789 /* Set up parameters to be used in prefetching algorithm. Do not
8790 override the defaults unless we are tuning for a core we have
8791 researched values for. */
8792 if (aarch64_tune_params.prefetch->num_slots > 0)
8793 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
8794 aarch64_tune_params.prefetch->num_slots,
8795 opts->x_param_values,
8796 global_options_set.x_param_values);
8797 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
8798 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
8799 aarch64_tune_params.prefetch->l1_cache_size,
8800 opts->x_param_values,
8801 global_options_set.x_param_values);
8802 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
8803 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8804 aarch64_tune_params.prefetch->l1_cache_line_size,
8805 opts->x_param_values,
8806 global_options_set.x_param_values);
8807 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
8808 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
8809 aarch64_tune_params.prefetch->l2_cache_size,
8810 opts->x_param_values,
8811 global_options_set.x_param_values);
8812
8813 /* Enable sw prefetching at specified optimization level for
8814 CPUS that have prefetch. Lower optimization level threshold by 1
8815 when profiling is enabled. */
8816 if (opts->x_flag_prefetch_loop_arrays < 0
8817 && !opts->x_optimize_size
8818 && aarch64_tune_params.prefetch->default_opt_level >= 0
8819 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
8820 opts->x_flag_prefetch_loop_arrays = 1;
8821
8822 aarch64_override_options_after_change_1 (opts);
8823 }
8824
8825 /* Print a hint with a suggestion for a core or architecture name that
8826 most closely resembles what the user passed in STR. ARCH is true if
8827 the user is asking for an architecture name. ARCH is false if the user
8828 is asking for a core name. */
8829
8830 static void
8831 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8832 {
8833 auto_vec<const char *> candidates;
8834 const struct processor *entry = arch ? all_architectures : all_cores;
8835 for (; entry->name != NULL; entry++)
8836 candidates.safe_push (entry->name);
8837 char *s;
8838 const char *hint = candidates_list_and_hint (str, s, candidates);
8839 if (hint)
8840 inform (input_location, "valid arguments are: %s;"
8841 " did you mean %qs?", s, hint);
8842 XDELETEVEC (s);
8843 }
8844
8845 /* Print a hint with a suggestion for a core name that most closely resembles
8846 what the user passed in STR. */
8847
8848 inline static void
8849 aarch64_print_hint_for_core (const char *str)
8850 {
8851 aarch64_print_hint_for_core_or_arch (str, false);
8852 }
8853
8854 /* Print a hint with a suggestion for an architecture name that most closely
8855 resembles what the user passed in STR. */
8856
8857 inline static void
8858 aarch64_print_hint_for_arch (const char *str)
8859 {
8860 aarch64_print_hint_for_core_or_arch (str, true);
8861 }
8862
8863 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8864 specified in STR and throw errors if appropriate. Put the results if
8865 they are valid in RES and ISA_FLAGS. Return whether the option is
8866 valid. */
8867
8868 static bool
8869 aarch64_validate_mcpu (const char *str, const struct processor **res,
8870 unsigned long *isa_flags)
8871 {
8872 enum aarch64_parse_opt_result parse_res
8873 = aarch64_parse_cpu (str, res, isa_flags);
8874
8875 if (parse_res == AARCH64_PARSE_OK)
8876 return true;
8877
8878 switch (parse_res)
8879 {
8880 case AARCH64_PARSE_MISSING_ARG:
8881 error ("missing cpu name in %<-mcpu=%s%>", str);
8882 break;
8883 case AARCH64_PARSE_INVALID_ARG:
8884 error ("unknown value %qs for -mcpu", str);
8885 aarch64_print_hint_for_core (str);
8886 break;
8887 case AARCH64_PARSE_INVALID_FEATURE:
8888 error ("invalid feature modifier in %<-mcpu=%s%>", str);
8889 break;
8890 default:
8891 gcc_unreachable ();
8892 }
8893
8894 return false;
8895 }
8896
8897 /* Validate a command-line -march option. Parse the arch and extensions
8898 (if any) specified in STR and throw errors if appropriate. Put the
8899 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8900 option is valid. */
8901
8902 static bool
8903 aarch64_validate_march (const char *str, const struct processor **res,
8904 unsigned long *isa_flags)
8905 {
8906 enum aarch64_parse_opt_result parse_res
8907 = aarch64_parse_arch (str, res, isa_flags);
8908
8909 if (parse_res == AARCH64_PARSE_OK)
8910 return true;
8911
8912 switch (parse_res)
8913 {
8914 case AARCH64_PARSE_MISSING_ARG:
8915 error ("missing arch name in %<-march=%s%>", str);
8916 break;
8917 case AARCH64_PARSE_INVALID_ARG:
8918 error ("unknown value %qs for -march", str);
8919 aarch64_print_hint_for_arch (str);
8920 break;
8921 case AARCH64_PARSE_INVALID_FEATURE:
8922 error ("invalid feature modifier in %<-march=%s%>", str);
8923 break;
8924 default:
8925 gcc_unreachable ();
8926 }
8927
8928 return false;
8929 }
8930
8931 /* Validate a command-line -mtune option. Parse the cpu
8932 specified in STR and throw errors if appropriate. Put the
8933 result, if it is valid, in RES. Return whether the option is
8934 valid. */
8935
8936 static bool
8937 aarch64_validate_mtune (const char *str, const struct processor **res)
8938 {
8939 enum aarch64_parse_opt_result parse_res
8940 = aarch64_parse_tune (str, res);
8941
8942 if (parse_res == AARCH64_PARSE_OK)
8943 return true;
8944
8945 switch (parse_res)
8946 {
8947 case AARCH64_PARSE_MISSING_ARG:
8948 error ("missing cpu name in %<-mtune=%s%>", str);
8949 break;
8950 case AARCH64_PARSE_INVALID_ARG:
8951 error ("unknown value %qs for -mtune", str);
8952 aarch64_print_hint_for_core (str);
8953 break;
8954 default:
8955 gcc_unreachable ();
8956 }
8957 return false;
8958 }
8959
8960 /* Return the CPU corresponding to the enum CPU.
8961 If it doesn't specify a cpu, return the default. */
8962
8963 static const struct processor *
8964 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8965 {
8966 if (cpu != aarch64_none)
8967 return &all_cores[cpu];
8968
8969 /* The & 0x3f is to extract the bottom 6 bits that encode the
8970 default cpu as selected by the --with-cpu GCC configure option
8971 in config.gcc.
8972 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8973 flags mechanism should be reworked to make it more sane. */
8974 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8975 }
8976
8977 /* Return the architecture corresponding to the enum ARCH.
8978 If it doesn't specify a valid architecture, return the default. */
8979
8980 static const struct processor *
8981 aarch64_get_arch (enum aarch64_arch arch)
8982 {
8983 if (arch != aarch64_no_arch)
8984 return &all_architectures[arch];
8985
8986 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8987
8988 return &all_architectures[cpu->arch];
8989 }
8990
8991 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8992 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8993 tuning structs. In particular it must set selected_tune and
8994 aarch64_isa_flags that define the available ISA features and tuning
8995 decisions. It must also set selected_arch as this will be used to
8996 output the .arch asm tags for each function. */
8997
8998 static void
8999 aarch64_override_options (void)
9000 {
9001 unsigned long cpu_isa = 0;
9002 unsigned long arch_isa = 0;
9003 aarch64_isa_flags = 0;
9004
9005 bool valid_cpu = true;
9006 bool valid_tune = true;
9007 bool valid_arch = true;
9008
9009 selected_cpu = NULL;
9010 selected_arch = NULL;
9011 selected_tune = NULL;
9012
9013 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9014 If either of -march or -mtune is given, they override their
9015 respective component of -mcpu. */
9016 if (aarch64_cpu_string)
9017 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9018 &cpu_isa);
9019
9020 if (aarch64_arch_string)
9021 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9022 &arch_isa);
9023
9024 if (aarch64_tune_string)
9025 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9026
9027 /* If the user did not specify a processor, choose the default
9028 one for them. This will be the CPU set during configuration using
9029 --with-cpu, otherwise it is "generic". */
9030 if (!selected_cpu)
9031 {
9032 if (selected_arch)
9033 {
9034 selected_cpu = &all_cores[selected_arch->ident];
9035 aarch64_isa_flags = arch_isa;
9036 explicit_arch = selected_arch->arch;
9037 }
9038 else
9039 {
9040 /* Get default configure-time CPU. */
9041 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9042 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9043 }
9044
9045 if (selected_tune)
9046 explicit_tune_core = selected_tune->ident;
9047 }
9048 /* If both -mcpu and -march are specified check that they are architecturally
9049 compatible, warn if they're not and prefer the -march ISA flags. */
9050 else if (selected_arch)
9051 {
9052 if (selected_arch->arch != selected_cpu->arch)
9053 {
9054 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9055 all_architectures[selected_cpu->arch].name,
9056 selected_arch->name);
9057 }
9058 aarch64_isa_flags = arch_isa;
9059 explicit_arch = selected_arch->arch;
9060 explicit_tune_core = selected_tune ? selected_tune->ident
9061 : selected_cpu->ident;
9062 }
9063 else
9064 {
9065 /* -mcpu but no -march. */
9066 aarch64_isa_flags = cpu_isa;
9067 explicit_tune_core = selected_tune ? selected_tune->ident
9068 : selected_cpu->ident;
9069 gcc_assert (selected_cpu);
9070 selected_arch = &all_architectures[selected_cpu->arch];
9071 explicit_arch = selected_arch->arch;
9072 }
9073
9074 /* Set the arch as well as we will need it when outputing
9075 the .arch directive in assembly. */
9076 if (!selected_arch)
9077 {
9078 gcc_assert (selected_cpu);
9079 selected_arch = &all_architectures[selected_cpu->arch];
9080 }
9081
9082 if (!selected_tune)
9083 selected_tune = selected_cpu;
9084
9085 #ifndef HAVE_AS_MABI_OPTION
9086 /* The compiler may have been configured with 2.23.* binutils, which does
9087 not have support for ILP32. */
9088 if (TARGET_ILP32)
9089 error ("Assembler does not support -mabi=ilp32");
9090 #endif
9091
9092 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9093 sorry ("Return address signing is only supported for -mabi=lp64");
9094
9095 /* Make sure we properly set up the explicit options. */
9096 if ((aarch64_cpu_string && valid_cpu)
9097 || (aarch64_tune_string && valid_tune))
9098 gcc_assert (explicit_tune_core != aarch64_none);
9099
9100 if ((aarch64_cpu_string && valid_cpu)
9101 || (aarch64_arch_string && valid_arch))
9102 gcc_assert (explicit_arch != aarch64_no_arch);
9103
9104 aarch64_override_options_internal (&global_options);
9105
9106 /* Save these options as the default ones in case we push and pop them later
9107 while processing functions with potential target attributes. */
9108 target_option_default_node = target_option_current_node
9109 = build_target_option_node (&global_options);
9110 }
9111
9112 /* Implement targetm.override_options_after_change. */
9113
9114 static void
9115 aarch64_override_options_after_change (void)
9116 {
9117 aarch64_override_options_after_change_1 (&global_options);
9118 }
9119
9120 static struct machine_function *
9121 aarch64_init_machine_status (void)
9122 {
9123 struct machine_function *machine;
9124 machine = ggc_cleared_alloc<machine_function> ();
9125 return machine;
9126 }
9127
9128 void
9129 aarch64_init_expanders (void)
9130 {
9131 init_machine_status = aarch64_init_machine_status;
9132 }
9133
9134 /* A checking mechanism for the implementation of the various code models. */
9135 static void
9136 initialize_aarch64_code_model (struct gcc_options *opts)
9137 {
9138 if (opts->x_flag_pic)
9139 {
9140 switch (opts->x_aarch64_cmodel_var)
9141 {
9142 case AARCH64_CMODEL_TINY:
9143 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9144 break;
9145 case AARCH64_CMODEL_SMALL:
9146 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9147 aarch64_cmodel = (flag_pic == 2
9148 ? AARCH64_CMODEL_SMALL_PIC
9149 : AARCH64_CMODEL_SMALL_SPIC);
9150 #else
9151 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9152 #endif
9153 break;
9154 case AARCH64_CMODEL_LARGE:
9155 sorry ("code model %qs with -f%s", "large",
9156 opts->x_flag_pic > 1 ? "PIC" : "pic");
9157 break;
9158 default:
9159 gcc_unreachable ();
9160 }
9161 }
9162 else
9163 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9164 }
9165
9166 /* Implement TARGET_OPTION_SAVE. */
9167
9168 static void
9169 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9170 {
9171 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9172 }
9173
9174 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9175 using the information saved in PTR. */
9176
9177 static void
9178 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9179 {
9180 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9181 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9182 opts->x_explicit_arch = ptr->x_explicit_arch;
9183 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9184 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9185
9186 aarch64_override_options_internal (opts);
9187 }
9188
9189 /* Implement TARGET_OPTION_PRINT. */
9190
9191 static void
9192 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9193 {
9194 const struct processor *cpu
9195 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9196 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9197 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9198 std::string extension
9199 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9200
9201 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9202 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9203 arch->name, extension.c_str ());
9204 }
9205
9206 static GTY(()) tree aarch64_previous_fndecl;
9207
9208 void
9209 aarch64_reset_previous_fndecl (void)
9210 {
9211 aarch64_previous_fndecl = NULL;
9212 }
9213
9214 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9215 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9216 make sure optab availability predicates are recomputed when necessary. */
9217
9218 void
9219 aarch64_save_restore_target_globals (tree new_tree)
9220 {
9221 if (TREE_TARGET_GLOBALS (new_tree))
9222 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9223 else if (new_tree == target_option_default_node)
9224 restore_target_globals (&default_target_globals);
9225 else
9226 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9227 }
9228
9229 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9230 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9231 of the function, if such exists. This function may be called multiple
9232 times on a single function so use aarch64_previous_fndecl to avoid
9233 setting up identical state. */
9234
9235 static void
9236 aarch64_set_current_function (tree fndecl)
9237 {
9238 if (!fndecl || fndecl == aarch64_previous_fndecl)
9239 return;
9240
9241 tree old_tree = (aarch64_previous_fndecl
9242 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9243 : NULL_TREE);
9244
9245 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9246
9247 /* If current function has no attributes but the previous one did,
9248 use the default node. */
9249 if (!new_tree && old_tree)
9250 new_tree = target_option_default_node;
9251
9252 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9253 the default have been handled by aarch64_save_restore_target_globals from
9254 aarch64_pragma_target_parse. */
9255 if (old_tree == new_tree)
9256 return;
9257
9258 aarch64_previous_fndecl = fndecl;
9259
9260 /* First set the target options. */
9261 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9262
9263 aarch64_save_restore_target_globals (new_tree);
9264 }
9265
9266 /* Enum describing the various ways we can handle attributes.
9267 In many cases we can reuse the generic option handling machinery. */
9268
9269 enum aarch64_attr_opt_type
9270 {
9271 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9272 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9273 aarch64_attr_enum, /* Attribute sets an enum variable. */
9274 aarch64_attr_custom /* Attribute requires a custom handling function. */
9275 };
9276
9277 /* All the information needed to handle a target attribute.
9278 NAME is the name of the attribute.
9279 ATTR_TYPE specifies the type of behavior of the attribute as described
9280 in the definition of enum aarch64_attr_opt_type.
9281 ALLOW_NEG is true if the attribute supports a "no-" form.
9282 HANDLER is the function that takes the attribute string and whether
9283 it is a pragma or attribute and handles the option. It is needed only
9284 when the ATTR_TYPE is aarch64_attr_custom.
9285 OPT_NUM is the enum specifying the option that the attribute modifies.
9286 This is needed for attributes that mirror the behavior of a command-line
9287 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9288 aarch64_attr_enum. */
9289
9290 struct aarch64_attribute_info
9291 {
9292 const char *name;
9293 enum aarch64_attr_opt_type attr_type;
9294 bool allow_neg;
9295 bool (*handler) (const char *, const char *);
9296 enum opt_code opt_num;
9297 };
9298
9299 /* Handle the ARCH_STR argument to the arch= target attribute.
9300 PRAGMA_OR_ATTR is used in potential error messages. */
9301
9302 static bool
9303 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9304 {
9305 const struct processor *tmp_arch = NULL;
9306 enum aarch64_parse_opt_result parse_res
9307 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9308
9309 if (parse_res == AARCH64_PARSE_OK)
9310 {
9311 gcc_assert (tmp_arch);
9312 selected_arch = tmp_arch;
9313 explicit_arch = selected_arch->arch;
9314 return true;
9315 }
9316
9317 switch (parse_res)
9318 {
9319 case AARCH64_PARSE_MISSING_ARG:
9320 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9321 break;
9322 case AARCH64_PARSE_INVALID_ARG:
9323 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9324 aarch64_print_hint_for_arch (str);
9325 break;
9326 case AARCH64_PARSE_INVALID_FEATURE:
9327 error ("invalid feature modifier %qs for 'arch' target %s",
9328 str, pragma_or_attr);
9329 break;
9330 default:
9331 gcc_unreachable ();
9332 }
9333
9334 return false;
9335 }
9336
9337 /* Handle the argument CPU_STR to the cpu= target attribute.
9338 PRAGMA_OR_ATTR is used in potential error messages. */
9339
9340 static bool
9341 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9342 {
9343 const struct processor *tmp_cpu = NULL;
9344 enum aarch64_parse_opt_result parse_res
9345 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9346
9347 if (parse_res == AARCH64_PARSE_OK)
9348 {
9349 gcc_assert (tmp_cpu);
9350 selected_tune = tmp_cpu;
9351 explicit_tune_core = selected_tune->ident;
9352
9353 selected_arch = &all_architectures[tmp_cpu->arch];
9354 explicit_arch = selected_arch->arch;
9355 return true;
9356 }
9357
9358 switch (parse_res)
9359 {
9360 case AARCH64_PARSE_MISSING_ARG:
9361 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9362 break;
9363 case AARCH64_PARSE_INVALID_ARG:
9364 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9365 aarch64_print_hint_for_core (str);
9366 break;
9367 case AARCH64_PARSE_INVALID_FEATURE:
9368 error ("invalid feature modifier %qs for 'cpu' target %s",
9369 str, pragma_or_attr);
9370 break;
9371 default:
9372 gcc_unreachable ();
9373 }
9374
9375 return false;
9376 }
9377
9378 /* Handle the argument STR to the tune= target attribute.
9379 PRAGMA_OR_ATTR is used in potential error messages. */
9380
9381 static bool
9382 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9383 {
9384 const struct processor *tmp_tune = NULL;
9385 enum aarch64_parse_opt_result parse_res
9386 = aarch64_parse_tune (str, &tmp_tune);
9387
9388 if (parse_res == AARCH64_PARSE_OK)
9389 {
9390 gcc_assert (tmp_tune);
9391 selected_tune = tmp_tune;
9392 explicit_tune_core = selected_tune->ident;
9393 return true;
9394 }
9395
9396 switch (parse_res)
9397 {
9398 case AARCH64_PARSE_INVALID_ARG:
9399 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9400 aarch64_print_hint_for_core (str);
9401 break;
9402 default:
9403 gcc_unreachable ();
9404 }
9405
9406 return false;
9407 }
9408
9409 /* Parse an architecture extensions target attribute string specified in STR.
9410 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9411 if successful. Update aarch64_isa_flags to reflect the ISA features
9412 modified.
9413 PRAGMA_OR_ATTR is used in potential error messages. */
9414
9415 static bool
9416 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9417 {
9418 enum aarch64_parse_opt_result parse_res;
9419 unsigned long isa_flags = aarch64_isa_flags;
9420
9421 /* We allow "+nothing" in the beginning to clear out all architectural
9422 features if the user wants to handpick specific features. */
9423 if (strncmp ("+nothing", str, 8) == 0)
9424 {
9425 isa_flags = 0;
9426 str += 8;
9427 }
9428
9429 parse_res = aarch64_parse_extension (str, &isa_flags);
9430
9431 if (parse_res == AARCH64_PARSE_OK)
9432 {
9433 aarch64_isa_flags = isa_flags;
9434 return true;
9435 }
9436
9437 switch (parse_res)
9438 {
9439 case AARCH64_PARSE_MISSING_ARG:
9440 error ("missing feature modifier in target %s %qs",
9441 pragma_or_attr, str);
9442 break;
9443
9444 case AARCH64_PARSE_INVALID_FEATURE:
9445 error ("invalid feature modifier in target %s %qs",
9446 pragma_or_attr, str);
9447 break;
9448
9449 default:
9450 gcc_unreachable ();
9451 }
9452
9453 return false;
9454 }
9455
9456 /* The target attributes that we support. On top of these we also support just
9457 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9458 handled explicitly in aarch64_process_one_target_attr. */
9459
9460 static const struct aarch64_attribute_info aarch64_attributes[] =
9461 {
9462 { "general-regs-only", aarch64_attr_mask, false, NULL,
9463 OPT_mgeneral_regs_only },
9464 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9465 OPT_mfix_cortex_a53_835769 },
9466 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9467 OPT_mfix_cortex_a53_843419 },
9468 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9469 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9470 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9471 OPT_momit_leaf_frame_pointer },
9472 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9473 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9474 OPT_march_ },
9475 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9476 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9477 OPT_mtune_ },
9478 { "sign-return-address", aarch64_attr_enum, false, NULL,
9479 OPT_msign_return_address_ },
9480 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9481 };
9482
9483 /* Parse ARG_STR which contains the definition of one target attribute.
9484 Show appropriate errors if any or return true if the attribute is valid.
9485 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9486 we're processing a target attribute or pragma. */
9487
9488 static bool
9489 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9490 {
9491 bool invert = false;
9492
9493 size_t len = strlen (arg_str);
9494
9495 if (len == 0)
9496 {
9497 error ("malformed target %s", pragma_or_attr);
9498 return false;
9499 }
9500
9501 char *str_to_check = (char *) alloca (len + 1);
9502 strcpy (str_to_check, arg_str);
9503
9504 /* Skip leading whitespace. */
9505 while (*str_to_check == ' ' || *str_to_check == '\t')
9506 str_to_check++;
9507
9508 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9509 It is easier to detect and handle it explicitly here rather than going
9510 through the machinery for the rest of the target attributes in this
9511 function. */
9512 if (*str_to_check == '+')
9513 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9514
9515 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9516 {
9517 invert = true;
9518 str_to_check += 3;
9519 }
9520 char *arg = strchr (str_to_check, '=');
9521
9522 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9523 and point ARG to "foo". */
9524 if (arg)
9525 {
9526 *arg = '\0';
9527 arg++;
9528 }
9529 const struct aarch64_attribute_info *p_attr;
9530 bool found = false;
9531 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9532 {
9533 /* If the names don't match up, or the user has given an argument
9534 to an attribute that doesn't accept one, or didn't give an argument
9535 to an attribute that expects one, fail to match. */
9536 if (strcmp (str_to_check, p_attr->name) != 0)
9537 continue;
9538
9539 found = true;
9540 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9541 || p_attr->attr_type == aarch64_attr_enum;
9542
9543 if (attr_need_arg_p ^ (arg != NULL))
9544 {
9545 error ("target %s %qs does not accept an argument",
9546 pragma_or_attr, str_to_check);
9547 return false;
9548 }
9549
9550 /* If the name matches but the attribute does not allow "no-" versions
9551 then we can't match. */
9552 if (invert && !p_attr->allow_neg)
9553 {
9554 error ("target %s %qs does not allow a negated form",
9555 pragma_or_attr, str_to_check);
9556 return false;
9557 }
9558
9559 switch (p_attr->attr_type)
9560 {
9561 /* Has a custom handler registered.
9562 For example, cpu=, arch=, tune=. */
9563 case aarch64_attr_custom:
9564 gcc_assert (p_attr->handler);
9565 if (!p_attr->handler (arg, pragma_or_attr))
9566 return false;
9567 break;
9568
9569 /* Either set or unset a boolean option. */
9570 case aarch64_attr_bool:
9571 {
9572 struct cl_decoded_option decoded;
9573
9574 generate_option (p_attr->opt_num, NULL, !invert,
9575 CL_TARGET, &decoded);
9576 aarch64_handle_option (&global_options, &global_options_set,
9577 &decoded, input_location);
9578 break;
9579 }
9580 /* Set or unset a bit in the target_flags. aarch64_handle_option
9581 should know what mask to apply given the option number. */
9582 case aarch64_attr_mask:
9583 {
9584 struct cl_decoded_option decoded;
9585 /* We only need to specify the option number.
9586 aarch64_handle_option will know which mask to apply. */
9587 decoded.opt_index = p_attr->opt_num;
9588 decoded.value = !invert;
9589 aarch64_handle_option (&global_options, &global_options_set,
9590 &decoded, input_location);
9591 break;
9592 }
9593 /* Use the option setting machinery to set an option to an enum. */
9594 case aarch64_attr_enum:
9595 {
9596 gcc_assert (arg);
9597 bool valid;
9598 int value;
9599 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9600 &value, CL_TARGET);
9601 if (valid)
9602 {
9603 set_option (&global_options, NULL, p_attr->opt_num, value,
9604 NULL, DK_UNSPECIFIED, input_location,
9605 global_dc);
9606 }
9607 else
9608 {
9609 error ("target %s %s=%s is not valid",
9610 pragma_or_attr, str_to_check, arg);
9611 }
9612 break;
9613 }
9614 default:
9615 gcc_unreachable ();
9616 }
9617 }
9618
9619 /* If we reached here we either have found an attribute and validated
9620 it or didn't match any. If we matched an attribute but its arguments
9621 were malformed we will have returned false already. */
9622 return found;
9623 }
9624
9625 /* Count how many times the character C appears in
9626 NULL-terminated string STR. */
9627
9628 static unsigned int
9629 num_occurences_in_str (char c, char *str)
9630 {
9631 unsigned int res = 0;
9632 while (*str != '\0')
9633 {
9634 if (*str == c)
9635 res++;
9636
9637 str++;
9638 }
9639
9640 return res;
9641 }
9642
9643 /* Parse the tree in ARGS that contains the target attribute information
9644 and update the global target options space. PRAGMA_OR_ATTR is a string
9645 to be used in error messages, specifying whether this is processing
9646 a target attribute or a target pragma. */
9647
9648 bool
9649 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9650 {
9651 if (TREE_CODE (args) == TREE_LIST)
9652 {
9653 do
9654 {
9655 tree head = TREE_VALUE (args);
9656 if (head)
9657 {
9658 if (!aarch64_process_target_attr (head, pragma_or_attr))
9659 return false;
9660 }
9661 args = TREE_CHAIN (args);
9662 } while (args);
9663
9664 return true;
9665 }
9666
9667 if (TREE_CODE (args) != STRING_CST)
9668 {
9669 error ("attribute %<target%> argument not a string");
9670 return false;
9671 }
9672
9673 size_t len = strlen (TREE_STRING_POINTER (args));
9674 char *str_to_check = (char *) alloca (len + 1);
9675 strcpy (str_to_check, TREE_STRING_POINTER (args));
9676
9677 if (len == 0)
9678 {
9679 error ("malformed target %s value", pragma_or_attr);
9680 return false;
9681 }
9682
9683 /* Used to catch empty spaces between commas i.e.
9684 attribute ((target ("attr1,,attr2"))). */
9685 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9686
9687 /* Handle multiple target attributes separated by ','. */
9688 char *token = strtok (str_to_check, ",");
9689
9690 unsigned int num_attrs = 0;
9691 while (token)
9692 {
9693 num_attrs++;
9694 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9695 {
9696 error ("target %s %qs is invalid", pragma_or_attr, token);
9697 return false;
9698 }
9699
9700 token = strtok (NULL, ",");
9701 }
9702
9703 if (num_attrs != num_commas + 1)
9704 {
9705 error ("malformed target %s list %qs",
9706 pragma_or_attr, TREE_STRING_POINTER (args));
9707 return false;
9708 }
9709
9710 return true;
9711 }
9712
9713 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9714 process attribute ((target ("..."))). */
9715
9716 static bool
9717 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9718 {
9719 struct cl_target_option cur_target;
9720 bool ret;
9721 tree old_optimize;
9722 tree new_target, new_optimize;
9723 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9724
9725 /* If what we're processing is the current pragma string then the
9726 target option node is already stored in target_option_current_node
9727 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9728 having to re-parse the string. This is especially useful to keep
9729 arm_neon.h compile times down since that header contains a lot
9730 of intrinsics enclosed in pragmas. */
9731 if (!existing_target && args == current_target_pragma)
9732 {
9733 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9734 return true;
9735 }
9736 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9737
9738 old_optimize = build_optimization_node (&global_options);
9739 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9740
9741 /* If the function changed the optimization levels as well as setting
9742 target options, start with the optimizations specified. */
9743 if (func_optimize && func_optimize != old_optimize)
9744 cl_optimization_restore (&global_options,
9745 TREE_OPTIMIZATION (func_optimize));
9746
9747 /* Save the current target options to restore at the end. */
9748 cl_target_option_save (&cur_target, &global_options);
9749
9750 /* If fndecl already has some target attributes applied to it, unpack
9751 them so that we add this attribute on top of them, rather than
9752 overwriting them. */
9753 if (existing_target)
9754 {
9755 struct cl_target_option *existing_options
9756 = TREE_TARGET_OPTION (existing_target);
9757
9758 if (existing_options)
9759 cl_target_option_restore (&global_options, existing_options);
9760 }
9761 else
9762 cl_target_option_restore (&global_options,
9763 TREE_TARGET_OPTION (target_option_current_node));
9764
9765
9766 ret = aarch64_process_target_attr (args, "attribute");
9767
9768 /* Set up any additional state. */
9769 if (ret)
9770 {
9771 aarch64_override_options_internal (&global_options);
9772 /* Initialize SIMD builtins if we haven't already.
9773 Set current_target_pragma to NULL for the duration so that
9774 the builtin initialization code doesn't try to tag the functions
9775 being built with the attributes specified by any current pragma, thus
9776 going into an infinite recursion. */
9777 if (TARGET_SIMD)
9778 {
9779 tree saved_current_target_pragma = current_target_pragma;
9780 current_target_pragma = NULL;
9781 aarch64_init_simd_builtins ();
9782 current_target_pragma = saved_current_target_pragma;
9783 }
9784 new_target = build_target_option_node (&global_options);
9785 }
9786 else
9787 new_target = NULL;
9788
9789 new_optimize = build_optimization_node (&global_options);
9790
9791 if (fndecl && ret)
9792 {
9793 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9794
9795 if (old_optimize != new_optimize)
9796 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9797 }
9798
9799 cl_target_option_restore (&global_options, &cur_target);
9800
9801 if (old_optimize != new_optimize)
9802 cl_optimization_restore (&global_options,
9803 TREE_OPTIMIZATION (old_optimize));
9804 return ret;
9805 }
9806
9807 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9808 tri-bool options (yes, no, don't care) and the default value is
9809 DEF, determine whether to reject inlining. */
9810
9811 static bool
9812 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9813 int dont_care, int def)
9814 {
9815 /* If the callee doesn't care, always allow inlining. */
9816 if (callee == dont_care)
9817 return true;
9818
9819 /* If the caller doesn't care, always allow inlining. */
9820 if (caller == dont_care)
9821 return true;
9822
9823 /* Otherwise, allow inlining if either the callee and caller values
9824 agree, or if the callee is using the default value. */
9825 return (callee == caller || callee == def);
9826 }
9827
9828 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9829 to inline CALLEE into CALLER based on target-specific info.
9830 Make sure that the caller and callee have compatible architectural
9831 features. Then go through the other possible target attributes
9832 and see if they can block inlining. Try not to reject always_inline
9833 callees unless they are incompatible architecturally. */
9834
9835 static bool
9836 aarch64_can_inline_p (tree caller, tree callee)
9837 {
9838 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9839 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9840
9841 /* If callee has no option attributes, then it is ok to inline. */
9842 if (!callee_tree)
9843 return true;
9844
9845 struct cl_target_option *caller_opts
9846 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9847 : target_option_default_node);
9848
9849 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9850
9851
9852 /* Callee's ISA flags should be a subset of the caller's. */
9853 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9854 != callee_opts->x_aarch64_isa_flags)
9855 return false;
9856
9857 /* Allow non-strict aligned functions inlining into strict
9858 aligned ones. */
9859 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9860 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9861 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9862 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9863 return false;
9864
9865 bool always_inline = lookup_attribute ("always_inline",
9866 DECL_ATTRIBUTES (callee));
9867
9868 /* If the architectural features match up and the callee is always_inline
9869 then the other attributes don't matter. */
9870 if (always_inline)
9871 return true;
9872
9873 if (caller_opts->x_aarch64_cmodel_var
9874 != callee_opts->x_aarch64_cmodel_var)
9875 return false;
9876
9877 if (caller_opts->x_aarch64_tls_dialect
9878 != callee_opts->x_aarch64_tls_dialect)
9879 return false;
9880
9881 /* Honour explicit requests to workaround errata. */
9882 if (!aarch64_tribools_ok_for_inlining_p (
9883 caller_opts->x_aarch64_fix_a53_err835769,
9884 callee_opts->x_aarch64_fix_a53_err835769,
9885 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9886 return false;
9887
9888 if (!aarch64_tribools_ok_for_inlining_p (
9889 caller_opts->x_aarch64_fix_a53_err843419,
9890 callee_opts->x_aarch64_fix_a53_err843419,
9891 2, TARGET_FIX_ERR_A53_843419))
9892 return false;
9893
9894 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9895 caller and calle and they don't match up, reject inlining. */
9896 if (!aarch64_tribools_ok_for_inlining_p (
9897 caller_opts->x_flag_omit_leaf_frame_pointer,
9898 callee_opts->x_flag_omit_leaf_frame_pointer,
9899 2, 1))
9900 return false;
9901
9902 /* If the callee has specific tuning overrides, respect them. */
9903 if (callee_opts->x_aarch64_override_tune_string != NULL
9904 && caller_opts->x_aarch64_override_tune_string == NULL)
9905 return false;
9906
9907 /* If the user specified tuning override strings for the
9908 caller and callee and they don't match up, reject inlining.
9909 We just do a string compare here, we don't analyze the meaning
9910 of the string, as it would be too costly for little gain. */
9911 if (callee_opts->x_aarch64_override_tune_string
9912 && caller_opts->x_aarch64_override_tune_string
9913 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9914 caller_opts->x_aarch64_override_tune_string) != 0))
9915 return false;
9916
9917 return true;
9918 }
9919
9920 /* Return true if SYMBOL_REF X binds locally. */
9921
9922 static bool
9923 aarch64_symbol_binds_local_p (const_rtx x)
9924 {
9925 return (SYMBOL_REF_DECL (x)
9926 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9927 : SYMBOL_REF_LOCAL_P (x));
9928 }
9929
9930 /* Return true if SYMBOL_REF X is thread local */
9931 static bool
9932 aarch64_tls_symbol_p (rtx x)
9933 {
9934 if (! TARGET_HAVE_TLS)
9935 return false;
9936
9937 if (GET_CODE (x) != SYMBOL_REF)
9938 return false;
9939
9940 return SYMBOL_REF_TLS_MODEL (x) != 0;
9941 }
9942
9943 /* Classify a TLS symbol into one of the TLS kinds. */
9944 enum aarch64_symbol_type
9945 aarch64_classify_tls_symbol (rtx x)
9946 {
9947 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9948
9949 switch (tls_kind)
9950 {
9951 case TLS_MODEL_GLOBAL_DYNAMIC:
9952 case TLS_MODEL_LOCAL_DYNAMIC:
9953 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9954
9955 case TLS_MODEL_INITIAL_EXEC:
9956 switch (aarch64_cmodel)
9957 {
9958 case AARCH64_CMODEL_TINY:
9959 case AARCH64_CMODEL_TINY_PIC:
9960 return SYMBOL_TINY_TLSIE;
9961 default:
9962 return SYMBOL_SMALL_TLSIE;
9963 }
9964
9965 case TLS_MODEL_LOCAL_EXEC:
9966 if (aarch64_tls_size == 12)
9967 return SYMBOL_TLSLE12;
9968 else if (aarch64_tls_size == 24)
9969 return SYMBOL_TLSLE24;
9970 else if (aarch64_tls_size == 32)
9971 return SYMBOL_TLSLE32;
9972 else if (aarch64_tls_size == 48)
9973 return SYMBOL_TLSLE48;
9974 else
9975 gcc_unreachable ();
9976
9977 case TLS_MODEL_EMULATED:
9978 case TLS_MODEL_NONE:
9979 return SYMBOL_FORCE_TO_MEM;
9980
9981 default:
9982 gcc_unreachable ();
9983 }
9984 }
9985
9986 /* Return the method that should be used to access SYMBOL_REF or
9987 LABEL_REF X. */
9988
9989 enum aarch64_symbol_type
9990 aarch64_classify_symbol (rtx x, rtx offset)
9991 {
9992 if (GET_CODE (x) == LABEL_REF)
9993 {
9994 switch (aarch64_cmodel)
9995 {
9996 case AARCH64_CMODEL_LARGE:
9997 return SYMBOL_FORCE_TO_MEM;
9998
9999 case AARCH64_CMODEL_TINY_PIC:
10000 case AARCH64_CMODEL_TINY:
10001 return SYMBOL_TINY_ABSOLUTE;
10002
10003 case AARCH64_CMODEL_SMALL_SPIC:
10004 case AARCH64_CMODEL_SMALL_PIC:
10005 case AARCH64_CMODEL_SMALL:
10006 return SYMBOL_SMALL_ABSOLUTE;
10007
10008 default:
10009 gcc_unreachable ();
10010 }
10011 }
10012
10013 if (GET_CODE (x) == SYMBOL_REF)
10014 {
10015 if (aarch64_tls_symbol_p (x))
10016 return aarch64_classify_tls_symbol (x);
10017
10018 switch (aarch64_cmodel)
10019 {
10020 case AARCH64_CMODEL_TINY:
10021 /* When we retrieve symbol + offset address, we have to make sure
10022 the offset does not cause overflow of the final address. But
10023 we have no way of knowing the address of symbol at compile time
10024 so we can't accurately say if the distance between the PC and
10025 symbol + offset is outside the addressible range of +/-1M in the
10026 TINY code model. So we rely on images not being greater than
10027 1M and cap the offset at 1M and anything beyond 1M will have to
10028 be loaded using an alternative mechanism. Furthermore if the
10029 symbol is a weak reference to something that isn't known to
10030 resolve to a symbol in this module, then force to memory. */
10031 if ((SYMBOL_REF_WEAK (x)
10032 && !aarch64_symbol_binds_local_p (x))
10033 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10034 return SYMBOL_FORCE_TO_MEM;
10035 return SYMBOL_TINY_ABSOLUTE;
10036
10037 case AARCH64_CMODEL_SMALL:
10038 /* Same reasoning as the tiny code model, but the offset cap here is
10039 4G. */
10040 if ((SYMBOL_REF_WEAK (x)
10041 && !aarch64_symbol_binds_local_p (x))
10042 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10043 HOST_WIDE_INT_C (4294967264)))
10044 return SYMBOL_FORCE_TO_MEM;
10045 return SYMBOL_SMALL_ABSOLUTE;
10046
10047 case AARCH64_CMODEL_TINY_PIC:
10048 if (!aarch64_symbol_binds_local_p (x))
10049 return SYMBOL_TINY_GOT;
10050 return SYMBOL_TINY_ABSOLUTE;
10051
10052 case AARCH64_CMODEL_SMALL_SPIC:
10053 case AARCH64_CMODEL_SMALL_PIC:
10054 if (!aarch64_symbol_binds_local_p (x))
10055 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10056 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10057 return SYMBOL_SMALL_ABSOLUTE;
10058
10059 case AARCH64_CMODEL_LARGE:
10060 /* This is alright even in PIC code as the constant
10061 pool reference is always PC relative and within
10062 the same translation unit. */
10063 if (CONSTANT_POOL_ADDRESS_P (x))
10064 return SYMBOL_SMALL_ABSOLUTE;
10065 else
10066 return SYMBOL_FORCE_TO_MEM;
10067
10068 default:
10069 gcc_unreachable ();
10070 }
10071 }
10072
10073 /* By default push everything into the constant pool. */
10074 return SYMBOL_FORCE_TO_MEM;
10075 }
10076
10077 bool
10078 aarch64_constant_address_p (rtx x)
10079 {
10080 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10081 }
10082
10083 bool
10084 aarch64_legitimate_pic_operand_p (rtx x)
10085 {
10086 if (GET_CODE (x) == SYMBOL_REF
10087 || (GET_CODE (x) == CONST
10088 && GET_CODE (XEXP (x, 0)) == PLUS
10089 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10090 return false;
10091
10092 return true;
10093 }
10094
10095 /* Return true if X holds either a quarter-precision or
10096 floating-point +0.0 constant. */
10097 static bool
10098 aarch64_valid_floating_const (machine_mode mode, rtx x)
10099 {
10100 if (!CONST_DOUBLE_P (x))
10101 return false;
10102
10103 if (aarch64_float_const_zero_rtx_p (x))
10104 return true;
10105
10106 /* We only handle moving 0.0 to a TFmode register. */
10107 if (!(mode == SFmode || mode == DFmode))
10108 return false;
10109
10110 return aarch64_float_const_representable_p (x);
10111 }
10112
10113 static bool
10114 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10115 {
10116 /* Do not allow vector struct mode constants. We could support
10117 0 and -1 easily, but they need support in aarch64-simd.md. */
10118 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10119 return false;
10120
10121 /* This could probably go away because
10122 we now decompose CONST_INTs according to expand_mov_immediate. */
10123 if ((GET_CODE (x) == CONST_VECTOR
10124 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10125 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
10126 return !targetm.cannot_force_const_mem (mode, x);
10127
10128 if (GET_CODE (x) == HIGH
10129 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10130 return true;
10131
10132 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10133 so spilling them is better than rematerialization. */
10134 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10135 return true;
10136
10137 return aarch64_constant_address_p (x);
10138 }
10139
10140 rtx
10141 aarch64_load_tp (rtx target)
10142 {
10143 if (!target
10144 || GET_MODE (target) != Pmode
10145 || !register_operand (target, Pmode))
10146 target = gen_reg_rtx (Pmode);
10147
10148 /* Can return in any reg. */
10149 emit_insn (gen_aarch64_load_tp_hard (target));
10150 return target;
10151 }
10152
10153 /* On AAPCS systems, this is the "struct __va_list". */
10154 static GTY(()) tree va_list_type;
10155
10156 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10157 Return the type to use as __builtin_va_list.
10158
10159 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10160
10161 struct __va_list
10162 {
10163 void *__stack;
10164 void *__gr_top;
10165 void *__vr_top;
10166 int __gr_offs;
10167 int __vr_offs;
10168 }; */
10169
10170 static tree
10171 aarch64_build_builtin_va_list (void)
10172 {
10173 tree va_list_name;
10174 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10175
10176 /* Create the type. */
10177 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10178 /* Give it the required name. */
10179 va_list_name = build_decl (BUILTINS_LOCATION,
10180 TYPE_DECL,
10181 get_identifier ("__va_list"),
10182 va_list_type);
10183 DECL_ARTIFICIAL (va_list_name) = 1;
10184 TYPE_NAME (va_list_type) = va_list_name;
10185 TYPE_STUB_DECL (va_list_type) = va_list_name;
10186
10187 /* Create the fields. */
10188 f_stack = build_decl (BUILTINS_LOCATION,
10189 FIELD_DECL, get_identifier ("__stack"),
10190 ptr_type_node);
10191 f_grtop = build_decl (BUILTINS_LOCATION,
10192 FIELD_DECL, get_identifier ("__gr_top"),
10193 ptr_type_node);
10194 f_vrtop = build_decl (BUILTINS_LOCATION,
10195 FIELD_DECL, get_identifier ("__vr_top"),
10196 ptr_type_node);
10197 f_groff = build_decl (BUILTINS_LOCATION,
10198 FIELD_DECL, get_identifier ("__gr_offs"),
10199 integer_type_node);
10200 f_vroff = build_decl (BUILTINS_LOCATION,
10201 FIELD_DECL, get_identifier ("__vr_offs"),
10202 integer_type_node);
10203
10204 /* Tell tree-stdarg pass about our internal offset fields.
10205 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10206 purpose to identify whether the code is updating va_list internal
10207 offset fields through irregular way. */
10208 va_list_gpr_counter_field = f_groff;
10209 va_list_fpr_counter_field = f_vroff;
10210
10211 DECL_ARTIFICIAL (f_stack) = 1;
10212 DECL_ARTIFICIAL (f_grtop) = 1;
10213 DECL_ARTIFICIAL (f_vrtop) = 1;
10214 DECL_ARTIFICIAL (f_groff) = 1;
10215 DECL_ARTIFICIAL (f_vroff) = 1;
10216
10217 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10218 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10219 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10220 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10221 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10222
10223 TYPE_FIELDS (va_list_type) = f_stack;
10224 DECL_CHAIN (f_stack) = f_grtop;
10225 DECL_CHAIN (f_grtop) = f_vrtop;
10226 DECL_CHAIN (f_vrtop) = f_groff;
10227 DECL_CHAIN (f_groff) = f_vroff;
10228
10229 /* Compute its layout. */
10230 layout_type (va_list_type);
10231
10232 return va_list_type;
10233 }
10234
10235 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10236 static void
10237 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10238 {
10239 const CUMULATIVE_ARGS *cum;
10240 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10241 tree stack, grtop, vrtop, groff, vroff;
10242 tree t;
10243 int gr_save_area_size = cfun->va_list_gpr_size;
10244 int vr_save_area_size = cfun->va_list_fpr_size;
10245 int vr_offset;
10246
10247 cum = &crtl->args.info;
10248 if (cfun->va_list_gpr_size)
10249 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10250 cfun->va_list_gpr_size);
10251 if (cfun->va_list_fpr_size)
10252 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10253 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10254
10255 if (!TARGET_FLOAT)
10256 {
10257 gcc_assert (cum->aapcs_nvrn == 0);
10258 vr_save_area_size = 0;
10259 }
10260
10261 f_stack = TYPE_FIELDS (va_list_type_node);
10262 f_grtop = DECL_CHAIN (f_stack);
10263 f_vrtop = DECL_CHAIN (f_grtop);
10264 f_groff = DECL_CHAIN (f_vrtop);
10265 f_vroff = DECL_CHAIN (f_groff);
10266
10267 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10268 NULL_TREE);
10269 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10270 NULL_TREE);
10271 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10272 NULL_TREE);
10273 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10274 NULL_TREE);
10275 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10276 NULL_TREE);
10277
10278 /* Emit code to initialize STACK, which points to the next varargs stack
10279 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10280 by named arguments. STACK is 8-byte aligned. */
10281 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10282 if (cum->aapcs_stack_size > 0)
10283 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10284 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10285 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10286
10287 /* Emit code to initialize GRTOP, the top of the GR save area.
10288 virtual_incoming_args_rtx should have been 16 byte aligned. */
10289 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10290 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10291 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10292
10293 /* Emit code to initialize VRTOP, the top of the VR save area.
10294 This address is gr_save_area_bytes below GRTOP, rounded
10295 down to the next 16-byte boundary. */
10296 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10297 vr_offset = ROUND_UP (gr_save_area_size,
10298 STACK_BOUNDARY / BITS_PER_UNIT);
10299
10300 if (vr_offset)
10301 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10302 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10303 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10304
10305 /* Emit code to initialize GROFF, the offset from GRTOP of the
10306 next GPR argument. */
10307 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10308 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10309 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10310
10311 /* Likewise emit code to initialize VROFF, the offset from FTOP
10312 of the next VR argument. */
10313 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10314 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10315 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10316 }
10317
10318 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10319
10320 static tree
10321 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10322 gimple_seq *post_p ATTRIBUTE_UNUSED)
10323 {
10324 tree addr;
10325 bool indirect_p;
10326 bool is_ha; /* is HFA or HVA. */
10327 bool dw_align; /* double-word align. */
10328 machine_mode ag_mode = VOIDmode;
10329 int nregs;
10330 machine_mode mode;
10331
10332 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10333 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10334 HOST_WIDE_INT size, rsize, adjust, align;
10335 tree t, u, cond1, cond2;
10336
10337 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10338 if (indirect_p)
10339 type = build_pointer_type (type);
10340
10341 mode = TYPE_MODE (type);
10342
10343 f_stack = TYPE_FIELDS (va_list_type_node);
10344 f_grtop = DECL_CHAIN (f_stack);
10345 f_vrtop = DECL_CHAIN (f_grtop);
10346 f_groff = DECL_CHAIN (f_vrtop);
10347 f_vroff = DECL_CHAIN (f_groff);
10348
10349 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10350 f_stack, NULL_TREE);
10351 size = int_size_in_bytes (type);
10352 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10353
10354 dw_align = false;
10355 adjust = 0;
10356 if (aarch64_vfp_is_call_or_return_candidate (mode,
10357 type,
10358 &ag_mode,
10359 &nregs,
10360 &is_ha))
10361 {
10362 /* TYPE passed in fp/simd registers. */
10363 if (!TARGET_FLOAT)
10364 aarch64_err_no_fpadvsimd (mode, "varargs");
10365
10366 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10367 unshare_expr (valist), f_vrtop, NULL_TREE);
10368 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10369 unshare_expr (valist), f_vroff, NULL_TREE);
10370
10371 rsize = nregs * UNITS_PER_VREG;
10372
10373 if (is_ha)
10374 {
10375 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10376 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10377 }
10378 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10379 && size < UNITS_PER_VREG)
10380 {
10381 adjust = UNITS_PER_VREG - size;
10382 }
10383 }
10384 else
10385 {
10386 /* TYPE passed in general registers. */
10387 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10388 unshare_expr (valist), f_grtop, NULL_TREE);
10389 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10390 unshare_expr (valist), f_groff, NULL_TREE);
10391 rsize = ROUND_UP (size, UNITS_PER_WORD);
10392 nregs = rsize / UNITS_PER_WORD;
10393
10394 if (align > 8)
10395 dw_align = true;
10396
10397 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10398 && size < UNITS_PER_WORD)
10399 {
10400 adjust = UNITS_PER_WORD - size;
10401 }
10402 }
10403
10404 /* Get a local temporary for the field value. */
10405 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10406
10407 /* Emit code to branch if off >= 0. */
10408 t = build2 (GE_EXPR, boolean_type_node, off,
10409 build_int_cst (TREE_TYPE (off), 0));
10410 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10411
10412 if (dw_align)
10413 {
10414 /* Emit: offs = (offs + 15) & -16. */
10415 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10416 build_int_cst (TREE_TYPE (off), 15));
10417 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10418 build_int_cst (TREE_TYPE (off), -16));
10419 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10420 }
10421 else
10422 roundup = NULL;
10423
10424 /* Update ap.__[g|v]r_offs */
10425 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10426 build_int_cst (TREE_TYPE (off), rsize));
10427 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10428
10429 /* String up. */
10430 if (roundup)
10431 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10432
10433 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10434 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10435 build_int_cst (TREE_TYPE (f_off), 0));
10436 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10437
10438 /* String up: make sure the assignment happens before the use. */
10439 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10440 COND_EXPR_ELSE (cond1) = t;
10441
10442 /* Prepare the trees handling the argument that is passed on the stack;
10443 the top level node will store in ON_STACK. */
10444 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10445 if (align > 8)
10446 {
10447 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10448 t = fold_convert (intDI_type_node, arg);
10449 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10450 build_int_cst (TREE_TYPE (t), 15));
10451 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10452 build_int_cst (TREE_TYPE (t), -16));
10453 t = fold_convert (TREE_TYPE (arg), t);
10454 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10455 }
10456 else
10457 roundup = NULL;
10458 /* Advance ap.__stack */
10459 t = fold_convert (intDI_type_node, arg);
10460 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10461 build_int_cst (TREE_TYPE (t), size + 7));
10462 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10463 build_int_cst (TREE_TYPE (t), -8));
10464 t = fold_convert (TREE_TYPE (arg), t);
10465 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10466 /* String up roundup and advance. */
10467 if (roundup)
10468 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10469 /* String up with arg */
10470 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10471 /* Big-endianness related address adjustment. */
10472 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10473 && size < UNITS_PER_WORD)
10474 {
10475 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10476 size_int (UNITS_PER_WORD - size));
10477 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10478 }
10479
10480 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10481 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10482
10483 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10484 t = off;
10485 if (adjust)
10486 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10487 build_int_cst (TREE_TYPE (off), adjust));
10488
10489 t = fold_convert (sizetype, t);
10490 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10491
10492 if (is_ha)
10493 {
10494 /* type ha; // treat as "struct {ftype field[n];}"
10495 ... [computing offs]
10496 for (i = 0; i <nregs; ++i, offs += 16)
10497 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10498 return ha; */
10499 int i;
10500 tree tmp_ha, field_t, field_ptr_t;
10501
10502 /* Declare a local variable. */
10503 tmp_ha = create_tmp_var_raw (type, "ha");
10504 gimple_add_tmp_var (tmp_ha);
10505
10506 /* Establish the base type. */
10507 switch (ag_mode)
10508 {
10509 case SFmode:
10510 field_t = float_type_node;
10511 field_ptr_t = float_ptr_type_node;
10512 break;
10513 case DFmode:
10514 field_t = double_type_node;
10515 field_ptr_t = double_ptr_type_node;
10516 break;
10517 case TFmode:
10518 field_t = long_double_type_node;
10519 field_ptr_t = long_double_ptr_type_node;
10520 break;
10521 case HFmode:
10522 field_t = aarch64_fp16_type_node;
10523 field_ptr_t = aarch64_fp16_ptr_type_node;
10524 break;
10525 case V2SImode:
10526 case V4SImode:
10527 {
10528 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10529 field_t = build_vector_type_for_mode (innertype, ag_mode);
10530 field_ptr_t = build_pointer_type (field_t);
10531 }
10532 break;
10533 default:
10534 gcc_assert (0);
10535 }
10536
10537 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10538 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10539 addr = t;
10540 t = fold_convert (field_ptr_t, addr);
10541 t = build2 (MODIFY_EXPR, field_t,
10542 build1 (INDIRECT_REF, field_t, tmp_ha),
10543 build1 (INDIRECT_REF, field_t, t));
10544
10545 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10546 for (i = 1; i < nregs; ++i)
10547 {
10548 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10549 u = fold_convert (field_ptr_t, addr);
10550 u = build2 (MODIFY_EXPR, field_t,
10551 build2 (MEM_REF, field_t, tmp_ha,
10552 build_int_cst (field_ptr_t,
10553 (i *
10554 int_size_in_bytes (field_t)))),
10555 build1 (INDIRECT_REF, field_t, u));
10556 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10557 }
10558
10559 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10560 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10561 }
10562
10563 COND_EXPR_ELSE (cond2) = t;
10564 addr = fold_convert (build_pointer_type (type), cond1);
10565 addr = build_va_arg_indirect_ref (addr);
10566
10567 if (indirect_p)
10568 addr = build_va_arg_indirect_ref (addr);
10569
10570 return addr;
10571 }
10572
10573 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10574
10575 static void
10576 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10577 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10578 int no_rtl)
10579 {
10580 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10581 CUMULATIVE_ARGS local_cum;
10582 int gr_saved = cfun->va_list_gpr_size;
10583 int vr_saved = cfun->va_list_fpr_size;
10584
10585 /* The caller has advanced CUM up to, but not beyond, the last named
10586 argument. Advance a local copy of CUM past the last "real" named
10587 argument, to find out how many registers are left over. */
10588 local_cum = *cum;
10589 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10590
10591 /* Found out how many registers we need to save.
10592 Honor tree-stdvar analysis results. */
10593 if (cfun->va_list_gpr_size)
10594 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10595 cfun->va_list_gpr_size / UNITS_PER_WORD);
10596 if (cfun->va_list_fpr_size)
10597 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10598 cfun->va_list_fpr_size / UNITS_PER_VREG);
10599
10600 if (!TARGET_FLOAT)
10601 {
10602 gcc_assert (local_cum.aapcs_nvrn == 0);
10603 vr_saved = 0;
10604 }
10605
10606 if (!no_rtl)
10607 {
10608 if (gr_saved > 0)
10609 {
10610 rtx ptr, mem;
10611
10612 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10613 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10614 - gr_saved * UNITS_PER_WORD);
10615 mem = gen_frame_mem (BLKmode, ptr);
10616 set_mem_alias_set (mem, get_varargs_alias_set ());
10617
10618 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10619 mem, gr_saved);
10620 }
10621 if (vr_saved > 0)
10622 {
10623 /* We can't use move_block_from_reg, because it will use
10624 the wrong mode, storing D regs only. */
10625 machine_mode mode = TImode;
10626 int off, i, vr_start;
10627
10628 /* Set OFF to the offset from virtual_incoming_args_rtx of
10629 the first vector register. The VR save area lies below
10630 the GR one, and is aligned to 16 bytes. */
10631 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10632 STACK_BOUNDARY / BITS_PER_UNIT);
10633 off -= vr_saved * UNITS_PER_VREG;
10634
10635 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10636 for (i = 0; i < vr_saved; ++i)
10637 {
10638 rtx ptr, mem;
10639
10640 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10641 mem = gen_frame_mem (mode, ptr);
10642 set_mem_alias_set (mem, get_varargs_alias_set ());
10643 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10644 off += UNITS_PER_VREG;
10645 }
10646 }
10647 }
10648
10649 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10650 any complication of having crtl->args.pretend_args_size changed. */
10651 cfun->machine->frame.saved_varargs_size
10652 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10653 STACK_BOUNDARY / BITS_PER_UNIT)
10654 + vr_saved * UNITS_PER_VREG);
10655 }
10656
10657 static void
10658 aarch64_conditional_register_usage (void)
10659 {
10660 int i;
10661 if (!TARGET_FLOAT)
10662 {
10663 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10664 {
10665 fixed_regs[i] = 1;
10666 call_used_regs[i] = 1;
10667 }
10668 }
10669 }
10670
10671 /* Walk down the type tree of TYPE counting consecutive base elements.
10672 If *MODEP is VOIDmode, then set it to the first valid floating point
10673 type. If a non-floating point type is found, or if a floating point
10674 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10675 otherwise return the count in the sub-tree. */
10676 static int
10677 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10678 {
10679 machine_mode mode;
10680 HOST_WIDE_INT size;
10681
10682 switch (TREE_CODE (type))
10683 {
10684 case REAL_TYPE:
10685 mode = TYPE_MODE (type);
10686 if (mode != DFmode && mode != SFmode
10687 && mode != TFmode && mode != HFmode)
10688 return -1;
10689
10690 if (*modep == VOIDmode)
10691 *modep = mode;
10692
10693 if (*modep == mode)
10694 return 1;
10695
10696 break;
10697
10698 case COMPLEX_TYPE:
10699 mode = TYPE_MODE (TREE_TYPE (type));
10700 if (mode != DFmode && mode != SFmode
10701 && mode != TFmode && mode != HFmode)
10702 return -1;
10703
10704 if (*modep == VOIDmode)
10705 *modep = mode;
10706
10707 if (*modep == mode)
10708 return 2;
10709
10710 break;
10711
10712 case VECTOR_TYPE:
10713 /* Use V2SImode and V4SImode as representatives of all 64-bit
10714 and 128-bit vector types. */
10715 size = int_size_in_bytes (type);
10716 switch (size)
10717 {
10718 case 8:
10719 mode = V2SImode;
10720 break;
10721 case 16:
10722 mode = V4SImode;
10723 break;
10724 default:
10725 return -1;
10726 }
10727
10728 if (*modep == VOIDmode)
10729 *modep = mode;
10730
10731 /* Vector modes are considered to be opaque: two vectors are
10732 equivalent for the purposes of being homogeneous aggregates
10733 if they are the same size. */
10734 if (*modep == mode)
10735 return 1;
10736
10737 break;
10738
10739 case ARRAY_TYPE:
10740 {
10741 int count;
10742 tree index = TYPE_DOMAIN (type);
10743
10744 /* Can't handle incomplete types nor sizes that are not
10745 fixed. */
10746 if (!COMPLETE_TYPE_P (type)
10747 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10748 return -1;
10749
10750 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10751 if (count == -1
10752 || !index
10753 || !TYPE_MAX_VALUE (index)
10754 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10755 || !TYPE_MIN_VALUE (index)
10756 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10757 || count < 0)
10758 return -1;
10759
10760 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10761 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10762
10763 /* There must be no padding. */
10764 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10765 return -1;
10766
10767 return count;
10768 }
10769
10770 case RECORD_TYPE:
10771 {
10772 int count = 0;
10773 int sub_count;
10774 tree field;
10775
10776 /* Can't handle incomplete types nor sizes that are not
10777 fixed. */
10778 if (!COMPLETE_TYPE_P (type)
10779 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10780 return -1;
10781
10782 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10783 {
10784 if (TREE_CODE (field) != FIELD_DECL)
10785 continue;
10786
10787 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10788 if (sub_count < 0)
10789 return -1;
10790 count += sub_count;
10791 }
10792
10793 /* There must be no padding. */
10794 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10795 return -1;
10796
10797 return count;
10798 }
10799
10800 case UNION_TYPE:
10801 case QUAL_UNION_TYPE:
10802 {
10803 /* These aren't very interesting except in a degenerate case. */
10804 int count = 0;
10805 int sub_count;
10806 tree field;
10807
10808 /* Can't handle incomplete types nor sizes that are not
10809 fixed. */
10810 if (!COMPLETE_TYPE_P (type)
10811 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10812 return -1;
10813
10814 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10815 {
10816 if (TREE_CODE (field) != FIELD_DECL)
10817 continue;
10818
10819 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10820 if (sub_count < 0)
10821 return -1;
10822 count = count > sub_count ? count : sub_count;
10823 }
10824
10825 /* There must be no padding. */
10826 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10827 return -1;
10828
10829 return count;
10830 }
10831
10832 default:
10833 break;
10834 }
10835
10836 return -1;
10837 }
10838
10839 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10840 type as described in AAPCS64 \S 4.1.2.
10841
10842 See the comment above aarch64_composite_type_p for the notes on MODE. */
10843
10844 static bool
10845 aarch64_short_vector_p (const_tree type,
10846 machine_mode mode)
10847 {
10848 HOST_WIDE_INT size = -1;
10849
10850 if (type && TREE_CODE (type) == VECTOR_TYPE)
10851 size = int_size_in_bytes (type);
10852 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10853 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10854 size = GET_MODE_SIZE (mode);
10855
10856 return (size == 8 || size == 16);
10857 }
10858
10859 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10860 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10861 array types. The C99 floating-point complex types are also considered
10862 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10863 types, which are GCC extensions and out of the scope of AAPCS64, are
10864 treated as composite types here as well.
10865
10866 Note that MODE itself is not sufficient in determining whether a type
10867 is such a composite type or not. This is because
10868 stor-layout.c:compute_record_mode may have already changed the MODE
10869 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10870 structure with only one field may have its MODE set to the mode of the
10871 field. Also an integer mode whose size matches the size of the
10872 RECORD_TYPE type may be used to substitute the original mode
10873 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10874 solely relied on. */
10875
10876 static bool
10877 aarch64_composite_type_p (const_tree type,
10878 machine_mode mode)
10879 {
10880 if (aarch64_short_vector_p (type, mode))
10881 return false;
10882
10883 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10884 return true;
10885
10886 if (mode == BLKmode
10887 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10888 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10889 return true;
10890
10891 return false;
10892 }
10893
10894 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10895 shall be passed or returned in simd/fp register(s) (providing these
10896 parameter passing registers are available).
10897
10898 Upon successful return, *COUNT returns the number of needed registers,
10899 *BASE_MODE returns the mode of the individual register and when IS_HAF
10900 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10901 floating-point aggregate or a homogeneous short-vector aggregate. */
10902
10903 static bool
10904 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10905 const_tree type,
10906 machine_mode *base_mode,
10907 int *count,
10908 bool *is_ha)
10909 {
10910 machine_mode new_mode = VOIDmode;
10911 bool composite_p = aarch64_composite_type_p (type, mode);
10912
10913 if (is_ha != NULL) *is_ha = false;
10914
10915 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10916 || aarch64_short_vector_p (type, mode))
10917 {
10918 *count = 1;
10919 new_mode = mode;
10920 }
10921 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10922 {
10923 if (is_ha != NULL) *is_ha = true;
10924 *count = 2;
10925 new_mode = GET_MODE_INNER (mode);
10926 }
10927 else if (type && composite_p)
10928 {
10929 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10930
10931 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10932 {
10933 if (is_ha != NULL) *is_ha = true;
10934 *count = ag_count;
10935 }
10936 else
10937 return false;
10938 }
10939 else
10940 return false;
10941
10942 *base_mode = new_mode;
10943 return true;
10944 }
10945
10946 /* Implement TARGET_STRUCT_VALUE_RTX. */
10947
10948 static rtx
10949 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10950 int incoming ATTRIBUTE_UNUSED)
10951 {
10952 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10953 }
10954
10955 /* Implements target hook vector_mode_supported_p. */
10956 static bool
10957 aarch64_vector_mode_supported_p (machine_mode mode)
10958 {
10959 if (TARGET_SIMD
10960 && (mode == V4SImode || mode == V8HImode
10961 || mode == V16QImode || mode == V2DImode
10962 || mode == V2SImode || mode == V4HImode
10963 || mode == V8QImode || mode == V2SFmode
10964 || mode == V4SFmode || mode == V2DFmode
10965 || mode == V4HFmode || mode == V8HFmode
10966 || mode == V1DFmode))
10967 return true;
10968
10969 return false;
10970 }
10971
10972 /* Return appropriate SIMD container
10973 for MODE within a vector of WIDTH bits. */
10974 static machine_mode
10975 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10976 {
10977 gcc_assert (width == 64 || width == 128);
10978 if (TARGET_SIMD)
10979 {
10980 if (width == 128)
10981 switch (mode)
10982 {
10983 case DFmode:
10984 return V2DFmode;
10985 case SFmode:
10986 return V4SFmode;
10987 case HFmode:
10988 return V8HFmode;
10989 case SImode:
10990 return V4SImode;
10991 case HImode:
10992 return V8HImode;
10993 case QImode:
10994 return V16QImode;
10995 case DImode:
10996 return V2DImode;
10997 default:
10998 break;
10999 }
11000 else
11001 switch (mode)
11002 {
11003 case SFmode:
11004 return V2SFmode;
11005 case HFmode:
11006 return V4HFmode;
11007 case SImode:
11008 return V2SImode;
11009 case HImode:
11010 return V4HImode;
11011 case QImode:
11012 return V8QImode;
11013 default:
11014 break;
11015 }
11016 }
11017 return word_mode;
11018 }
11019
11020 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11021 static machine_mode
11022 aarch64_preferred_simd_mode (machine_mode mode)
11023 {
11024 return aarch64_simd_container_mode (mode, 128);
11025 }
11026
11027 /* Return the bitmask of possible vector sizes for the vectorizer
11028 to iterate over. */
11029 static unsigned int
11030 aarch64_autovectorize_vector_sizes (void)
11031 {
11032 return (16 | 8);
11033 }
11034
11035 /* Implement TARGET_MANGLE_TYPE. */
11036
11037 static const char *
11038 aarch64_mangle_type (const_tree type)
11039 {
11040 /* The AArch64 ABI documents say that "__va_list" has to be
11041 managled as if it is in the "std" namespace. */
11042 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11043 return "St9__va_list";
11044
11045 /* Half-precision float. */
11046 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11047 return "Dh";
11048
11049 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11050 builtin types. */
11051 if (TYPE_NAME (type) != NULL)
11052 return aarch64_mangle_builtin_type (type);
11053
11054 /* Use the default mangling. */
11055 return NULL;
11056 }
11057
11058 /* Find the first rtx_insn before insn that will generate an assembly
11059 instruction. */
11060
11061 static rtx_insn *
11062 aarch64_prev_real_insn (rtx_insn *insn)
11063 {
11064 if (!insn)
11065 return NULL;
11066
11067 do
11068 {
11069 insn = prev_real_insn (insn);
11070 }
11071 while (insn && recog_memoized (insn) < 0);
11072
11073 return insn;
11074 }
11075
11076 static bool
11077 is_madd_op (enum attr_type t1)
11078 {
11079 unsigned int i;
11080 /* A number of these may be AArch32 only. */
11081 enum attr_type mlatypes[] = {
11082 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11083 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11084 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11085 };
11086
11087 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11088 {
11089 if (t1 == mlatypes[i])
11090 return true;
11091 }
11092
11093 return false;
11094 }
11095
11096 /* Check if there is a register dependency between a load and the insn
11097 for which we hold recog_data. */
11098
11099 static bool
11100 dep_between_memop_and_curr (rtx memop)
11101 {
11102 rtx load_reg;
11103 int opno;
11104
11105 gcc_assert (GET_CODE (memop) == SET);
11106
11107 if (!REG_P (SET_DEST (memop)))
11108 return false;
11109
11110 load_reg = SET_DEST (memop);
11111 for (opno = 1; opno < recog_data.n_operands; opno++)
11112 {
11113 rtx operand = recog_data.operand[opno];
11114 if (REG_P (operand)
11115 && reg_overlap_mentioned_p (load_reg, operand))
11116 return true;
11117
11118 }
11119 return false;
11120 }
11121
11122
11123 /* When working around the Cortex-A53 erratum 835769,
11124 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11125 instruction and has a preceding memory instruction such that a NOP
11126 should be inserted between them. */
11127
11128 bool
11129 aarch64_madd_needs_nop (rtx_insn* insn)
11130 {
11131 enum attr_type attr_type;
11132 rtx_insn *prev;
11133 rtx body;
11134
11135 if (!TARGET_FIX_ERR_A53_835769)
11136 return false;
11137
11138 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11139 return false;
11140
11141 attr_type = get_attr_type (insn);
11142 if (!is_madd_op (attr_type))
11143 return false;
11144
11145 prev = aarch64_prev_real_insn (insn);
11146 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11147 Restore recog state to INSN to avoid state corruption. */
11148 extract_constrain_insn_cached (insn);
11149
11150 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11151 return false;
11152
11153 body = single_set (prev);
11154
11155 /* If the previous insn is a memory op and there is no dependency between
11156 it and the DImode madd, emit a NOP between them. If body is NULL then we
11157 have a complex memory operation, probably a load/store pair.
11158 Be conservative for now and emit a NOP. */
11159 if (GET_MODE (recog_data.operand[0]) == DImode
11160 && (!body || !dep_between_memop_and_curr (body)))
11161 return true;
11162
11163 return false;
11164
11165 }
11166
11167
11168 /* Implement FINAL_PRESCAN_INSN. */
11169
11170 void
11171 aarch64_final_prescan_insn (rtx_insn *insn)
11172 {
11173 if (aarch64_madd_needs_nop (insn))
11174 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11175 }
11176
11177
11178 /* Return the equivalent letter for size. */
11179 static char
11180 sizetochar (int size)
11181 {
11182 switch (size)
11183 {
11184 case 64: return 'd';
11185 case 32: return 's';
11186 case 16: return 'h';
11187 case 8 : return 'b';
11188 default: gcc_unreachable ();
11189 }
11190 }
11191
11192 /* Return true iff x is a uniform vector of floating-point
11193 constants, and the constant can be represented in
11194 quarter-precision form. Note, as aarch64_float_const_representable
11195 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11196 static bool
11197 aarch64_vect_float_const_representable_p (rtx x)
11198 {
11199 rtx elt;
11200 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11201 && const_vec_duplicate_p (x, &elt)
11202 && aarch64_float_const_representable_p (elt));
11203 }
11204
11205 /* Return true for valid and false for invalid. */
11206 bool
11207 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11208 struct simd_immediate_info *info)
11209 {
11210 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11211 matches = 1; \
11212 for (i = 0; i < idx; i += (STRIDE)) \
11213 if (!(TEST)) \
11214 matches = 0; \
11215 if (matches) \
11216 { \
11217 immtype = (CLASS); \
11218 elsize = (ELSIZE); \
11219 eshift = (SHIFT); \
11220 emvn = (NEG); \
11221 break; \
11222 }
11223
11224 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11225 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11226 unsigned char bytes[16];
11227 int immtype = -1, matches;
11228 unsigned int invmask = inverse ? 0xff : 0;
11229 int eshift, emvn;
11230
11231 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11232 {
11233 if (! (aarch64_simd_imm_zero_p (op, mode)
11234 || aarch64_vect_float_const_representable_p (op)))
11235 return false;
11236
11237 if (info)
11238 {
11239 info->value = CONST_VECTOR_ELT (op, 0);
11240 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11241 info->mvn = false;
11242 info->shift = 0;
11243 }
11244
11245 return true;
11246 }
11247
11248 /* Splat vector constant out into a byte vector. */
11249 for (i = 0; i < n_elts; i++)
11250 {
11251 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11252 it must be laid out in the vector register in reverse order. */
11253 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11254 unsigned HOST_WIDE_INT elpart;
11255
11256 gcc_assert (CONST_INT_P (el));
11257 elpart = INTVAL (el);
11258
11259 for (unsigned int byte = 0; byte < innersize; byte++)
11260 {
11261 bytes[idx++] = (elpart & 0xff) ^ invmask;
11262 elpart >>= BITS_PER_UNIT;
11263 }
11264
11265 }
11266
11267 /* Sanity check. */
11268 gcc_assert (idx == GET_MODE_SIZE (mode));
11269
11270 do
11271 {
11272 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11273 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11274
11275 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11276 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11277
11278 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11279 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11280
11281 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11282 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11283
11284 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11285
11286 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11287
11288 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11289 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11290
11291 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11292 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11293
11294 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11295 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11296
11297 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11298 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11299
11300 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11301
11302 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11303
11304 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11305 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11306
11307 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11308 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11309
11310 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11311 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11312
11313 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11314 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11315
11316 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11317
11318 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11319 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11320 }
11321 while (0);
11322
11323 if (immtype == -1)
11324 return false;
11325
11326 if (info)
11327 {
11328 info->element_width = elsize;
11329 info->mvn = emvn != 0;
11330 info->shift = eshift;
11331
11332 unsigned HOST_WIDE_INT imm = 0;
11333
11334 if (immtype >= 12 && immtype <= 15)
11335 info->msl = true;
11336
11337 /* Un-invert bytes of recognized vector, if necessary. */
11338 if (invmask != 0)
11339 for (i = 0; i < idx; i++)
11340 bytes[i] ^= invmask;
11341
11342 if (immtype == 17)
11343 {
11344 /* FIXME: Broken on 32-bit H_W_I hosts. */
11345 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11346
11347 for (i = 0; i < 8; i++)
11348 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11349 << (i * BITS_PER_UNIT);
11350
11351
11352 info->value = GEN_INT (imm);
11353 }
11354 else
11355 {
11356 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11357 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11358
11359 /* Construct 'abcdefgh' because the assembler cannot handle
11360 generic constants. */
11361 if (info->mvn)
11362 imm = ~imm;
11363 imm = (imm >> info->shift) & 0xff;
11364 info->value = GEN_INT (imm);
11365 }
11366 }
11367
11368 return true;
11369 #undef CHECK
11370 }
11371
11372 /* Check of immediate shift constants are within range. */
11373 bool
11374 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11375 {
11376 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11377 if (left)
11378 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11379 else
11380 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11381 }
11382
11383 /* Return true if X is a uniform vector where all elements
11384 are either the floating-point constant 0.0 or the
11385 integer constant 0. */
11386 bool
11387 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11388 {
11389 return x == CONST0_RTX (mode);
11390 }
11391
11392
11393 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11394 operation of width WIDTH at bit position POS. */
11395
11396 rtx
11397 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11398 {
11399 gcc_assert (CONST_INT_P (width));
11400 gcc_assert (CONST_INT_P (pos));
11401
11402 unsigned HOST_WIDE_INT mask
11403 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11404 return GEN_INT (mask << UINTVAL (pos));
11405 }
11406
11407 bool
11408 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11409 {
11410 HOST_WIDE_INT imm = INTVAL (x);
11411 int i;
11412
11413 for (i = 0; i < 8; i++)
11414 {
11415 unsigned int byte = imm & 0xff;
11416 if (byte != 0xff && byte != 0)
11417 return false;
11418 imm >>= 8;
11419 }
11420
11421 return true;
11422 }
11423
11424 bool
11425 aarch64_mov_operand_p (rtx x, machine_mode mode)
11426 {
11427 if (GET_CODE (x) == HIGH
11428 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11429 return true;
11430
11431 if (CONST_INT_P (x))
11432 return true;
11433
11434 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11435 return true;
11436
11437 return aarch64_classify_symbolic_expression (x)
11438 == SYMBOL_TINY_ABSOLUTE;
11439 }
11440
11441 /* Return a const_int vector of VAL. */
11442 rtx
11443 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11444 {
11445 int nunits = GET_MODE_NUNITS (mode);
11446 rtvec v = rtvec_alloc (nunits);
11447 int i;
11448
11449 rtx cache = GEN_INT (val);
11450
11451 for (i=0; i < nunits; i++)
11452 RTVEC_ELT (v, i) = cache;
11453
11454 return gen_rtx_CONST_VECTOR (mode, v);
11455 }
11456
11457 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11458
11459 bool
11460 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11461 {
11462 machine_mode vmode;
11463
11464 gcc_assert (!VECTOR_MODE_P (mode));
11465 vmode = aarch64_preferred_simd_mode (mode);
11466 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11467 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11468 }
11469
11470 /* Construct and return a PARALLEL RTX vector with elements numbering the
11471 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11472 the vector - from the perspective of the architecture. This does not
11473 line up with GCC's perspective on lane numbers, so we end up with
11474 different masks depending on our target endian-ness. The diagram
11475 below may help. We must draw the distinction when building masks
11476 which select one half of the vector. An instruction selecting
11477 architectural low-lanes for a big-endian target, must be described using
11478 a mask selecting GCC high-lanes.
11479
11480 Big-Endian Little-Endian
11481
11482 GCC 0 1 2 3 3 2 1 0
11483 | x | x | x | x | | x | x | x | x |
11484 Architecture 3 2 1 0 3 2 1 0
11485
11486 Low Mask: { 2, 3 } { 0, 1 }
11487 High Mask: { 0, 1 } { 2, 3 }
11488 */
11489
11490 rtx
11491 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11492 {
11493 int nunits = GET_MODE_NUNITS (mode);
11494 rtvec v = rtvec_alloc (nunits / 2);
11495 int high_base = nunits / 2;
11496 int low_base = 0;
11497 int base;
11498 rtx t1;
11499 int i;
11500
11501 if (BYTES_BIG_ENDIAN)
11502 base = high ? low_base : high_base;
11503 else
11504 base = high ? high_base : low_base;
11505
11506 for (i = 0; i < nunits / 2; i++)
11507 RTVEC_ELT (v, i) = GEN_INT (base + i);
11508
11509 t1 = gen_rtx_PARALLEL (mode, v);
11510 return t1;
11511 }
11512
11513 /* Check OP for validity as a PARALLEL RTX vector with elements
11514 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11515 from the perspective of the architecture. See the diagram above
11516 aarch64_simd_vect_par_cnst_half for more details. */
11517
11518 bool
11519 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11520 bool high)
11521 {
11522 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11523 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11524 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11525 int i = 0;
11526
11527 if (!VECTOR_MODE_P (mode))
11528 return false;
11529
11530 if (count_op != count_ideal)
11531 return false;
11532
11533 for (i = 0; i < count_ideal; i++)
11534 {
11535 rtx elt_op = XVECEXP (op, 0, i);
11536 rtx elt_ideal = XVECEXP (ideal, 0, i);
11537
11538 if (!CONST_INT_P (elt_op)
11539 || INTVAL (elt_ideal) != INTVAL (elt_op))
11540 return false;
11541 }
11542 return true;
11543 }
11544
11545 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11546 HIGH (exclusive). */
11547 void
11548 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11549 const_tree exp)
11550 {
11551 HOST_WIDE_INT lane;
11552 gcc_assert (CONST_INT_P (operand));
11553 lane = INTVAL (operand);
11554
11555 if (lane < low || lane >= high)
11556 {
11557 if (exp)
11558 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11559 else
11560 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11561 }
11562 }
11563
11564 /* Return TRUE if OP is a valid vector addressing mode. */
11565 bool
11566 aarch64_simd_mem_operand_p (rtx op)
11567 {
11568 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11569 || REG_P (XEXP (op, 0)));
11570 }
11571
11572 /* Emit a register copy from operand to operand, taking care not to
11573 early-clobber source registers in the process.
11574
11575 COUNT is the number of components into which the copy needs to be
11576 decomposed. */
11577 void
11578 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11579 unsigned int count)
11580 {
11581 unsigned int i;
11582 int rdest = REGNO (operands[0]);
11583 int rsrc = REGNO (operands[1]);
11584
11585 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11586 || rdest < rsrc)
11587 for (i = 0; i < count; i++)
11588 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11589 gen_rtx_REG (mode, rsrc + i));
11590 else
11591 for (i = 0; i < count; i++)
11592 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11593 gen_rtx_REG (mode, rsrc + count - i - 1));
11594 }
11595
11596 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11597 one of VSTRUCT modes: OI, CI, or XI. */
11598 int
11599 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11600 {
11601 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11602 }
11603
11604 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11605 alignment of a vector to 128 bits. */
11606 static HOST_WIDE_INT
11607 aarch64_simd_vector_alignment (const_tree type)
11608 {
11609 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11610 return MIN (align, 128);
11611 }
11612
11613 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11614 static bool
11615 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11616 {
11617 if (is_packed)
11618 return false;
11619
11620 /* We guarantee alignment for vectors up to 128-bits. */
11621 if (tree_int_cst_compare (TYPE_SIZE (type),
11622 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11623 return false;
11624
11625 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11626 return true;
11627 }
11628
11629 /* Return true if the vector misalignment factor is supported by the
11630 target. */
11631 static bool
11632 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11633 const_tree type, int misalignment,
11634 bool is_packed)
11635 {
11636 if (TARGET_SIMD && STRICT_ALIGNMENT)
11637 {
11638 /* Return if movmisalign pattern is not supported for this mode. */
11639 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11640 return false;
11641
11642 if (misalignment == -1)
11643 {
11644 /* Misalignment factor is unknown at compile time but we know
11645 it's word aligned. */
11646 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11647 {
11648 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11649
11650 if (element_size != 64)
11651 return true;
11652 }
11653 return false;
11654 }
11655 }
11656 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11657 is_packed);
11658 }
11659
11660 /* If VALS is a vector constant that can be loaded into a register
11661 using DUP, generate instructions to do so and return an RTX to
11662 assign to the register. Otherwise return NULL_RTX. */
11663 static rtx
11664 aarch64_simd_dup_constant (rtx vals)
11665 {
11666 machine_mode mode = GET_MODE (vals);
11667 machine_mode inner_mode = GET_MODE_INNER (mode);
11668 rtx x;
11669
11670 if (!const_vec_duplicate_p (vals, &x))
11671 return NULL_RTX;
11672
11673 /* We can load this constant by using DUP and a constant in a
11674 single ARM register. This will be cheaper than a vector
11675 load. */
11676 x = copy_to_mode_reg (inner_mode, x);
11677 return gen_rtx_VEC_DUPLICATE (mode, x);
11678 }
11679
11680
11681 /* Generate code to load VALS, which is a PARALLEL containing only
11682 constants (for vec_init) or CONST_VECTOR, efficiently into a
11683 register. Returns an RTX to copy into the register, or NULL_RTX
11684 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11685 static rtx
11686 aarch64_simd_make_constant (rtx vals)
11687 {
11688 machine_mode mode = GET_MODE (vals);
11689 rtx const_dup;
11690 rtx const_vec = NULL_RTX;
11691 int n_elts = GET_MODE_NUNITS (mode);
11692 int n_const = 0;
11693 int i;
11694
11695 if (GET_CODE (vals) == CONST_VECTOR)
11696 const_vec = vals;
11697 else if (GET_CODE (vals) == PARALLEL)
11698 {
11699 /* A CONST_VECTOR must contain only CONST_INTs and
11700 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11701 Only store valid constants in a CONST_VECTOR. */
11702 for (i = 0; i < n_elts; ++i)
11703 {
11704 rtx x = XVECEXP (vals, 0, i);
11705 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11706 n_const++;
11707 }
11708 if (n_const == n_elts)
11709 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11710 }
11711 else
11712 gcc_unreachable ();
11713
11714 if (const_vec != NULL_RTX
11715 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11716 /* Load using MOVI/MVNI. */
11717 return const_vec;
11718 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11719 /* Loaded using DUP. */
11720 return const_dup;
11721 else if (const_vec != NULL_RTX)
11722 /* Load from constant pool. We can not take advantage of single-cycle
11723 LD1 because we need a PC-relative addressing mode. */
11724 return const_vec;
11725 else
11726 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11727 We can not construct an initializer. */
11728 return NULL_RTX;
11729 }
11730
11731 /* Expand a vector initialisation sequence, such that TARGET is
11732 initialised to contain VALS. */
11733
11734 void
11735 aarch64_expand_vector_init (rtx target, rtx vals)
11736 {
11737 machine_mode mode = GET_MODE (target);
11738 machine_mode inner_mode = GET_MODE_INNER (mode);
11739 /* The number of vector elements. */
11740 int n_elts = GET_MODE_NUNITS (mode);
11741 /* The number of vector elements which are not constant. */
11742 int n_var = 0;
11743 rtx any_const = NULL_RTX;
11744 /* The first element of vals. */
11745 rtx v0 = XVECEXP (vals, 0, 0);
11746 bool all_same = true;
11747
11748 /* Count the number of variable elements to initialise. */
11749 for (int i = 0; i < n_elts; ++i)
11750 {
11751 rtx x = XVECEXP (vals, 0, i);
11752 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11753 ++n_var;
11754 else
11755 any_const = x;
11756
11757 all_same &= rtx_equal_p (x, v0);
11758 }
11759
11760 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11761 how best to handle this. */
11762 if (n_var == 0)
11763 {
11764 rtx constant = aarch64_simd_make_constant (vals);
11765 if (constant != NULL_RTX)
11766 {
11767 emit_move_insn (target, constant);
11768 return;
11769 }
11770 }
11771
11772 /* Splat a single non-constant element if we can. */
11773 if (all_same)
11774 {
11775 rtx x = copy_to_mode_reg (inner_mode, v0);
11776 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11777 return;
11778 }
11779
11780 enum insn_code icode = optab_handler (vec_set_optab, mode);
11781 gcc_assert (icode != CODE_FOR_nothing);
11782
11783 /* If there are only variable elements, try to optimize
11784 the insertion using dup for the most common element
11785 followed by insertions. */
11786
11787 /* The algorithm will fill matches[*][0] with the earliest matching element,
11788 and matches[X][1] with the count of duplicate elements (if X is the
11789 earliest element which has duplicates). */
11790
11791 if (n_var == n_elts && n_elts <= 16)
11792 {
11793 int matches[16][2] = {0};
11794 for (int i = 0; i < n_elts; i++)
11795 {
11796 for (int j = 0; j <= i; j++)
11797 {
11798 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
11799 {
11800 matches[i][0] = j;
11801 matches[j][1]++;
11802 break;
11803 }
11804 }
11805 }
11806 int maxelement = 0;
11807 int maxv = 0;
11808 for (int i = 0; i < n_elts; i++)
11809 if (matches[i][1] > maxv)
11810 {
11811 maxelement = i;
11812 maxv = matches[i][1];
11813 }
11814
11815 /* Create a duplicate of the most common element. */
11816 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
11817 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11818
11819 /* Insert the rest. */
11820 for (int i = 0; i < n_elts; i++)
11821 {
11822 rtx x = XVECEXP (vals, 0, i);
11823 if (matches[i][0] == maxelement)
11824 continue;
11825 x = copy_to_mode_reg (inner_mode, x);
11826 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11827 }
11828 return;
11829 }
11830
11831 /* Initialise a vector which is part-variable. We want to first try
11832 to build those lanes which are constant in the most efficient way we
11833 can. */
11834 if (n_var != n_elts)
11835 {
11836 rtx copy = copy_rtx (vals);
11837
11838 /* Load constant part of vector. We really don't care what goes into the
11839 parts we will overwrite, but we're more likely to be able to load the
11840 constant efficiently if it has fewer, larger, repeating parts
11841 (see aarch64_simd_valid_immediate). */
11842 for (int i = 0; i < n_elts; i++)
11843 {
11844 rtx x = XVECEXP (vals, 0, i);
11845 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11846 continue;
11847 rtx subst = any_const;
11848 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11849 {
11850 /* Look in the copied vector, as more elements are const. */
11851 rtx test = XVECEXP (copy, 0, i ^ bit);
11852 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11853 {
11854 subst = test;
11855 break;
11856 }
11857 }
11858 XVECEXP (copy, 0, i) = subst;
11859 }
11860 aarch64_expand_vector_init (target, copy);
11861 }
11862
11863 /* Insert the variable lanes directly. */
11864 for (int i = 0; i < n_elts; i++)
11865 {
11866 rtx x = XVECEXP (vals, 0, i);
11867 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11868 continue;
11869 x = copy_to_mode_reg (inner_mode, x);
11870 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11871 }
11872 }
11873
11874 static unsigned HOST_WIDE_INT
11875 aarch64_shift_truncation_mask (machine_mode mode)
11876 {
11877 return
11878 (!SHIFT_COUNT_TRUNCATED
11879 || aarch64_vector_mode_supported_p (mode)
11880 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11881 }
11882
11883 /* Select a format to encode pointers in exception handling data. */
11884 int
11885 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11886 {
11887 int type;
11888 switch (aarch64_cmodel)
11889 {
11890 case AARCH64_CMODEL_TINY:
11891 case AARCH64_CMODEL_TINY_PIC:
11892 case AARCH64_CMODEL_SMALL:
11893 case AARCH64_CMODEL_SMALL_PIC:
11894 case AARCH64_CMODEL_SMALL_SPIC:
11895 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11896 for everything. */
11897 type = DW_EH_PE_sdata4;
11898 break;
11899 default:
11900 /* No assumptions here. 8-byte relocs required. */
11901 type = DW_EH_PE_sdata8;
11902 break;
11903 }
11904 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11905 }
11906
11907 /* The last .arch and .tune assembly strings that we printed. */
11908 static std::string aarch64_last_printed_arch_string;
11909 static std::string aarch64_last_printed_tune_string;
11910
11911 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11912 by the function fndecl. */
11913
11914 void
11915 aarch64_declare_function_name (FILE *stream, const char* name,
11916 tree fndecl)
11917 {
11918 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11919
11920 struct cl_target_option *targ_options;
11921 if (target_parts)
11922 targ_options = TREE_TARGET_OPTION (target_parts);
11923 else
11924 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11925 gcc_assert (targ_options);
11926
11927 const struct processor *this_arch
11928 = aarch64_get_arch (targ_options->x_explicit_arch);
11929
11930 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11931 std::string extension
11932 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11933 this_arch->flags);
11934 /* Only update the assembler .arch string if it is distinct from the last
11935 such string we printed. */
11936 std::string to_print = this_arch->name + extension;
11937 if (to_print != aarch64_last_printed_arch_string)
11938 {
11939 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11940 aarch64_last_printed_arch_string = to_print;
11941 }
11942
11943 /* Print the cpu name we're tuning for in the comments, might be
11944 useful to readers of the generated asm. Do it only when it changes
11945 from function to function and verbose assembly is requested. */
11946 const struct processor *this_tune
11947 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11948
11949 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11950 {
11951 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11952 this_tune->name);
11953 aarch64_last_printed_tune_string = this_tune->name;
11954 }
11955
11956 /* Don't forget the type directive for ELF. */
11957 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11958 ASM_OUTPUT_LABEL (stream, name);
11959 }
11960
11961 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11962
11963 static void
11964 aarch64_start_file (void)
11965 {
11966 struct cl_target_option *default_options
11967 = TREE_TARGET_OPTION (target_option_default_node);
11968
11969 const struct processor *default_arch
11970 = aarch64_get_arch (default_options->x_explicit_arch);
11971 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11972 std::string extension
11973 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11974 default_arch->flags);
11975
11976 aarch64_last_printed_arch_string = default_arch->name + extension;
11977 aarch64_last_printed_tune_string = "";
11978 asm_fprintf (asm_out_file, "\t.arch %s\n",
11979 aarch64_last_printed_arch_string.c_str ());
11980
11981 default_file_start ();
11982 }
11983
11984 /* Emit load exclusive. */
11985
11986 static void
11987 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11988 rtx mem, rtx model_rtx)
11989 {
11990 rtx (*gen) (rtx, rtx, rtx);
11991
11992 switch (mode)
11993 {
11994 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11995 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11996 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11997 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11998 default:
11999 gcc_unreachable ();
12000 }
12001
12002 emit_insn (gen (rval, mem, model_rtx));
12003 }
12004
12005 /* Emit store exclusive. */
12006
12007 static void
12008 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12009 rtx rval, rtx mem, rtx model_rtx)
12010 {
12011 rtx (*gen) (rtx, rtx, rtx, rtx);
12012
12013 switch (mode)
12014 {
12015 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
12016 case HImode: gen = gen_aarch64_store_exclusivehi; break;
12017 case SImode: gen = gen_aarch64_store_exclusivesi; break;
12018 case DImode: gen = gen_aarch64_store_exclusivedi; break;
12019 default:
12020 gcc_unreachable ();
12021 }
12022
12023 emit_insn (gen (bval, rval, mem, model_rtx));
12024 }
12025
12026 /* Mark the previous jump instruction as unlikely. */
12027
12028 static void
12029 aarch64_emit_unlikely_jump (rtx insn)
12030 {
12031 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
12032
12033 rtx_insn *jump = emit_jump_insn (insn);
12034 add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
12035 }
12036
12037 /* Expand a compare and swap pattern. */
12038
12039 void
12040 aarch64_expand_compare_and_swap (rtx operands[])
12041 {
12042 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12043 machine_mode mode, cmp_mode;
12044 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12045 int idx;
12046 gen_cas_fn gen;
12047 const gen_cas_fn split_cas[] =
12048 {
12049 gen_aarch64_compare_and_swapqi,
12050 gen_aarch64_compare_and_swaphi,
12051 gen_aarch64_compare_and_swapsi,
12052 gen_aarch64_compare_and_swapdi
12053 };
12054 const gen_cas_fn atomic_cas[] =
12055 {
12056 gen_aarch64_compare_and_swapqi_lse,
12057 gen_aarch64_compare_and_swaphi_lse,
12058 gen_aarch64_compare_and_swapsi_lse,
12059 gen_aarch64_compare_and_swapdi_lse
12060 };
12061
12062 bval = operands[0];
12063 rval = operands[1];
12064 mem = operands[2];
12065 oldval = operands[3];
12066 newval = operands[4];
12067 is_weak = operands[5];
12068 mod_s = operands[6];
12069 mod_f = operands[7];
12070 mode = GET_MODE (mem);
12071 cmp_mode = mode;
12072
12073 /* Normally the succ memory model must be stronger than fail, but in the
12074 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12075 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12076
12077 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12078 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12079 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12080
12081 switch (mode)
12082 {
12083 case QImode:
12084 case HImode:
12085 /* For short modes, we're going to perform the comparison in SImode,
12086 so do the zero-extension now. */
12087 cmp_mode = SImode;
12088 rval = gen_reg_rtx (SImode);
12089 oldval = convert_modes (SImode, mode, oldval, true);
12090 /* Fall through. */
12091
12092 case SImode:
12093 case DImode:
12094 /* Force the value into a register if needed. */
12095 if (!aarch64_plus_operand (oldval, mode))
12096 oldval = force_reg (cmp_mode, oldval);
12097 break;
12098
12099 default:
12100 gcc_unreachable ();
12101 }
12102
12103 switch (mode)
12104 {
12105 case QImode: idx = 0; break;
12106 case HImode: idx = 1; break;
12107 case SImode: idx = 2; break;
12108 case DImode: idx = 3; break;
12109 default:
12110 gcc_unreachable ();
12111 }
12112 if (TARGET_LSE)
12113 gen = atomic_cas[idx];
12114 else
12115 gen = split_cas[idx];
12116
12117 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12118
12119 if (mode == QImode || mode == HImode)
12120 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12121
12122 x = gen_rtx_REG (CCmode, CC_REGNUM);
12123 x = gen_rtx_EQ (SImode, x, const0_rtx);
12124 emit_insn (gen_rtx_SET (bval, x));
12125 }
12126
12127 /* Test whether the target supports using a atomic load-operate instruction.
12128 CODE is the operation and AFTER is TRUE if the data in memory after the
12129 operation should be returned and FALSE if the data before the operation
12130 should be returned. Returns FALSE if the operation isn't supported by the
12131 architecture. */
12132
12133 bool
12134 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12135 {
12136 if (!TARGET_LSE)
12137 return false;
12138
12139 switch (code)
12140 {
12141 case SET:
12142 case AND:
12143 case IOR:
12144 case XOR:
12145 case MINUS:
12146 case PLUS:
12147 return true;
12148 default:
12149 return false;
12150 }
12151 }
12152
12153 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12154 sequence implementing an atomic operation. */
12155
12156 static void
12157 aarch64_emit_post_barrier (enum memmodel model)
12158 {
12159 const enum memmodel base_model = memmodel_base (model);
12160
12161 if (is_mm_sync (model)
12162 && (base_model == MEMMODEL_ACQUIRE
12163 || base_model == MEMMODEL_ACQ_REL
12164 || base_model == MEMMODEL_SEQ_CST))
12165 {
12166 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12167 }
12168 }
12169
12170 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12171 for the data in memory. EXPECTED is the value expected to be in memory.
12172 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12173 is the memory ordering to use. */
12174
12175 void
12176 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12177 rtx expected, rtx desired,
12178 rtx model)
12179 {
12180 rtx (*gen) (rtx, rtx, rtx, rtx);
12181 machine_mode mode;
12182
12183 mode = GET_MODE (mem);
12184
12185 switch (mode)
12186 {
12187 case QImode: gen = gen_aarch64_atomic_casqi; break;
12188 case HImode: gen = gen_aarch64_atomic_cashi; break;
12189 case SImode: gen = gen_aarch64_atomic_cassi; break;
12190 case DImode: gen = gen_aarch64_atomic_casdi; break;
12191 default:
12192 gcc_unreachable ();
12193 }
12194
12195 /* Move the expected value into the CAS destination register. */
12196 emit_insn (gen_rtx_SET (rval, expected));
12197
12198 /* Emit the CAS. */
12199 emit_insn (gen (rval, mem, desired, model));
12200
12201 /* Compare the expected value with the value loaded by the CAS, to establish
12202 whether the swap was made. */
12203 aarch64_gen_compare_reg (EQ, rval, expected);
12204 }
12205
12206 /* Split a compare and swap pattern. */
12207
12208 void
12209 aarch64_split_compare_and_swap (rtx operands[])
12210 {
12211 rtx rval, mem, oldval, newval, scratch;
12212 machine_mode mode;
12213 bool is_weak;
12214 rtx_code_label *label1, *label2;
12215 rtx x, cond;
12216 enum memmodel model;
12217 rtx model_rtx;
12218
12219 rval = operands[0];
12220 mem = operands[1];
12221 oldval = operands[2];
12222 newval = operands[3];
12223 is_weak = (operands[4] != const0_rtx);
12224 model_rtx = operands[5];
12225 scratch = operands[7];
12226 mode = GET_MODE (mem);
12227 model = memmodel_from_int (INTVAL (model_rtx));
12228
12229 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12230 loop:
12231 .label1:
12232 LD[A]XR rval, [mem]
12233 CBNZ rval, .label2
12234 ST[L]XR scratch, newval, [mem]
12235 CBNZ scratch, .label1
12236 .label2:
12237 CMP rval, 0. */
12238 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12239
12240 label1 = NULL;
12241 if (!is_weak)
12242 {
12243 label1 = gen_label_rtx ();
12244 emit_label (label1);
12245 }
12246 label2 = gen_label_rtx ();
12247
12248 /* The initial load can be relaxed for a __sync operation since a final
12249 barrier will be emitted to stop code hoisting. */
12250 if (is_mm_sync (model))
12251 aarch64_emit_load_exclusive (mode, rval, mem,
12252 GEN_INT (MEMMODEL_RELAXED));
12253 else
12254 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12255
12256 if (strong_zero_p)
12257 {
12258 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12259 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12260 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12261 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12262 }
12263 else
12264 {
12265 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12266 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12267 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12268 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12269 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12270 }
12271
12272 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12273
12274 if (!is_weak)
12275 {
12276 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12277 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12278 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12279 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12280 }
12281 else
12282 {
12283 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12284 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12285 emit_insn (gen_rtx_SET (cond, x));
12286 }
12287
12288 emit_label (label2);
12289 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12290 to set the condition flags. If this is not used it will be removed by
12291 later passes. */
12292 if (strong_zero_p)
12293 {
12294 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12295 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12296 emit_insn (gen_rtx_SET (cond, x));
12297 }
12298 /* Emit any final barrier needed for a __sync operation. */
12299 if (is_mm_sync (model))
12300 aarch64_emit_post_barrier (model);
12301 }
12302
12303 /* Emit a BIC instruction. */
12304
12305 static void
12306 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12307 {
12308 rtx shift_rtx = GEN_INT (shift);
12309 rtx (*gen) (rtx, rtx, rtx, rtx);
12310
12311 switch (mode)
12312 {
12313 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12314 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12315 default:
12316 gcc_unreachable ();
12317 }
12318
12319 emit_insn (gen (dst, s2, shift_rtx, s1));
12320 }
12321
12322 /* Emit an atomic swap. */
12323
12324 static void
12325 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12326 rtx mem, rtx model)
12327 {
12328 rtx (*gen) (rtx, rtx, rtx, rtx);
12329
12330 switch (mode)
12331 {
12332 case QImode: gen = gen_aarch64_atomic_swpqi; break;
12333 case HImode: gen = gen_aarch64_atomic_swphi; break;
12334 case SImode: gen = gen_aarch64_atomic_swpsi; break;
12335 case DImode: gen = gen_aarch64_atomic_swpdi; break;
12336 default:
12337 gcc_unreachable ();
12338 }
12339
12340 emit_insn (gen (dst, mem, value, model));
12341 }
12342
12343 /* Operations supported by aarch64_emit_atomic_load_op. */
12344
12345 enum aarch64_atomic_load_op_code
12346 {
12347 AARCH64_LDOP_PLUS, /* A + B */
12348 AARCH64_LDOP_XOR, /* A ^ B */
12349 AARCH64_LDOP_OR, /* A | B */
12350 AARCH64_LDOP_BIC /* A & ~B */
12351 };
12352
12353 /* Emit an atomic load-operate. */
12354
12355 static void
12356 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12357 machine_mode mode, rtx dst, rtx src,
12358 rtx mem, rtx model)
12359 {
12360 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12361 const aarch64_atomic_load_op_fn plus[] =
12362 {
12363 gen_aarch64_atomic_loadaddqi,
12364 gen_aarch64_atomic_loadaddhi,
12365 gen_aarch64_atomic_loadaddsi,
12366 gen_aarch64_atomic_loadadddi
12367 };
12368 const aarch64_atomic_load_op_fn eor[] =
12369 {
12370 gen_aarch64_atomic_loadeorqi,
12371 gen_aarch64_atomic_loadeorhi,
12372 gen_aarch64_atomic_loadeorsi,
12373 gen_aarch64_atomic_loadeordi
12374 };
12375 const aarch64_atomic_load_op_fn ior[] =
12376 {
12377 gen_aarch64_atomic_loadsetqi,
12378 gen_aarch64_atomic_loadsethi,
12379 gen_aarch64_atomic_loadsetsi,
12380 gen_aarch64_atomic_loadsetdi
12381 };
12382 const aarch64_atomic_load_op_fn bic[] =
12383 {
12384 gen_aarch64_atomic_loadclrqi,
12385 gen_aarch64_atomic_loadclrhi,
12386 gen_aarch64_atomic_loadclrsi,
12387 gen_aarch64_atomic_loadclrdi
12388 };
12389 aarch64_atomic_load_op_fn gen;
12390 int idx = 0;
12391
12392 switch (mode)
12393 {
12394 case QImode: idx = 0; break;
12395 case HImode: idx = 1; break;
12396 case SImode: idx = 2; break;
12397 case DImode: idx = 3; break;
12398 default:
12399 gcc_unreachable ();
12400 }
12401
12402 switch (code)
12403 {
12404 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12405 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12406 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12407 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12408 default:
12409 gcc_unreachable ();
12410 }
12411
12412 emit_insn (gen (dst, mem, src, model));
12413 }
12414
12415 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12416 location to store the data read from memory. OUT_RESULT is the location to
12417 store the result of the operation. MEM is the memory location to read and
12418 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12419 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12420 be NULL. */
12421
12422 void
12423 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12424 rtx mem, rtx value, rtx model_rtx)
12425 {
12426 machine_mode mode = GET_MODE (mem);
12427 machine_mode wmode = (mode == DImode ? DImode : SImode);
12428 const bool short_mode = (mode < SImode);
12429 aarch64_atomic_load_op_code ldop_code;
12430 rtx src;
12431 rtx x;
12432
12433 if (out_data)
12434 out_data = gen_lowpart (mode, out_data);
12435
12436 if (out_result)
12437 out_result = gen_lowpart (mode, out_result);
12438
12439 /* Make sure the value is in a register, putting it into a destination
12440 register if it needs to be manipulated. */
12441 if (!register_operand (value, mode)
12442 || code == AND || code == MINUS)
12443 {
12444 src = out_result ? out_result : out_data;
12445 emit_move_insn (src, gen_lowpart (mode, value));
12446 }
12447 else
12448 src = value;
12449 gcc_assert (register_operand (src, mode));
12450
12451 /* Preprocess the data for the operation as necessary. If the operation is
12452 a SET then emit a swap instruction and finish. */
12453 switch (code)
12454 {
12455 case SET:
12456 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12457 return;
12458
12459 case MINUS:
12460 /* Negate the value and treat it as a PLUS. */
12461 {
12462 rtx neg_src;
12463
12464 /* Resize the value if necessary. */
12465 if (short_mode)
12466 src = gen_lowpart (wmode, src);
12467
12468 neg_src = gen_rtx_NEG (wmode, src);
12469 emit_insn (gen_rtx_SET (src, neg_src));
12470
12471 if (short_mode)
12472 src = gen_lowpart (mode, src);
12473 }
12474 /* Fall-through. */
12475 case PLUS:
12476 ldop_code = AARCH64_LDOP_PLUS;
12477 break;
12478
12479 case IOR:
12480 ldop_code = AARCH64_LDOP_OR;
12481 break;
12482
12483 case XOR:
12484 ldop_code = AARCH64_LDOP_XOR;
12485 break;
12486
12487 case AND:
12488 {
12489 rtx not_src;
12490
12491 /* Resize the value if necessary. */
12492 if (short_mode)
12493 src = gen_lowpart (wmode, src);
12494
12495 not_src = gen_rtx_NOT (wmode, src);
12496 emit_insn (gen_rtx_SET (src, not_src));
12497
12498 if (short_mode)
12499 src = gen_lowpart (mode, src);
12500 }
12501 ldop_code = AARCH64_LDOP_BIC;
12502 break;
12503
12504 default:
12505 /* The operation can't be done with atomic instructions. */
12506 gcc_unreachable ();
12507 }
12508
12509 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12510
12511 /* If necessary, calculate the data in memory after the update by redoing the
12512 operation from values in registers. */
12513 if (!out_result)
12514 return;
12515
12516 if (short_mode)
12517 {
12518 src = gen_lowpart (wmode, src);
12519 out_data = gen_lowpart (wmode, out_data);
12520 out_result = gen_lowpart (wmode, out_result);
12521 }
12522
12523 x = NULL_RTX;
12524
12525 switch (code)
12526 {
12527 case MINUS:
12528 case PLUS:
12529 x = gen_rtx_PLUS (wmode, out_data, src);
12530 break;
12531 case IOR:
12532 x = gen_rtx_IOR (wmode, out_data, src);
12533 break;
12534 case XOR:
12535 x = gen_rtx_XOR (wmode, out_data, src);
12536 break;
12537 case AND:
12538 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12539 return;
12540 default:
12541 gcc_unreachable ();
12542 }
12543
12544 emit_set_insn (out_result, x);
12545
12546 return;
12547 }
12548
12549 /* Split an atomic operation. */
12550
12551 void
12552 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12553 rtx value, rtx model_rtx, rtx cond)
12554 {
12555 machine_mode mode = GET_MODE (mem);
12556 machine_mode wmode = (mode == DImode ? DImode : SImode);
12557 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12558 const bool is_sync = is_mm_sync (model);
12559 rtx_code_label *label;
12560 rtx x;
12561
12562 /* Split the atomic operation into a sequence. */
12563 label = gen_label_rtx ();
12564 emit_label (label);
12565
12566 if (new_out)
12567 new_out = gen_lowpart (wmode, new_out);
12568 if (old_out)
12569 old_out = gen_lowpart (wmode, old_out);
12570 else
12571 old_out = new_out;
12572 value = simplify_gen_subreg (wmode, value, mode, 0);
12573
12574 /* The initial load can be relaxed for a __sync operation since a final
12575 barrier will be emitted to stop code hoisting. */
12576 if (is_sync)
12577 aarch64_emit_load_exclusive (mode, old_out, mem,
12578 GEN_INT (MEMMODEL_RELAXED));
12579 else
12580 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12581
12582 switch (code)
12583 {
12584 case SET:
12585 new_out = value;
12586 break;
12587
12588 case NOT:
12589 x = gen_rtx_AND (wmode, old_out, value);
12590 emit_insn (gen_rtx_SET (new_out, x));
12591 x = gen_rtx_NOT (wmode, new_out);
12592 emit_insn (gen_rtx_SET (new_out, x));
12593 break;
12594
12595 case MINUS:
12596 if (CONST_INT_P (value))
12597 {
12598 value = GEN_INT (-INTVAL (value));
12599 code = PLUS;
12600 }
12601 /* Fall through. */
12602
12603 default:
12604 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12605 emit_insn (gen_rtx_SET (new_out, x));
12606 break;
12607 }
12608
12609 aarch64_emit_store_exclusive (mode, cond, mem,
12610 gen_lowpart (mode, new_out), model_rtx);
12611
12612 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12613 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12614 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12615 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12616
12617 /* Emit any final barrier needed for a __sync operation. */
12618 if (is_sync)
12619 aarch64_emit_post_barrier (model);
12620 }
12621
12622 static void
12623 aarch64_init_libfuncs (void)
12624 {
12625 /* Half-precision float operations. The compiler handles all operations
12626 with NULL libfuncs by converting to SFmode. */
12627
12628 /* Conversions. */
12629 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12630 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12631
12632 /* Arithmetic. */
12633 set_optab_libfunc (add_optab, HFmode, NULL);
12634 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12635 set_optab_libfunc (smul_optab, HFmode, NULL);
12636 set_optab_libfunc (neg_optab, HFmode, NULL);
12637 set_optab_libfunc (sub_optab, HFmode, NULL);
12638
12639 /* Comparisons. */
12640 set_optab_libfunc (eq_optab, HFmode, NULL);
12641 set_optab_libfunc (ne_optab, HFmode, NULL);
12642 set_optab_libfunc (lt_optab, HFmode, NULL);
12643 set_optab_libfunc (le_optab, HFmode, NULL);
12644 set_optab_libfunc (ge_optab, HFmode, NULL);
12645 set_optab_libfunc (gt_optab, HFmode, NULL);
12646 set_optab_libfunc (unord_optab, HFmode, NULL);
12647 }
12648
12649 /* Target hook for c_mode_for_suffix. */
12650 static machine_mode
12651 aarch64_c_mode_for_suffix (char suffix)
12652 {
12653 if (suffix == 'q')
12654 return TFmode;
12655
12656 return VOIDmode;
12657 }
12658
12659 /* We can only represent floating point constants which will fit in
12660 "quarter-precision" values. These values are characterised by
12661 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12662 by:
12663
12664 (-1)^s * (n/16) * 2^r
12665
12666 Where:
12667 's' is the sign bit.
12668 'n' is an integer in the range 16 <= n <= 31.
12669 'r' is an integer in the range -3 <= r <= 4. */
12670
12671 /* Return true iff X can be represented by a quarter-precision
12672 floating point immediate operand X. Note, we cannot represent 0.0. */
12673 bool
12674 aarch64_float_const_representable_p (rtx x)
12675 {
12676 /* This represents our current view of how many bits
12677 make up the mantissa. */
12678 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12679 int exponent;
12680 unsigned HOST_WIDE_INT mantissa, mask;
12681 REAL_VALUE_TYPE r, m;
12682 bool fail;
12683
12684 if (!CONST_DOUBLE_P (x))
12685 return false;
12686
12687 /* We don't support HFmode constants yet. */
12688 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12689 return false;
12690
12691 r = *CONST_DOUBLE_REAL_VALUE (x);
12692
12693 /* We cannot represent infinities, NaNs or +/-zero. We won't
12694 know if we have +zero until we analyse the mantissa, but we
12695 can reject the other invalid values. */
12696 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12697 || REAL_VALUE_MINUS_ZERO (r))
12698 return false;
12699
12700 /* Extract exponent. */
12701 r = real_value_abs (&r);
12702 exponent = REAL_EXP (&r);
12703
12704 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12705 highest (sign) bit, with a fixed binary point at bit point_pos.
12706 m1 holds the low part of the mantissa, m2 the high part.
12707 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12708 bits for the mantissa, this can fail (low bits will be lost). */
12709 real_ldexp (&m, &r, point_pos - exponent);
12710 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12711
12712 /* If the low part of the mantissa has bits set we cannot represent
12713 the value. */
12714 if (w.ulow () != 0)
12715 return false;
12716 /* We have rejected the lower HOST_WIDE_INT, so update our
12717 understanding of how many bits lie in the mantissa and
12718 look only at the high HOST_WIDE_INT. */
12719 mantissa = w.elt (1);
12720 point_pos -= HOST_BITS_PER_WIDE_INT;
12721
12722 /* We can only represent values with a mantissa of the form 1.xxxx. */
12723 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12724 if ((mantissa & mask) != 0)
12725 return false;
12726
12727 /* Having filtered unrepresentable values, we may now remove all
12728 but the highest 5 bits. */
12729 mantissa >>= point_pos - 5;
12730
12731 /* We cannot represent the value 0.0, so reject it. This is handled
12732 elsewhere. */
12733 if (mantissa == 0)
12734 return false;
12735
12736 /* Then, as bit 4 is always set, we can mask it off, leaving
12737 the mantissa in the range [0, 15]. */
12738 mantissa &= ~(1 << 4);
12739 gcc_assert (mantissa <= 15);
12740
12741 /* GCC internally does not use IEEE754-like encoding (where normalized
12742 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12743 Our mantissa values are shifted 4 places to the left relative to
12744 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12745 by 5 places to correct for GCC's representation. */
12746 exponent = 5 - exponent;
12747
12748 return (exponent >= 0 && exponent <= 7);
12749 }
12750
12751 char*
12752 aarch64_output_simd_mov_immediate (rtx const_vector,
12753 machine_mode mode,
12754 unsigned width)
12755 {
12756 bool is_valid;
12757 static char templ[40];
12758 const char *mnemonic;
12759 const char *shift_op;
12760 unsigned int lane_count = 0;
12761 char element_char;
12762
12763 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12764
12765 /* This will return true to show const_vector is legal for use as either
12766 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12767 also update INFO to show how the immediate should be generated. */
12768 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12769 gcc_assert (is_valid);
12770
12771 element_char = sizetochar (info.element_width);
12772 lane_count = width / info.element_width;
12773
12774 mode = GET_MODE_INNER (mode);
12775 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12776 {
12777 gcc_assert (info.shift == 0 && ! info.mvn);
12778 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12779 move immediate path. */
12780 if (aarch64_float_const_zero_rtx_p (info.value))
12781 info.value = GEN_INT (0);
12782 else
12783 {
12784 const unsigned int buf_size = 20;
12785 char float_buf[buf_size] = {'\0'};
12786 real_to_decimal_for_mode (float_buf,
12787 CONST_DOUBLE_REAL_VALUE (info.value),
12788 buf_size, buf_size, 1, mode);
12789
12790 if (lane_count == 1)
12791 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12792 else
12793 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12794 lane_count, element_char, float_buf);
12795 return templ;
12796 }
12797 }
12798
12799 mnemonic = info.mvn ? "mvni" : "movi";
12800 shift_op = info.msl ? "msl" : "lsl";
12801
12802 gcc_assert (CONST_INT_P (info.value));
12803 if (lane_count == 1)
12804 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12805 mnemonic, UINTVAL (info.value));
12806 else if (info.shift)
12807 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12808 ", %s %d", mnemonic, lane_count, element_char,
12809 UINTVAL (info.value), shift_op, info.shift);
12810 else
12811 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12812 mnemonic, lane_count, element_char, UINTVAL (info.value));
12813 return templ;
12814 }
12815
12816 char*
12817 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12818 machine_mode mode)
12819 {
12820 machine_mode vmode;
12821
12822 gcc_assert (!VECTOR_MODE_P (mode));
12823 vmode = aarch64_simd_container_mode (mode, 64);
12824 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12825 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12826 }
12827
12828 /* Split operands into moves from op[1] + op[2] into op[0]. */
12829
12830 void
12831 aarch64_split_combinev16qi (rtx operands[3])
12832 {
12833 unsigned int dest = REGNO (operands[0]);
12834 unsigned int src1 = REGNO (operands[1]);
12835 unsigned int src2 = REGNO (operands[2]);
12836 machine_mode halfmode = GET_MODE (operands[1]);
12837 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12838 rtx destlo, desthi;
12839
12840 gcc_assert (halfmode == V16QImode);
12841
12842 if (src1 == dest && src2 == dest + halfregs)
12843 {
12844 /* No-op move. Can't split to nothing; emit something. */
12845 emit_note (NOTE_INSN_DELETED);
12846 return;
12847 }
12848
12849 /* Preserve register attributes for variable tracking. */
12850 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12851 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12852 GET_MODE_SIZE (halfmode));
12853
12854 /* Special case of reversed high/low parts. */
12855 if (reg_overlap_mentioned_p (operands[2], destlo)
12856 && reg_overlap_mentioned_p (operands[1], desthi))
12857 {
12858 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12859 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12860 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12861 }
12862 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12863 {
12864 /* Try to avoid unnecessary moves if part of the result
12865 is in the right place already. */
12866 if (src1 != dest)
12867 emit_move_insn (destlo, operands[1]);
12868 if (src2 != dest + halfregs)
12869 emit_move_insn (desthi, operands[2]);
12870 }
12871 else
12872 {
12873 if (src2 != dest + halfregs)
12874 emit_move_insn (desthi, operands[2]);
12875 if (src1 != dest)
12876 emit_move_insn (destlo, operands[1]);
12877 }
12878 }
12879
12880 /* vec_perm support. */
12881
12882 #define MAX_VECT_LEN 16
12883
12884 struct expand_vec_perm_d
12885 {
12886 rtx target, op0, op1;
12887 unsigned char perm[MAX_VECT_LEN];
12888 machine_mode vmode;
12889 unsigned char nelt;
12890 bool one_vector_p;
12891 bool testing_p;
12892 };
12893
12894 /* Generate a variable permutation. */
12895
12896 static void
12897 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12898 {
12899 machine_mode vmode = GET_MODE (target);
12900 bool one_vector_p = rtx_equal_p (op0, op1);
12901
12902 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12903 gcc_checking_assert (GET_MODE (op0) == vmode);
12904 gcc_checking_assert (GET_MODE (op1) == vmode);
12905 gcc_checking_assert (GET_MODE (sel) == vmode);
12906 gcc_checking_assert (TARGET_SIMD);
12907
12908 if (one_vector_p)
12909 {
12910 if (vmode == V8QImode)
12911 {
12912 /* Expand the argument to a V16QI mode by duplicating it. */
12913 rtx pair = gen_reg_rtx (V16QImode);
12914 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12915 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12916 }
12917 else
12918 {
12919 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12920 }
12921 }
12922 else
12923 {
12924 rtx pair;
12925
12926 if (vmode == V8QImode)
12927 {
12928 pair = gen_reg_rtx (V16QImode);
12929 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12930 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12931 }
12932 else
12933 {
12934 pair = gen_reg_rtx (OImode);
12935 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12936 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12937 }
12938 }
12939 }
12940
12941 void
12942 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12943 {
12944 machine_mode vmode = GET_MODE (target);
12945 unsigned int nelt = GET_MODE_NUNITS (vmode);
12946 bool one_vector_p = rtx_equal_p (op0, op1);
12947 rtx mask;
12948
12949 /* The TBL instruction does not use a modulo index, so we must take care
12950 of that ourselves. */
12951 mask = aarch64_simd_gen_const_vector_dup (vmode,
12952 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12953 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12954
12955 /* For big-endian, we also need to reverse the index within the vector
12956 (but not which vector). */
12957 if (BYTES_BIG_ENDIAN)
12958 {
12959 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12960 if (!one_vector_p)
12961 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12962 sel = expand_simple_binop (vmode, XOR, sel, mask,
12963 NULL, 0, OPTAB_LIB_WIDEN);
12964 }
12965 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12966 }
12967
12968 /* Recognize patterns suitable for the TRN instructions. */
12969 static bool
12970 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12971 {
12972 unsigned int i, odd, mask, nelt = d->nelt;
12973 rtx out, in0, in1, x;
12974 rtx (*gen) (rtx, rtx, rtx);
12975 machine_mode vmode = d->vmode;
12976
12977 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12978 return false;
12979
12980 /* Note that these are little-endian tests.
12981 We correct for big-endian later. */
12982 if (d->perm[0] == 0)
12983 odd = 0;
12984 else if (d->perm[0] == 1)
12985 odd = 1;
12986 else
12987 return false;
12988 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12989
12990 for (i = 0; i < nelt; i += 2)
12991 {
12992 if (d->perm[i] != i + odd)
12993 return false;
12994 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12995 return false;
12996 }
12997
12998 /* Success! */
12999 if (d->testing_p)
13000 return true;
13001
13002 in0 = d->op0;
13003 in1 = d->op1;
13004 if (BYTES_BIG_ENDIAN)
13005 {
13006 x = in0, in0 = in1, in1 = x;
13007 odd = !odd;
13008 }
13009 out = d->target;
13010
13011 if (odd)
13012 {
13013 switch (vmode)
13014 {
13015 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
13016 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
13017 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
13018 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
13019 case V4SImode: gen = gen_aarch64_trn2v4si; break;
13020 case V2SImode: gen = gen_aarch64_trn2v2si; break;
13021 case V2DImode: gen = gen_aarch64_trn2v2di; break;
13022 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13023 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13024 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13025 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13026 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
13027 default:
13028 return false;
13029 }
13030 }
13031 else
13032 {
13033 switch (vmode)
13034 {
13035 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
13036 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
13037 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
13038 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
13039 case V4SImode: gen = gen_aarch64_trn1v4si; break;
13040 case V2SImode: gen = gen_aarch64_trn1v2si; break;
13041 case V2DImode: gen = gen_aarch64_trn1v2di; break;
13042 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13043 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13044 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13045 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13046 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
13047 default:
13048 return false;
13049 }
13050 }
13051
13052 emit_insn (gen (out, in0, in1));
13053 return true;
13054 }
13055
13056 /* Recognize patterns suitable for the UZP instructions. */
13057 static bool
13058 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13059 {
13060 unsigned int i, odd, mask, nelt = d->nelt;
13061 rtx out, in0, in1, x;
13062 rtx (*gen) (rtx, rtx, rtx);
13063 machine_mode vmode = d->vmode;
13064
13065 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13066 return false;
13067
13068 /* Note that these are little-endian tests.
13069 We correct for big-endian later. */
13070 if (d->perm[0] == 0)
13071 odd = 0;
13072 else if (d->perm[0] == 1)
13073 odd = 1;
13074 else
13075 return false;
13076 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13077
13078 for (i = 0; i < nelt; i++)
13079 {
13080 unsigned elt = (i * 2 + odd) & mask;
13081 if (d->perm[i] != elt)
13082 return false;
13083 }
13084
13085 /* Success! */
13086 if (d->testing_p)
13087 return true;
13088
13089 in0 = d->op0;
13090 in1 = d->op1;
13091 if (BYTES_BIG_ENDIAN)
13092 {
13093 x = in0, in0 = in1, in1 = x;
13094 odd = !odd;
13095 }
13096 out = d->target;
13097
13098 if (odd)
13099 {
13100 switch (vmode)
13101 {
13102 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13103 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13104 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13105 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13106 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
13107 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
13108 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
13109 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13110 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13111 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13112 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13113 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13114 default:
13115 return false;
13116 }
13117 }
13118 else
13119 {
13120 switch (vmode)
13121 {
13122 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13123 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13124 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13125 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13126 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
13127 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
13128 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
13129 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13130 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13131 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13132 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13133 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13134 default:
13135 return false;
13136 }
13137 }
13138
13139 emit_insn (gen (out, in0, in1));
13140 return true;
13141 }
13142
13143 /* Recognize patterns suitable for the ZIP instructions. */
13144 static bool
13145 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13146 {
13147 unsigned int i, high, mask, nelt = d->nelt;
13148 rtx out, in0, in1, x;
13149 rtx (*gen) (rtx, rtx, rtx);
13150 machine_mode vmode = d->vmode;
13151
13152 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13153 return false;
13154
13155 /* Note that these are little-endian tests.
13156 We correct for big-endian later. */
13157 high = nelt / 2;
13158 if (d->perm[0] == high)
13159 /* Do Nothing. */
13160 ;
13161 else if (d->perm[0] == 0)
13162 high = 0;
13163 else
13164 return false;
13165 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13166
13167 for (i = 0; i < nelt / 2; i++)
13168 {
13169 unsigned elt = (i + high) & mask;
13170 if (d->perm[i * 2] != elt)
13171 return false;
13172 elt = (elt + nelt) & mask;
13173 if (d->perm[i * 2 + 1] != elt)
13174 return false;
13175 }
13176
13177 /* Success! */
13178 if (d->testing_p)
13179 return true;
13180
13181 in0 = d->op0;
13182 in1 = d->op1;
13183 if (BYTES_BIG_ENDIAN)
13184 {
13185 x = in0, in0 = in1, in1 = x;
13186 high = !high;
13187 }
13188 out = d->target;
13189
13190 if (high)
13191 {
13192 switch (vmode)
13193 {
13194 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
13195 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
13196 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
13197 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
13198 case V4SImode: gen = gen_aarch64_zip2v4si; break;
13199 case V2SImode: gen = gen_aarch64_zip2v2si; break;
13200 case V2DImode: gen = gen_aarch64_zip2v2di; break;
13201 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13202 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13203 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13204 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13205 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
13206 default:
13207 return false;
13208 }
13209 }
13210 else
13211 {
13212 switch (vmode)
13213 {
13214 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13215 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13216 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13217 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13218 case V4SImode: gen = gen_aarch64_zip1v4si; break;
13219 case V2SImode: gen = gen_aarch64_zip1v2si; break;
13220 case V2DImode: gen = gen_aarch64_zip1v2di; break;
13221 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13222 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13223 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13224 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13225 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13226 default:
13227 return false;
13228 }
13229 }
13230
13231 emit_insn (gen (out, in0, in1));
13232 return true;
13233 }
13234
13235 /* Recognize patterns for the EXT insn. */
13236
13237 static bool
13238 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13239 {
13240 unsigned int i, nelt = d->nelt;
13241 rtx (*gen) (rtx, rtx, rtx, rtx);
13242 rtx offset;
13243
13244 unsigned int location = d->perm[0]; /* Always < nelt. */
13245
13246 /* Check if the extracted indices are increasing by one. */
13247 for (i = 1; i < nelt; i++)
13248 {
13249 unsigned int required = location + i;
13250 if (d->one_vector_p)
13251 {
13252 /* We'll pass the same vector in twice, so allow indices to wrap. */
13253 required &= (nelt - 1);
13254 }
13255 if (d->perm[i] != required)
13256 return false;
13257 }
13258
13259 switch (d->vmode)
13260 {
13261 case V16QImode: gen = gen_aarch64_extv16qi; break;
13262 case V8QImode: gen = gen_aarch64_extv8qi; break;
13263 case V4HImode: gen = gen_aarch64_extv4hi; break;
13264 case V8HImode: gen = gen_aarch64_extv8hi; break;
13265 case V2SImode: gen = gen_aarch64_extv2si; break;
13266 case V4SImode: gen = gen_aarch64_extv4si; break;
13267 case V4HFmode: gen = gen_aarch64_extv4hf; break;
13268 case V8HFmode: gen = gen_aarch64_extv8hf; break;
13269 case V2SFmode: gen = gen_aarch64_extv2sf; break;
13270 case V4SFmode: gen = gen_aarch64_extv4sf; break;
13271 case V2DImode: gen = gen_aarch64_extv2di; break;
13272 case V2DFmode: gen = gen_aarch64_extv2df; break;
13273 default:
13274 return false;
13275 }
13276
13277 /* Success! */
13278 if (d->testing_p)
13279 return true;
13280
13281 /* The case where (location == 0) is a no-op for both big- and little-endian,
13282 and is removed by the mid-end at optimization levels -O1 and higher. */
13283
13284 if (BYTES_BIG_ENDIAN && (location != 0))
13285 {
13286 /* After setup, we want the high elements of the first vector (stored
13287 at the LSB end of the register), and the low elements of the second
13288 vector (stored at the MSB end of the register). So swap. */
13289 std::swap (d->op0, d->op1);
13290 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13291 location = nelt - location;
13292 }
13293
13294 offset = GEN_INT (location);
13295 emit_insn (gen (d->target, d->op0, d->op1, offset));
13296 return true;
13297 }
13298
13299 /* Recognize patterns for the REV insns. */
13300
13301 static bool
13302 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13303 {
13304 unsigned int i, j, diff, nelt = d->nelt;
13305 rtx (*gen) (rtx, rtx);
13306
13307 if (!d->one_vector_p)
13308 return false;
13309
13310 diff = d->perm[0];
13311 switch (diff)
13312 {
13313 case 7:
13314 switch (d->vmode)
13315 {
13316 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13317 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
13318 default:
13319 return false;
13320 }
13321 break;
13322 case 3:
13323 switch (d->vmode)
13324 {
13325 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13326 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
13327 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
13328 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
13329 default:
13330 return false;
13331 }
13332 break;
13333 case 1:
13334 switch (d->vmode)
13335 {
13336 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13337 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
13338 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
13339 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
13340 case V4SImode: gen = gen_aarch64_rev64v4si; break;
13341 case V2SImode: gen = gen_aarch64_rev64v2si; break;
13342 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13343 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13344 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13345 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13346 default:
13347 return false;
13348 }
13349 break;
13350 default:
13351 return false;
13352 }
13353
13354 for (i = 0; i < nelt ; i += diff + 1)
13355 for (j = 0; j <= diff; j += 1)
13356 {
13357 /* This is guaranteed to be true as the value of diff
13358 is 7, 3, 1 and we should have enough elements in the
13359 queue to generate this. Getting a vector mask with a
13360 value of diff other than these values implies that
13361 something is wrong by the time we get here. */
13362 gcc_assert (i + j < nelt);
13363 if (d->perm[i + j] != i + diff - j)
13364 return false;
13365 }
13366
13367 /* Success! */
13368 if (d->testing_p)
13369 return true;
13370
13371 emit_insn (gen (d->target, d->op0));
13372 return true;
13373 }
13374
13375 static bool
13376 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13377 {
13378 rtx (*gen) (rtx, rtx, rtx);
13379 rtx out = d->target;
13380 rtx in0;
13381 machine_mode vmode = d->vmode;
13382 unsigned int i, elt, nelt = d->nelt;
13383 rtx lane;
13384
13385 elt = d->perm[0];
13386 for (i = 1; i < nelt; i++)
13387 {
13388 if (elt != d->perm[i])
13389 return false;
13390 }
13391
13392 /* The generic preparation in aarch64_expand_vec_perm_const_1
13393 swaps the operand order and the permute indices if it finds
13394 d->perm[0] to be in the second operand. Thus, we can always
13395 use d->op0 and need not do any extra arithmetic to get the
13396 correct lane number. */
13397 in0 = d->op0;
13398 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13399
13400 switch (vmode)
13401 {
13402 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13403 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13404 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13405 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13406 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13407 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13408 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13409 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13410 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13411 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13412 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13413 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13414 default:
13415 return false;
13416 }
13417
13418 emit_insn (gen (out, in0, lane));
13419 return true;
13420 }
13421
13422 static bool
13423 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13424 {
13425 rtx rperm[MAX_VECT_LEN], sel;
13426 machine_mode vmode = d->vmode;
13427 unsigned int i, nelt = d->nelt;
13428
13429 if (d->testing_p)
13430 return true;
13431
13432 /* Generic code will try constant permutation twice. Once with the
13433 original mode and again with the elements lowered to QImode.
13434 So wait and don't do the selector expansion ourselves. */
13435 if (vmode != V8QImode && vmode != V16QImode)
13436 return false;
13437
13438 for (i = 0; i < nelt; ++i)
13439 {
13440 int nunits = GET_MODE_NUNITS (vmode);
13441
13442 /* If big-endian and two vectors we end up with a weird mixed-endian
13443 mode on NEON. Reverse the index within each word but not the word
13444 itself. */
13445 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13446 : d->perm[i]);
13447 }
13448 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13449 sel = force_reg (vmode, sel);
13450
13451 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13452 return true;
13453 }
13454
13455 static bool
13456 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13457 {
13458 /* The pattern matching functions above are written to look for a small
13459 number to begin the sequence (0, 1, N/2). If we begin with an index
13460 from the second operand, we can swap the operands. */
13461 if (d->perm[0] >= d->nelt)
13462 {
13463 unsigned i, nelt = d->nelt;
13464
13465 gcc_assert (nelt == (nelt & -nelt));
13466 for (i = 0; i < nelt; ++i)
13467 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13468
13469 std::swap (d->op0, d->op1);
13470 }
13471
13472 if (TARGET_SIMD)
13473 {
13474 if (aarch64_evpc_rev (d))
13475 return true;
13476 else if (aarch64_evpc_ext (d))
13477 return true;
13478 else if (aarch64_evpc_dup (d))
13479 return true;
13480 else if (aarch64_evpc_zip (d))
13481 return true;
13482 else if (aarch64_evpc_uzp (d))
13483 return true;
13484 else if (aarch64_evpc_trn (d))
13485 return true;
13486 return aarch64_evpc_tbl (d);
13487 }
13488 return false;
13489 }
13490
13491 /* Expand a vec_perm_const pattern. */
13492
13493 bool
13494 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13495 {
13496 struct expand_vec_perm_d d;
13497 int i, nelt, which;
13498
13499 d.target = target;
13500 d.op0 = op0;
13501 d.op1 = op1;
13502
13503 d.vmode = GET_MODE (target);
13504 gcc_assert (VECTOR_MODE_P (d.vmode));
13505 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13506 d.testing_p = false;
13507
13508 for (i = which = 0; i < nelt; ++i)
13509 {
13510 rtx e = XVECEXP (sel, 0, i);
13511 int ei = INTVAL (e) & (2 * nelt - 1);
13512 which |= (ei < nelt ? 1 : 2);
13513 d.perm[i] = ei;
13514 }
13515
13516 switch (which)
13517 {
13518 default:
13519 gcc_unreachable ();
13520
13521 case 3:
13522 d.one_vector_p = false;
13523 if (!rtx_equal_p (op0, op1))
13524 break;
13525
13526 /* The elements of PERM do not suggest that only the first operand
13527 is used, but both operands are identical. Allow easier matching
13528 of the permutation by folding the permutation into the single
13529 input vector. */
13530 /* Fall Through. */
13531 case 2:
13532 for (i = 0; i < nelt; ++i)
13533 d.perm[i] &= nelt - 1;
13534 d.op0 = op1;
13535 d.one_vector_p = true;
13536 break;
13537
13538 case 1:
13539 d.op1 = op0;
13540 d.one_vector_p = true;
13541 break;
13542 }
13543
13544 return aarch64_expand_vec_perm_const_1 (&d);
13545 }
13546
13547 static bool
13548 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13549 const unsigned char *sel)
13550 {
13551 struct expand_vec_perm_d d;
13552 unsigned int i, nelt, which;
13553 bool ret;
13554
13555 d.vmode = vmode;
13556 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13557 d.testing_p = true;
13558 memcpy (d.perm, sel, nelt);
13559
13560 /* Calculate whether all elements are in one vector. */
13561 for (i = which = 0; i < nelt; ++i)
13562 {
13563 unsigned char e = d.perm[i];
13564 gcc_assert (e < 2 * nelt);
13565 which |= (e < nelt ? 1 : 2);
13566 }
13567
13568 /* If all elements are from the second vector, reindex as if from the
13569 first vector. */
13570 if (which == 2)
13571 for (i = 0; i < nelt; ++i)
13572 d.perm[i] -= nelt;
13573
13574 /* Check whether the mask can be applied to a single vector. */
13575 d.one_vector_p = (which != 3);
13576
13577 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13578 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13579 if (!d.one_vector_p)
13580 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13581
13582 start_sequence ();
13583 ret = aarch64_expand_vec_perm_const_1 (&d);
13584 end_sequence ();
13585
13586 return ret;
13587 }
13588
13589 rtx
13590 aarch64_reverse_mask (enum machine_mode mode)
13591 {
13592 /* We have to reverse each vector because we dont have
13593 a permuted load that can reverse-load according to ABI rules. */
13594 rtx mask;
13595 rtvec v = rtvec_alloc (16);
13596 int i, j;
13597 int nunits = GET_MODE_NUNITS (mode);
13598 int usize = GET_MODE_UNIT_SIZE (mode);
13599
13600 gcc_assert (BYTES_BIG_ENDIAN);
13601 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13602
13603 for (i = 0; i < nunits; i++)
13604 for (j = 0; j < usize; j++)
13605 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13606 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13607 return force_reg (V16QImode, mask);
13608 }
13609
13610 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13611 However due to issues with register allocation it is preferable to avoid
13612 tieing integer scalar and FP scalar modes. Executing integer operations
13613 in general registers is better than treating them as scalar vector
13614 operations. This reduces latency and avoids redundant int<->FP moves.
13615 So tie modes if they are either the same class, or vector modes with
13616 other vector modes, vector structs or any scalar mode.
13617 */
13618
13619 bool
13620 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13621 {
13622 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13623 return true;
13624
13625 /* We specifically want to allow elements of "structure" modes to
13626 be tieable to the structure. This more general condition allows
13627 other rarer situations too. */
13628 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13629 return true;
13630
13631 /* Also allow any scalar modes with vectors. */
13632 if (aarch64_vector_mode_supported_p (mode1)
13633 || aarch64_vector_mode_supported_p (mode2))
13634 return true;
13635
13636 return false;
13637 }
13638
13639 /* Return a new RTX holding the result of moving POINTER forward by
13640 AMOUNT bytes. */
13641
13642 static rtx
13643 aarch64_move_pointer (rtx pointer, int amount)
13644 {
13645 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13646
13647 return adjust_automodify_address (pointer, GET_MODE (pointer),
13648 next, amount);
13649 }
13650
13651 /* Return a new RTX holding the result of moving POINTER forward by the
13652 size of the mode it points to. */
13653
13654 static rtx
13655 aarch64_progress_pointer (rtx pointer)
13656 {
13657 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13658
13659 return aarch64_move_pointer (pointer, amount);
13660 }
13661
13662 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13663 MODE bytes. */
13664
13665 static void
13666 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13667 machine_mode mode)
13668 {
13669 rtx reg = gen_reg_rtx (mode);
13670
13671 /* "Cast" the pointers to the correct mode. */
13672 *src = adjust_address (*src, mode, 0);
13673 *dst = adjust_address (*dst, mode, 0);
13674 /* Emit the memcpy. */
13675 emit_move_insn (reg, *src);
13676 emit_move_insn (*dst, reg);
13677 /* Move the pointers forward. */
13678 *src = aarch64_progress_pointer (*src);
13679 *dst = aarch64_progress_pointer (*dst);
13680 }
13681
13682 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13683 we succeed, otherwise return false. */
13684
13685 bool
13686 aarch64_expand_movmem (rtx *operands)
13687 {
13688 unsigned int n;
13689 rtx dst = operands[0];
13690 rtx src = operands[1];
13691 rtx base;
13692 bool speed_p = !optimize_function_for_size_p (cfun);
13693
13694 /* When optimizing for size, give a better estimate of the length of a
13695 memcpy call, but use the default otherwise. */
13696 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13697
13698 /* We can't do anything smart if the amount to copy is not constant. */
13699 if (!CONST_INT_P (operands[2]))
13700 return false;
13701
13702 n = UINTVAL (operands[2]);
13703
13704 /* Try to keep the number of instructions low. For cases below 16 bytes we
13705 need to make at most two moves. For cases above 16 bytes it will be one
13706 move for each 16 byte chunk, then at most two additional moves. */
13707 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13708 return false;
13709
13710 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13711 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13712
13713 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13714 src = adjust_automodify_address (src, VOIDmode, base, 0);
13715
13716 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13717 1-byte chunk. */
13718 if (n < 4)
13719 {
13720 if (n >= 2)
13721 {
13722 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13723 n -= 2;
13724 }
13725
13726 if (n == 1)
13727 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13728
13729 return true;
13730 }
13731
13732 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13733 4-byte chunk, partially overlapping with the previously copied chunk. */
13734 if (n < 8)
13735 {
13736 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13737 n -= 4;
13738 if (n > 0)
13739 {
13740 int move = n - 4;
13741
13742 src = aarch64_move_pointer (src, move);
13743 dst = aarch64_move_pointer (dst, move);
13744 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13745 }
13746 return true;
13747 }
13748
13749 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13750 them, then (if applicable) an 8-byte chunk. */
13751 while (n >= 8)
13752 {
13753 if (n / 16)
13754 {
13755 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13756 n -= 16;
13757 }
13758 else
13759 {
13760 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13761 n -= 8;
13762 }
13763 }
13764
13765 /* Finish the final bytes of the copy. We can always do this in one
13766 instruction. We either copy the exact amount we need, or partially
13767 overlap with the previous chunk we copied and copy 8-bytes. */
13768 if (n == 0)
13769 return true;
13770 else if (n == 1)
13771 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13772 else if (n == 2)
13773 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13774 else if (n == 4)
13775 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13776 else
13777 {
13778 if (n == 3)
13779 {
13780 src = aarch64_move_pointer (src, -1);
13781 dst = aarch64_move_pointer (dst, -1);
13782 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13783 }
13784 else
13785 {
13786 int move = n - 8;
13787
13788 src = aarch64_move_pointer (src, move);
13789 dst = aarch64_move_pointer (dst, move);
13790 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13791 }
13792 }
13793
13794 return true;
13795 }
13796
13797 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13798 SImode stores. Handle the case when the constant has identical
13799 bottom and top halves. This is beneficial when the two stores can be
13800 merged into an STP and we avoid synthesising potentially expensive
13801 immediates twice. Return true if such a split is possible. */
13802
13803 bool
13804 aarch64_split_dimode_const_store (rtx dst, rtx src)
13805 {
13806 rtx lo = gen_lowpart (SImode, src);
13807 rtx hi = gen_highpart_mode (SImode, DImode, src);
13808
13809 bool size_p = optimize_function_for_size_p (cfun);
13810
13811 if (!rtx_equal_p (lo, hi))
13812 return false;
13813
13814 unsigned int orig_cost
13815 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13816 unsigned int lo_cost
13817 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13818
13819 /* We want to transform:
13820 MOV x1, 49370
13821 MOVK x1, 0x140, lsl 16
13822 MOVK x1, 0xc0da, lsl 32
13823 MOVK x1, 0x140, lsl 48
13824 STR x1, [x0]
13825 into:
13826 MOV w1, 49370
13827 MOVK w1, 0x140, lsl 16
13828 STP w1, w1, [x0]
13829 So we want to perform this only when we save two instructions
13830 or more. When optimizing for size, however, accept any code size
13831 savings we can. */
13832 if (size_p && orig_cost <= lo_cost)
13833 return false;
13834
13835 if (!size_p
13836 && (orig_cost <= lo_cost + 1))
13837 return false;
13838
13839 rtx mem_lo = adjust_address (dst, SImode, 0);
13840 if (!aarch64_mem_pair_operand (mem_lo, SImode))
13841 return false;
13842
13843 rtx tmp_reg = gen_reg_rtx (SImode);
13844 aarch64_expand_mov_immediate (tmp_reg, lo);
13845 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13846 /* Don't emit an explicit store pair as this may not be always profitable.
13847 Let the sched-fusion logic decide whether to merge them. */
13848 emit_move_insn (mem_lo, tmp_reg);
13849 emit_move_insn (mem_hi, tmp_reg);
13850
13851 return true;
13852 }
13853
13854 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13855
13856 static unsigned HOST_WIDE_INT
13857 aarch64_asan_shadow_offset (void)
13858 {
13859 return (HOST_WIDE_INT_1 << 36);
13860 }
13861
13862 static bool
13863 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13864 unsigned int align,
13865 enum by_pieces_operation op,
13866 bool speed_p)
13867 {
13868 /* STORE_BY_PIECES can be used when copying a constant string, but
13869 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13870 For now we always fail this and let the move_by_pieces code copy
13871 the string from read-only memory. */
13872 if (op == STORE_BY_PIECES)
13873 return false;
13874
13875 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13876 }
13877
13878 static rtx
13879 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13880 int code, tree treeop0, tree treeop1)
13881 {
13882 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13883 rtx op0, op1;
13884 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13885 insn_code icode;
13886 struct expand_operand ops[4];
13887
13888 start_sequence ();
13889 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13890
13891 op_mode = GET_MODE (op0);
13892 if (op_mode == VOIDmode)
13893 op_mode = GET_MODE (op1);
13894
13895 switch (op_mode)
13896 {
13897 case QImode:
13898 case HImode:
13899 case SImode:
13900 cmp_mode = SImode;
13901 icode = CODE_FOR_cmpsi;
13902 break;
13903
13904 case DImode:
13905 cmp_mode = DImode;
13906 icode = CODE_FOR_cmpdi;
13907 break;
13908
13909 case SFmode:
13910 cmp_mode = SFmode;
13911 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13912 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13913 break;
13914
13915 case DFmode:
13916 cmp_mode = DFmode;
13917 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13918 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13919 break;
13920
13921 default:
13922 end_sequence ();
13923 return NULL_RTX;
13924 }
13925
13926 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13927 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13928 if (!op0 || !op1)
13929 {
13930 end_sequence ();
13931 return NULL_RTX;
13932 }
13933 *prep_seq = get_insns ();
13934 end_sequence ();
13935
13936 create_fixed_operand (&ops[0], op0);
13937 create_fixed_operand (&ops[1], op1);
13938
13939 start_sequence ();
13940 if (!maybe_expand_insn (icode, 2, ops))
13941 {
13942 end_sequence ();
13943 return NULL_RTX;
13944 }
13945 *gen_seq = get_insns ();
13946 end_sequence ();
13947
13948 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13949 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13950 }
13951
13952 static rtx
13953 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13954 int cmp_code, tree treeop0, tree treeop1, int bit_code)
13955 {
13956 rtx op0, op1, target;
13957 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13958 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13959 insn_code icode;
13960 struct expand_operand ops[6];
13961 int aarch64_cond;
13962
13963 push_to_sequence (*prep_seq);
13964 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13965
13966 op_mode = GET_MODE (op0);
13967 if (op_mode == VOIDmode)
13968 op_mode = GET_MODE (op1);
13969
13970 switch (op_mode)
13971 {
13972 case QImode:
13973 case HImode:
13974 case SImode:
13975 cmp_mode = SImode;
13976 icode = CODE_FOR_ccmpsi;
13977 break;
13978
13979 case DImode:
13980 cmp_mode = DImode;
13981 icode = CODE_FOR_ccmpdi;
13982 break;
13983
13984 case SFmode:
13985 cmp_mode = SFmode;
13986 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13987 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13988 break;
13989
13990 case DFmode:
13991 cmp_mode = DFmode;
13992 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13993 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13994 break;
13995
13996 default:
13997 end_sequence ();
13998 return NULL_RTX;
13999 }
14000
14001 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14002 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14003 if (!op0 || !op1)
14004 {
14005 end_sequence ();
14006 return NULL_RTX;
14007 }
14008 *prep_seq = get_insns ();
14009 end_sequence ();
14010
14011 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14012 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14013
14014 if (bit_code != AND)
14015 {
14016 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14017 GET_MODE (XEXP (prev, 0))),
14018 VOIDmode, XEXP (prev, 0), const0_rtx);
14019 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14020 }
14021
14022 create_fixed_operand (&ops[0], XEXP (prev, 0));
14023 create_fixed_operand (&ops[1], target);
14024 create_fixed_operand (&ops[2], op0);
14025 create_fixed_operand (&ops[3], op1);
14026 create_fixed_operand (&ops[4], prev);
14027 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14028
14029 push_to_sequence (*gen_seq);
14030 if (!maybe_expand_insn (icode, 6, ops))
14031 {
14032 end_sequence ();
14033 return NULL_RTX;
14034 }
14035
14036 *gen_seq = get_insns ();
14037 end_sequence ();
14038
14039 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14040 }
14041
14042 #undef TARGET_GEN_CCMP_FIRST
14043 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14044
14045 #undef TARGET_GEN_CCMP_NEXT
14046 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14047
14048 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14049 instruction fusion of some sort. */
14050
14051 static bool
14052 aarch64_macro_fusion_p (void)
14053 {
14054 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14055 }
14056
14057
14058 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14059 should be kept together during scheduling. */
14060
14061 static bool
14062 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14063 {
14064 rtx set_dest;
14065 rtx prev_set = single_set (prev);
14066 rtx curr_set = single_set (curr);
14067 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14068 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14069
14070 if (!aarch64_macro_fusion_p ())
14071 return false;
14072
14073 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14074 {
14075 /* We are trying to match:
14076 prev (mov) == (set (reg r0) (const_int imm16))
14077 curr (movk) == (set (zero_extract (reg r0)
14078 (const_int 16)
14079 (const_int 16))
14080 (const_int imm16_1)) */
14081
14082 set_dest = SET_DEST (curr_set);
14083
14084 if (GET_CODE (set_dest) == ZERO_EXTRACT
14085 && CONST_INT_P (SET_SRC (curr_set))
14086 && CONST_INT_P (SET_SRC (prev_set))
14087 && CONST_INT_P (XEXP (set_dest, 2))
14088 && INTVAL (XEXP (set_dest, 2)) == 16
14089 && REG_P (XEXP (set_dest, 0))
14090 && REG_P (SET_DEST (prev_set))
14091 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14092 {
14093 return true;
14094 }
14095 }
14096
14097 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14098 {
14099
14100 /* We're trying to match:
14101 prev (adrp) == (set (reg r1)
14102 (high (symbol_ref ("SYM"))))
14103 curr (add) == (set (reg r0)
14104 (lo_sum (reg r1)
14105 (symbol_ref ("SYM"))))
14106 Note that r0 need not necessarily be the same as r1, especially
14107 during pre-regalloc scheduling. */
14108
14109 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14110 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14111 {
14112 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14113 && REG_P (XEXP (SET_SRC (curr_set), 0))
14114 && REGNO (XEXP (SET_SRC (curr_set), 0))
14115 == REGNO (SET_DEST (prev_set))
14116 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14117 XEXP (SET_SRC (curr_set), 1)))
14118 return true;
14119 }
14120 }
14121
14122 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14123 {
14124
14125 /* We're trying to match:
14126 prev (movk) == (set (zero_extract (reg r0)
14127 (const_int 16)
14128 (const_int 32))
14129 (const_int imm16_1))
14130 curr (movk) == (set (zero_extract (reg r0)
14131 (const_int 16)
14132 (const_int 48))
14133 (const_int imm16_2)) */
14134
14135 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14136 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14137 && REG_P (XEXP (SET_DEST (prev_set), 0))
14138 && REG_P (XEXP (SET_DEST (curr_set), 0))
14139 && REGNO (XEXP (SET_DEST (prev_set), 0))
14140 == REGNO (XEXP (SET_DEST (curr_set), 0))
14141 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14142 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14143 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14144 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14145 && CONST_INT_P (SET_SRC (prev_set))
14146 && CONST_INT_P (SET_SRC (curr_set)))
14147 return true;
14148
14149 }
14150 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14151 {
14152 /* We're trying to match:
14153 prev (adrp) == (set (reg r0)
14154 (high (symbol_ref ("SYM"))))
14155 curr (ldr) == (set (reg r1)
14156 (mem (lo_sum (reg r0)
14157 (symbol_ref ("SYM")))))
14158 or
14159 curr (ldr) == (set (reg r1)
14160 (zero_extend (mem
14161 (lo_sum (reg r0)
14162 (symbol_ref ("SYM")))))) */
14163 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14164 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14165 {
14166 rtx curr_src = SET_SRC (curr_set);
14167
14168 if (GET_CODE (curr_src) == ZERO_EXTEND)
14169 curr_src = XEXP (curr_src, 0);
14170
14171 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14172 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14173 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14174 == REGNO (SET_DEST (prev_set))
14175 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14176 XEXP (SET_SRC (prev_set), 0)))
14177 return true;
14178 }
14179 }
14180
14181 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14182 && aarch_crypto_can_dual_issue (prev, curr))
14183 return true;
14184
14185 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14186 && any_condjump_p (curr))
14187 {
14188 enum attr_type prev_type = get_attr_type (prev);
14189
14190 /* FIXME: this misses some which is considered simple arthematic
14191 instructions for ThunderX. Simple shifts are missed here. */
14192 if (prev_type == TYPE_ALUS_SREG
14193 || prev_type == TYPE_ALUS_IMM
14194 || prev_type == TYPE_LOGICS_REG
14195 || prev_type == TYPE_LOGICS_IMM)
14196 return true;
14197 }
14198
14199 return false;
14200 }
14201
14202 /* Return true iff the instruction fusion described by OP is enabled. */
14203
14204 bool
14205 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14206 {
14207 return (aarch64_tune_params.fusible_ops & op) != 0;
14208 }
14209
14210 /* If MEM is in the form of [base+offset], extract the two parts
14211 of address and set to BASE and OFFSET, otherwise return false
14212 after clearing BASE and OFFSET. */
14213
14214 bool
14215 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14216 {
14217 rtx addr;
14218
14219 gcc_assert (MEM_P (mem));
14220
14221 addr = XEXP (mem, 0);
14222
14223 if (REG_P (addr))
14224 {
14225 *base = addr;
14226 *offset = const0_rtx;
14227 return true;
14228 }
14229
14230 if (GET_CODE (addr) == PLUS
14231 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14232 {
14233 *base = XEXP (addr, 0);
14234 *offset = XEXP (addr, 1);
14235 return true;
14236 }
14237
14238 *base = NULL_RTX;
14239 *offset = NULL_RTX;
14240
14241 return false;
14242 }
14243
14244 /* Types for scheduling fusion. */
14245 enum sched_fusion_type
14246 {
14247 SCHED_FUSION_NONE = 0,
14248 SCHED_FUSION_LD_SIGN_EXTEND,
14249 SCHED_FUSION_LD_ZERO_EXTEND,
14250 SCHED_FUSION_LD,
14251 SCHED_FUSION_ST,
14252 SCHED_FUSION_NUM
14253 };
14254
14255 /* If INSN is a load or store of address in the form of [base+offset],
14256 extract the two parts and set to BASE and OFFSET. Return scheduling
14257 fusion type this INSN is. */
14258
14259 static enum sched_fusion_type
14260 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14261 {
14262 rtx x, dest, src;
14263 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14264
14265 gcc_assert (INSN_P (insn));
14266 x = PATTERN (insn);
14267 if (GET_CODE (x) != SET)
14268 return SCHED_FUSION_NONE;
14269
14270 src = SET_SRC (x);
14271 dest = SET_DEST (x);
14272
14273 machine_mode dest_mode = GET_MODE (dest);
14274
14275 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14276 return SCHED_FUSION_NONE;
14277
14278 if (GET_CODE (src) == SIGN_EXTEND)
14279 {
14280 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14281 src = XEXP (src, 0);
14282 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14283 return SCHED_FUSION_NONE;
14284 }
14285 else if (GET_CODE (src) == ZERO_EXTEND)
14286 {
14287 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14288 src = XEXP (src, 0);
14289 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14290 return SCHED_FUSION_NONE;
14291 }
14292
14293 if (GET_CODE (src) == MEM && REG_P (dest))
14294 extract_base_offset_in_addr (src, base, offset);
14295 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14296 {
14297 fusion = SCHED_FUSION_ST;
14298 extract_base_offset_in_addr (dest, base, offset);
14299 }
14300 else
14301 return SCHED_FUSION_NONE;
14302
14303 if (*base == NULL_RTX || *offset == NULL_RTX)
14304 fusion = SCHED_FUSION_NONE;
14305
14306 return fusion;
14307 }
14308
14309 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14310
14311 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14312 and PRI are only calculated for these instructions. For other instruction,
14313 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14314 type instruction fusion can be added by returning different priorities.
14315
14316 It's important that irrelevant instructions get the largest FUSION_PRI. */
14317
14318 static void
14319 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14320 int *fusion_pri, int *pri)
14321 {
14322 int tmp, off_val;
14323 rtx base, offset;
14324 enum sched_fusion_type fusion;
14325
14326 gcc_assert (INSN_P (insn));
14327
14328 tmp = max_pri - 1;
14329 fusion = fusion_load_store (insn, &base, &offset);
14330 if (fusion == SCHED_FUSION_NONE)
14331 {
14332 *pri = tmp;
14333 *fusion_pri = tmp;
14334 return;
14335 }
14336
14337 /* Set FUSION_PRI according to fusion type and base register. */
14338 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14339
14340 /* Calculate PRI. */
14341 tmp /= 2;
14342
14343 /* INSN with smaller offset goes first. */
14344 off_val = (int)(INTVAL (offset));
14345 if (off_val >= 0)
14346 tmp -= (off_val & 0xfffff);
14347 else
14348 tmp += ((- off_val) & 0xfffff);
14349
14350 *pri = tmp;
14351 return;
14352 }
14353
14354 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14355 Adjust priority of sha1h instructions so they are scheduled before
14356 other SHA1 instructions. */
14357
14358 static int
14359 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14360 {
14361 rtx x = PATTERN (insn);
14362
14363 if (GET_CODE (x) == SET)
14364 {
14365 x = SET_SRC (x);
14366
14367 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14368 return priority + 10;
14369 }
14370
14371 return priority;
14372 }
14373
14374 /* Given OPERANDS of consecutive load/store, check if we can merge
14375 them into ldp/stp. LOAD is true if they are load instructions.
14376 MODE is the mode of memory operands. */
14377
14378 bool
14379 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14380 enum machine_mode mode)
14381 {
14382 HOST_WIDE_INT offval_1, offval_2, msize;
14383 enum reg_class rclass_1, rclass_2;
14384 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14385
14386 if (load)
14387 {
14388 mem_1 = operands[1];
14389 mem_2 = operands[3];
14390 reg_1 = operands[0];
14391 reg_2 = operands[2];
14392 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14393 if (REGNO (reg_1) == REGNO (reg_2))
14394 return false;
14395 }
14396 else
14397 {
14398 mem_1 = operands[0];
14399 mem_2 = operands[2];
14400 reg_1 = operands[1];
14401 reg_2 = operands[3];
14402 }
14403
14404 /* The mems cannot be volatile. */
14405 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14406 return false;
14407
14408 /* If we have SImode and slow unaligned ldp,
14409 check the alignment to be at least 8 byte. */
14410 if (mode == SImode
14411 && (aarch64_tune_params.extra_tuning_flags
14412 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14413 && !optimize_size
14414 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14415 return false;
14416
14417 /* Check if the addresses are in the form of [base+offset]. */
14418 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14419 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14420 return false;
14421 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14422 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14423 return false;
14424
14425 /* Check if the bases are same. */
14426 if (!rtx_equal_p (base_1, base_2))
14427 return false;
14428
14429 offval_1 = INTVAL (offset_1);
14430 offval_2 = INTVAL (offset_2);
14431 msize = GET_MODE_SIZE (mode);
14432 /* Check if the offsets are consecutive. */
14433 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14434 return false;
14435
14436 /* Check if the addresses are clobbered by load. */
14437 if (load)
14438 {
14439 if (reg_mentioned_p (reg_1, mem_1))
14440 return false;
14441
14442 /* In increasing order, the last load can clobber the address. */
14443 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14444 return false;
14445 }
14446
14447 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14448 rclass_1 = FP_REGS;
14449 else
14450 rclass_1 = GENERAL_REGS;
14451
14452 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14453 rclass_2 = FP_REGS;
14454 else
14455 rclass_2 = GENERAL_REGS;
14456
14457 /* Check if the registers are of same class. */
14458 if (rclass_1 != rclass_2)
14459 return false;
14460
14461 return true;
14462 }
14463
14464 /* Given OPERANDS of consecutive load/store, check if we can merge
14465 them into ldp/stp by adjusting the offset. LOAD is true if they
14466 are load instructions. MODE is the mode of memory operands.
14467
14468 Given below consecutive stores:
14469
14470 str w1, [xb, 0x100]
14471 str w1, [xb, 0x104]
14472 str w1, [xb, 0x108]
14473 str w1, [xb, 0x10c]
14474
14475 Though the offsets are out of the range supported by stp, we can
14476 still pair them after adjusting the offset, like:
14477
14478 add scratch, xb, 0x100
14479 stp w1, w1, [scratch]
14480 stp w1, w1, [scratch, 0x8]
14481
14482 The peephole patterns detecting this opportunity should guarantee
14483 the scratch register is avaliable. */
14484
14485 bool
14486 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14487 enum machine_mode mode)
14488 {
14489 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14490 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14491 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14492 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14493
14494 if (load)
14495 {
14496 reg_1 = operands[0];
14497 mem_1 = operands[1];
14498 reg_2 = operands[2];
14499 mem_2 = operands[3];
14500 reg_3 = operands[4];
14501 mem_3 = operands[5];
14502 reg_4 = operands[6];
14503 mem_4 = operands[7];
14504 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14505 && REG_P (reg_3) && REG_P (reg_4));
14506 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14507 return false;
14508 }
14509 else
14510 {
14511 mem_1 = operands[0];
14512 reg_1 = operands[1];
14513 mem_2 = operands[2];
14514 reg_2 = operands[3];
14515 mem_3 = operands[4];
14516 reg_3 = operands[5];
14517 mem_4 = operands[6];
14518 reg_4 = operands[7];
14519 }
14520 /* Skip if memory operand is by itslef valid for ldp/stp. */
14521 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14522 return false;
14523
14524 /* The mems cannot be volatile. */
14525 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14526 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14527 return false;
14528
14529 /* Check if the addresses are in the form of [base+offset]. */
14530 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14531 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14532 return false;
14533 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14534 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14535 return false;
14536 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14537 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14538 return false;
14539 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14540 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14541 return false;
14542
14543 /* Check if the bases are same. */
14544 if (!rtx_equal_p (base_1, base_2)
14545 || !rtx_equal_p (base_2, base_3)
14546 || !rtx_equal_p (base_3, base_4))
14547 return false;
14548
14549 offval_1 = INTVAL (offset_1);
14550 offval_2 = INTVAL (offset_2);
14551 offval_3 = INTVAL (offset_3);
14552 offval_4 = INTVAL (offset_4);
14553 msize = GET_MODE_SIZE (mode);
14554 /* Check if the offsets are consecutive. */
14555 if ((offval_1 != (offval_2 + msize)
14556 || offval_1 != (offval_3 + msize * 2)
14557 || offval_1 != (offval_4 + msize * 3))
14558 && (offval_4 != (offval_3 + msize)
14559 || offval_4 != (offval_2 + msize * 2)
14560 || offval_4 != (offval_1 + msize * 3)))
14561 return false;
14562
14563 /* Check if the addresses are clobbered by load. */
14564 if (load)
14565 {
14566 if (reg_mentioned_p (reg_1, mem_1)
14567 || reg_mentioned_p (reg_2, mem_2)
14568 || reg_mentioned_p (reg_3, mem_3))
14569 return false;
14570
14571 /* In increasing order, the last load can clobber the address. */
14572 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14573 return false;
14574 }
14575
14576 /* If we have SImode and slow unaligned ldp,
14577 check the alignment to be at least 8 byte. */
14578 if (mode == SImode
14579 && (aarch64_tune_params.extra_tuning_flags
14580 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14581 && !optimize_size
14582 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14583 return false;
14584
14585 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14586 rclass_1 = FP_REGS;
14587 else
14588 rclass_1 = GENERAL_REGS;
14589
14590 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14591 rclass_2 = FP_REGS;
14592 else
14593 rclass_2 = GENERAL_REGS;
14594
14595 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14596 rclass_3 = FP_REGS;
14597 else
14598 rclass_3 = GENERAL_REGS;
14599
14600 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14601 rclass_4 = FP_REGS;
14602 else
14603 rclass_4 = GENERAL_REGS;
14604
14605 /* Check if the registers are of same class. */
14606 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14607 return false;
14608
14609 return true;
14610 }
14611
14612 /* Given OPERANDS of consecutive load/store, this function pairs them
14613 into ldp/stp after adjusting the offset. It depends on the fact
14614 that addresses of load/store instructions are in increasing order.
14615 MODE is the mode of memory operands. CODE is the rtl operator
14616 which should be applied to all memory operands, it's SIGN_EXTEND,
14617 ZERO_EXTEND or UNKNOWN. */
14618
14619 bool
14620 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14621 enum machine_mode mode, RTX_CODE code)
14622 {
14623 rtx base, offset, t1, t2;
14624 rtx mem_1, mem_2, mem_3, mem_4;
14625 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14626
14627 if (load)
14628 {
14629 mem_1 = operands[1];
14630 mem_2 = operands[3];
14631 mem_3 = operands[5];
14632 mem_4 = operands[7];
14633 }
14634 else
14635 {
14636 mem_1 = operands[0];
14637 mem_2 = operands[2];
14638 mem_3 = operands[4];
14639 mem_4 = operands[6];
14640 gcc_assert (code == UNKNOWN);
14641 }
14642
14643 extract_base_offset_in_addr (mem_1, &base, &offset);
14644 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14645
14646 /* Adjust offset thus it can fit in ldp/stp instruction. */
14647 msize = GET_MODE_SIZE (mode);
14648 stp_off_limit = msize * 0x40;
14649 off_val = INTVAL (offset);
14650 abs_off = (off_val < 0) ? -off_val : off_val;
14651 new_off = abs_off % stp_off_limit;
14652 adj_off = abs_off - new_off;
14653
14654 /* Further adjust to make sure all offsets are OK. */
14655 if ((new_off + msize * 2) >= stp_off_limit)
14656 {
14657 adj_off += stp_off_limit;
14658 new_off -= stp_off_limit;
14659 }
14660
14661 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14662 if (adj_off >= 0x1000)
14663 return false;
14664
14665 if (off_val < 0)
14666 {
14667 adj_off = -adj_off;
14668 new_off = -new_off;
14669 }
14670
14671 /* Create new memory references. */
14672 mem_1 = change_address (mem_1, VOIDmode,
14673 plus_constant (DImode, operands[8], new_off));
14674
14675 /* Check if the adjusted address is OK for ldp/stp. */
14676 if (!aarch64_mem_pair_operand (mem_1, mode))
14677 return false;
14678
14679 msize = GET_MODE_SIZE (mode);
14680 mem_2 = change_address (mem_2, VOIDmode,
14681 plus_constant (DImode,
14682 operands[8],
14683 new_off + msize));
14684 mem_3 = change_address (mem_3, VOIDmode,
14685 plus_constant (DImode,
14686 operands[8],
14687 new_off + msize * 2));
14688 mem_4 = change_address (mem_4, VOIDmode,
14689 plus_constant (DImode,
14690 operands[8],
14691 new_off + msize * 3));
14692
14693 if (code == ZERO_EXTEND)
14694 {
14695 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14696 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14697 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14698 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14699 }
14700 else if (code == SIGN_EXTEND)
14701 {
14702 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14703 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14704 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14705 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14706 }
14707
14708 if (load)
14709 {
14710 operands[1] = mem_1;
14711 operands[3] = mem_2;
14712 operands[5] = mem_3;
14713 operands[7] = mem_4;
14714 }
14715 else
14716 {
14717 operands[0] = mem_1;
14718 operands[2] = mem_2;
14719 operands[4] = mem_3;
14720 operands[6] = mem_4;
14721 }
14722
14723 /* Emit adjusting instruction. */
14724 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14725 /* Emit ldp/stp instructions. */
14726 t1 = gen_rtx_SET (operands[0], operands[1]);
14727 t2 = gen_rtx_SET (operands[2], operands[3]);
14728 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14729 t1 = gen_rtx_SET (operands[4], operands[5]);
14730 t2 = gen_rtx_SET (operands[6], operands[7]);
14731 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14732 return true;
14733 }
14734
14735 /* Return 1 if pseudo register should be created and used to hold
14736 GOT address for PIC code. */
14737
14738 bool
14739 aarch64_use_pseudo_pic_reg (void)
14740 {
14741 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14742 }
14743
14744 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14745
14746 static int
14747 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14748 {
14749 switch (XINT (x, 1))
14750 {
14751 case UNSPEC_GOTSMALLPIC:
14752 case UNSPEC_GOTSMALLPIC28K:
14753 case UNSPEC_GOTTINYPIC:
14754 return 0;
14755 default:
14756 break;
14757 }
14758
14759 return default_unspec_may_trap_p (x, flags);
14760 }
14761
14762
14763 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14764 return the log2 of that value. Otherwise return -1. */
14765
14766 int
14767 aarch64_fpconst_pow_of_2 (rtx x)
14768 {
14769 const REAL_VALUE_TYPE *r;
14770
14771 if (!CONST_DOUBLE_P (x))
14772 return -1;
14773
14774 r = CONST_DOUBLE_REAL_VALUE (x);
14775
14776 if (REAL_VALUE_NEGATIVE (*r)
14777 || REAL_VALUE_ISNAN (*r)
14778 || REAL_VALUE_ISINF (*r)
14779 || !real_isinteger (r, DFmode))
14780 return -1;
14781
14782 return exact_log2 (real_to_integer (r));
14783 }
14784
14785 /* If X is a vector of equal CONST_DOUBLE values and that value is
14786 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14787
14788 int
14789 aarch64_vec_fpconst_pow_of_2 (rtx x)
14790 {
14791 if (GET_CODE (x) != CONST_VECTOR)
14792 return -1;
14793
14794 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14795 return -1;
14796
14797 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14798 if (firstval <= 0)
14799 return -1;
14800
14801 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14802 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14803 return -1;
14804
14805 return firstval;
14806 }
14807
14808 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14809 to float.
14810
14811 __fp16 always promotes through this hook.
14812 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14813 through the generic excess precision logic rather than here. */
14814
14815 static tree
14816 aarch64_promoted_type (const_tree t)
14817 {
14818 if (SCALAR_FLOAT_TYPE_P (t)
14819 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14820 return float_type_node;
14821
14822 return NULL_TREE;
14823 }
14824
14825 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14826
14827 static bool
14828 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14829 optimization_type opt_type)
14830 {
14831 switch (op)
14832 {
14833 case rsqrt_optab:
14834 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14835
14836 default:
14837 return true;
14838 }
14839 }
14840
14841 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14842 if MODE is HFmode, and punt to the generic implementation otherwise. */
14843
14844 static bool
14845 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14846 {
14847 return (mode == HFmode
14848 ? true
14849 : default_libgcc_floating_mode_supported_p (mode));
14850 }
14851
14852 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14853 if MODE is HFmode, and punt to the generic implementation otherwise. */
14854
14855 static bool
14856 aarch64_scalar_mode_supported_p (machine_mode mode)
14857 {
14858 return (mode == HFmode
14859 ? true
14860 : default_scalar_mode_supported_p (mode));
14861 }
14862
14863 /* Set the value of FLT_EVAL_METHOD.
14864 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14865
14866 0: evaluate all operations and constants, whose semantic type has at
14867 most the range and precision of type float, to the range and
14868 precision of float; evaluate all other operations and constants to
14869 the range and precision of the semantic type;
14870
14871 N, where _FloatN is a supported interchange floating type
14872 evaluate all operations and constants, whose semantic type has at
14873 most the range and precision of _FloatN type, to the range and
14874 precision of the _FloatN type; evaluate all other operations and
14875 constants to the range and precision of the semantic type;
14876
14877 If we have the ARMv8.2-A extensions then we support _Float16 in native
14878 precision, so we should set this to 16. Otherwise, we support the type,
14879 but want to evaluate expressions in float precision, so set this to
14880 0. */
14881
14882 static enum flt_eval_method
14883 aarch64_excess_precision (enum excess_precision_type type)
14884 {
14885 switch (type)
14886 {
14887 case EXCESS_PRECISION_TYPE_FAST:
14888 case EXCESS_PRECISION_TYPE_STANDARD:
14889 /* We can calculate either in 16-bit range and precision or
14890 32-bit range and precision. Make that decision based on whether
14891 we have native support for the ARMv8.2-A 16-bit floating-point
14892 instructions or not. */
14893 return (TARGET_FP_F16INST
14894 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14895 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14896 case EXCESS_PRECISION_TYPE_IMPLICIT:
14897 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14898 default:
14899 gcc_unreachable ();
14900 }
14901 return FLT_EVAL_METHOD_UNPREDICTABLE;
14902 }
14903
14904 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
14905 scheduled for speculative execution. Reject the long-running division
14906 and square-root instructions. */
14907
14908 static bool
14909 aarch64_sched_can_speculate_insn (rtx_insn *insn)
14910 {
14911 switch (get_attr_type (insn))
14912 {
14913 case TYPE_SDIV:
14914 case TYPE_UDIV:
14915 case TYPE_FDIVS:
14916 case TYPE_FDIVD:
14917 case TYPE_FSQRTS:
14918 case TYPE_FSQRTD:
14919 case TYPE_NEON_FP_SQRT_S:
14920 case TYPE_NEON_FP_SQRT_D:
14921 case TYPE_NEON_FP_SQRT_S_Q:
14922 case TYPE_NEON_FP_SQRT_D_Q:
14923 case TYPE_NEON_FP_DIV_S:
14924 case TYPE_NEON_FP_DIV_D:
14925 case TYPE_NEON_FP_DIV_S_Q:
14926 case TYPE_NEON_FP_DIV_D_Q:
14927 return false;
14928 default:
14929 return true;
14930 }
14931 }
14932
14933 /* Target-specific selftests. */
14934
14935 #if CHECKING_P
14936
14937 namespace selftest {
14938
14939 /* Selftest for the RTL loader.
14940 Verify that the RTL loader copes with a dump from
14941 print_rtx_function. This is essentially just a test that class
14942 function_reader can handle a real dump, but it also verifies
14943 that lookup_reg_by_dump_name correctly handles hard regs.
14944 The presence of hard reg names in the dump means that the test is
14945 target-specific, hence it is in this file. */
14946
14947 static void
14948 aarch64_test_loading_full_dump ()
14949 {
14950 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14951
14952 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14953
14954 rtx_insn *insn_1 = get_insn_by_uid (1);
14955 ASSERT_EQ (NOTE, GET_CODE (insn_1));
14956
14957 rtx_insn *insn_15 = get_insn_by_uid (15);
14958 ASSERT_EQ (INSN, GET_CODE (insn_15));
14959 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14960
14961 /* Verify crtl->return_rtx. */
14962 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14963 ASSERT_EQ (0, REGNO (crtl->return_rtx));
14964 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14965 }
14966
14967 /* Run all target-specific selftests. */
14968
14969 static void
14970 aarch64_run_selftests (void)
14971 {
14972 aarch64_test_loading_full_dump ();
14973 }
14974
14975 } // namespace selftest
14976
14977 #endif /* #if CHECKING_P */
14978
14979 #undef TARGET_ADDRESS_COST
14980 #define TARGET_ADDRESS_COST aarch64_address_cost
14981
14982 /* This hook will determines whether unnamed bitfields affect the alignment
14983 of the containing structure. The hook returns true if the structure
14984 should inherit the alignment requirements of an unnamed bitfield's
14985 type. */
14986 #undef TARGET_ALIGN_ANON_BITFIELD
14987 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14988
14989 #undef TARGET_ASM_ALIGNED_DI_OP
14990 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14991
14992 #undef TARGET_ASM_ALIGNED_HI_OP
14993 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14994
14995 #undef TARGET_ASM_ALIGNED_SI_OP
14996 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14997
14998 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14999 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15000 hook_bool_const_tree_hwi_hwi_const_tree_true
15001
15002 #undef TARGET_ASM_FILE_START
15003 #define TARGET_ASM_FILE_START aarch64_start_file
15004
15005 #undef TARGET_ASM_OUTPUT_MI_THUNK
15006 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15007
15008 #undef TARGET_ASM_SELECT_RTX_SECTION
15009 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15010
15011 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15012 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15013
15014 #undef TARGET_BUILD_BUILTIN_VA_LIST
15015 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15016
15017 #undef TARGET_CALLEE_COPIES
15018 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15019
15020 #undef TARGET_CAN_ELIMINATE
15021 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15022
15023 #undef TARGET_CAN_INLINE_P
15024 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15025
15026 #undef TARGET_CANNOT_FORCE_CONST_MEM
15027 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15028
15029 #undef TARGET_CASE_VALUES_THRESHOLD
15030 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15031
15032 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15033 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15034
15035 /* Only the least significant bit is used for initialization guard
15036 variables. */
15037 #undef TARGET_CXX_GUARD_MASK_BIT
15038 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15039
15040 #undef TARGET_C_MODE_FOR_SUFFIX
15041 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15042
15043 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15044 #undef TARGET_DEFAULT_TARGET_FLAGS
15045 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15046 #endif
15047
15048 #undef TARGET_CLASS_MAX_NREGS
15049 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15050
15051 #undef TARGET_BUILTIN_DECL
15052 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15053
15054 #undef TARGET_BUILTIN_RECIPROCAL
15055 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15056
15057 #undef TARGET_C_EXCESS_PRECISION
15058 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15059
15060 #undef TARGET_EXPAND_BUILTIN
15061 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15062
15063 #undef TARGET_EXPAND_BUILTIN_VA_START
15064 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15065
15066 #undef TARGET_FOLD_BUILTIN
15067 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15068
15069 #undef TARGET_FUNCTION_ARG
15070 #define TARGET_FUNCTION_ARG aarch64_function_arg
15071
15072 #undef TARGET_FUNCTION_ARG_ADVANCE
15073 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15074
15075 #undef TARGET_FUNCTION_ARG_BOUNDARY
15076 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15077
15078 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15079 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15080
15081 #undef TARGET_FUNCTION_VALUE
15082 #define TARGET_FUNCTION_VALUE aarch64_function_value
15083
15084 #undef TARGET_FUNCTION_VALUE_REGNO_P
15085 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15086
15087 #undef TARGET_FRAME_POINTER_REQUIRED
15088 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15089
15090 #undef TARGET_GIMPLE_FOLD_BUILTIN
15091 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15092
15093 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15094 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15095
15096 #undef TARGET_INIT_BUILTINS
15097 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15098
15099 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15100 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15101 aarch64_ira_change_pseudo_allocno_class
15102
15103 #undef TARGET_LEGITIMATE_ADDRESS_P
15104 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15105
15106 #undef TARGET_LEGITIMATE_CONSTANT_P
15107 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15108
15109 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15110 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15111 aarch64_legitimize_address_displacement
15112
15113 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15114 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15115
15116 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15117 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15118 aarch64_libgcc_floating_mode_supported_p
15119
15120 #undef TARGET_MANGLE_TYPE
15121 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15122
15123 #undef TARGET_MEMORY_MOVE_COST
15124 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15125
15126 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15127 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15128
15129 #undef TARGET_MUST_PASS_IN_STACK
15130 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15131
15132 /* This target hook should return true if accesses to volatile bitfields
15133 should use the narrowest mode possible. It should return false if these
15134 accesses should use the bitfield container type. */
15135 #undef TARGET_NARROW_VOLATILE_BITFIELD
15136 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15137
15138 #undef TARGET_OPTION_OVERRIDE
15139 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15140
15141 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15142 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15143 aarch64_override_options_after_change
15144
15145 #undef TARGET_OPTION_SAVE
15146 #define TARGET_OPTION_SAVE aarch64_option_save
15147
15148 #undef TARGET_OPTION_RESTORE
15149 #define TARGET_OPTION_RESTORE aarch64_option_restore
15150
15151 #undef TARGET_OPTION_PRINT
15152 #define TARGET_OPTION_PRINT aarch64_option_print
15153
15154 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15155 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15156
15157 #undef TARGET_SET_CURRENT_FUNCTION
15158 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15159
15160 #undef TARGET_PASS_BY_REFERENCE
15161 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15162
15163 #undef TARGET_PREFERRED_RELOAD_CLASS
15164 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15165
15166 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15167 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15168
15169 #undef TARGET_PROMOTED_TYPE
15170 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15171
15172 #undef TARGET_SECONDARY_RELOAD
15173 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15174
15175 #undef TARGET_SHIFT_TRUNCATION_MASK
15176 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15177
15178 #undef TARGET_SETUP_INCOMING_VARARGS
15179 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15180
15181 #undef TARGET_STRUCT_VALUE_RTX
15182 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15183
15184 #undef TARGET_REGISTER_MOVE_COST
15185 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15186
15187 #undef TARGET_RETURN_IN_MEMORY
15188 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15189
15190 #undef TARGET_RETURN_IN_MSB
15191 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15192
15193 #undef TARGET_RTX_COSTS
15194 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15195
15196 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15197 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15198
15199 #undef TARGET_SCHED_ISSUE_RATE
15200 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15201
15202 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15203 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15204 aarch64_sched_first_cycle_multipass_dfa_lookahead
15205
15206 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15207 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15208 aarch64_first_cycle_multipass_dfa_lookahead_guard
15209
15210 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15211 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15212 aarch64_get_separate_components
15213
15214 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15215 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15216 aarch64_components_for_bb
15217
15218 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15219 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15220 aarch64_disqualify_components
15221
15222 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15223 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15224 aarch64_emit_prologue_components
15225
15226 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15227 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15228 aarch64_emit_epilogue_components
15229
15230 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15231 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15232 aarch64_set_handled_components
15233
15234 #undef TARGET_TRAMPOLINE_INIT
15235 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15236
15237 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15238 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15239
15240 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15241 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15242
15243 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15244 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15245 aarch64_builtin_support_vector_misalignment
15246
15247 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15248 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15249
15250 #undef TARGET_VECTORIZE_ADD_STMT_COST
15251 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15252
15253 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15254 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15255 aarch64_builtin_vectorization_cost
15256
15257 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15258 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15259
15260 #undef TARGET_VECTORIZE_BUILTINS
15261 #define TARGET_VECTORIZE_BUILTINS
15262
15263 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15264 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15265 aarch64_builtin_vectorized_function
15266
15267 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15268 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15269 aarch64_autovectorize_vector_sizes
15270
15271 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15272 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15273 aarch64_atomic_assign_expand_fenv
15274
15275 /* Section anchor support. */
15276
15277 #undef TARGET_MIN_ANCHOR_OFFSET
15278 #define TARGET_MIN_ANCHOR_OFFSET -256
15279
15280 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15281 byte offset; we can do much more for larger data types, but have no way
15282 to determine the size of the access. We assume accesses are aligned. */
15283 #undef TARGET_MAX_ANCHOR_OFFSET
15284 #define TARGET_MAX_ANCHOR_OFFSET 4095
15285
15286 #undef TARGET_VECTOR_ALIGNMENT
15287 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15288
15289 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15290 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15291 aarch64_simd_vector_alignment_reachable
15292
15293 /* vec_perm support. */
15294
15295 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15296 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15297 aarch64_vectorize_vec_perm_const_ok
15298
15299 #undef TARGET_INIT_LIBFUNCS
15300 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15301
15302 #undef TARGET_FIXED_CONDITION_CODE_REGS
15303 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15304
15305 #undef TARGET_FLAGS_REGNUM
15306 #define TARGET_FLAGS_REGNUM CC_REGNUM
15307
15308 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15309 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15310
15311 #undef TARGET_ASAN_SHADOW_OFFSET
15312 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15313
15314 #undef TARGET_LEGITIMIZE_ADDRESS
15315 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15316
15317 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15318 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15319 aarch64_use_by_pieces_infrastructure_p
15320
15321 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15322 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15323
15324 #undef TARGET_CAN_USE_DOLOOP_P
15325 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15326
15327 #undef TARGET_SCHED_ADJUST_PRIORITY
15328 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15329
15330 #undef TARGET_SCHED_MACRO_FUSION_P
15331 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15332
15333 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15334 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15335
15336 #undef TARGET_SCHED_FUSION_PRIORITY
15337 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15338
15339 #undef TARGET_UNSPEC_MAY_TRAP_P
15340 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15341
15342 #undef TARGET_USE_PSEUDO_PIC_REG
15343 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15344
15345 #undef TARGET_PRINT_OPERAND
15346 #define TARGET_PRINT_OPERAND aarch64_print_operand
15347
15348 #undef TARGET_PRINT_OPERAND_ADDRESS
15349 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15350
15351 #undef TARGET_OPTAB_SUPPORTED_P
15352 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15353
15354 #undef TARGET_OMIT_STRUCT_RETURN_REG
15355 #define TARGET_OMIT_STRUCT_RETURN_REG true
15356
15357 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15358 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15359 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15360
15361 #if CHECKING_P
15362 #undef TARGET_RUN_TARGET_SELFTESTS
15363 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15364 #endif /* #if CHECKING_P */
15365
15366 struct gcc_target targetm = TARGET_INITIALIZER;
15367
15368 #include "gt-aarch64.h"