]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
037339d431d80c49699446e548d6b2707883b6a8
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "flags.h"
49 #include "explow.h"
50 #include "expr.h"
51 #include "reload.h"
52 #include "langhooks.h"
53 #include "opts.h"
54 #include "params.h"
55 #include "gimplify.h"
56 #include "dwarf2.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
60 #include "dumpfile.h"
61 #include "builtins.h"
62 #include "rtl-iter.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
67 #include "selftest.h"
68 #include "selftest-rtl.h"
69
70 /* This file should be included last. */
71 #include "target-def.h"
72
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
75
76 /* Classifies an address.
77
78 ADDRESS_REG_IMM
79 A simple base register plus immediate offset.
80
81 ADDRESS_REG_WB
82 A base register indexed by immediate offset with writeback.
83
84 ADDRESS_REG_REG
85 A base register indexed by (optionally scaled) register.
86
87 ADDRESS_REG_UXTW
88 A base register indexed by (optionally scaled) zero-extended register.
89
90 ADDRESS_REG_SXTW
91 A base register indexed by (optionally scaled) sign-extended register.
92
93 ADDRESS_LO_SUM
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
95
96 ADDRESS_SYMBOLIC:
97 A constant symbolic address, in pc-relative literal pool. */
98
99 enum aarch64_address_type {
100 ADDRESS_REG_IMM,
101 ADDRESS_REG_WB,
102 ADDRESS_REG_REG,
103 ADDRESS_REG_UXTW,
104 ADDRESS_REG_SXTW,
105 ADDRESS_LO_SUM,
106 ADDRESS_SYMBOLIC
107 };
108
109 struct aarch64_address_info {
110 enum aarch64_address_type type;
111 rtx base;
112 rtx offset;
113 int shift;
114 enum aarch64_symbol_type symbol_type;
115 };
116
117 struct simd_immediate_info
118 {
119 rtx value;
120 int shift;
121 int element_width;
122 bool mvn;
123 bool msl;
124 };
125
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel;
128
129 #ifdef HAVE_AS_TLS
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
132 #endif
133
134 static bool aarch64_composite_type_p (const_tree, machine_mode);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
136 const_tree,
137 machine_mode *, int *,
138 bool *);
139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
144 const unsigned char *sel);
145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
147 const_tree type,
148 int misalignment,
149 bool is_packed);
150
151 /* Major revision number of the ARM Architecture implemented by the target. */
152 unsigned aarch64_architecture_version;
153
154 /* The processor for which instructions should be scheduled. */
155 enum aarch64_processor aarch64_tune = cortexa53;
156
157 /* Mask to specify which instruction scheduling options should be used. */
158 unsigned long aarch64_tune_flags = 0;
159
160 /* Global flag for PC relative loads. */
161 bool aarch64_pcrelative_literal_loads;
162
163 /* Support for command line parsing of boolean flags in the tuning
164 structures. */
165 struct aarch64_flag_desc
166 {
167 const char* name;
168 unsigned int flag;
169 };
170
171 #define AARCH64_FUSION_PAIR(name, internal_name) \
172 { name, AARCH64_FUSE_##internal_name },
173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
174 {
175 { "none", AARCH64_FUSE_NOTHING },
176 #include "aarch64-fusion-pairs.def"
177 { "all", AARCH64_FUSE_ALL },
178 { NULL, AARCH64_FUSE_NOTHING }
179 };
180
181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
182 { name, AARCH64_EXTRA_TUNE_##internal_name },
183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
184 {
185 { "none", AARCH64_EXTRA_TUNE_NONE },
186 #include "aarch64-tuning-flags.def"
187 { "all", AARCH64_EXTRA_TUNE_ALL },
188 { NULL, AARCH64_EXTRA_TUNE_NONE }
189 };
190
191 /* Tuning parameters. */
192
193 static const struct cpu_addrcost_table generic_addrcost_table =
194 {
195 {
196 1, /* hi */
197 0, /* si */
198 0, /* di */
199 1, /* ti */
200 },
201 0, /* pre_modify */
202 0, /* post_modify */
203 0, /* register_offset */
204 0, /* register_sextend */
205 0, /* register_zextend */
206 0 /* imm_offset */
207 };
208
209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
210 {
211 {
212 1, /* hi */
213 0, /* si */
214 0, /* di */
215 1, /* ti */
216 },
217 0, /* pre_modify */
218 0, /* post_modify */
219 0, /* register_offset */
220 0, /* register_sextend */
221 0, /* register_zextend */
222 0, /* imm_offset */
223 };
224
225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
226 {
227 {
228 0, /* hi */
229 0, /* si */
230 0, /* di */
231 2, /* ti */
232 },
233 0, /* pre_modify */
234 0, /* post_modify */
235 1, /* register_offset */
236 1, /* register_sextend */
237 2, /* register_zextend */
238 0, /* imm_offset */
239 };
240
241 static const struct cpu_addrcost_table xgene1_addrcost_table =
242 {
243 {
244 1, /* hi */
245 0, /* si */
246 0, /* di */
247 1, /* ti */
248 },
249 1, /* pre_modify */
250 0, /* post_modify */
251 0, /* register_offset */
252 1, /* register_sextend */
253 1, /* register_zextend */
254 0, /* imm_offset */
255 };
256
257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
258 {
259 {
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
264 },
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
271 };
272
273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
274 {
275 {
276 1, /* hi */
277 1, /* si */
278 1, /* di */
279 2, /* ti */
280 },
281 0, /* pre_modify */
282 0, /* post_modify */
283 2, /* register_offset */
284 3, /* register_sextend */
285 3, /* register_zextend */
286 0, /* imm_offset */
287 };
288
289 static const struct cpu_regmove_cost generic_regmove_cost =
290 {
291 1, /* GP2GP */
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost. */
294 5, /* GP2FP */
295 5, /* FP2GP */
296 2 /* FP2FP */
297 };
298
299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
300 {
301 1, /* GP2GP */
302 /* Avoid the use of slow int<->fp moves for spilling by setting
303 their cost higher than memmov_cost. */
304 5, /* GP2FP */
305 5, /* FP2GP */
306 2 /* FP2FP */
307 };
308
309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
310 {
311 1, /* GP2GP */
312 /* Avoid the use of slow int<->fp moves for spilling by setting
313 their cost higher than memmov_cost. */
314 5, /* GP2FP */
315 5, /* FP2GP */
316 2 /* FP2FP */
317 };
318
319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
320 {
321 1, /* GP2GP */
322 /* Avoid the use of slow int<->fp moves for spilling by setting
323 their cost higher than memmov_cost (actual, 4 and 9). */
324 9, /* GP2FP */
325 9, /* FP2GP */
326 1 /* FP2FP */
327 };
328
329 static const struct cpu_regmove_cost thunderx_regmove_cost =
330 {
331 2, /* GP2GP */
332 2, /* GP2FP */
333 6, /* FP2GP */
334 4 /* FP2FP */
335 };
336
337 static const struct cpu_regmove_cost xgene1_regmove_cost =
338 {
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 8, /* GP2FP */
343 8, /* FP2GP */
344 2 /* FP2FP */
345 };
346
347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
348 {
349 2, /* GP2GP */
350 /* Avoid the use of int<->fp moves for spilling. */
351 6, /* GP2FP */
352 6, /* FP2GP */
353 4 /* FP2FP */
354 };
355
356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
357 {
358 1, /* GP2GP */
359 /* Avoid the use of int<->fp moves for spilling. */
360 8, /* GP2FP */
361 8, /* FP2GP */
362 4 /* FP2FP */
363 };
364
365 /* Generic costs for vector insn classes. */
366 static const struct cpu_vector_cost generic_vector_cost =
367 {
368 1, /* scalar_int_stmt_cost */
369 1, /* scalar_fp_stmt_cost */
370 1, /* scalar_load_cost */
371 1, /* scalar_store_cost */
372 1, /* vec_int_stmt_cost */
373 1, /* vec_fp_stmt_cost */
374 2, /* vec_permute_cost */
375 1, /* vec_to_scalar_cost */
376 1, /* scalar_to_vec_cost */
377 1, /* vec_align_load_cost */
378 1, /* vec_unalign_load_cost */
379 1, /* vec_unalign_store_cost */
380 1, /* vec_store_cost */
381 3, /* cond_taken_branch_cost */
382 1 /* cond_not_taken_branch_cost */
383 };
384
385 /* ThunderX costs for vector insn classes. */
386 static const struct cpu_vector_cost thunderx_vector_cost =
387 {
388 1, /* scalar_int_stmt_cost */
389 1, /* scalar_fp_stmt_cost */
390 3, /* scalar_load_cost */
391 1, /* scalar_store_cost */
392 4, /* vec_int_stmt_cost */
393 4, /* vec_fp_stmt_cost */
394 4, /* vec_permute_cost */
395 2, /* vec_to_scalar_cost */
396 2, /* scalar_to_vec_cost */
397 3, /* vec_align_load_cost */
398 10, /* vec_unalign_load_cost */
399 10, /* vec_unalign_store_cost */
400 1, /* vec_store_cost */
401 3, /* cond_taken_branch_cost */
402 3 /* cond_not_taken_branch_cost */
403 };
404
405 /* Generic costs for vector insn classes. */
406 static const struct cpu_vector_cost cortexa57_vector_cost =
407 {
408 1, /* scalar_int_stmt_cost */
409 1, /* scalar_fp_stmt_cost */
410 4, /* scalar_load_cost */
411 1, /* scalar_store_cost */
412 2, /* vec_int_stmt_cost */
413 2, /* vec_fp_stmt_cost */
414 3, /* vec_permute_cost */
415 8, /* vec_to_scalar_cost */
416 8, /* scalar_to_vec_cost */
417 4, /* vec_align_load_cost */
418 4, /* vec_unalign_load_cost */
419 1, /* vec_unalign_store_cost */
420 1, /* vec_store_cost */
421 1, /* cond_taken_branch_cost */
422 1 /* cond_not_taken_branch_cost */
423 };
424
425 static const struct cpu_vector_cost exynosm1_vector_cost =
426 {
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 5, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 3, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 3, /* vec_permute_cost */
434 3, /* vec_to_scalar_cost */
435 3, /* scalar_to_vec_cost */
436 5, /* vec_align_load_cost */
437 5, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 1, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
442 };
443
444 /* Generic costs for vector insn classes. */
445 static const struct cpu_vector_cost xgene1_vector_cost =
446 {
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 5, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 2, /* vec_int_stmt_cost */
452 2, /* vec_fp_stmt_cost */
453 2, /* vec_permute_cost */
454 4, /* vec_to_scalar_cost */
455 4, /* scalar_to_vec_cost */
456 10, /* vec_align_load_cost */
457 10, /* vec_unalign_load_cost */
458 2, /* vec_unalign_store_cost */
459 2, /* vec_store_cost */
460 2, /* cond_taken_branch_cost */
461 1 /* cond_not_taken_branch_cost */
462 };
463
464 /* Costs for vector insn classes for Vulcan. */
465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
466 {
467 1, /* scalar_int_stmt_cost */
468 6, /* scalar_fp_stmt_cost */
469 4, /* scalar_load_cost */
470 1, /* scalar_store_cost */
471 5, /* vec_int_stmt_cost */
472 6, /* vec_fp_stmt_cost */
473 3, /* vec_permute_cost */
474 6, /* vec_to_scalar_cost */
475 5, /* scalar_to_vec_cost */
476 8, /* vec_align_load_cost */
477 8, /* vec_unalign_load_cost */
478 4, /* vec_unalign_store_cost */
479 4, /* vec_store_cost */
480 2, /* cond_taken_branch_cost */
481 1 /* cond_not_taken_branch_cost */
482 };
483
484 /* Generic costs for branch instructions. */
485 static const struct cpu_branch_cost generic_branch_cost =
486 {
487 1, /* Predictable. */
488 3 /* Unpredictable. */
489 };
490
491 /* Branch costs for Cortex-A57. */
492 static const struct cpu_branch_cost cortexa57_branch_cost =
493 {
494 1, /* Predictable. */
495 3 /* Unpredictable. */
496 };
497
498 /* Branch costs for Vulcan. */
499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
500 {
501 1, /* Predictable. */
502 3 /* Unpredictable. */
503 };
504
505 /* Generic approximation modes. */
506 static const cpu_approx_modes generic_approx_modes =
507 {
508 AARCH64_APPROX_NONE, /* division */
509 AARCH64_APPROX_NONE, /* sqrt */
510 AARCH64_APPROX_NONE /* recip_sqrt */
511 };
512
513 /* Approximation modes for Exynos M1. */
514 static const cpu_approx_modes exynosm1_approx_modes =
515 {
516 AARCH64_APPROX_NONE, /* division */
517 AARCH64_APPROX_ALL, /* sqrt */
518 AARCH64_APPROX_ALL /* recip_sqrt */
519 };
520
521 /* Approximation modes for X-Gene 1. */
522 static const cpu_approx_modes xgene1_approx_modes =
523 {
524 AARCH64_APPROX_NONE, /* division */
525 AARCH64_APPROX_NONE, /* sqrt */
526 AARCH64_APPROX_ALL /* recip_sqrt */
527 };
528
529 /* Generic prefetch settings (which disable prefetch). */
530 static const cpu_prefetch_tune generic_prefetch_tune =
531 {
532 0, /* num_slots */
533 -1, /* l1_cache_size */
534 -1, /* l1_cache_line_size */
535 -1, /* l2_cache_size */
536 -1 /* default_opt_level */
537 };
538
539 static const cpu_prefetch_tune exynosm1_prefetch_tune =
540 {
541 0, /* num_slots */
542 -1, /* l1_cache_size */
543 64, /* l1_cache_line_size */
544 -1, /* l2_cache_size */
545 -1 /* default_opt_level */
546 };
547
548 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
549 {
550 4, /* num_slots */
551 32, /* l1_cache_size */
552 64, /* l1_cache_line_size */
553 1024, /* l2_cache_size */
554 3 /* default_opt_level */
555 };
556
557 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
558 {
559 8, /* num_slots */
560 32, /* l1_cache_size */
561 128, /* l1_cache_line_size */
562 16*1024, /* l2_cache_size */
563 3 /* default_opt_level */
564 };
565
566 static const cpu_prefetch_tune thunderx_prefetch_tune =
567 {
568 8, /* num_slots */
569 32, /* l1_cache_size */
570 128, /* l1_cache_line_size */
571 -1, /* l2_cache_size */
572 -1 /* default_opt_level */
573 };
574
575 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
576 {
577 8, /* num_slots */
578 32, /* l1_cache_size */
579 64, /* l1_cache_line_size */
580 256, /* l2_cache_size */
581 -1 /* default_opt_level */
582 };
583
584 static const struct tune_params generic_tunings =
585 {
586 &cortexa57_extra_costs,
587 &generic_addrcost_table,
588 &generic_regmove_cost,
589 &generic_vector_cost,
590 &generic_branch_cost,
591 &generic_approx_modes,
592 4, /* memmov_cost */
593 2, /* issue_rate */
594 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
595 8, /* function_align. */
596 4, /* jump_align. */
597 8, /* loop_align. */
598 2, /* int_reassoc_width. */
599 4, /* fp_reassoc_width. */
600 1, /* vec_reassoc_width. */
601 2, /* min_div_recip_mul_sf. */
602 2, /* min_div_recip_mul_df. */
603 0, /* max_case_values. */
604 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
605 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
606 &generic_prefetch_tune
607 };
608
609 static const struct tune_params cortexa35_tunings =
610 {
611 &cortexa53_extra_costs,
612 &generic_addrcost_table,
613 &cortexa53_regmove_cost,
614 &generic_vector_cost,
615 &cortexa57_branch_cost,
616 &generic_approx_modes,
617 4, /* memmov_cost */
618 1, /* issue_rate */
619 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
620 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
621 16, /* function_align. */
622 4, /* jump_align. */
623 8, /* loop_align. */
624 2, /* int_reassoc_width. */
625 4, /* fp_reassoc_width. */
626 1, /* vec_reassoc_width. */
627 2, /* min_div_recip_mul_sf. */
628 2, /* min_div_recip_mul_df. */
629 0, /* max_case_values. */
630 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
631 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
632 &generic_prefetch_tune
633 };
634
635 static const struct tune_params cortexa53_tunings =
636 {
637 &cortexa53_extra_costs,
638 &generic_addrcost_table,
639 &cortexa53_regmove_cost,
640 &generic_vector_cost,
641 &cortexa57_branch_cost,
642 &generic_approx_modes,
643 4, /* memmov_cost */
644 2, /* issue_rate */
645 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
646 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
647 16, /* function_align. */
648 4, /* jump_align. */
649 8, /* loop_align. */
650 2, /* int_reassoc_width. */
651 4, /* fp_reassoc_width. */
652 1, /* vec_reassoc_width. */
653 2, /* min_div_recip_mul_sf. */
654 2, /* min_div_recip_mul_df. */
655 0, /* max_case_values. */
656 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
657 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
658 &generic_prefetch_tune
659 };
660
661 static const struct tune_params cortexa57_tunings =
662 {
663 &cortexa57_extra_costs,
664 &cortexa57_addrcost_table,
665 &cortexa57_regmove_cost,
666 &cortexa57_vector_cost,
667 &cortexa57_branch_cost,
668 &generic_approx_modes,
669 4, /* memmov_cost */
670 3, /* issue_rate */
671 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
672 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
673 16, /* function_align. */
674 4, /* jump_align. */
675 8, /* loop_align. */
676 2, /* int_reassoc_width. */
677 4, /* fp_reassoc_width. */
678 1, /* vec_reassoc_width. */
679 2, /* min_div_recip_mul_sf. */
680 2, /* min_div_recip_mul_df. */
681 0, /* max_case_values. */
682 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
683 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
684 &generic_prefetch_tune
685 };
686
687 static const struct tune_params cortexa72_tunings =
688 {
689 &cortexa57_extra_costs,
690 &cortexa57_addrcost_table,
691 &cortexa57_regmove_cost,
692 &cortexa57_vector_cost,
693 &cortexa57_branch_cost,
694 &generic_approx_modes,
695 4, /* memmov_cost */
696 3, /* issue_rate */
697 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
698 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
699 16, /* function_align. */
700 4, /* jump_align. */
701 8, /* loop_align. */
702 2, /* int_reassoc_width. */
703 4, /* fp_reassoc_width. */
704 1, /* vec_reassoc_width. */
705 2, /* min_div_recip_mul_sf. */
706 2, /* min_div_recip_mul_df. */
707 0, /* max_case_values. */
708 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
709 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
710 &generic_prefetch_tune
711 };
712
713 static const struct tune_params cortexa73_tunings =
714 {
715 &cortexa57_extra_costs,
716 &cortexa57_addrcost_table,
717 &cortexa57_regmove_cost,
718 &cortexa57_vector_cost,
719 &cortexa57_branch_cost,
720 &generic_approx_modes,
721 4, /* memmov_cost. */
722 2, /* issue_rate. */
723 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
724 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
725 16, /* function_align. */
726 4, /* jump_align. */
727 8, /* loop_align. */
728 2, /* int_reassoc_width. */
729 4, /* fp_reassoc_width. */
730 1, /* vec_reassoc_width. */
731 2, /* min_div_recip_mul_sf. */
732 2, /* min_div_recip_mul_df. */
733 0, /* max_case_values. */
734 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
735 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
736 &generic_prefetch_tune
737 };
738
739
740
741 static const struct tune_params exynosm1_tunings =
742 {
743 &exynosm1_extra_costs,
744 &exynosm1_addrcost_table,
745 &exynosm1_regmove_cost,
746 &exynosm1_vector_cost,
747 &generic_branch_cost,
748 &exynosm1_approx_modes,
749 4, /* memmov_cost */
750 3, /* issue_rate */
751 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
752 4, /* function_align. */
753 4, /* jump_align. */
754 4, /* loop_align. */
755 2, /* int_reassoc_width. */
756 4, /* fp_reassoc_width. */
757 1, /* vec_reassoc_width. */
758 2, /* min_div_recip_mul_sf. */
759 2, /* min_div_recip_mul_df. */
760 48, /* max_case_values. */
761 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
762 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
763 &exynosm1_prefetch_tune
764 };
765
766 static const struct tune_params thunderxt88_tunings =
767 {
768 &thunderx_extra_costs,
769 &generic_addrcost_table,
770 &thunderx_regmove_cost,
771 &thunderx_vector_cost,
772 &generic_branch_cost,
773 &generic_approx_modes,
774 6, /* memmov_cost */
775 2, /* issue_rate */
776 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
777 8, /* function_align. */
778 8, /* jump_align. */
779 8, /* loop_align. */
780 2, /* int_reassoc_width. */
781 4, /* fp_reassoc_width. */
782 1, /* vec_reassoc_width. */
783 2, /* min_div_recip_mul_sf. */
784 2, /* min_div_recip_mul_df. */
785 0, /* max_case_values. */
786 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
787 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
788 &thunderxt88_prefetch_tune
789 };
790
791 static const struct tune_params thunderx_tunings =
792 {
793 &thunderx_extra_costs,
794 &generic_addrcost_table,
795 &thunderx_regmove_cost,
796 &thunderx_vector_cost,
797 &generic_branch_cost,
798 &generic_approx_modes,
799 6, /* memmov_cost */
800 2, /* issue_rate */
801 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
802 8, /* function_align. */
803 8, /* jump_align. */
804 8, /* loop_align. */
805 2, /* int_reassoc_width. */
806 4, /* fp_reassoc_width. */
807 1, /* vec_reassoc_width. */
808 2, /* min_div_recip_mul_sf. */
809 2, /* min_div_recip_mul_df. */
810 0, /* max_case_values. */
811 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
812 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
813 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
814 &thunderx_prefetch_tune
815 };
816
817 static const struct tune_params xgene1_tunings =
818 {
819 &xgene1_extra_costs,
820 &xgene1_addrcost_table,
821 &xgene1_regmove_cost,
822 &xgene1_vector_cost,
823 &generic_branch_cost,
824 &xgene1_approx_modes,
825 6, /* memmov_cost */
826 4, /* issue_rate */
827 AARCH64_FUSE_NOTHING, /* fusible_ops */
828 16, /* function_align. */
829 8, /* jump_align. */
830 16, /* loop_align. */
831 2, /* int_reassoc_width. */
832 4, /* fp_reassoc_width. */
833 1, /* vec_reassoc_width. */
834 2, /* min_div_recip_mul_sf. */
835 2, /* min_div_recip_mul_df. */
836 0, /* max_case_values. */
837 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
838 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
839 &generic_prefetch_tune
840 };
841
842 static const struct tune_params qdf24xx_tunings =
843 {
844 &qdf24xx_extra_costs,
845 &qdf24xx_addrcost_table,
846 &qdf24xx_regmove_cost,
847 &generic_vector_cost,
848 &generic_branch_cost,
849 &generic_approx_modes,
850 4, /* memmov_cost */
851 4, /* issue_rate */
852 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
853 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
854 16, /* function_align. */
855 8, /* jump_align. */
856 16, /* loop_align. */
857 2, /* int_reassoc_width. */
858 4, /* fp_reassoc_width. */
859 1, /* vec_reassoc_width. */
860 2, /* min_div_recip_mul_sf. */
861 2, /* min_div_recip_mul_df. */
862 0, /* max_case_values. */
863 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
864 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
865 &qdf24xx_prefetch_tune
866 };
867
868 static const struct tune_params thunderx2t99_tunings =
869 {
870 &thunderx2t99_extra_costs,
871 &thunderx2t99_addrcost_table,
872 &thunderx2t99_regmove_cost,
873 &thunderx2t99_vector_cost,
874 &thunderx2t99_branch_cost,
875 &generic_approx_modes,
876 4, /* memmov_cost. */
877 4, /* issue_rate. */
878 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
879 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
880 16, /* function_align. */
881 8, /* jump_align. */
882 16, /* loop_align. */
883 3, /* int_reassoc_width. */
884 2, /* fp_reassoc_width. */
885 2, /* vec_reassoc_width. */
886 2, /* min_div_recip_mul_sf. */
887 2, /* min_div_recip_mul_df. */
888 0, /* max_case_values. */
889 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
890 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
891 &thunderx2t99_prefetch_tune
892 };
893
894 /* Support for fine-grained override of the tuning structures. */
895 struct aarch64_tuning_override_function
896 {
897 const char* name;
898 void (*parse_override)(const char*, struct tune_params*);
899 };
900
901 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
902 static void aarch64_parse_tune_string (const char*, struct tune_params*);
903
904 static const struct aarch64_tuning_override_function
905 aarch64_tuning_override_functions[] =
906 {
907 { "fuse", aarch64_parse_fuse_string },
908 { "tune", aarch64_parse_tune_string },
909 { NULL, NULL }
910 };
911
912 /* A processor implementing AArch64. */
913 struct processor
914 {
915 const char *const name;
916 enum aarch64_processor ident;
917 enum aarch64_processor sched_core;
918 enum aarch64_arch arch;
919 unsigned architecture_version;
920 const unsigned long flags;
921 const struct tune_params *const tune;
922 };
923
924 /* Architectures implementing AArch64. */
925 static const struct processor all_architectures[] =
926 {
927 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
928 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
929 #include "aarch64-arches.def"
930 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
931 };
932
933 /* Processor cores implementing AArch64. */
934 static const struct processor all_cores[] =
935 {
936 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
937 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
938 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
939 FLAGS, &COSTS##_tunings},
940 #include "aarch64-cores.def"
941 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
942 AARCH64_FL_FOR_ARCH8, &generic_tunings},
943 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
944 };
945
946
947 /* Target specification. These are populated by the -march, -mtune, -mcpu
948 handling code or by target attributes. */
949 static const struct processor *selected_arch;
950 static const struct processor *selected_cpu;
951 static const struct processor *selected_tune;
952
953 /* The current tuning set. */
954 struct tune_params aarch64_tune_params = generic_tunings;
955
956 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
957
958 /* An ISA extension in the co-processor and main instruction set space. */
959 struct aarch64_option_extension
960 {
961 const char *const name;
962 const unsigned long flags_on;
963 const unsigned long flags_off;
964 };
965
966 typedef enum aarch64_cond_code
967 {
968 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
969 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
970 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
971 }
972 aarch64_cc;
973
974 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
975
976 /* The condition codes of the processor, and the inverse function. */
977 static const char * const aarch64_condition_codes[] =
978 {
979 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
980 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
981 };
982
983 /* Generate code to enable conditional branches in functions over 1 MiB. */
984 const char *
985 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
986 const char * branch_format)
987 {
988 rtx_code_label * tmp_label = gen_label_rtx ();
989 char label_buf[256];
990 char buffer[128];
991 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
992 CODE_LABEL_NUMBER (tmp_label));
993 const char *label_ptr = targetm.strip_name_encoding (label_buf);
994 rtx dest_label = operands[pos_label];
995 operands[pos_label] = tmp_label;
996
997 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
998 output_asm_insn (buffer, operands);
999
1000 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1001 operands[pos_label] = dest_label;
1002 output_asm_insn (buffer, operands);
1003 return "";
1004 }
1005
1006 void
1007 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1008 {
1009 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1010 if (TARGET_GENERAL_REGS_ONLY)
1011 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1012 else
1013 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1014 }
1015
1016 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1017 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1018 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1019 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1020 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1021 irrespectively of its cost results in bad allocations with many redundant
1022 int<->FP moves which are expensive on various cores.
1023 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1024 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1025 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1026 Otherwise set the allocno class depending on the mode.
1027 The result of this is that it is no longer inefficient to have a higher
1028 memory move cost than the register move cost.
1029 */
1030
1031 static reg_class_t
1032 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1033 reg_class_t best_class)
1034 {
1035 enum machine_mode mode;
1036
1037 if (allocno_class != ALL_REGS)
1038 return allocno_class;
1039
1040 if (best_class != ALL_REGS)
1041 return best_class;
1042
1043 mode = PSEUDO_REGNO_MODE (regno);
1044 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1045 }
1046
1047 static unsigned int
1048 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
1049 {
1050 if (GET_MODE_UNIT_SIZE (mode) == 4)
1051 return aarch64_tune_params.min_div_recip_mul_sf;
1052 return aarch64_tune_params.min_div_recip_mul_df;
1053 }
1054
1055 static int
1056 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1057 enum machine_mode mode)
1058 {
1059 if (VECTOR_MODE_P (mode))
1060 return aarch64_tune_params.vec_reassoc_width;
1061 if (INTEGRAL_MODE_P (mode))
1062 return aarch64_tune_params.int_reassoc_width;
1063 if (FLOAT_MODE_P (mode))
1064 return aarch64_tune_params.fp_reassoc_width;
1065 return 1;
1066 }
1067
1068 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1069 unsigned
1070 aarch64_dbx_register_number (unsigned regno)
1071 {
1072 if (GP_REGNUM_P (regno))
1073 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1074 else if (regno == SP_REGNUM)
1075 return AARCH64_DWARF_SP;
1076 else if (FP_REGNUM_P (regno))
1077 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1078
1079 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1080 equivalent DWARF register. */
1081 return DWARF_FRAME_REGISTERS;
1082 }
1083
1084 /* Return TRUE if MODE is any of the large INT modes. */
1085 static bool
1086 aarch64_vect_struct_mode_p (machine_mode mode)
1087 {
1088 return mode == OImode || mode == CImode || mode == XImode;
1089 }
1090
1091 /* Return TRUE if MODE is any of the vector modes. */
1092 static bool
1093 aarch64_vector_mode_p (machine_mode mode)
1094 {
1095 return aarch64_vector_mode_supported_p (mode)
1096 || aarch64_vect_struct_mode_p (mode);
1097 }
1098
1099 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1100 static bool
1101 aarch64_array_mode_supported_p (machine_mode mode,
1102 unsigned HOST_WIDE_INT nelems)
1103 {
1104 if (TARGET_SIMD
1105 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1106 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1107 && (nelems >= 2 && nelems <= 4))
1108 return true;
1109
1110 return false;
1111 }
1112
1113 /* Implement HARD_REGNO_NREGS. */
1114
1115 int
1116 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1117 {
1118 switch (aarch64_regno_regclass (regno))
1119 {
1120 case FP_REGS:
1121 case FP_LO_REGS:
1122 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1123 default:
1124 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1125 }
1126 gcc_unreachable ();
1127 }
1128
1129 /* Implement HARD_REGNO_MODE_OK. */
1130
1131 int
1132 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1133 {
1134 if (GET_MODE_CLASS (mode) == MODE_CC)
1135 return regno == CC_REGNUM;
1136
1137 if (regno == SP_REGNUM)
1138 /* The purpose of comparing with ptr_mode is to support the
1139 global register variable associated with the stack pointer
1140 register via the syntax of asm ("wsp") in ILP32. */
1141 return mode == Pmode || mode == ptr_mode;
1142
1143 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1144 return mode == Pmode;
1145
1146 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1147 return 1;
1148
1149 if (FP_REGNUM_P (regno))
1150 {
1151 if (aarch64_vect_struct_mode_p (mode))
1152 return
1153 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1154 else
1155 return 1;
1156 }
1157
1158 return 0;
1159 }
1160
1161 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1162 machine_mode
1163 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1164 machine_mode mode)
1165 {
1166 /* Handle modes that fit within single registers. */
1167 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1168 {
1169 if (GET_MODE_SIZE (mode) >= 4)
1170 return mode;
1171 else
1172 return SImode;
1173 }
1174 /* Fall back to generic for multi-reg and very large modes. */
1175 else
1176 return choose_hard_reg_mode (regno, nregs, false);
1177 }
1178
1179 /* Return true if calls to DECL should be treated as
1180 long-calls (ie called via a register). */
1181 static bool
1182 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1183 {
1184 return false;
1185 }
1186
1187 /* Return true if calls to symbol-ref SYM should be treated as
1188 long-calls (ie called via a register). */
1189 bool
1190 aarch64_is_long_call_p (rtx sym)
1191 {
1192 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1193 }
1194
1195 /* Return true if calls to symbol-ref SYM should not go through
1196 plt stubs. */
1197
1198 bool
1199 aarch64_is_noplt_call_p (rtx sym)
1200 {
1201 const_tree decl = SYMBOL_REF_DECL (sym);
1202
1203 if (flag_pic
1204 && decl
1205 && (!flag_plt
1206 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1207 && !targetm.binds_local_p (decl))
1208 return true;
1209
1210 return false;
1211 }
1212
1213 /* Return true if the offsets to a zero/sign-extract operation
1214 represent an expression that matches an extend operation. The
1215 operands represent the paramters from
1216
1217 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1218 bool
1219 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1220 rtx extract_imm)
1221 {
1222 HOST_WIDE_INT mult_val, extract_val;
1223
1224 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1225 return false;
1226
1227 mult_val = INTVAL (mult_imm);
1228 extract_val = INTVAL (extract_imm);
1229
1230 if (extract_val > 8
1231 && extract_val < GET_MODE_BITSIZE (mode)
1232 && exact_log2 (extract_val & ~7) > 0
1233 && (extract_val & 7) <= 4
1234 && mult_val == (1 << (extract_val & 7)))
1235 return true;
1236
1237 return false;
1238 }
1239
1240 /* Emit an insn that's a simple single-set. Both the operands must be
1241 known to be valid. */
1242 inline static rtx_insn *
1243 emit_set_insn (rtx x, rtx y)
1244 {
1245 return emit_insn (gen_rtx_SET (x, y));
1246 }
1247
1248 /* X and Y are two things to compare using CODE. Emit the compare insn and
1249 return the rtx for register 0 in the proper mode. */
1250 rtx
1251 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1252 {
1253 machine_mode mode = SELECT_CC_MODE (code, x, y);
1254 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1255
1256 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1257 return cc_reg;
1258 }
1259
1260 /* Build the SYMBOL_REF for __tls_get_addr. */
1261
1262 static GTY(()) rtx tls_get_addr_libfunc;
1263
1264 rtx
1265 aarch64_tls_get_addr (void)
1266 {
1267 if (!tls_get_addr_libfunc)
1268 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1269 return tls_get_addr_libfunc;
1270 }
1271
1272 /* Return the TLS model to use for ADDR. */
1273
1274 static enum tls_model
1275 tls_symbolic_operand_type (rtx addr)
1276 {
1277 enum tls_model tls_kind = TLS_MODEL_NONE;
1278 rtx sym, addend;
1279
1280 if (GET_CODE (addr) == CONST)
1281 {
1282 split_const (addr, &sym, &addend);
1283 if (GET_CODE (sym) == SYMBOL_REF)
1284 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1285 }
1286 else if (GET_CODE (addr) == SYMBOL_REF)
1287 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1288
1289 return tls_kind;
1290 }
1291
1292 /* We'll allow lo_sum's in addresses in our legitimate addresses
1293 so that combine would take care of combining addresses where
1294 necessary, but for generation purposes, we'll generate the address
1295 as :
1296 RTL Absolute
1297 tmp = hi (symbol_ref); adrp x1, foo
1298 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1299 nop
1300
1301 PIC TLS
1302 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1303 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1304 bl __tls_get_addr
1305 nop
1306
1307 Load TLS symbol, depending on TLS mechanism and TLS access model.
1308
1309 Global Dynamic - Traditional TLS:
1310 adrp tmp, :tlsgd:imm
1311 add dest, tmp, #:tlsgd_lo12:imm
1312 bl __tls_get_addr
1313
1314 Global Dynamic - TLS Descriptors:
1315 adrp dest, :tlsdesc:imm
1316 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1317 add dest, dest, #:tlsdesc_lo12:imm
1318 blr tmp
1319 mrs tp, tpidr_el0
1320 add dest, dest, tp
1321
1322 Initial Exec:
1323 mrs tp, tpidr_el0
1324 adrp tmp, :gottprel:imm
1325 ldr dest, [tmp, #:gottprel_lo12:imm]
1326 add dest, dest, tp
1327
1328 Local Exec:
1329 mrs tp, tpidr_el0
1330 add t0, tp, #:tprel_hi12:imm, lsl #12
1331 add t0, t0, #:tprel_lo12_nc:imm
1332 */
1333
1334 static void
1335 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1336 enum aarch64_symbol_type type)
1337 {
1338 switch (type)
1339 {
1340 case SYMBOL_SMALL_ABSOLUTE:
1341 {
1342 /* In ILP32, the mode of dest can be either SImode or DImode. */
1343 rtx tmp_reg = dest;
1344 machine_mode mode = GET_MODE (dest);
1345
1346 gcc_assert (mode == Pmode || mode == ptr_mode);
1347
1348 if (can_create_pseudo_p ())
1349 tmp_reg = gen_reg_rtx (mode);
1350
1351 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1352 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1353 return;
1354 }
1355
1356 case SYMBOL_TINY_ABSOLUTE:
1357 emit_insn (gen_rtx_SET (dest, imm));
1358 return;
1359
1360 case SYMBOL_SMALL_GOT_28K:
1361 {
1362 machine_mode mode = GET_MODE (dest);
1363 rtx gp_rtx = pic_offset_table_rtx;
1364 rtx insn;
1365 rtx mem;
1366
1367 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1368 here before rtl expand. Tree IVOPT will generate rtl pattern to
1369 decide rtx costs, in which case pic_offset_table_rtx is not
1370 initialized. For that case no need to generate the first adrp
1371 instruction as the final cost for global variable access is
1372 one instruction. */
1373 if (gp_rtx != NULL)
1374 {
1375 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1376 using the page base as GOT base, the first page may be wasted,
1377 in the worst scenario, there is only 28K space for GOT).
1378
1379 The generate instruction sequence for accessing global variable
1380 is:
1381
1382 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1383
1384 Only one instruction needed. But we must initialize
1385 pic_offset_table_rtx properly. We generate initialize insn for
1386 every global access, and allow CSE to remove all redundant.
1387
1388 The final instruction sequences will look like the following
1389 for multiply global variables access.
1390
1391 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1392
1393 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1394 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1395 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1396 ... */
1397
1398 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1399 crtl->uses_pic_offset_table = 1;
1400 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1401
1402 if (mode != GET_MODE (gp_rtx))
1403 gp_rtx = gen_lowpart (mode, gp_rtx);
1404
1405 }
1406
1407 if (mode == ptr_mode)
1408 {
1409 if (mode == DImode)
1410 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1411 else
1412 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1413
1414 mem = XVECEXP (SET_SRC (insn), 0, 0);
1415 }
1416 else
1417 {
1418 gcc_assert (mode == Pmode);
1419
1420 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1421 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1422 }
1423
1424 /* The operand is expected to be MEM. Whenever the related insn
1425 pattern changed, above code which calculate mem should be
1426 updated. */
1427 gcc_assert (GET_CODE (mem) == MEM);
1428 MEM_READONLY_P (mem) = 1;
1429 MEM_NOTRAP_P (mem) = 1;
1430 emit_insn (insn);
1431 return;
1432 }
1433
1434 case SYMBOL_SMALL_GOT_4G:
1435 {
1436 /* In ILP32, the mode of dest can be either SImode or DImode,
1437 while the got entry is always of SImode size. The mode of
1438 dest depends on how dest is used: if dest is assigned to a
1439 pointer (e.g. in the memory), it has SImode; it may have
1440 DImode if dest is dereferenced to access the memeory.
1441 This is why we have to handle three different ldr_got_small
1442 patterns here (two patterns for ILP32). */
1443
1444 rtx insn;
1445 rtx mem;
1446 rtx tmp_reg = dest;
1447 machine_mode mode = GET_MODE (dest);
1448
1449 if (can_create_pseudo_p ())
1450 tmp_reg = gen_reg_rtx (mode);
1451
1452 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1453 if (mode == ptr_mode)
1454 {
1455 if (mode == DImode)
1456 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1457 else
1458 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1459
1460 mem = XVECEXP (SET_SRC (insn), 0, 0);
1461 }
1462 else
1463 {
1464 gcc_assert (mode == Pmode);
1465
1466 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1467 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1468 }
1469
1470 gcc_assert (GET_CODE (mem) == MEM);
1471 MEM_READONLY_P (mem) = 1;
1472 MEM_NOTRAP_P (mem) = 1;
1473 emit_insn (insn);
1474 return;
1475 }
1476
1477 case SYMBOL_SMALL_TLSGD:
1478 {
1479 rtx_insn *insns;
1480 machine_mode mode = GET_MODE (dest);
1481 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1482
1483 start_sequence ();
1484 if (TARGET_ILP32)
1485 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1486 else
1487 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1488 insns = get_insns ();
1489 end_sequence ();
1490
1491 RTL_CONST_CALL_P (insns) = 1;
1492 emit_libcall_block (insns, dest, result, imm);
1493 return;
1494 }
1495
1496 case SYMBOL_SMALL_TLSDESC:
1497 {
1498 machine_mode mode = GET_MODE (dest);
1499 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1500 rtx tp;
1501
1502 gcc_assert (mode == Pmode || mode == ptr_mode);
1503
1504 /* In ILP32, the got entry is always of SImode size. Unlike
1505 small GOT, the dest is fixed at reg 0. */
1506 if (TARGET_ILP32)
1507 emit_insn (gen_tlsdesc_small_si (imm));
1508 else
1509 emit_insn (gen_tlsdesc_small_di (imm));
1510 tp = aarch64_load_tp (NULL);
1511
1512 if (mode != Pmode)
1513 tp = gen_lowpart (mode, tp);
1514
1515 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1516 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1517 return;
1518 }
1519
1520 case SYMBOL_SMALL_TLSIE:
1521 {
1522 /* In ILP32, the mode of dest can be either SImode or DImode,
1523 while the got entry is always of SImode size. The mode of
1524 dest depends on how dest is used: if dest is assigned to a
1525 pointer (e.g. in the memory), it has SImode; it may have
1526 DImode if dest is dereferenced to access the memeory.
1527 This is why we have to handle three different tlsie_small
1528 patterns here (two patterns for ILP32). */
1529 machine_mode mode = GET_MODE (dest);
1530 rtx tmp_reg = gen_reg_rtx (mode);
1531 rtx tp = aarch64_load_tp (NULL);
1532
1533 if (mode == ptr_mode)
1534 {
1535 if (mode == DImode)
1536 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1537 else
1538 {
1539 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1540 tp = gen_lowpart (mode, tp);
1541 }
1542 }
1543 else
1544 {
1545 gcc_assert (mode == Pmode);
1546 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1547 }
1548
1549 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1550 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1551 return;
1552 }
1553
1554 case SYMBOL_TLSLE12:
1555 case SYMBOL_TLSLE24:
1556 case SYMBOL_TLSLE32:
1557 case SYMBOL_TLSLE48:
1558 {
1559 machine_mode mode = GET_MODE (dest);
1560 rtx tp = aarch64_load_tp (NULL);
1561
1562 if (mode != Pmode)
1563 tp = gen_lowpart (mode, tp);
1564
1565 switch (type)
1566 {
1567 case SYMBOL_TLSLE12:
1568 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1569 (dest, tp, imm));
1570 break;
1571 case SYMBOL_TLSLE24:
1572 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1573 (dest, tp, imm));
1574 break;
1575 case SYMBOL_TLSLE32:
1576 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1577 (dest, imm));
1578 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1579 (dest, dest, tp));
1580 break;
1581 case SYMBOL_TLSLE48:
1582 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1583 (dest, imm));
1584 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1585 (dest, dest, tp));
1586 break;
1587 default:
1588 gcc_unreachable ();
1589 }
1590
1591 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1592 return;
1593 }
1594
1595 case SYMBOL_TINY_GOT:
1596 emit_insn (gen_ldr_got_tiny (dest, imm));
1597 return;
1598
1599 case SYMBOL_TINY_TLSIE:
1600 {
1601 machine_mode mode = GET_MODE (dest);
1602 rtx tp = aarch64_load_tp (NULL);
1603
1604 if (mode == ptr_mode)
1605 {
1606 if (mode == DImode)
1607 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1608 else
1609 {
1610 tp = gen_lowpart (mode, tp);
1611 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1612 }
1613 }
1614 else
1615 {
1616 gcc_assert (mode == Pmode);
1617 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1618 }
1619
1620 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1621 return;
1622 }
1623
1624 default:
1625 gcc_unreachable ();
1626 }
1627 }
1628
1629 /* Emit a move from SRC to DEST. Assume that the move expanders can
1630 handle all moves if !can_create_pseudo_p (). The distinction is
1631 important because, unlike emit_move_insn, the move expanders know
1632 how to force Pmode objects into the constant pool even when the
1633 constant pool address is not itself legitimate. */
1634 static rtx
1635 aarch64_emit_move (rtx dest, rtx src)
1636 {
1637 return (can_create_pseudo_p ()
1638 ? emit_move_insn (dest, src)
1639 : emit_move_insn_1 (dest, src));
1640 }
1641
1642 /* Split a 128-bit move operation into two 64-bit move operations,
1643 taking care to handle partial overlap of register to register
1644 copies. Special cases are needed when moving between GP regs and
1645 FP regs. SRC can be a register, constant or memory; DST a register
1646 or memory. If either operand is memory it must not have any side
1647 effects. */
1648 void
1649 aarch64_split_128bit_move (rtx dst, rtx src)
1650 {
1651 rtx dst_lo, dst_hi;
1652 rtx src_lo, src_hi;
1653
1654 machine_mode mode = GET_MODE (dst);
1655
1656 gcc_assert (mode == TImode || mode == TFmode);
1657 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1658 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1659
1660 if (REG_P (dst) && REG_P (src))
1661 {
1662 int src_regno = REGNO (src);
1663 int dst_regno = REGNO (dst);
1664
1665 /* Handle FP <-> GP regs. */
1666 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1667 {
1668 src_lo = gen_lowpart (word_mode, src);
1669 src_hi = gen_highpart (word_mode, src);
1670
1671 if (mode == TImode)
1672 {
1673 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1674 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1675 }
1676 else
1677 {
1678 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1679 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1680 }
1681 return;
1682 }
1683 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1684 {
1685 dst_lo = gen_lowpart (word_mode, dst);
1686 dst_hi = gen_highpart (word_mode, dst);
1687
1688 if (mode == TImode)
1689 {
1690 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1691 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1692 }
1693 else
1694 {
1695 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1696 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1697 }
1698 return;
1699 }
1700 }
1701
1702 dst_lo = gen_lowpart (word_mode, dst);
1703 dst_hi = gen_highpart (word_mode, dst);
1704 src_lo = gen_lowpart (word_mode, src);
1705 src_hi = gen_highpart_mode (word_mode, mode, src);
1706
1707 /* At most one pairing may overlap. */
1708 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1709 {
1710 aarch64_emit_move (dst_hi, src_hi);
1711 aarch64_emit_move (dst_lo, src_lo);
1712 }
1713 else
1714 {
1715 aarch64_emit_move (dst_lo, src_lo);
1716 aarch64_emit_move (dst_hi, src_hi);
1717 }
1718 }
1719
1720 bool
1721 aarch64_split_128bit_move_p (rtx dst, rtx src)
1722 {
1723 return (! REG_P (src)
1724 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1725 }
1726
1727 /* Split a complex SIMD combine. */
1728
1729 void
1730 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1731 {
1732 machine_mode src_mode = GET_MODE (src1);
1733 machine_mode dst_mode = GET_MODE (dst);
1734
1735 gcc_assert (VECTOR_MODE_P (dst_mode));
1736 gcc_assert (register_operand (dst, dst_mode)
1737 && register_operand (src1, src_mode)
1738 && register_operand (src2, src_mode));
1739
1740 rtx (*gen) (rtx, rtx, rtx);
1741
1742 switch (src_mode)
1743 {
1744 case V8QImode:
1745 gen = gen_aarch64_simd_combinev8qi;
1746 break;
1747 case V4HImode:
1748 gen = gen_aarch64_simd_combinev4hi;
1749 break;
1750 case V2SImode:
1751 gen = gen_aarch64_simd_combinev2si;
1752 break;
1753 case V4HFmode:
1754 gen = gen_aarch64_simd_combinev4hf;
1755 break;
1756 case V2SFmode:
1757 gen = gen_aarch64_simd_combinev2sf;
1758 break;
1759 case DImode:
1760 gen = gen_aarch64_simd_combinedi;
1761 break;
1762 case DFmode:
1763 gen = gen_aarch64_simd_combinedf;
1764 break;
1765 default:
1766 gcc_unreachable ();
1767 }
1768
1769 emit_insn (gen (dst, src1, src2));
1770 return;
1771 }
1772
1773 /* Split a complex SIMD move. */
1774
1775 void
1776 aarch64_split_simd_move (rtx dst, rtx src)
1777 {
1778 machine_mode src_mode = GET_MODE (src);
1779 machine_mode dst_mode = GET_MODE (dst);
1780
1781 gcc_assert (VECTOR_MODE_P (dst_mode));
1782
1783 if (REG_P (dst) && REG_P (src))
1784 {
1785 rtx (*gen) (rtx, rtx);
1786
1787 gcc_assert (VECTOR_MODE_P (src_mode));
1788
1789 switch (src_mode)
1790 {
1791 case V16QImode:
1792 gen = gen_aarch64_split_simd_movv16qi;
1793 break;
1794 case V8HImode:
1795 gen = gen_aarch64_split_simd_movv8hi;
1796 break;
1797 case V4SImode:
1798 gen = gen_aarch64_split_simd_movv4si;
1799 break;
1800 case V2DImode:
1801 gen = gen_aarch64_split_simd_movv2di;
1802 break;
1803 case V8HFmode:
1804 gen = gen_aarch64_split_simd_movv8hf;
1805 break;
1806 case V4SFmode:
1807 gen = gen_aarch64_split_simd_movv4sf;
1808 break;
1809 case V2DFmode:
1810 gen = gen_aarch64_split_simd_movv2df;
1811 break;
1812 default:
1813 gcc_unreachable ();
1814 }
1815
1816 emit_insn (gen (dst, src));
1817 return;
1818 }
1819 }
1820
1821 bool
1822 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1823 machine_mode ymode, rtx y)
1824 {
1825 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1826 gcc_assert (r != NULL);
1827 return rtx_equal_p (x, r);
1828 }
1829
1830
1831 static rtx
1832 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1833 {
1834 if (can_create_pseudo_p ())
1835 return force_reg (mode, value);
1836 else
1837 {
1838 x = aarch64_emit_move (x, value);
1839 return x;
1840 }
1841 }
1842
1843
1844 static rtx
1845 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1846 {
1847 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1848 {
1849 rtx high;
1850 /* Load the full offset into a register. This
1851 might be improvable in the future. */
1852 high = GEN_INT (offset);
1853 offset = 0;
1854 high = aarch64_force_temporary (mode, temp, high);
1855 reg = aarch64_force_temporary (mode, temp,
1856 gen_rtx_PLUS (mode, high, reg));
1857 }
1858 return plus_constant (mode, reg, offset);
1859 }
1860
1861 static int
1862 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1863 machine_mode mode)
1864 {
1865 int i;
1866 unsigned HOST_WIDE_INT val, val2, mask;
1867 int one_match, zero_match;
1868 int num_insns;
1869
1870 val = INTVAL (imm);
1871
1872 if (aarch64_move_imm (val, mode))
1873 {
1874 if (generate)
1875 emit_insn (gen_rtx_SET (dest, imm));
1876 return 1;
1877 }
1878
1879 if ((val >> 32) == 0 || mode == SImode)
1880 {
1881 if (generate)
1882 {
1883 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1884 if (mode == SImode)
1885 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1886 GEN_INT ((val >> 16) & 0xffff)));
1887 else
1888 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1889 GEN_INT ((val >> 16) & 0xffff)));
1890 }
1891 return 2;
1892 }
1893
1894 /* Remaining cases are all for DImode. */
1895
1896 mask = 0xffff;
1897 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1898 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1899 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1900 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1901
1902 if (zero_match != 2 && one_match != 2)
1903 {
1904 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1905 For a 64-bit bitmask try whether changing 16 bits to all ones or
1906 zeroes creates a valid bitmask. To check any repeated bitmask,
1907 try using 16 bits from the other 32-bit half of val. */
1908
1909 for (i = 0; i < 64; i += 16, mask <<= 16)
1910 {
1911 val2 = val & ~mask;
1912 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1913 break;
1914 val2 = val | mask;
1915 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1916 break;
1917 val2 = val2 & ~mask;
1918 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1919 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1920 break;
1921 }
1922 if (i != 64)
1923 {
1924 if (generate)
1925 {
1926 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1927 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1928 GEN_INT ((val >> i) & 0xffff)));
1929 }
1930 return 2;
1931 }
1932 }
1933
1934 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1935 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1936 otherwise skip zero bits. */
1937
1938 num_insns = 1;
1939 mask = 0xffff;
1940 val2 = one_match > zero_match ? ~val : val;
1941 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1942
1943 if (generate)
1944 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1945 ? (val | ~(mask << i))
1946 : (val & (mask << i)))));
1947 for (i += 16; i < 64; i += 16)
1948 {
1949 if ((val2 & (mask << i)) == 0)
1950 continue;
1951 if (generate)
1952 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1953 GEN_INT ((val >> i) & 0xffff)));
1954 num_insns ++;
1955 }
1956
1957 return num_insns;
1958 }
1959
1960
1961 void
1962 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1963 {
1964 machine_mode mode = GET_MODE (dest);
1965
1966 gcc_assert (mode == SImode || mode == DImode);
1967
1968 /* Check on what type of symbol it is. */
1969 if (GET_CODE (imm) == SYMBOL_REF
1970 || GET_CODE (imm) == LABEL_REF
1971 || GET_CODE (imm) == CONST)
1972 {
1973 rtx mem, base, offset;
1974 enum aarch64_symbol_type sty;
1975
1976 /* If we have (const (plus symbol offset)), separate out the offset
1977 before we start classifying the symbol. */
1978 split_const (imm, &base, &offset);
1979
1980 sty = aarch64_classify_symbol (base, offset);
1981 switch (sty)
1982 {
1983 case SYMBOL_FORCE_TO_MEM:
1984 if (offset != const0_rtx
1985 && targetm.cannot_force_const_mem (mode, imm))
1986 {
1987 gcc_assert (can_create_pseudo_p ());
1988 base = aarch64_force_temporary (mode, dest, base);
1989 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1990 aarch64_emit_move (dest, base);
1991 return;
1992 }
1993
1994 mem = force_const_mem (ptr_mode, imm);
1995 gcc_assert (mem);
1996
1997 /* If we aren't generating PC relative literals, then
1998 we need to expand the literal pool access carefully.
1999 This is something that needs to be done in a number
2000 of places, so could well live as a separate function. */
2001 if (!aarch64_pcrelative_literal_loads)
2002 {
2003 gcc_assert (can_create_pseudo_p ());
2004 base = gen_reg_rtx (ptr_mode);
2005 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2006 if (ptr_mode != Pmode)
2007 base = convert_memory_address (Pmode, base);
2008 mem = gen_rtx_MEM (ptr_mode, base);
2009 }
2010
2011 if (mode != ptr_mode)
2012 mem = gen_rtx_ZERO_EXTEND (mode, mem);
2013
2014 emit_insn (gen_rtx_SET (dest, mem));
2015
2016 return;
2017
2018 case SYMBOL_SMALL_TLSGD:
2019 case SYMBOL_SMALL_TLSDESC:
2020 case SYMBOL_SMALL_TLSIE:
2021 case SYMBOL_SMALL_GOT_28K:
2022 case SYMBOL_SMALL_GOT_4G:
2023 case SYMBOL_TINY_GOT:
2024 case SYMBOL_TINY_TLSIE:
2025 if (offset != const0_rtx)
2026 {
2027 gcc_assert(can_create_pseudo_p ());
2028 base = aarch64_force_temporary (mode, dest, base);
2029 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2030 aarch64_emit_move (dest, base);
2031 return;
2032 }
2033 /* FALLTHRU */
2034
2035 case SYMBOL_SMALL_ABSOLUTE:
2036 case SYMBOL_TINY_ABSOLUTE:
2037 case SYMBOL_TLSLE12:
2038 case SYMBOL_TLSLE24:
2039 case SYMBOL_TLSLE32:
2040 case SYMBOL_TLSLE48:
2041 aarch64_load_symref_appropriately (dest, imm, sty);
2042 return;
2043
2044 default:
2045 gcc_unreachable ();
2046 }
2047 }
2048
2049 if (!CONST_INT_P (imm))
2050 {
2051 if (GET_CODE (imm) == HIGH)
2052 emit_insn (gen_rtx_SET (dest, imm));
2053 else
2054 {
2055 rtx mem = force_const_mem (mode, imm);
2056 gcc_assert (mem);
2057 emit_insn (gen_rtx_SET (dest, mem));
2058 }
2059
2060 return;
2061 }
2062
2063 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2064 }
2065
2066 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2067 temporary value if necessary. FRAME_RELATED_P should be true if
2068 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2069 to the generated instructions. If SCRATCHREG is known to hold
2070 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2071 immediate again.
2072
2073 Since this function may be used to adjust the stack pointer, we must
2074 ensure that it cannot cause transient stack deallocation (for example
2075 by first incrementing SP and then decrementing when adjusting by a
2076 large immediate). */
2077
2078 static void
2079 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2080 HOST_WIDE_INT delta, bool frame_related_p,
2081 bool emit_move_imm)
2082 {
2083 HOST_WIDE_INT mdelta = abs_hwi (delta);
2084 rtx this_rtx = gen_rtx_REG (mode, regnum);
2085 rtx_insn *insn;
2086
2087 if (!mdelta)
2088 return;
2089
2090 /* Single instruction adjustment. */
2091 if (aarch64_uimm12_shift (mdelta))
2092 {
2093 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2094 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2095 return;
2096 }
2097
2098 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2099 Only do this if mdelta is not a 16-bit move as adjusting using a move
2100 is better. */
2101 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2102 {
2103 HOST_WIDE_INT low_off = mdelta & 0xfff;
2104
2105 low_off = delta < 0 ? -low_off : low_off;
2106 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2107 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2108 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2109 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2110 return;
2111 }
2112
2113 /* Emit a move immediate if required and an addition/subtraction. */
2114 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2115 if (emit_move_imm)
2116 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2117 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2118 : gen_add2_insn (this_rtx, scratch_rtx));
2119 if (frame_related_p)
2120 {
2121 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2122 rtx adj = plus_constant (mode, this_rtx, delta);
2123 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2124 }
2125 }
2126
2127 static inline void
2128 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2129 HOST_WIDE_INT delta)
2130 {
2131 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2132 }
2133
2134 static inline void
2135 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2136 {
2137 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2138 true, emit_move_imm);
2139 }
2140
2141 static inline void
2142 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2143 {
2144 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2145 frame_related_p, true);
2146 }
2147
2148 static bool
2149 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2150 tree exp ATTRIBUTE_UNUSED)
2151 {
2152 /* Currently, always true. */
2153 return true;
2154 }
2155
2156 /* Implement TARGET_PASS_BY_REFERENCE. */
2157
2158 static bool
2159 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2160 machine_mode mode,
2161 const_tree type,
2162 bool named ATTRIBUTE_UNUSED)
2163 {
2164 HOST_WIDE_INT size;
2165 machine_mode dummymode;
2166 int nregs;
2167
2168 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2169 size = (mode == BLKmode && type)
2170 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2171
2172 /* Aggregates are passed by reference based on their size. */
2173 if (type && AGGREGATE_TYPE_P (type))
2174 {
2175 size = int_size_in_bytes (type);
2176 }
2177
2178 /* Variable sized arguments are always returned by reference. */
2179 if (size < 0)
2180 return true;
2181
2182 /* Can this be a candidate to be passed in fp/simd register(s)? */
2183 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2184 &dummymode, &nregs,
2185 NULL))
2186 return false;
2187
2188 /* Arguments which are variable sized or larger than 2 registers are
2189 passed by reference unless they are a homogenous floating point
2190 aggregate. */
2191 return size > 2 * UNITS_PER_WORD;
2192 }
2193
2194 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2195 static bool
2196 aarch64_return_in_msb (const_tree valtype)
2197 {
2198 machine_mode dummy_mode;
2199 int dummy_int;
2200
2201 /* Never happens in little-endian mode. */
2202 if (!BYTES_BIG_ENDIAN)
2203 return false;
2204
2205 /* Only composite types smaller than or equal to 16 bytes can
2206 be potentially returned in registers. */
2207 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2208 || int_size_in_bytes (valtype) <= 0
2209 || int_size_in_bytes (valtype) > 16)
2210 return false;
2211
2212 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2213 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2214 is always passed/returned in the least significant bits of fp/simd
2215 register(s). */
2216 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2217 &dummy_mode, &dummy_int, NULL))
2218 return false;
2219
2220 return true;
2221 }
2222
2223 /* Implement TARGET_FUNCTION_VALUE.
2224 Define how to find the value returned by a function. */
2225
2226 static rtx
2227 aarch64_function_value (const_tree type, const_tree func,
2228 bool outgoing ATTRIBUTE_UNUSED)
2229 {
2230 machine_mode mode;
2231 int unsignedp;
2232 int count;
2233 machine_mode ag_mode;
2234
2235 mode = TYPE_MODE (type);
2236 if (INTEGRAL_TYPE_P (type))
2237 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2238
2239 if (aarch64_return_in_msb (type))
2240 {
2241 HOST_WIDE_INT size = int_size_in_bytes (type);
2242
2243 if (size % UNITS_PER_WORD != 0)
2244 {
2245 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2246 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2247 }
2248 }
2249
2250 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2251 &ag_mode, &count, NULL))
2252 {
2253 if (!aarch64_composite_type_p (type, mode))
2254 {
2255 gcc_assert (count == 1 && mode == ag_mode);
2256 return gen_rtx_REG (mode, V0_REGNUM);
2257 }
2258 else
2259 {
2260 int i;
2261 rtx par;
2262
2263 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2264 for (i = 0; i < count; i++)
2265 {
2266 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2267 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2268 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2269 XVECEXP (par, 0, i) = tmp;
2270 }
2271 return par;
2272 }
2273 }
2274 else
2275 return gen_rtx_REG (mode, R0_REGNUM);
2276 }
2277
2278 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2279 Return true if REGNO is the number of a hard register in which the values
2280 of called function may come back. */
2281
2282 static bool
2283 aarch64_function_value_regno_p (const unsigned int regno)
2284 {
2285 /* Maximum of 16 bytes can be returned in the general registers. Examples
2286 of 16-byte return values are: 128-bit integers and 16-byte small
2287 structures (excluding homogeneous floating-point aggregates). */
2288 if (regno == R0_REGNUM || regno == R1_REGNUM)
2289 return true;
2290
2291 /* Up to four fp/simd registers can return a function value, e.g. a
2292 homogeneous floating-point aggregate having four members. */
2293 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2294 return TARGET_FLOAT;
2295
2296 return false;
2297 }
2298
2299 /* Implement TARGET_RETURN_IN_MEMORY.
2300
2301 If the type T of the result of a function is such that
2302 void func (T arg)
2303 would require that arg be passed as a value in a register (or set of
2304 registers) according to the parameter passing rules, then the result
2305 is returned in the same registers as would be used for such an
2306 argument. */
2307
2308 static bool
2309 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2310 {
2311 HOST_WIDE_INT size;
2312 machine_mode ag_mode;
2313 int count;
2314
2315 if (!AGGREGATE_TYPE_P (type)
2316 && TREE_CODE (type) != COMPLEX_TYPE
2317 && TREE_CODE (type) != VECTOR_TYPE)
2318 /* Simple scalar types always returned in registers. */
2319 return false;
2320
2321 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2322 type,
2323 &ag_mode,
2324 &count,
2325 NULL))
2326 return false;
2327
2328 /* Types larger than 2 registers returned in memory. */
2329 size = int_size_in_bytes (type);
2330 return (size < 0 || size > 2 * UNITS_PER_WORD);
2331 }
2332
2333 static bool
2334 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2335 const_tree type, int *nregs)
2336 {
2337 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2338 return aarch64_vfp_is_call_or_return_candidate (mode,
2339 type,
2340 &pcum->aapcs_vfp_rmode,
2341 nregs,
2342 NULL);
2343 }
2344
2345 /* Given MODE and TYPE of a function argument, return the alignment in
2346 bits. The idea is to suppress any stronger alignment requested by
2347 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2348 This is a helper function for local use only. */
2349
2350 static unsigned int
2351 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2352 {
2353 if (!type)
2354 return GET_MODE_ALIGNMENT (mode);
2355
2356 if (integer_zerop (TYPE_SIZE (type)))
2357 return 0;
2358
2359 gcc_assert (TYPE_MODE (type) == mode);
2360
2361 if (!AGGREGATE_TYPE_P (type))
2362 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2363
2364 if (TREE_CODE (type) == ARRAY_TYPE)
2365 return TYPE_ALIGN (TREE_TYPE (type));
2366
2367 unsigned int alignment = 0;
2368 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2369 if (TREE_CODE (field) == FIELD_DECL)
2370 alignment = std::max (alignment, DECL_ALIGN (field));
2371
2372 return alignment;
2373 }
2374
2375 /* Layout a function argument according to the AAPCS64 rules. The rule
2376 numbers refer to the rule numbers in the AAPCS64. */
2377
2378 static void
2379 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2380 const_tree type,
2381 bool named ATTRIBUTE_UNUSED)
2382 {
2383 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2384 int ncrn, nvrn, nregs;
2385 bool allocate_ncrn, allocate_nvrn;
2386 HOST_WIDE_INT size;
2387
2388 /* We need to do this once per argument. */
2389 if (pcum->aapcs_arg_processed)
2390 return;
2391
2392 pcum->aapcs_arg_processed = true;
2393
2394 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2395 size
2396 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2397 UNITS_PER_WORD);
2398
2399 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2400 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2401 mode,
2402 type,
2403 &nregs);
2404
2405 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2406 The following code thus handles passing by SIMD/FP registers first. */
2407
2408 nvrn = pcum->aapcs_nvrn;
2409
2410 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2411 and homogenous short-vector aggregates (HVA). */
2412 if (allocate_nvrn)
2413 {
2414 if (!TARGET_FLOAT)
2415 aarch64_err_no_fpadvsimd (mode, "argument");
2416
2417 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2418 {
2419 pcum->aapcs_nextnvrn = nvrn + nregs;
2420 if (!aarch64_composite_type_p (type, mode))
2421 {
2422 gcc_assert (nregs == 1);
2423 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2424 }
2425 else
2426 {
2427 rtx par;
2428 int i;
2429 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2430 for (i = 0; i < nregs; i++)
2431 {
2432 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2433 V0_REGNUM + nvrn + i);
2434 tmp = gen_rtx_EXPR_LIST
2435 (VOIDmode, tmp,
2436 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2437 XVECEXP (par, 0, i) = tmp;
2438 }
2439 pcum->aapcs_reg = par;
2440 }
2441 return;
2442 }
2443 else
2444 {
2445 /* C.3 NSRN is set to 8. */
2446 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2447 goto on_stack;
2448 }
2449 }
2450
2451 ncrn = pcum->aapcs_ncrn;
2452 nregs = size / UNITS_PER_WORD;
2453
2454 /* C6 - C9. though the sign and zero extension semantics are
2455 handled elsewhere. This is the case where the argument fits
2456 entirely general registers. */
2457 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2458 {
2459
2460 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2461
2462 /* C.8 if the argument has an alignment of 16 then the NGRN is
2463 rounded up to the next even number. */
2464 if (nregs == 2
2465 && ncrn % 2
2466 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2467 comparison is there because for > 16 * BITS_PER_UNIT
2468 alignment nregs should be > 2 and therefore it should be
2469 passed by reference rather than value. */
2470 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2471 {
2472 ++ncrn;
2473 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2474 }
2475
2476 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2477 A reg is still generated for it, but the caller should be smart
2478 enough not to use it. */
2479 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2480 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2481 else
2482 {
2483 rtx par;
2484 int i;
2485
2486 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2487 for (i = 0; i < nregs; i++)
2488 {
2489 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2490 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2491 GEN_INT (i * UNITS_PER_WORD));
2492 XVECEXP (par, 0, i) = tmp;
2493 }
2494 pcum->aapcs_reg = par;
2495 }
2496
2497 pcum->aapcs_nextncrn = ncrn + nregs;
2498 return;
2499 }
2500
2501 /* C.11 */
2502 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2503
2504 /* The argument is passed on stack; record the needed number of words for
2505 this argument and align the total size if necessary. */
2506 on_stack:
2507 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2508
2509 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2510 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2511 16 / UNITS_PER_WORD);
2512 return;
2513 }
2514
2515 /* Implement TARGET_FUNCTION_ARG. */
2516
2517 static rtx
2518 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2519 const_tree type, bool named)
2520 {
2521 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2522 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2523
2524 if (mode == VOIDmode)
2525 return NULL_RTX;
2526
2527 aarch64_layout_arg (pcum_v, mode, type, named);
2528 return pcum->aapcs_reg;
2529 }
2530
2531 void
2532 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2533 const_tree fntype ATTRIBUTE_UNUSED,
2534 rtx libname ATTRIBUTE_UNUSED,
2535 const_tree fndecl ATTRIBUTE_UNUSED,
2536 unsigned n_named ATTRIBUTE_UNUSED)
2537 {
2538 pcum->aapcs_ncrn = 0;
2539 pcum->aapcs_nvrn = 0;
2540 pcum->aapcs_nextncrn = 0;
2541 pcum->aapcs_nextnvrn = 0;
2542 pcum->pcs_variant = ARM_PCS_AAPCS64;
2543 pcum->aapcs_reg = NULL_RTX;
2544 pcum->aapcs_arg_processed = false;
2545 pcum->aapcs_stack_words = 0;
2546 pcum->aapcs_stack_size = 0;
2547
2548 if (!TARGET_FLOAT
2549 && fndecl && TREE_PUBLIC (fndecl)
2550 && fntype && fntype != error_mark_node)
2551 {
2552 const_tree type = TREE_TYPE (fntype);
2553 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2554 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2555 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2556 &mode, &nregs, NULL))
2557 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2558 }
2559 return;
2560 }
2561
2562 static void
2563 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2564 machine_mode mode,
2565 const_tree type,
2566 bool named)
2567 {
2568 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2569 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2570 {
2571 aarch64_layout_arg (pcum_v, mode, type, named);
2572 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2573 != (pcum->aapcs_stack_words != 0));
2574 pcum->aapcs_arg_processed = false;
2575 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2576 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2577 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2578 pcum->aapcs_stack_words = 0;
2579 pcum->aapcs_reg = NULL_RTX;
2580 }
2581 }
2582
2583 bool
2584 aarch64_function_arg_regno_p (unsigned regno)
2585 {
2586 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2587 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2588 }
2589
2590 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2591 PARM_BOUNDARY bits of alignment, but will be given anything up
2592 to STACK_BOUNDARY bits if the type requires it. This makes sure
2593 that both before and after the layout of each argument, the Next
2594 Stacked Argument Address (NSAA) will have a minimum alignment of
2595 8 bytes. */
2596
2597 static unsigned int
2598 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2599 {
2600 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2601 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2602 }
2603
2604 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2605
2606 Return true if an argument passed on the stack should be padded upwards,
2607 i.e. if the least-significant byte of the stack slot has useful data.
2608
2609 Small aggregate types are placed in the lowest memory address.
2610
2611 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2612
2613 bool
2614 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2615 {
2616 /* On little-endian targets, the least significant byte of every stack
2617 argument is passed at the lowest byte address of the stack slot. */
2618 if (!BYTES_BIG_ENDIAN)
2619 return true;
2620
2621 /* Otherwise, integral, floating-point and pointer types are padded downward:
2622 the least significant byte of a stack argument is passed at the highest
2623 byte address of the stack slot. */
2624 if (type
2625 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2626 || POINTER_TYPE_P (type))
2627 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2628 return false;
2629
2630 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2631 return true;
2632 }
2633
2634 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2635
2636 It specifies padding for the last (may also be the only)
2637 element of a block move between registers and memory. If
2638 assuming the block is in the memory, padding upward means that
2639 the last element is padded after its highest significant byte,
2640 while in downward padding, the last element is padded at the
2641 its least significant byte side.
2642
2643 Small aggregates and small complex types are always padded
2644 upwards.
2645
2646 We don't need to worry about homogeneous floating-point or
2647 short-vector aggregates; their move is not affected by the
2648 padding direction determined here. Regardless of endianness,
2649 each element of such an aggregate is put in the least
2650 significant bits of a fp/simd register.
2651
2652 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2653 register has useful data, and return the opposite if the most
2654 significant byte does. */
2655
2656 bool
2657 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2658 bool first ATTRIBUTE_UNUSED)
2659 {
2660
2661 /* Small composite types are always padded upward. */
2662 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2663 {
2664 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2665 : GET_MODE_SIZE (mode));
2666 if (size < 2 * UNITS_PER_WORD)
2667 return true;
2668 }
2669
2670 /* Otherwise, use the default padding. */
2671 return !BYTES_BIG_ENDIAN;
2672 }
2673
2674 static machine_mode
2675 aarch64_libgcc_cmp_return_mode (void)
2676 {
2677 return SImode;
2678 }
2679
2680 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2681
2682 /* We use the 12-bit shifted immediate arithmetic instructions so values
2683 must be multiple of (1 << 12), i.e. 4096. */
2684 #define ARITH_FACTOR 4096
2685
2686 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2687 #error Cannot use simple address calculation for stack probing
2688 #endif
2689
2690 /* The pair of scratch registers used for stack probing. */
2691 #define PROBE_STACK_FIRST_REG 9
2692 #define PROBE_STACK_SECOND_REG 10
2693
2694 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2695 inclusive. These are offsets from the current stack pointer. */
2696
2697 static void
2698 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2699 {
2700 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2701
2702 /* See the same assertion on PROBE_INTERVAL above. */
2703 gcc_assert ((first % ARITH_FACTOR) == 0);
2704
2705 /* See if we have a constant small number of probes to generate. If so,
2706 that's the easy case. */
2707 if (size <= PROBE_INTERVAL)
2708 {
2709 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2710
2711 emit_set_insn (reg1,
2712 plus_constant (Pmode,
2713 stack_pointer_rtx, -(first + base)));
2714 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2715 }
2716
2717 /* The run-time loop is made up of 8 insns in the generic case while the
2718 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2719 else if (size <= 4 * PROBE_INTERVAL)
2720 {
2721 HOST_WIDE_INT i, rem;
2722
2723 emit_set_insn (reg1,
2724 plus_constant (Pmode,
2725 stack_pointer_rtx,
2726 -(first + PROBE_INTERVAL)));
2727 emit_stack_probe (reg1);
2728
2729 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2730 it exceeds SIZE. If only two probes are needed, this will not
2731 generate any code. Then probe at FIRST + SIZE. */
2732 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2733 {
2734 emit_set_insn (reg1,
2735 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2736 emit_stack_probe (reg1);
2737 }
2738
2739 rem = size - (i - PROBE_INTERVAL);
2740 if (rem > 256)
2741 {
2742 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2743
2744 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2745 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2746 }
2747 else
2748 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2749 }
2750
2751 /* Otherwise, do the same as above, but in a loop. Note that we must be
2752 extra careful with variables wrapping around because we might be at
2753 the very top (or the very bottom) of the address space and we have
2754 to be able to handle this case properly; in particular, we use an
2755 equality test for the loop condition. */
2756 else
2757 {
2758 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2759
2760 /* Step 1: round SIZE to the previous multiple of the interval. */
2761
2762 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2763
2764
2765 /* Step 2: compute initial and final value of the loop counter. */
2766
2767 /* TEST_ADDR = SP + FIRST. */
2768 emit_set_insn (reg1,
2769 plus_constant (Pmode, stack_pointer_rtx, -first));
2770
2771 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2772 HOST_WIDE_INT adjustment = - (first + rounded_size);
2773 if (! aarch64_uimm12_shift (adjustment))
2774 {
2775 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2776 true, Pmode);
2777 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2778 }
2779 else
2780 {
2781 emit_set_insn (reg2,
2782 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2783 }
2784
2785 /* Step 3: the loop
2786
2787 do
2788 {
2789 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2790 probe at TEST_ADDR
2791 }
2792 while (TEST_ADDR != LAST_ADDR)
2793
2794 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2795 until it is equal to ROUNDED_SIZE. */
2796
2797 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2798
2799
2800 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2801 that SIZE is equal to ROUNDED_SIZE. */
2802
2803 if (size != rounded_size)
2804 {
2805 HOST_WIDE_INT rem = size - rounded_size;
2806
2807 if (rem > 256)
2808 {
2809 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2810
2811 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2812 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2813 }
2814 else
2815 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2816 }
2817 }
2818
2819 /* Make sure nothing is scheduled before we are done. */
2820 emit_insn (gen_blockage ());
2821 }
2822
2823 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2824 absolute addresses. */
2825
2826 const char *
2827 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2828 {
2829 static int labelno = 0;
2830 char loop_lab[32];
2831 rtx xops[2];
2832
2833 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2834
2835 /* Loop. */
2836 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2837
2838 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2839 xops[0] = reg1;
2840 xops[1] = GEN_INT (PROBE_INTERVAL);
2841 output_asm_insn ("sub\t%0, %0, %1", xops);
2842
2843 /* Probe at TEST_ADDR. */
2844 output_asm_insn ("str\txzr, [%0]", xops);
2845
2846 /* Test if TEST_ADDR == LAST_ADDR. */
2847 xops[1] = reg2;
2848 output_asm_insn ("cmp\t%0, %1", xops);
2849
2850 /* Branch. */
2851 fputs ("\tb.ne\t", asm_out_file);
2852 assemble_name_raw (asm_out_file, loop_lab);
2853 fputc ('\n', asm_out_file);
2854
2855 return "";
2856 }
2857
2858 static bool
2859 aarch64_frame_pointer_required (void)
2860 {
2861 /* In aarch64_override_options_after_change
2862 flag_omit_leaf_frame_pointer turns off the frame pointer by
2863 default. Turn it back on now if we've not got a leaf
2864 function. */
2865 if (flag_omit_leaf_frame_pointer
2866 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2867 return true;
2868
2869 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2870 if (crtl->calls_eh_return)
2871 return true;
2872
2873 return false;
2874 }
2875
2876 /* Mark the registers that need to be saved by the callee and calculate
2877 the size of the callee-saved registers area and frame record (both FP
2878 and LR may be omitted). */
2879 static void
2880 aarch64_layout_frame (void)
2881 {
2882 HOST_WIDE_INT offset = 0;
2883 int regno, last_fp_reg = INVALID_REGNUM;
2884
2885 if (reload_completed && cfun->machine->frame.laid_out)
2886 return;
2887
2888 #define SLOT_NOT_REQUIRED (-2)
2889 #define SLOT_REQUIRED (-1)
2890
2891 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2892 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2893
2894 /* First mark all the registers that really need to be saved... */
2895 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2896 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2897
2898 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2899 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2900
2901 /* ... that includes the eh data registers (if needed)... */
2902 if (crtl->calls_eh_return)
2903 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2904 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2905 = SLOT_REQUIRED;
2906
2907 /* ... and any callee saved register that dataflow says is live. */
2908 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2909 if (df_regs_ever_live_p (regno)
2910 && (regno == R30_REGNUM
2911 || !call_used_regs[regno]))
2912 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2913
2914 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2915 if (df_regs_ever_live_p (regno)
2916 && !call_used_regs[regno])
2917 {
2918 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2919 last_fp_reg = regno;
2920 }
2921
2922 if (frame_pointer_needed)
2923 {
2924 /* FP and LR are placed in the linkage record. */
2925 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2926 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2927 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2928 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2929 offset += 2 * UNITS_PER_WORD;
2930 }
2931
2932 /* Now assign stack slots for them. */
2933 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2934 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2935 {
2936 cfun->machine->frame.reg_offset[regno] = offset;
2937 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2938 cfun->machine->frame.wb_candidate1 = regno;
2939 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2940 cfun->machine->frame.wb_candidate2 = regno;
2941 offset += UNITS_PER_WORD;
2942 }
2943
2944 HOST_WIDE_INT max_int_offset = offset;
2945 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2946 bool has_align_gap = offset != max_int_offset;
2947
2948 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2949 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2950 {
2951 /* If there is an alignment gap between integer and fp callee-saves,
2952 allocate the last fp register to it if possible. */
2953 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2954 {
2955 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2956 break;
2957 }
2958
2959 cfun->machine->frame.reg_offset[regno] = offset;
2960 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2961 cfun->machine->frame.wb_candidate1 = regno;
2962 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2963 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2964 cfun->machine->frame.wb_candidate2 = regno;
2965 offset += UNITS_PER_WORD;
2966 }
2967
2968 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2969
2970 cfun->machine->frame.saved_regs_size = offset;
2971
2972 HOST_WIDE_INT varargs_and_saved_regs_size
2973 = offset + cfun->machine->frame.saved_varargs_size;
2974
2975 cfun->machine->frame.hard_fp_offset
2976 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2977 STACK_BOUNDARY / BITS_PER_UNIT);
2978
2979 cfun->machine->frame.frame_size
2980 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2981 + crtl->outgoing_args_size,
2982 STACK_BOUNDARY / BITS_PER_UNIT);
2983
2984 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2985
2986 cfun->machine->frame.initial_adjust = 0;
2987 cfun->machine->frame.final_adjust = 0;
2988 cfun->machine->frame.callee_adjust = 0;
2989 cfun->machine->frame.callee_offset = 0;
2990
2991 HOST_WIDE_INT max_push_offset = 0;
2992 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2993 max_push_offset = 512;
2994 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2995 max_push_offset = 256;
2996
2997 if (cfun->machine->frame.frame_size < max_push_offset
2998 && crtl->outgoing_args_size == 0)
2999 {
3000 /* Simple, small frame with no outgoing arguments:
3001 stp reg1, reg2, [sp, -frame_size]!
3002 stp reg3, reg4, [sp, 16] */
3003 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3004 }
3005 else if ((crtl->outgoing_args_size
3006 + cfun->machine->frame.saved_regs_size < 512)
3007 && !(cfun->calls_alloca
3008 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3009 {
3010 /* Frame with small outgoing arguments:
3011 sub sp, sp, frame_size
3012 stp reg1, reg2, [sp, outgoing_args_size]
3013 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3014 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3015 cfun->machine->frame.callee_offset
3016 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3017 }
3018 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3019 {
3020 /* Frame with large outgoing arguments but a small local area:
3021 stp reg1, reg2, [sp, -hard_fp_offset]!
3022 stp reg3, reg4, [sp, 16]
3023 sub sp, sp, outgoing_args_size */
3024 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3025 cfun->machine->frame.final_adjust
3026 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3027 }
3028 else if (!frame_pointer_needed
3029 && varargs_and_saved_regs_size < max_push_offset)
3030 {
3031 /* Frame with large local area and outgoing arguments (this pushes the
3032 callee-saves first, followed by the locals and outgoing area):
3033 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3034 stp reg3, reg4, [sp, 16]
3035 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3036 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3037 cfun->machine->frame.final_adjust
3038 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3039 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3040 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3041 }
3042 else
3043 {
3044 /* Frame with large local area and outgoing arguments using frame pointer:
3045 sub sp, sp, hard_fp_offset
3046 stp x29, x30, [sp, 0]
3047 add x29, sp, 0
3048 stp reg3, reg4, [sp, 16]
3049 sub sp, sp, outgoing_args_size */
3050 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3051 cfun->machine->frame.final_adjust
3052 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3053 }
3054
3055 cfun->machine->frame.laid_out = true;
3056 }
3057
3058 /* Return true if the register REGNO is saved on entry to
3059 the current function. */
3060
3061 static bool
3062 aarch64_register_saved_on_entry (int regno)
3063 {
3064 return cfun->machine->frame.reg_offset[regno] >= 0;
3065 }
3066
3067 /* Return the next register up from REGNO up to LIMIT for the callee
3068 to save. */
3069
3070 static unsigned
3071 aarch64_next_callee_save (unsigned regno, unsigned limit)
3072 {
3073 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3074 regno ++;
3075 return regno;
3076 }
3077
3078 /* Push the register number REGNO of mode MODE to the stack with write-back
3079 adjusting the stack by ADJUSTMENT. */
3080
3081 static void
3082 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3083 HOST_WIDE_INT adjustment)
3084 {
3085 rtx base_rtx = stack_pointer_rtx;
3086 rtx insn, reg, mem;
3087
3088 reg = gen_rtx_REG (mode, regno);
3089 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3090 plus_constant (Pmode, base_rtx, -adjustment));
3091 mem = gen_rtx_MEM (mode, mem);
3092
3093 insn = emit_move_insn (mem, reg);
3094 RTX_FRAME_RELATED_P (insn) = 1;
3095 }
3096
3097 /* Generate and return an instruction to store the pair of registers
3098 REG and REG2 of mode MODE to location BASE with write-back adjusting
3099 the stack location BASE by ADJUSTMENT. */
3100
3101 static rtx
3102 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3103 HOST_WIDE_INT adjustment)
3104 {
3105 switch (mode)
3106 {
3107 case DImode:
3108 return gen_storewb_pairdi_di (base, base, reg, reg2,
3109 GEN_INT (-adjustment),
3110 GEN_INT (UNITS_PER_WORD - adjustment));
3111 case DFmode:
3112 return gen_storewb_pairdf_di (base, base, reg, reg2,
3113 GEN_INT (-adjustment),
3114 GEN_INT (UNITS_PER_WORD - adjustment));
3115 default:
3116 gcc_unreachable ();
3117 }
3118 }
3119
3120 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3121 stack pointer by ADJUSTMENT. */
3122
3123 static void
3124 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3125 {
3126 rtx_insn *insn;
3127 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3128
3129 if (regno2 == INVALID_REGNUM)
3130 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3131
3132 rtx reg1 = gen_rtx_REG (mode, regno1);
3133 rtx reg2 = gen_rtx_REG (mode, regno2);
3134
3135 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3136 reg2, adjustment));
3137 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3138 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3139 RTX_FRAME_RELATED_P (insn) = 1;
3140 }
3141
3142 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3143 adjusting it by ADJUSTMENT afterwards. */
3144
3145 static rtx
3146 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3147 HOST_WIDE_INT adjustment)
3148 {
3149 switch (mode)
3150 {
3151 case DImode:
3152 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3153 GEN_INT (UNITS_PER_WORD));
3154 case DFmode:
3155 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3156 GEN_INT (UNITS_PER_WORD));
3157 default:
3158 gcc_unreachable ();
3159 }
3160 }
3161
3162 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3163 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3164 into CFI_OPS. */
3165
3166 static void
3167 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3168 rtx *cfi_ops)
3169 {
3170 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3171 rtx reg1 = gen_rtx_REG (mode, regno1);
3172
3173 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3174
3175 if (regno2 == INVALID_REGNUM)
3176 {
3177 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3178 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3179 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3180 }
3181 else
3182 {
3183 rtx reg2 = gen_rtx_REG (mode, regno2);
3184 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3185 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3186 reg2, adjustment));
3187 }
3188 }
3189
3190 /* Generate and return a store pair instruction of mode MODE to store
3191 register REG1 to MEM1 and register REG2 to MEM2. */
3192
3193 static rtx
3194 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3195 rtx reg2)
3196 {
3197 switch (mode)
3198 {
3199 case DImode:
3200 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3201
3202 case DFmode:
3203 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3204
3205 default:
3206 gcc_unreachable ();
3207 }
3208 }
3209
3210 /* Generate and regurn a load pair isntruction of mode MODE to load register
3211 REG1 from MEM1 and register REG2 from MEM2. */
3212
3213 static rtx
3214 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3215 rtx mem2)
3216 {
3217 switch (mode)
3218 {
3219 case DImode:
3220 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3221
3222 case DFmode:
3223 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3224
3225 default:
3226 gcc_unreachable ();
3227 }
3228 }
3229
3230 /* Return TRUE if return address signing should be enabled for the current
3231 function, otherwise return FALSE. */
3232
3233 bool
3234 aarch64_return_address_signing_enabled (void)
3235 {
3236 /* This function should only be called after frame laid out. */
3237 gcc_assert (cfun->machine->frame.laid_out);
3238
3239 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3240 if it's LR is pushed onto stack. */
3241 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3242 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3243 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3244 }
3245
3246 /* Emit code to save the callee-saved registers from register number START
3247 to LIMIT to the stack at the location starting at offset START_OFFSET,
3248 skipping any write-back candidates if SKIP_WB is true. */
3249
3250 static void
3251 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3252 unsigned start, unsigned limit, bool skip_wb)
3253 {
3254 rtx_insn *insn;
3255 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3256 ? gen_frame_mem : gen_rtx_MEM);
3257 unsigned regno;
3258 unsigned regno2;
3259
3260 for (regno = aarch64_next_callee_save (start, limit);
3261 regno <= limit;
3262 regno = aarch64_next_callee_save (regno + 1, limit))
3263 {
3264 rtx reg, mem;
3265 HOST_WIDE_INT offset;
3266
3267 if (skip_wb
3268 && (regno == cfun->machine->frame.wb_candidate1
3269 || regno == cfun->machine->frame.wb_candidate2))
3270 continue;
3271
3272 if (cfun->machine->reg_is_wrapped_separately[regno])
3273 continue;
3274
3275 reg = gen_rtx_REG (mode, regno);
3276 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3277 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3278 offset));
3279
3280 regno2 = aarch64_next_callee_save (regno + 1, limit);
3281
3282 if (regno2 <= limit
3283 && !cfun->machine->reg_is_wrapped_separately[regno2]
3284 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3285 == cfun->machine->frame.reg_offset[regno2]))
3286
3287 {
3288 rtx reg2 = gen_rtx_REG (mode, regno2);
3289 rtx mem2;
3290
3291 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3292 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3293 offset));
3294 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3295 reg2));
3296
3297 /* The first part of a frame-related parallel insn is
3298 always assumed to be relevant to the frame
3299 calculations; subsequent parts, are only
3300 frame-related if explicitly marked. */
3301 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3302 regno = regno2;
3303 }
3304 else
3305 insn = emit_move_insn (mem, reg);
3306
3307 RTX_FRAME_RELATED_P (insn) = 1;
3308 }
3309 }
3310
3311 /* Emit code to restore the callee registers of mode MODE from register
3312 number START up to and including LIMIT. Restore from the stack offset
3313 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3314 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3315
3316 static void
3317 aarch64_restore_callee_saves (machine_mode mode,
3318 HOST_WIDE_INT start_offset, unsigned start,
3319 unsigned limit, bool skip_wb, rtx *cfi_ops)
3320 {
3321 rtx base_rtx = stack_pointer_rtx;
3322 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3323 ? gen_frame_mem : gen_rtx_MEM);
3324 unsigned regno;
3325 unsigned regno2;
3326 HOST_WIDE_INT offset;
3327
3328 for (regno = aarch64_next_callee_save (start, limit);
3329 regno <= limit;
3330 regno = aarch64_next_callee_save (regno + 1, limit))
3331 {
3332 if (cfun->machine->reg_is_wrapped_separately[regno])
3333 continue;
3334
3335 rtx reg, mem;
3336
3337 if (skip_wb
3338 && (regno == cfun->machine->frame.wb_candidate1
3339 || regno == cfun->machine->frame.wb_candidate2))
3340 continue;
3341
3342 reg = gen_rtx_REG (mode, regno);
3343 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3344 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3345
3346 regno2 = aarch64_next_callee_save (regno + 1, limit);
3347
3348 if (regno2 <= limit
3349 && !cfun->machine->reg_is_wrapped_separately[regno2]
3350 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3351 == cfun->machine->frame.reg_offset[regno2]))
3352 {
3353 rtx reg2 = gen_rtx_REG (mode, regno2);
3354 rtx mem2;
3355
3356 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3357 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3358 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3359
3360 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3361 regno = regno2;
3362 }
3363 else
3364 emit_move_insn (reg, mem);
3365 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3366 }
3367 }
3368
3369 static inline bool
3370 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3371 HOST_WIDE_INT offset)
3372 {
3373 return offset >= -256 && offset < 256;
3374 }
3375
3376 static inline bool
3377 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3378 {
3379 return (offset >= 0
3380 && offset < 4096 * GET_MODE_SIZE (mode)
3381 && offset % GET_MODE_SIZE (mode) == 0);
3382 }
3383
3384 bool
3385 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3386 {
3387 return (offset >= -64 * GET_MODE_SIZE (mode)
3388 && offset < 64 * GET_MODE_SIZE (mode)
3389 && offset % GET_MODE_SIZE (mode) == 0);
3390 }
3391
3392 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3393
3394 static sbitmap
3395 aarch64_get_separate_components (void)
3396 {
3397 aarch64_layout_frame ();
3398
3399 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3400 bitmap_clear (components);
3401
3402 /* The registers we need saved to the frame. */
3403 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3404 if (aarch64_register_saved_on_entry (regno))
3405 {
3406 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3407 if (!frame_pointer_needed)
3408 offset += cfun->machine->frame.frame_size
3409 - cfun->machine->frame.hard_fp_offset;
3410 /* Check that we can access the stack slot of the register with one
3411 direct load with no adjustments needed. */
3412 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3413 bitmap_set_bit (components, regno);
3414 }
3415
3416 /* Don't mess with the hard frame pointer. */
3417 if (frame_pointer_needed)
3418 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3419
3420 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3421 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3422 /* If aarch64_layout_frame has chosen registers to store/restore with
3423 writeback don't interfere with them to avoid having to output explicit
3424 stack adjustment instructions. */
3425 if (reg2 != INVALID_REGNUM)
3426 bitmap_clear_bit (components, reg2);
3427 if (reg1 != INVALID_REGNUM)
3428 bitmap_clear_bit (components, reg1);
3429
3430 bitmap_clear_bit (components, LR_REGNUM);
3431 bitmap_clear_bit (components, SP_REGNUM);
3432
3433 return components;
3434 }
3435
3436 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3437
3438 static sbitmap
3439 aarch64_components_for_bb (basic_block bb)
3440 {
3441 bitmap in = DF_LIVE_IN (bb);
3442 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3443 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3444
3445 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3446 bitmap_clear (components);
3447
3448 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3449 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3450 if ((!call_used_regs[regno])
3451 && (bitmap_bit_p (in, regno)
3452 || bitmap_bit_p (gen, regno)
3453 || bitmap_bit_p (kill, regno)))
3454 bitmap_set_bit (components, regno);
3455
3456 return components;
3457 }
3458
3459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3460 Nothing to do for aarch64. */
3461
3462 static void
3463 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3464 {
3465 }
3466
3467 /* Return the next set bit in BMP from START onwards. Return the total number
3468 of bits in BMP if no set bit is found at or after START. */
3469
3470 static unsigned int
3471 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3472 {
3473 unsigned int nbits = SBITMAP_SIZE (bmp);
3474 if (start == nbits)
3475 return start;
3476
3477 gcc_assert (start < nbits);
3478 for (unsigned int i = start; i < nbits; i++)
3479 if (bitmap_bit_p (bmp, i))
3480 return i;
3481
3482 return nbits;
3483 }
3484
3485 /* Do the work for aarch64_emit_prologue_components and
3486 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3487 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3488 for these components or the epilogue sequence. That is, it determines
3489 whether we should emit stores or loads and what kind of CFA notes to attach
3490 to the insns. Otherwise the logic for the two sequences is very
3491 similar. */
3492
3493 static void
3494 aarch64_process_components (sbitmap components, bool prologue_p)
3495 {
3496 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3497 ? HARD_FRAME_POINTER_REGNUM
3498 : STACK_POINTER_REGNUM);
3499
3500 unsigned last_regno = SBITMAP_SIZE (components);
3501 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3502 rtx_insn *insn = NULL;
3503
3504 while (regno != last_regno)
3505 {
3506 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3507 so DFmode for the vector registers is enough. */
3508 machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3509 rtx reg = gen_rtx_REG (mode, regno);
3510 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3511 if (!frame_pointer_needed)
3512 offset += cfun->machine->frame.frame_size
3513 - cfun->machine->frame.hard_fp_offset;
3514 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3515 rtx mem = gen_frame_mem (mode, addr);
3516
3517 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3518 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3519 /* No more registers to handle after REGNO.
3520 Emit a single save/restore and exit. */
3521 if (regno2 == last_regno)
3522 {
3523 insn = emit_insn (set);
3524 RTX_FRAME_RELATED_P (insn) = 1;
3525 if (prologue_p)
3526 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3527 else
3528 add_reg_note (insn, REG_CFA_RESTORE, reg);
3529 break;
3530 }
3531
3532 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3533 /* The next register is not of the same class or its offset is not
3534 mergeable with the current one into a pair. */
3535 if (!satisfies_constraint_Ump (mem)
3536 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3537 || (offset2 - cfun->machine->frame.reg_offset[regno])
3538 != GET_MODE_SIZE (mode))
3539 {
3540 insn = emit_insn (set);
3541 RTX_FRAME_RELATED_P (insn) = 1;
3542 if (prologue_p)
3543 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3544 else
3545 add_reg_note (insn, REG_CFA_RESTORE, reg);
3546
3547 regno = regno2;
3548 continue;
3549 }
3550
3551 /* REGNO2 can be saved/restored in a pair with REGNO. */
3552 rtx reg2 = gen_rtx_REG (mode, regno2);
3553 if (!frame_pointer_needed)
3554 offset2 += cfun->machine->frame.frame_size
3555 - cfun->machine->frame.hard_fp_offset;
3556 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3557 rtx mem2 = gen_frame_mem (mode, addr2);
3558 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3559 : gen_rtx_SET (reg2, mem2);
3560
3561 if (prologue_p)
3562 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3563 else
3564 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3565
3566 RTX_FRAME_RELATED_P (insn) = 1;
3567 if (prologue_p)
3568 {
3569 add_reg_note (insn, REG_CFA_OFFSET, set);
3570 add_reg_note (insn, REG_CFA_OFFSET, set2);
3571 }
3572 else
3573 {
3574 add_reg_note (insn, REG_CFA_RESTORE, reg);
3575 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3576 }
3577
3578 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3579 }
3580 }
3581
3582 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3583
3584 static void
3585 aarch64_emit_prologue_components (sbitmap components)
3586 {
3587 aarch64_process_components (components, true);
3588 }
3589
3590 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3591
3592 static void
3593 aarch64_emit_epilogue_components (sbitmap components)
3594 {
3595 aarch64_process_components (components, false);
3596 }
3597
3598 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3599
3600 static void
3601 aarch64_set_handled_components (sbitmap components)
3602 {
3603 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3604 if (bitmap_bit_p (components, regno))
3605 cfun->machine->reg_is_wrapped_separately[regno] = true;
3606 }
3607
3608 /* AArch64 stack frames generated by this compiler look like:
3609
3610 +-------------------------------+
3611 | |
3612 | incoming stack arguments |
3613 | |
3614 +-------------------------------+
3615 | | <-- incoming stack pointer (aligned)
3616 | callee-allocated save area |
3617 | for register varargs |
3618 | |
3619 +-------------------------------+
3620 | local variables | <-- frame_pointer_rtx
3621 | |
3622 +-------------------------------+
3623 | padding0 | \
3624 +-------------------------------+ |
3625 | callee-saved registers | | frame.saved_regs_size
3626 +-------------------------------+ |
3627 | LR' | |
3628 +-------------------------------+ |
3629 | FP' | / <- hard_frame_pointer_rtx (aligned)
3630 +-------------------------------+
3631 | dynamic allocation |
3632 +-------------------------------+
3633 | padding |
3634 +-------------------------------+
3635 | outgoing stack arguments | <-- arg_pointer
3636 | |
3637 +-------------------------------+
3638 | | <-- stack_pointer_rtx (aligned)
3639
3640 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3641 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3642 unchanged. */
3643
3644 /* Generate the prologue instructions for entry into a function.
3645 Establish the stack frame by decreasing the stack pointer with a
3646 properly calculated size and, if necessary, create a frame record
3647 filled with the values of LR and previous frame pointer. The
3648 current FP is also set up if it is in use. */
3649
3650 void
3651 aarch64_expand_prologue (void)
3652 {
3653 aarch64_layout_frame ();
3654
3655 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3656 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3657 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3658 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3659 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3660 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3661 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3662 rtx_insn *insn;
3663
3664 /* Sign return address for functions. */
3665 if (aarch64_return_address_signing_enabled ())
3666 {
3667 insn = emit_insn (gen_pacisp ());
3668 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3669 RTX_FRAME_RELATED_P (insn) = 1;
3670 }
3671
3672 if (flag_stack_usage_info)
3673 current_function_static_stack_size = frame_size;
3674
3675 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3676 {
3677 if (crtl->is_leaf && !cfun->calls_alloca)
3678 {
3679 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3680 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3681 frame_size - STACK_CHECK_PROTECT);
3682 }
3683 else if (frame_size > 0)
3684 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3685 }
3686
3687 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3688
3689 if (callee_adjust != 0)
3690 aarch64_push_regs (reg1, reg2, callee_adjust);
3691
3692 if (frame_pointer_needed)
3693 {
3694 if (callee_adjust == 0)
3695 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3696 R30_REGNUM, false);
3697 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3698 stack_pointer_rtx,
3699 GEN_INT (callee_offset)));
3700 RTX_FRAME_RELATED_P (insn) = 1;
3701 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3702 }
3703
3704 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3705 callee_adjust != 0 || frame_pointer_needed);
3706 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3707 callee_adjust != 0 || frame_pointer_needed);
3708 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3709 }
3710
3711 /* Return TRUE if we can use a simple_return insn.
3712
3713 This function checks whether the callee saved stack is empty, which
3714 means no restore actions are need. The pro_and_epilogue will use
3715 this to check whether shrink-wrapping opt is feasible. */
3716
3717 bool
3718 aarch64_use_return_insn_p (void)
3719 {
3720 if (!reload_completed)
3721 return false;
3722
3723 if (crtl->profile)
3724 return false;
3725
3726 aarch64_layout_frame ();
3727
3728 return cfun->machine->frame.frame_size == 0;
3729 }
3730
3731 /* Generate the epilogue instructions for returning from a function.
3732 This is almost exactly the reverse of the prolog sequence, except
3733 that we need to insert barriers to avoid scheduling loads that read
3734 from a deallocated stack, and we optimize the unwind records by
3735 emitting them all together if possible. */
3736 void
3737 aarch64_expand_epilogue (bool for_sibcall)
3738 {
3739 aarch64_layout_frame ();
3740
3741 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3742 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3743 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3744 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3745 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3746 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3747 rtx cfi_ops = NULL;
3748 rtx_insn *insn;
3749
3750 /* We need to add memory barrier to prevent read from deallocated stack. */
3751 bool need_barrier_p = (get_frame_size ()
3752 + cfun->machine->frame.saved_varargs_size) != 0;
3753
3754 /* Emit a barrier to prevent loads from a deallocated stack. */
3755 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3756 || crtl->calls_eh_return)
3757 {
3758 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3759 need_barrier_p = false;
3760 }
3761
3762 /* Restore the stack pointer from the frame pointer if it may not
3763 be the same as the stack pointer. */
3764 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3765 {
3766 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3767 hard_frame_pointer_rtx,
3768 GEN_INT (-callee_offset)));
3769 /* If writeback is used when restoring callee-saves, the CFA
3770 is restored on the instruction doing the writeback. */
3771 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3772 }
3773 else
3774 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3775
3776 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3777 callee_adjust != 0, &cfi_ops);
3778 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3779 callee_adjust != 0, &cfi_ops);
3780
3781 if (need_barrier_p)
3782 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3783
3784 if (callee_adjust != 0)
3785 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3786
3787 if (callee_adjust != 0 || initial_adjust > 65536)
3788 {
3789 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3790 insn = get_last_insn ();
3791 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3792 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3793 RTX_FRAME_RELATED_P (insn) = 1;
3794 cfi_ops = NULL;
3795 }
3796
3797 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3798
3799 if (cfi_ops)
3800 {
3801 /* Emit delayed restores and reset the CFA to be SP. */
3802 insn = get_last_insn ();
3803 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3804 REG_NOTES (insn) = cfi_ops;
3805 RTX_FRAME_RELATED_P (insn) = 1;
3806 }
3807
3808 /* We prefer to emit the combined return/authenticate instruction RETAA,
3809 however there are three cases in which we must instead emit an explicit
3810 authentication instruction.
3811
3812 1) Sibcalls don't return in a normal way, so if we're about to call one
3813 we must authenticate.
3814
3815 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3816 generating code for !TARGET_ARMV8_3 we can't use it and must
3817 explicitly authenticate.
3818
3819 3) On an eh_return path we make extra stack adjustments to update the
3820 canonical frame address to be the exception handler's CFA. We want
3821 to authenticate using the CFA of the function which calls eh_return.
3822 */
3823 if (aarch64_return_address_signing_enabled ()
3824 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3825 {
3826 insn = emit_insn (gen_autisp ());
3827 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3828 RTX_FRAME_RELATED_P (insn) = 1;
3829 }
3830
3831 /* Stack adjustment for exception handler. */
3832 if (crtl->calls_eh_return)
3833 {
3834 /* We need to unwind the stack by the offset computed by
3835 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3836 to be SP; letting the CFA move during this adjustment
3837 is just as correct as retaining the CFA from the body
3838 of the function. Therefore, do nothing special. */
3839 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3840 }
3841
3842 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3843 if (!for_sibcall)
3844 emit_jump_insn (ret_rtx);
3845 }
3846
3847 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3848 normally or return to a previous frame after unwinding.
3849
3850 An EH return uses a single shared return sequence. The epilogue is
3851 exactly like a normal epilogue except that it has an extra input
3852 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3853 that must be applied after the frame has been destroyed. An extra label
3854 is inserted before the epilogue which initializes this register to zero,
3855 and this is the entry point for a normal return.
3856
3857 An actual EH return updates the return address, initializes the stack
3858 adjustment and jumps directly into the epilogue (bypassing the zeroing
3859 of the adjustment). Since the return address is typically saved on the
3860 stack when a function makes a call, the saved LR must be updated outside
3861 the epilogue.
3862
3863 This poses problems as the store is generated well before the epilogue,
3864 so the offset of LR is not known yet. Also optimizations will remove the
3865 store as it appears dead, even after the epilogue is generated (as the
3866 base or offset for loading LR is different in many cases).
3867
3868 To avoid these problems this implementation forces the frame pointer
3869 in eh_return functions so that the location of LR is fixed and known early.
3870 It also marks the store volatile, so no optimization is permitted to
3871 remove the store. */
3872 rtx
3873 aarch64_eh_return_handler_rtx (void)
3874 {
3875 rtx tmp = gen_frame_mem (Pmode,
3876 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3877
3878 /* Mark the store volatile, so no optimization is permitted to remove it. */
3879 MEM_VOLATILE_P (tmp) = true;
3880 return tmp;
3881 }
3882
3883 /* Output code to add DELTA to the first argument, and then jump
3884 to FUNCTION. Used for C++ multiple inheritance. */
3885 static void
3886 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3887 HOST_WIDE_INT delta,
3888 HOST_WIDE_INT vcall_offset,
3889 tree function)
3890 {
3891 /* The this pointer is always in x0. Note that this differs from
3892 Arm where the this pointer maybe bumped to r1 if r0 is required
3893 to return a pointer to an aggregate. On AArch64 a result value
3894 pointer will be in x8. */
3895 int this_regno = R0_REGNUM;
3896 rtx this_rtx, temp0, temp1, addr, funexp;
3897 rtx_insn *insn;
3898
3899 reload_completed = 1;
3900 emit_note (NOTE_INSN_PROLOGUE_END);
3901
3902 if (vcall_offset == 0)
3903 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3904 else
3905 {
3906 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3907
3908 this_rtx = gen_rtx_REG (Pmode, this_regno);
3909 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3910 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3911
3912 addr = this_rtx;
3913 if (delta != 0)
3914 {
3915 if (delta >= -256 && delta < 256)
3916 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3917 plus_constant (Pmode, this_rtx, delta));
3918 else
3919 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3920 }
3921
3922 if (Pmode == ptr_mode)
3923 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3924 else
3925 aarch64_emit_move (temp0,
3926 gen_rtx_ZERO_EXTEND (Pmode,
3927 gen_rtx_MEM (ptr_mode, addr)));
3928
3929 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3930 addr = plus_constant (Pmode, temp0, vcall_offset);
3931 else
3932 {
3933 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3934 Pmode);
3935 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3936 }
3937
3938 if (Pmode == ptr_mode)
3939 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3940 else
3941 aarch64_emit_move (temp1,
3942 gen_rtx_SIGN_EXTEND (Pmode,
3943 gen_rtx_MEM (ptr_mode, addr)));
3944
3945 emit_insn (gen_add2_insn (this_rtx, temp1));
3946 }
3947
3948 /* Generate a tail call to the target function. */
3949 if (!TREE_USED (function))
3950 {
3951 assemble_external (function);
3952 TREE_USED (function) = 1;
3953 }
3954 funexp = XEXP (DECL_RTL (function), 0);
3955 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3956 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3957 SIBLING_CALL_P (insn) = 1;
3958
3959 insn = get_insns ();
3960 shorten_branches (insn);
3961 final_start_function (insn, file, 1);
3962 final (insn, file, 1);
3963 final_end_function ();
3964
3965 /* Stop pretending to be a post-reload pass. */
3966 reload_completed = 0;
3967 }
3968
3969 static bool
3970 aarch64_tls_referenced_p (rtx x)
3971 {
3972 if (!TARGET_HAVE_TLS)
3973 return false;
3974 subrtx_iterator::array_type array;
3975 FOR_EACH_SUBRTX (iter, array, x, ALL)
3976 {
3977 const_rtx x = *iter;
3978 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3979 return true;
3980 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3981 TLS offsets, not real symbol references. */
3982 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3983 iter.skip_subrtxes ();
3984 }
3985 return false;
3986 }
3987
3988
3989 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3990 a left shift of 0 or 12 bits. */
3991 bool
3992 aarch64_uimm12_shift (HOST_WIDE_INT val)
3993 {
3994 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3995 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3996 );
3997 }
3998
3999
4000 /* Return true if val is an immediate that can be loaded into a
4001 register by a MOVZ instruction. */
4002 static bool
4003 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
4004 {
4005 if (GET_MODE_SIZE (mode) > 4)
4006 {
4007 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4008 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4009 return 1;
4010 }
4011 else
4012 {
4013 /* Ignore sign extension. */
4014 val &= (HOST_WIDE_INT) 0xffffffff;
4015 }
4016 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4017 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4018 }
4019
4020 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4021
4022 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4023 {
4024 0x0000000100000001ull,
4025 0x0001000100010001ull,
4026 0x0101010101010101ull,
4027 0x1111111111111111ull,
4028 0x5555555555555555ull,
4029 };
4030
4031
4032 /* Return true if val is a valid bitmask immediate. */
4033
4034 bool
4035 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4036 {
4037 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4038 int bits;
4039
4040 /* Check for a single sequence of one bits and return quickly if so.
4041 The special cases of all ones and all zeroes returns false. */
4042 val = (unsigned HOST_WIDE_INT) val_in;
4043 tmp = val + (val & -val);
4044
4045 if (tmp == (tmp & -tmp))
4046 return (val + 1) > 1;
4047
4048 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4049 if (mode == SImode)
4050 val = (val << 32) | (val & 0xffffffff);
4051
4052 /* Invert if the immediate doesn't start with a zero bit - this means we
4053 only need to search for sequences of one bits. */
4054 if (val & 1)
4055 val = ~val;
4056
4057 /* Find the first set bit and set tmp to val with the first sequence of one
4058 bits removed. Return success if there is a single sequence of ones. */
4059 first_one = val & -val;
4060 tmp = val & (val + first_one);
4061
4062 if (tmp == 0)
4063 return true;
4064
4065 /* Find the next set bit and compute the difference in bit position. */
4066 next_one = tmp & -tmp;
4067 bits = clz_hwi (first_one) - clz_hwi (next_one);
4068 mask = val ^ tmp;
4069
4070 /* Check the bit position difference is a power of 2, and that the first
4071 sequence of one bits fits within 'bits' bits. */
4072 if ((mask >> bits) != 0 || bits != (bits & -bits))
4073 return false;
4074
4075 /* Check the sequence of one bits is repeated 64/bits times. */
4076 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4077 }
4078
4079 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4080 Assumed precondition: VAL_IN Is not zero. */
4081
4082 unsigned HOST_WIDE_INT
4083 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4084 {
4085 int lowest_bit_set = ctz_hwi (val_in);
4086 int highest_bit_set = floor_log2 (val_in);
4087 gcc_assert (val_in != 0);
4088
4089 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4090 (HOST_WIDE_INT_1U << lowest_bit_set));
4091 }
4092
4093 /* Create constant where bits outside of lowest bit set to highest bit set
4094 are set to 1. */
4095
4096 unsigned HOST_WIDE_INT
4097 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4098 {
4099 return val_in | ~aarch64_and_split_imm1 (val_in);
4100 }
4101
4102 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4103
4104 bool
4105 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4106 {
4107 if (aarch64_bitmask_imm (val_in, mode))
4108 return false;
4109
4110 if (aarch64_move_imm (val_in, mode))
4111 return false;
4112
4113 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4114
4115 return aarch64_bitmask_imm (imm2, mode);
4116 }
4117
4118 /* Return true if val is an immediate that can be loaded into a
4119 register in a single instruction. */
4120 bool
4121 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4122 {
4123 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4124 return 1;
4125 return aarch64_bitmask_imm (val, mode);
4126 }
4127
4128 static bool
4129 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4130 {
4131 rtx base, offset;
4132
4133 if (GET_CODE (x) == HIGH)
4134 return true;
4135
4136 split_const (x, &base, &offset);
4137 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4138 {
4139 if (aarch64_classify_symbol (base, offset)
4140 != SYMBOL_FORCE_TO_MEM)
4141 return true;
4142 else
4143 /* Avoid generating a 64-bit relocation in ILP32; leave
4144 to aarch64_expand_mov_immediate to handle it properly. */
4145 return mode != ptr_mode;
4146 }
4147
4148 return aarch64_tls_referenced_p (x);
4149 }
4150
4151 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4152 The expansion for a table switch is quite expensive due to the number
4153 of instructions, the table lookup and hard to predict indirect jump.
4154 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4155 set, otherwise use tables for > 16 cases as a tradeoff between size and
4156 performance. When optimizing for size, use the default setting. */
4157
4158 static unsigned int
4159 aarch64_case_values_threshold (void)
4160 {
4161 /* Use the specified limit for the number of cases before using jump
4162 tables at higher optimization levels. */
4163 if (optimize > 2
4164 && selected_cpu->tune->max_case_values != 0)
4165 return selected_cpu->tune->max_case_values;
4166 else
4167 return optimize_size ? default_case_values_threshold () : 17;
4168 }
4169
4170 /* Return true if register REGNO is a valid index register.
4171 STRICT_P is true if REG_OK_STRICT is in effect. */
4172
4173 bool
4174 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4175 {
4176 if (!HARD_REGISTER_NUM_P (regno))
4177 {
4178 if (!strict_p)
4179 return true;
4180
4181 if (!reg_renumber)
4182 return false;
4183
4184 regno = reg_renumber[regno];
4185 }
4186 return GP_REGNUM_P (regno);
4187 }
4188
4189 /* Return true if register REGNO is a valid base register for mode MODE.
4190 STRICT_P is true if REG_OK_STRICT is in effect. */
4191
4192 bool
4193 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4194 {
4195 if (!HARD_REGISTER_NUM_P (regno))
4196 {
4197 if (!strict_p)
4198 return true;
4199
4200 if (!reg_renumber)
4201 return false;
4202
4203 regno = reg_renumber[regno];
4204 }
4205
4206 /* The fake registers will be eliminated to either the stack or
4207 hard frame pointer, both of which are usually valid base registers.
4208 Reload deals with the cases where the eliminated form isn't valid. */
4209 return (GP_REGNUM_P (regno)
4210 || regno == SP_REGNUM
4211 || regno == FRAME_POINTER_REGNUM
4212 || regno == ARG_POINTER_REGNUM);
4213 }
4214
4215 /* Return true if X is a valid base register for mode MODE.
4216 STRICT_P is true if REG_OK_STRICT is in effect. */
4217
4218 static bool
4219 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4220 {
4221 if (!strict_p && GET_CODE (x) == SUBREG)
4222 x = SUBREG_REG (x);
4223
4224 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4225 }
4226
4227 /* Return true if address offset is a valid index. If it is, fill in INFO
4228 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4229
4230 static bool
4231 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4232 machine_mode mode, bool strict_p)
4233 {
4234 enum aarch64_address_type type;
4235 rtx index;
4236 int shift;
4237
4238 /* (reg:P) */
4239 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4240 && GET_MODE (x) == Pmode)
4241 {
4242 type = ADDRESS_REG_REG;
4243 index = x;
4244 shift = 0;
4245 }
4246 /* (sign_extend:DI (reg:SI)) */
4247 else if ((GET_CODE (x) == SIGN_EXTEND
4248 || GET_CODE (x) == ZERO_EXTEND)
4249 && GET_MODE (x) == DImode
4250 && GET_MODE (XEXP (x, 0)) == SImode)
4251 {
4252 type = (GET_CODE (x) == SIGN_EXTEND)
4253 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4254 index = XEXP (x, 0);
4255 shift = 0;
4256 }
4257 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4258 else if (GET_CODE (x) == MULT
4259 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4260 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4261 && GET_MODE (XEXP (x, 0)) == DImode
4262 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4263 && CONST_INT_P (XEXP (x, 1)))
4264 {
4265 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4266 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267 index = XEXP (XEXP (x, 0), 0);
4268 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4269 }
4270 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4271 else if (GET_CODE (x) == ASHIFT
4272 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274 && GET_MODE (XEXP (x, 0)) == DImode
4275 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276 && CONST_INT_P (XEXP (x, 1)))
4277 {
4278 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280 index = XEXP (XEXP (x, 0), 0);
4281 shift = INTVAL (XEXP (x, 1));
4282 }
4283 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4284 else if ((GET_CODE (x) == SIGN_EXTRACT
4285 || GET_CODE (x) == ZERO_EXTRACT)
4286 && GET_MODE (x) == DImode
4287 && GET_CODE (XEXP (x, 0)) == MULT
4288 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4289 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4290 {
4291 type = (GET_CODE (x) == SIGN_EXTRACT)
4292 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293 index = XEXP (XEXP (x, 0), 0);
4294 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4295 if (INTVAL (XEXP (x, 1)) != 32 + shift
4296 || INTVAL (XEXP (x, 2)) != 0)
4297 shift = -1;
4298 }
4299 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4300 (const_int 0xffffffff<<shift)) */
4301 else if (GET_CODE (x) == AND
4302 && GET_MODE (x) == DImode
4303 && GET_CODE (XEXP (x, 0)) == MULT
4304 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4305 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4306 && CONST_INT_P (XEXP (x, 1)))
4307 {
4308 type = ADDRESS_REG_UXTW;
4309 index = XEXP (XEXP (x, 0), 0);
4310 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4311 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4312 shift = -1;
4313 }
4314 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4315 else if ((GET_CODE (x) == SIGN_EXTRACT
4316 || GET_CODE (x) == ZERO_EXTRACT)
4317 && GET_MODE (x) == DImode
4318 && GET_CODE (XEXP (x, 0)) == ASHIFT
4319 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4320 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4321 {
4322 type = (GET_CODE (x) == SIGN_EXTRACT)
4323 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4324 index = XEXP (XEXP (x, 0), 0);
4325 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4326 if (INTVAL (XEXP (x, 1)) != 32 + shift
4327 || INTVAL (XEXP (x, 2)) != 0)
4328 shift = -1;
4329 }
4330 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4331 (const_int 0xffffffff<<shift)) */
4332 else if (GET_CODE (x) == AND
4333 && GET_MODE (x) == DImode
4334 && GET_CODE (XEXP (x, 0)) == ASHIFT
4335 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4336 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4337 && CONST_INT_P (XEXP (x, 1)))
4338 {
4339 type = ADDRESS_REG_UXTW;
4340 index = XEXP (XEXP (x, 0), 0);
4341 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4342 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4343 shift = -1;
4344 }
4345 /* (mult:P (reg:P) (const_int scale)) */
4346 else if (GET_CODE (x) == MULT
4347 && GET_MODE (x) == Pmode
4348 && GET_MODE (XEXP (x, 0)) == Pmode
4349 && CONST_INT_P (XEXP (x, 1)))
4350 {
4351 type = ADDRESS_REG_REG;
4352 index = XEXP (x, 0);
4353 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4354 }
4355 /* (ashift:P (reg:P) (const_int shift)) */
4356 else if (GET_CODE (x) == ASHIFT
4357 && GET_MODE (x) == Pmode
4358 && GET_MODE (XEXP (x, 0)) == Pmode
4359 && CONST_INT_P (XEXP (x, 1)))
4360 {
4361 type = ADDRESS_REG_REG;
4362 index = XEXP (x, 0);
4363 shift = INTVAL (XEXP (x, 1));
4364 }
4365 else
4366 return false;
4367
4368 if (GET_CODE (index) == SUBREG)
4369 index = SUBREG_REG (index);
4370
4371 if ((shift == 0 ||
4372 (shift > 0 && shift <= 3
4373 && (1 << shift) == GET_MODE_SIZE (mode)))
4374 && REG_P (index)
4375 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4376 {
4377 info->type = type;
4378 info->offset = index;
4379 info->shift = shift;
4380 return true;
4381 }
4382
4383 return false;
4384 }
4385
4386 /* Return true if MODE is one of the modes for which we
4387 support LDP/STP operations. */
4388
4389 static bool
4390 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4391 {
4392 return mode == SImode || mode == DImode
4393 || mode == SFmode || mode == DFmode
4394 || (aarch64_vector_mode_supported_p (mode)
4395 && GET_MODE_SIZE (mode) == 8);
4396 }
4397
4398 /* Return true if REGNO is a virtual pointer register, or an eliminable
4399 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4400 include stack_pointer or hard_frame_pointer. */
4401 static bool
4402 virt_or_elim_regno_p (unsigned regno)
4403 {
4404 return ((regno >= FIRST_VIRTUAL_REGISTER
4405 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4406 || regno == FRAME_POINTER_REGNUM
4407 || regno == ARG_POINTER_REGNUM);
4408 }
4409
4410 /* Return true if X is a valid address for machine mode MODE. If it is,
4411 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4412 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4413
4414 static bool
4415 aarch64_classify_address (struct aarch64_address_info *info,
4416 rtx x, machine_mode mode,
4417 RTX_CODE outer_code, bool strict_p)
4418 {
4419 enum rtx_code code = GET_CODE (x);
4420 rtx op0, op1;
4421
4422 /* On BE, we use load/store pair for all large int mode load/stores.
4423 TI/TFmode may also use a load/store pair. */
4424 bool load_store_pair_p = (outer_code == PARALLEL
4425 || mode == TImode
4426 || mode == TFmode
4427 || (BYTES_BIG_ENDIAN
4428 && aarch64_vect_struct_mode_p (mode)));
4429
4430 bool allow_reg_index_p =
4431 !load_store_pair_p
4432 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4433 && !aarch64_vect_struct_mode_p (mode);
4434
4435 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4436 REG addressing. */
4437 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4438 && (code != POST_INC && code != REG))
4439 return false;
4440
4441 switch (code)
4442 {
4443 case REG:
4444 case SUBREG:
4445 info->type = ADDRESS_REG_IMM;
4446 info->base = x;
4447 info->offset = const0_rtx;
4448 return aarch64_base_register_rtx_p (x, strict_p);
4449
4450 case PLUS:
4451 op0 = XEXP (x, 0);
4452 op1 = XEXP (x, 1);
4453
4454 if (! strict_p
4455 && REG_P (op0)
4456 && virt_or_elim_regno_p (REGNO (op0))
4457 && CONST_INT_P (op1))
4458 {
4459 info->type = ADDRESS_REG_IMM;
4460 info->base = op0;
4461 info->offset = op1;
4462
4463 return true;
4464 }
4465
4466 if (GET_MODE_SIZE (mode) != 0
4467 && CONST_INT_P (op1)
4468 && aarch64_base_register_rtx_p (op0, strict_p))
4469 {
4470 HOST_WIDE_INT offset = INTVAL (op1);
4471
4472 info->type = ADDRESS_REG_IMM;
4473 info->base = op0;
4474 info->offset = op1;
4475
4476 /* TImode and TFmode values are allowed in both pairs of X
4477 registers and individual Q registers. The available
4478 address modes are:
4479 X,X: 7-bit signed scaled offset
4480 Q: 9-bit signed offset
4481 We conservatively require an offset representable in either mode.
4482 When performing the check for pairs of X registers i.e. LDP/STP
4483 pass down DImode since that is the natural size of the LDP/STP
4484 instruction memory accesses. */
4485 if (mode == TImode || mode == TFmode)
4486 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4487 && (offset_9bit_signed_unscaled_p (mode, offset)
4488 || offset_12bit_unsigned_scaled_p (mode, offset)));
4489
4490 /* A 7bit offset check because OImode will emit a ldp/stp
4491 instruction (only big endian will get here).
4492 For ldp/stp instructions, the offset is scaled for the size of a
4493 single element of the pair. */
4494 if (mode == OImode)
4495 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4496
4497 /* Three 9/12 bit offsets checks because CImode will emit three
4498 ldr/str instructions (only big endian will get here). */
4499 if (mode == CImode)
4500 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4501 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4502 || offset_12bit_unsigned_scaled_p (V16QImode,
4503 offset + 32)));
4504
4505 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4506 instructions (only big endian will get here). */
4507 if (mode == XImode)
4508 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4509 && aarch64_offset_7bit_signed_scaled_p (TImode,
4510 offset + 32));
4511
4512 if (load_store_pair_p)
4513 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4514 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4515 else
4516 return (offset_9bit_signed_unscaled_p (mode, offset)
4517 || offset_12bit_unsigned_scaled_p (mode, offset));
4518 }
4519
4520 if (allow_reg_index_p)
4521 {
4522 /* Look for base + (scaled/extended) index register. */
4523 if (aarch64_base_register_rtx_p (op0, strict_p)
4524 && aarch64_classify_index (info, op1, mode, strict_p))
4525 {
4526 info->base = op0;
4527 return true;
4528 }
4529 if (aarch64_base_register_rtx_p (op1, strict_p)
4530 && aarch64_classify_index (info, op0, mode, strict_p))
4531 {
4532 info->base = op1;
4533 return true;
4534 }
4535 }
4536
4537 return false;
4538
4539 case POST_INC:
4540 case POST_DEC:
4541 case PRE_INC:
4542 case PRE_DEC:
4543 info->type = ADDRESS_REG_WB;
4544 info->base = XEXP (x, 0);
4545 info->offset = NULL_RTX;
4546 return aarch64_base_register_rtx_p (info->base, strict_p);
4547
4548 case POST_MODIFY:
4549 case PRE_MODIFY:
4550 info->type = ADDRESS_REG_WB;
4551 info->base = XEXP (x, 0);
4552 if (GET_CODE (XEXP (x, 1)) == PLUS
4553 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4554 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4555 && aarch64_base_register_rtx_p (info->base, strict_p))
4556 {
4557 HOST_WIDE_INT offset;
4558 info->offset = XEXP (XEXP (x, 1), 1);
4559 offset = INTVAL (info->offset);
4560
4561 /* TImode and TFmode values are allowed in both pairs of X
4562 registers and individual Q registers. The available
4563 address modes are:
4564 X,X: 7-bit signed scaled offset
4565 Q: 9-bit signed offset
4566 We conservatively require an offset representable in either mode.
4567 */
4568 if (mode == TImode || mode == TFmode)
4569 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4570 && offset_9bit_signed_unscaled_p (mode, offset));
4571
4572 if (load_store_pair_p)
4573 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4574 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4575 else
4576 return offset_9bit_signed_unscaled_p (mode, offset);
4577 }
4578 return false;
4579
4580 case CONST:
4581 case SYMBOL_REF:
4582 case LABEL_REF:
4583 /* load literal: pc-relative constant pool entry. Only supported
4584 for SI mode or larger. */
4585 info->type = ADDRESS_SYMBOLIC;
4586
4587 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4588 {
4589 rtx sym, addend;
4590
4591 split_const (x, &sym, &addend);
4592 return ((GET_CODE (sym) == LABEL_REF
4593 || (GET_CODE (sym) == SYMBOL_REF
4594 && CONSTANT_POOL_ADDRESS_P (sym)
4595 && aarch64_pcrelative_literal_loads)));
4596 }
4597 return false;
4598
4599 case LO_SUM:
4600 info->type = ADDRESS_LO_SUM;
4601 info->base = XEXP (x, 0);
4602 info->offset = XEXP (x, 1);
4603 if (allow_reg_index_p
4604 && aarch64_base_register_rtx_p (info->base, strict_p))
4605 {
4606 rtx sym, offs;
4607 split_const (info->offset, &sym, &offs);
4608 if (GET_CODE (sym) == SYMBOL_REF
4609 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4610 {
4611 /* The symbol and offset must be aligned to the access size. */
4612 unsigned int align;
4613 unsigned int ref_size;
4614
4615 if (CONSTANT_POOL_ADDRESS_P (sym))
4616 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4617 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4618 {
4619 tree exp = SYMBOL_REF_DECL (sym);
4620 align = TYPE_ALIGN (TREE_TYPE (exp));
4621 align = CONSTANT_ALIGNMENT (exp, align);
4622 }
4623 else if (SYMBOL_REF_DECL (sym))
4624 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4625 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4626 && SYMBOL_REF_BLOCK (sym) != NULL)
4627 align = SYMBOL_REF_BLOCK (sym)->alignment;
4628 else
4629 align = BITS_PER_UNIT;
4630
4631 ref_size = GET_MODE_SIZE (mode);
4632 if (ref_size == 0)
4633 ref_size = GET_MODE_SIZE (DImode);
4634
4635 return ((INTVAL (offs) & (ref_size - 1)) == 0
4636 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4637 }
4638 }
4639 return false;
4640
4641 default:
4642 return false;
4643 }
4644 }
4645
4646 /* Return true if the address X is valid for a PRFM instruction.
4647 STRICT_P is true if we should do strict checking with
4648 aarch64_classify_address. */
4649
4650 bool
4651 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4652 {
4653 struct aarch64_address_info addr;
4654
4655 /* PRFM accepts the same addresses as DImode... */
4656 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4657 if (!res)
4658 return false;
4659
4660 /* ... except writeback forms. */
4661 return addr.type != ADDRESS_REG_WB;
4662 }
4663
4664 bool
4665 aarch64_symbolic_address_p (rtx x)
4666 {
4667 rtx offset;
4668
4669 split_const (x, &x, &offset);
4670 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4671 }
4672
4673 /* Classify the base of symbolic expression X. */
4674
4675 enum aarch64_symbol_type
4676 aarch64_classify_symbolic_expression (rtx x)
4677 {
4678 rtx offset;
4679
4680 split_const (x, &x, &offset);
4681 return aarch64_classify_symbol (x, offset);
4682 }
4683
4684
4685 /* Return TRUE if X is a legitimate address for accessing memory in
4686 mode MODE. */
4687 static bool
4688 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4689 {
4690 struct aarch64_address_info addr;
4691
4692 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4693 }
4694
4695 /* Return TRUE if X is a legitimate address for accessing memory in
4696 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4697 pair operation. */
4698 bool
4699 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4700 RTX_CODE outer_code, bool strict_p)
4701 {
4702 struct aarch64_address_info addr;
4703
4704 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4705 }
4706
4707 /* Split an out-of-range address displacement into a base and offset.
4708 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4709 to increase opportunities for sharing the base address of different sizes.
4710 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4711 static bool
4712 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4713 {
4714 HOST_WIDE_INT offset = INTVAL (*disp);
4715 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4716
4717 if (mode == TImode || mode == TFmode
4718 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4719 base = (offset + 0x100) & ~0x1ff;
4720
4721 *off = GEN_INT (base);
4722 *disp = GEN_INT (offset - base);
4723 return true;
4724 }
4725
4726 /* Return TRUE if rtx X is immediate constant 0.0 */
4727 bool
4728 aarch64_float_const_zero_rtx_p (rtx x)
4729 {
4730 if (GET_MODE (x) == VOIDmode)
4731 return false;
4732
4733 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4734 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4735 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4736 }
4737
4738 /* Return the fixed registers used for condition codes. */
4739
4740 static bool
4741 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4742 {
4743 *p1 = CC_REGNUM;
4744 *p2 = INVALID_REGNUM;
4745 return true;
4746 }
4747
4748 /* This function is used by the call expanders of the machine description.
4749 RESULT is the register in which the result is returned. It's NULL for
4750 "call" and "sibcall".
4751 MEM is the location of the function call.
4752 SIBCALL indicates whether this function call is normal call or sibling call.
4753 It will generate different pattern accordingly. */
4754
4755 void
4756 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4757 {
4758 rtx call, callee, tmp;
4759 rtvec vec;
4760 machine_mode mode;
4761
4762 gcc_assert (MEM_P (mem));
4763 callee = XEXP (mem, 0);
4764 mode = GET_MODE (callee);
4765 gcc_assert (mode == Pmode);
4766
4767 /* Decide if we should generate indirect calls by loading the
4768 address of the callee into a register before performing
4769 the branch-and-link. */
4770 if (SYMBOL_REF_P (callee)
4771 ? (aarch64_is_long_call_p (callee)
4772 || aarch64_is_noplt_call_p (callee))
4773 : !REG_P (callee))
4774 XEXP (mem, 0) = force_reg (mode, callee);
4775
4776 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4777
4778 if (result != NULL_RTX)
4779 call = gen_rtx_SET (result, call);
4780
4781 if (sibcall)
4782 tmp = ret_rtx;
4783 else
4784 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4785
4786 vec = gen_rtvec (2, call, tmp);
4787 call = gen_rtx_PARALLEL (VOIDmode, vec);
4788
4789 aarch64_emit_call_insn (call);
4790 }
4791
4792 /* Emit call insn with PAT and do aarch64-specific handling. */
4793
4794 void
4795 aarch64_emit_call_insn (rtx pat)
4796 {
4797 rtx insn = emit_call_insn (pat);
4798
4799 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4800 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4801 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4802 }
4803
4804 machine_mode
4805 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4806 {
4807 /* All floating point compares return CCFP if it is an equality
4808 comparison, and CCFPE otherwise. */
4809 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4810 {
4811 switch (code)
4812 {
4813 case EQ:
4814 case NE:
4815 case UNORDERED:
4816 case ORDERED:
4817 case UNLT:
4818 case UNLE:
4819 case UNGT:
4820 case UNGE:
4821 case UNEQ:
4822 case LTGT:
4823 return CCFPmode;
4824
4825 case LT:
4826 case LE:
4827 case GT:
4828 case GE:
4829 return CCFPEmode;
4830
4831 default:
4832 gcc_unreachable ();
4833 }
4834 }
4835
4836 /* Equality comparisons of short modes against zero can be performed
4837 using the TST instruction with the appropriate bitmask. */
4838 if (y == const0_rtx && REG_P (x)
4839 && (code == EQ || code == NE)
4840 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4841 return CC_NZmode;
4842
4843 /* Similarly, comparisons of zero_extends from shorter modes can
4844 be performed using an ANDS with an immediate mask. */
4845 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4846 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4847 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4848 && (code == EQ || code == NE))
4849 return CC_NZmode;
4850
4851 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4852 && y == const0_rtx
4853 && (code == EQ || code == NE || code == LT || code == GE)
4854 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4855 || GET_CODE (x) == NEG
4856 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4857 && CONST_INT_P (XEXP (x, 2)))))
4858 return CC_NZmode;
4859
4860 /* A compare with a shifted operand. Because of canonicalization,
4861 the comparison will have to be swapped when we emit the assembly
4862 code. */
4863 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4864 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4865 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4866 || GET_CODE (x) == LSHIFTRT
4867 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4868 return CC_SWPmode;
4869
4870 /* Similarly for a negated operand, but we can only do this for
4871 equalities. */
4872 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4873 && (REG_P (y) || GET_CODE (y) == SUBREG)
4874 && (code == EQ || code == NE)
4875 && GET_CODE (x) == NEG)
4876 return CC_Zmode;
4877
4878 /* A test for unsigned overflow. */
4879 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4880 && code == NE
4881 && GET_CODE (x) == PLUS
4882 && GET_CODE (y) == ZERO_EXTEND)
4883 return CC_Cmode;
4884
4885 /* For everything else, return CCmode. */
4886 return CCmode;
4887 }
4888
4889 static int
4890 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4891
4892 int
4893 aarch64_get_condition_code (rtx x)
4894 {
4895 machine_mode mode = GET_MODE (XEXP (x, 0));
4896 enum rtx_code comp_code = GET_CODE (x);
4897
4898 if (GET_MODE_CLASS (mode) != MODE_CC)
4899 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4900 return aarch64_get_condition_code_1 (mode, comp_code);
4901 }
4902
4903 static int
4904 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4905 {
4906 switch (mode)
4907 {
4908 case CCFPmode:
4909 case CCFPEmode:
4910 switch (comp_code)
4911 {
4912 case GE: return AARCH64_GE;
4913 case GT: return AARCH64_GT;
4914 case LE: return AARCH64_LS;
4915 case LT: return AARCH64_MI;
4916 case NE: return AARCH64_NE;
4917 case EQ: return AARCH64_EQ;
4918 case ORDERED: return AARCH64_VC;
4919 case UNORDERED: return AARCH64_VS;
4920 case UNLT: return AARCH64_LT;
4921 case UNLE: return AARCH64_LE;
4922 case UNGT: return AARCH64_HI;
4923 case UNGE: return AARCH64_PL;
4924 default: return -1;
4925 }
4926 break;
4927
4928 case CCmode:
4929 switch (comp_code)
4930 {
4931 case NE: return AARCH64_NE;
4932 case EQ: return AARCH64_EQ;
4933 case GE: return AARCH64_GE;
4934 case GT: return AARCH64_GT;
4935 case LE: return AARCH64_LE;
4936 case LT: return AARCH64_LT;
4937 case GEU: return AARCH64_CS;
4938 case GTU: return AARCH64_HI;
4939 case LEU: return AARCH64_LS;
4940 case LTU: return AARCH64_CC;
4941 default: return -1;
4942 }
4943 break;
4944
4945 case CC_SWPmode:
4946 switch (comp_code)
4947 {
4948 case NE: return AARCH64_NE;
4949 case EQ: return AARCH64_EQ;
4950 case GE: return AARCH64_LE;
4951 case GT: return AARCH64_LT;
4952 case LE: return AARCH64_GE;
4953 case LT: return AARCH64_GT;
4954 case GEU: return AARCH64_LS;
4955 case GTU: return AARCH64_CC;
4956 case LEU: return AARCH64_CS;
4957 case LTU: return AARCH64_HI;
4958 default: return -1;
4959 }
4960 break;
4961
4962 case CC_NZmode:
4963 switch (comp_code)
4964 {
4965 case NE: return AARCH64_NE;
4966 case EQ: return AARCH64_EQ;
4967 case GE: return AARCH64_PL;
4968 case LT: return AARCH64_MI;
4969 default: return -1;
4970 }
4971 break;
4972
4973 case CC_Zmode:
4974 switch (comp_code)
4975 {
4976 case NE: return AARCH64_NE;
4977 case EQ: return AARCH64_EQ;
4978 default: return -1;
4979 }
4980 break;
4981
4982 case CC_Cmode:
4983 switch (comp_code)
4984 {
4985 case NE: return AARCH64_CS;
4986 case EQ: return AARCH64_CC;
4987 default: return -1;
4988 }
4989 break;
4990
4991 default:
4992 return -1;
4993 }
4994
4995 return -1;
4996 }
4997
4998 bool
4999 aarch64_const_vec_all_same_in_range_p (rtx x,
5000 HOST_WIDE_INT minval,
5001 HOST_WIDE_INT maxval)
5002 {
5003 HOST_WIDE_INT firstval;
5004 int count, i;
5005
5006 if (GET_CODE (x) != CONST_VECTOR
5007 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5008 return false;
5009
5010 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5011 if (firstval < minval || firstval > maxval)
5012 return false;
5013
5014 count = CONST_VECTOR_NUNITS (x);
5015 for (i = 1; i < count; i++)
5016 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5017 return false;
5018
5019 return true;
5020 }
5021
5022 bool
5023 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5024 {
5025 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5026 }
5027
5028
5029 /* N Z C V. */
5030 #define AARCH64_CC_V 1
5031 #define AARCH64_CC_C (1 << 1)
5032 #define AARCH64_CC_Z (1 << 2)
5033 #define AARCH64_CC_N (1 << 3)
5034
5035 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5036 static const int aarch64_nzcv_codes[] =
5037 {
5038 0, /* EQ, Z == 1. */
5039 AARCH64_CC_Z, /* NE, Z == 0. */
5040 0, /* CS, C == 1. */
5041 AARCH64_CC_C, /* CC, C == 0. */
5042 0, /* MI, N == 1. */
5043 AARCH64_CC_N, /* PL, N == 0. */
5044 0, /* VS, V == 1. */
5045 AARCH64_CC_V, /* VC, V == 0. */
5046 0, /* HI, C ==1 && Z == 0. */
5047 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5048 AARCH64_CC_V, /* GE, N == V. */
5049 0, /* LT, N != V. */
5050 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5051 0, /* LE, !(Z == 0 && N == V). */
5052 0, /* AL, Any. */
5053 0 /* NV, Any. */
5054 };
5055
5056 static void
5057 aarch64_print_operand (FILE *f, rtx x, int code)
5058 {
5059 switch (code)
5060 {
5061 /* An integer or symbol address without a preceding # sign. */
5062 case 'c':
5063 switch (GET_CODE (x))
5064 {
5065 case CONST_INT:
5066 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5067 break;
5068
5069 case SYMBOL_REF:
5070 output_addr_const (f, x);
5071 break;
5072
5073 case CONST:
5074 if (GET_CODE (XEXP (x, 0)) == PLUS
5075 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5076 {
5077 output_addr_const (f, x);
5078 break;
5079 }
5080 /* Fall through. */
5081
5082 default:
5083 output_operand_lossage ("Unsupported operand for code '%c'", code);
5084 }
5085 break;
5086
5087 case 'e':
5088 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
5089 {
5090 int n;
5091
5092 if (!CONST_INT_P (x)
5093 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5094 {
5095 output_operand_lossage ("invalid operand for '%%%c'", code);
5096 return;
5097 }
5098
5099 switch (n)
5100 {
5101 case 3:
5102 fputc ('b', f);
5103 break;
5104 case 4:
5105 fputc ('h', f);
5106 break;
5107 case 5:
5108 fputc ('w', f);
5109 break;
5110 default:
5111 output_operand_lossage ("invalid operand for '%%%c'", code);
5112 return;
5113 }
5114 }
5115 break;
5116
5117 case 'p':
5118 {
5119 int n;
5120
5121 /* Print N such that 2^N == X. */
5122 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5123 {
5124 output_operand_lossage ("invalid operand for '%%%c'", code);
5125 return;
5126 }
5127
5128 asm_fprintf (f, "%d", n);
5129 }
5130 break;
5131
5132 case 'P':
5133 /* Print the number of non-zero bits in X (a const_int). */
5134 if (!CONST_INT_P (x))
5135 {
5136 output_operand_lossage ("invalid operand for '%%%c'", code);
5137 return;
5138 }
5139
5140 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5141 break;
5142
5143 case 'H':
5144 /* Print the higher numbered register of a pair (TImode) of regs. */
5145 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5146 {
5147 output_operand_lossage ("invalid operand for '%%%c'", code);
5148 return;
5149 }
5150
5151 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5152 break;
5153
5154 case 'M':
5155 case 'm':
5156 {
5157 int cond_code;
5158 /* Print a condition (eq, ne, etc) or its inverse. */
5159
5160 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5161 if (x == const_true_rtx)
5162 {
5163 if (code == 'M')
5164 fputs ("nv", f);
5165 return;
5166 }
5167
5168 if (!COMPARISON_P (x))
5169 {
5170 output_operand_lossage ("invalid operand for '%%%c'", code);
5171 return;
5172 }
5173
5174 cond_code = aarch64_get_condition_code (x);
5175 gcc_assert (cond_code >= 0);
5176 if (code == 'M')
5177 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5178 fputs (aarch64_condition_codes[cond_code], f);
5179 }
5180 break;
5181
5182 case 'b':
5183 case 'h':
5184 case 's':
5185 case 'd':
5186 case 'q':
5187 /* Print a scalar FP/SIMD register name. */
5188 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5189 {
5190 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5191 return;
5192 }
5193 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5194 break;
5195
5196 case 'S':
5197 case 'T':
5198 case 'U':
5199 case 'V':
5200 /* Print the first FP/SIMD register name in a list. */
5201 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5202 {
5203 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5204 return;
5205 }
5206 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5207 break;
5208
5209 case 'R':
5210 /* Print a scalar FP/SIMD register name + 1. */
5211 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5212 {
5213 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5214 return;
5215 }
5216 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5217 break;
5218
5219 case 'X':
5220 /* Print bottom 16 bits of integer constant in hex. */
5221 if (!CONST_INT_P (x))
5222 {
5223 output_operand_lossage ("invalid operand for '%%%c'", code);
5224 return;
5225 }
5226 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5227 break;
5228
5229 case 'w':
5230 case 'x':
5231 /* Print a general register name or the zero register (32-bit or
5232 64-bit). */
5233 if (x == const0_rtx
5234 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5235 {
5236 asm_fprintf (f, "%czr", code);
5237 break;
5238 }
5239
5240 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5241 {
5242 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5243 break;
5244 }
5245
5246 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5247 {
5248 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5249 break;
5250 }
5251
5252 /* Fall through */
5253
5254 case 0:
5255 /* Print a normal operand, if it's a general register, then we
5256 assume DImode. */
5257 if (x == NULL)
5258 {
5259 output_operand_lossage ("missing operand");
5260 return;
5261 }
5262
5263 switch (GET_CODE (x))
5264 {
5265 case REG:
5266 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5267 break;
5268
5269 case MEM:
5270 output_address (GET_MODE (x), XEXP (x, 0));
5271 /* Check all memory references are Pmode - even with ILP32. */
5272 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5273 break;
5274
5275 case CONST:
5276 case LABEL_REF:
5277 case SYMBOL_REF:
5278 output_addr_const (asm_out_file, x);
5279 break;
5280
5281 case CONST_INT:
5282 asm_fprintf (f, "%wd", INTVAL (x));
5283 break;
5284
5285 case CONST_VECTOR:
5286 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5287 {
5288 gcc_assert (
5289 aarch64_const_vec_all_same_in_range_p (x,
5290 HOST_WIDE_INT_MIN,
5291 HOST_WIDE_INT_MAX));
5292 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5293 }
5294 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5295 {
5296 fputc ('0', f);
5297 }
5298 else
5299 gcc_unreachable ();
5300 break;
5301
5302 case CONST_DOUBLE:
5303 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5304 be getting CONST_DOUBLEs holding integers. */
5305 gcc_assert (GET_MODE (x) != VOIDmode);
5306 if (aarch64_float_const_zero_rtx_p (x))
5307 {
5308 fputc ('0', f);
5309 break;
5310 }
5311 else if (aarch64_float_const_representable_p (x))
5312 {
5313 #define buf_size 20
5314 char float_buf[buf_size] = {'\0'};
5315 real_to_decimal_for_mode (float_buf,
5316 CONST_DOUBLE_REAL_VALUE (x),
5317 buf_size, buf_size,
5318 1, GET_MODE (x));
5319 asm_fprintf (asm_out_file, "%s", float_buf);
5320 break;
5321 #undef buf_size
5322 }
5323 output_operand_lossage ("invalid constant");
5324 return;
5325 default:
5326 output_operand_lossage ("invalid operand");
5327 return;
5328 }
5329 break;
5330
5331 case 'A':
5332 if (GET_CODE (x) == HIGH)
5333 x = XEXP (x, 0);
5334
5335 switch (aarch64_classify_symbolic_expression (x))
5336 {
5337 case SYMBOL_SMALL_GOT_4G:
5338 asm_fprintf (asm_out_file, ":got:");
5339 break;
5340
5341 case SYMBOL_SMALL_TLSGD:
5342 asm_fprintf (asm_out_file, ":tlsgd:");
5343 break;
5344
5345 case SYMBOL_SMALL_TLSDESC:
5346 asm_fprintf (asm_out_file, ":tlsdesc:");
5347 break;
5348
5349 case SYMBOL_SMALL_TLSIE:
5350 asm_fprintf (asm_out_file, ":gottprel:");
5351 break;
5352
5353 case SYMBOL_TLSLE24:
5354 asm_fprintf (asm_out_file, ":tprel:");
5355 break;
5356
5357 case SYMBOL_TINY_GOT:
5358 gcc_unreachable ();
5359 break;
5360
5361 default:
5362 break;
5363 }
5364 output_addr_const (asm_out_file, x);
5365 break;
5366
5367 case 'L':
5368 switch (aarch64_classify_symbolic_expression (x))
5369 {
5370 case SYMBOL_SMALL_GOT_4G:
5371 asm_fprintf (asm_out_file, ":lo12:");
5372 break;
5373
5374 case SYMBOL_SMALL_TLSGD:
5375 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5376 break;
5377
5378 case SYMBOL_SMALL_TLSDESC:
5379 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5380 break;
5381
5382 case SYMBOL_SMALL_TLSIE:
5383 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5384 break;
5385
5386 case SYMBOL_TLSLE12:
5387 asm_fprintf (asm_out_file, ":tprel_lo12:");
5388 break;
5389
5390 case SYMBOL_TLSLE24:
5391 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5392 break;
5393
5394 case SYMBOL_TINY_GOT:
5395 asm_fprintf (asm_out_file, ":got:");
5396 break;
5397
5398 case SYMBOL_TINY_TLSIE:
5399 asm_fprintf (asm_out_file, ":gottprel:");
5400 break;
5401
5402 default:
5403 break;
5404 }
5405 output_addr_const (asm_out_file, x);
5406 break;
5407
5408 case 'G':
5409
5410 switch (aarch64_classify_symbolic_expression (x))
5411 {
5412 case SYMBOL_TLSLE24:
5413 asm_fprintf (asm_out_file, ":tprel_hi12:");
5414 break;
5415 default:
5416 break;
5417 }
5418 output_addr_const (asm_out_file, x);
5419 break;
5420
5421 case 'k':
5422 {
5423 HOST_WIDE_INT cond_code;
5424 /* Print nzcv. */
5425
5426 if (!CONST_INT_P (x))
5427 {
5428 output_operand_lossage ("invalid operand for '%%%c'", code);
5429 return;
5430 }
5431
5432 cond_code = INTVAL (x);
5433 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5434 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5435 }
5436 break;
5437
5438 default:
5439 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5440 return;
5441 }
5442 }
5443
5444 static void
5445 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5446 {
5447 struct aarch64_address_info addr;
5448
5449 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5450 switch (addr.type)
5451 {
5452 case ADDRESS_REG_IMM:
5453 if (addr.offset == const0_rtx)
5454 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5455 else
5456 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5457 INTVAL (addr.offset));
5458 return;
5459
5460 case ADDRESS_REG_REG:
5461 if (addr.shift == 0)
5462 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5463 reg_names [REGNO (addr.offset)]);
5464 else
5465 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5466 reg_names [REGNO (addr.offset)], addr.shift);
5467 return;
5468
5469 case ADDRESS_REG_UXTW:
5470 if (addr.shift == 0)
5471 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5472 REGNO (addr.offset) - R0_REGNUM);
5473 else
5474 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5475 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5476 return;
5477
5478 case ADDRESS_REG_SXTW:
5479 if (addr.shift == 0)
5480 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5481 REGNO (addr.offset) - R0_REGNUM);
5482 else
5483 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5484 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5485 return;
5486
5487 case ADDRESS_REG_WB:
5488 switch (GET_CODE (x))
5489 {
5490 case PRE_INC:
5491 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5492 GET_MODE_SIZE (mode));
5493 return;
5494 case POST_INC:
5495 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5496 GET_MODE_SIZE (mode));
5497 return;
5498 case PRE_DEC:
5499 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5500 GET_MODE_SIZE (mode));
5501 return;
5502 case POST_DEC:
5503 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5504 GET_MODE_SIZE (mode));
5505 return;
5506 case PRE_MODIFY:
5507 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5508 INTVAL (addr.offset));
5509 return;
5510 case POST_MODIFY:
5511 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5512 INTVAL (addr.offset));
5513 return;
5514 default:
5515 break;
5516 }
5517 break;
5518
5519 case ADDRESS_LO_SUM:
5520 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5521 output_addr_const (f, addr.offset);
5522 asm_fprintf (f, "]");
5523 return;
5524
5525 case ADDRESS_SYMBOLIC:
5526 break;
5527 }
5528
5529 output_addr_const (f, x);
5530 }
5531
5532 bool
5533 aarch64_label_mentioned_p (rtx x)
5534 {
5535 const char *fmt;
5536 int i;
5537
5538 if (GET_CODE (x) == LABEL_REF)
5539 return true;
5540
5541 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5542 referencing instruction, but they are constant offsets, not
5543 symbols. */
5544 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5545 return false;
5546
5547 fmt = GET_RTX_FORMAT (GET_CODE (x));
5548 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5549 {
5550 if (fmt[i] == 'E')
5551 {
5552 int j;
5553
5554 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5555 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5556 return 1;
5557 }
5558 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5559 return 1;
5560 }
5561
5562 return 0;
5563 }
5564
5565 /* Implement REGNO_REG_CLASS. */
5566
5567 enum reg_class
5568 aarch64_regno_regclass (unsigned regno)
5569 {
5570 if (GP_REGNUM_P (regno))
5571 return GENERAL_REGS;
5572
5573 if (regno == SP_REGNUM)
5574 return STACK_REG;
5575
5576 if (regno == FRAME_POINTER_REGNUM
5577 || regno == ARG_POINTER_REGNUM)
5578 return POINTER_REGS;
5579
5580 if (FP_REGNUM_P (regno))
5581 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5582
5583 return NO_REGS;
5584 }
5585
5586 static rtx
5587 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5588 {
5589 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5590 where mask is selected by alignment and size of the offset.
5591 We try to pick as large a range for the offset as possible to
5592 maximize the chance of a CSE. However, for aligned addresses
5593 we limit the range to 4k so that structures with different sized
5594 elements are likely to use the same base. We need to be careful
5595 not to split a CONST for some forms of address expression, otherwise
5596 it will generate sub-optimal code. */
5597
5598 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5599 {
5600 rtx base = XEXP (x, 0);
5601 rtx offset_rtx = XEXP (x, 1);
5602 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5603
5604 if (GET_CODE (base) == PLUS)
5605 {
5606 rtx op0 = XEXP (base, 0);
5607 rtx op1 = XEXP (base, 1);
5608
5609 /* Force any scaling into a temp for CSE. */
5610 op0 = force_reg (Pmode, op0);
5611 op1 = force_reg (Pmode, op1);
5612
5613 /* Let the pointer register be in op0. */
5614 if (REG_POINTER (op1))
5615 std::swap (op0, op1);
5616
5617 /* If the pointer is virtual or frame related, then we know that
5618 virtual register instantiation or register elimination is going
5619 to apply a second constant. We want the two constants folded
5620 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5621 if (virt_or_elim_regno_p (REGNO (op0)))
5622 {
5623 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5624 NULL_RTX, true, OPTAB_DIRECT);
5625 return gen_rtx_PLUS (Pmode, base, op1);
5626 }
5627
5628 /* Otherwise, in order to encourage CSE (and thence loop strength
5629 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5630 base = expand_binop (Pmode, add_optab, op0, op1,
5631 NULL_RTX, true, OPTAB_DIRECT);
5632 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5633 }
5634
5635 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5636 HOST_WIDE_INT base_offset;
5637 if (GET_MODE_SIZE (mode) > 16)
5638 base_offset = (offset + 0x400) & ~0x7f0;
5639 /* For offsets aren't a multiple of the access size, the limit is
5640 -256...255. */
5641 else if (offset & (GET_MODE_SIZE (mode) - 1))
5642 {
5643 base_offset = (offset + 0x100) & ~0x1ff;
5644
5645 /* BLKmode typically uses LDP of X-registers. */
5646 if (mode == BLKmode)
5647 base_offset = (offset + 512) & ~0x3ff;
5648 }
5649 /* Small negative offsets are supported. */
5650 else if (IN_RANGE (offset, -256, 0))
5651 base_offset = 0;
5652 else if (mode == TImode || mode == TFmode)
5653 base_offset = (offset + 0x100) & ~0x1ff;
5654 /* Use 12-bit offset by access size. */
5655 else
5656 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5657
5658 if (base_offset != 0)
5659 {
5660 base = plus_constant (Pmode, base, base_offset);
5661 base = force_operand (base, NULL_RTX);
5662 return plus_constant (Pmode, base, offset - base_offset);
5663 }
5664 }
5665
5666 return x;
5667 }
5668
5669 /* Return the reload icode required for a constant pool in mode. */
5670 static enum insn_code
5671 aarch64_constant_pool_reload_icode (machine_mode mode)
5672 {
5673 switch (mode)
5674 {
5675 case SFmode:
5676 return CODE_FOR_aarch64_reload_movcpsfdi;
5677
5678 case DFmode:
5679 return CODE_FOR_aarch64_reload_movcpdfdi;
5680
5681 case TFmode:
5682 return CODE_FOR_aarch64_reload_movcptfdi;
5683
5684 case V8QImode:
5685 return CODE_FOR_aarch64_reload_movcpv8qidi;
5686
5687 case V16QImode:
5688 return CODE_FOR_aarch64_reload_movcpv16qidi;
5689
5690 case V4HImode:
5691 return CODE_FOR_aarch64_reload_movcpv4hidi;
5692
5693 case V8HImode:
5694 return CODE_FOR_aarch64_reload_movcpv8hidi;
5695
5696 case V2SImode:
5697 return CODE_FOR_aarch64_reload_movcpv2sidi;
5698
5699 case V4SImode:
5700 return CODE_FOR_aarch64_reload_movcpv4sidi;
5701
5702 case V2DImode:
5703 return CODE_FOR_aarch64_reload_movcpv2didi;
5704
5705 case V2DFmode:
5706 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5707
5708 default:
5709 gcc_unreachable ();
5710 }
5711
5712 gcc_unreachable ();
5713 }
5714 static reg_class_t
5715 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5716 reg_class_t rclass,
5717 machine_mode mode,
5718 secondary_reload_info *sri)
5719 {
5720
5721 /* If we have to disable direct literal pool loads and stores because the
5722 function is too big, then we need a scratch register. */
5723 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5724 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5725 || targetm.vector_mode_supported_p (GET_MODE (x)))
5726 && !aarch64_pcrelative_literal_loads)
5727 {
5728 sri->icode = aarch64_constant_pool_reload_icode (mode);
5729 return NO_REGS;
5730 }
5731
5732 /* Without the TARGET_SIMD instructions we cannot move a Q register
5733 to a Q register directly. We need a scratch. */
5734 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5735 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5736 && reg_class_subset_p (rclass, FP_REGS))
5737 {
5738 if (mode == TFmode)
5739 sri->icode = CODE_FOR_aarch64_reload_movtf;
5740 else if (mode == TImode)
5741 sri->icode = CODE_FOR_aarch64_reload_movti;
5742 return NO_REGS;
5743 }
5744
5745 /* A TFmode or TImode memory access should be handled via an FP_REGS
5746 because AArch64 has richer addressing modes for LDR/STR instructions
5747 than LDP/STP instructions. */
5748 if (TARGET_FLOAT && rclass == GENERAL_REGS
5749 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5750 return FP_REGS;
5751
5752 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5753 return GENERAL_REGS;
5754
5755 return NO_REGS;
5756 }
5757
5758 static bool
5759 aarch64_can_eliminate (const int from, const int to)
5760 {
5761 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5762 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5763
5764 if (frame_pointer_needed)
5765 {
5766 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5767 return true;
5768 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5769 return false;
5770 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5771 && !cfun->calls_alloca)
5772 return true;
5773 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5774 return true;
5775
5776 return false;
5777 }
5778 else
5779 {
5780 /* If we decided that we didn't need a leaf frame pointer but then used
5781 LR in the function, then we'll want a frame pointer after all, so
5782 prevent this elimination to ensure a frame pointer is used. */
5783 if (to == STACK_POINTER_REGNUM
5784 && flag_omit_leaf_frame_pointer
5785 && df_regs_ever_live_p (LR_REGNUM))
5786 return false;
5787 }
5788
5789 return true;
5790 }
5791
5792 HOST_WIDE_INT
5793 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5794 {
5795 aarch64_layout_frame ();
5796
5797 if (to == HARD_FRAME_POINTER_REGNUM)
5798 {
5799 if (from == ARG_POINTER_REGNUM)
5800 return cfun->machine->frame.hard_fp_offset;
5801
5802 if (from == FRAME_POINTER_REGNUM)
5803 return cfun->machine->frame.hard_fp_offset
5804 - cfun->machine->frame.locals_offset;
5805 }
5806
5807 if (to == STACK_POINTER_REGNUM)
5808 {
5809 if (from == FRAME_POINTER_REGNUM)
5810 return cfun->machine->frame.frame_size
5811 - cfun->machine->frame.locals_offset;
5812 }
5813
5814 return cfun->machine->frame.frame_size;
5815 }
5816
5817 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5818 previous frame. */
5819
5820 rtx
5821 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5822 {
5823 if (count != 0)
5824 return const0_rtx;
5825 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5826 }
5827
5828
5829 static void
5830 aarch64_asm_trampoline_template (FILE *f)
5831 {
5832 if (TARGET_ILP32)
5833 {
5834 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5835 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5836 }
5837 else
5838 {
5839 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5840 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5841 }
5842 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5843 assemble_aligned_integer (4, const0_rtx);
5844 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5845 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5846 }
5847
5848 static void
5849 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5850 {
5851 rtx fnaddr, mem, a_tramp;
5852 const int tramp_code_sz = 16;
5853
5854 /* Don't need to copy the trailing D-words, we fill those in below. */
5855 emit_block_move (m_tramp, assemble_trampoline_template (),
5856 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5857 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5858 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5859 if (GET_MODE (fnaddr) != ptr_mode)
5860 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5861 emit_move_insn (mem, fnaddr);
5862
5863 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5864 emit_move_insn (mem, chain_value);
5865
5866 /* XXX We should really define a "clear_cache" pattern and use
5867 gen_clear_cache(). */
5868 a_tramp = XEXP (m_tramp, 0);
5869 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5870 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5871 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5872 ptr_mode);
5873 }
5874
5875 static unsigned char
5876 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5877 {
5878 switch (regclass)
5879 {
5880 case CALLER_SAVE_REGS:
5881 case POINTER_REGS:
5882 case GENERAL_REGS:
5883 case ALL_REGS:
5884 case FP_REGS:
5885 case FP_LO_REGS:
5886 return
5887 aarch64_vector_mode_p (mode)
5888 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5889 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5890 case STACK_REG:
5891 return 1;
5892
5893 case NO_REGS:
5894 return 0;
5895
5896 default:
5897 break;
5898 }
5899 gcc_unreachable ();
5900 }
5901
5902 static reg_class_t
5903 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5904 {
5905 if (regclass == POINTER_REGS)
5906 return GENERAL_REGS;
5907
5908 if (regclass == STACK_REG)
5909 {
5910 if (REG_P(x)
5911 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5912 return regclass;
5913
5914 return NO_REGS;
5915 }
5916
5917 /* If it's an integer immediate that MOVI can't handle, then
5918 FP_REGS is not an option, so we return NO_REGS instead. */
5919 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5920 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5921 return NO_REGS;
5922
5923 /* Register eliminiation can result in a request for
5924 SP+constant->FP_REGS. We cannot support such operations which
5925 use SP as source and an FP_REG as destination, so reject out
5926 right now. */
5927 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5928 {
5929 rtx lhs = XEXP (x, 0);
5930
5931 /* Look through a possible SUBREG introduced by ILP32. */
5932 if (GET_CODE (lhs) == SUBREG)
5933 lhs = SUBREG_REG (lhs);
5934
5935 gcc_assert (REG_P (lhs));
5936 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5937 POINTER_REGS));
5938 return NO_REGS;
5939 }
5940
5941 return regclass;
5942 }
5943
5944 void
5945 aarch64_asm_output_labelref (FILE* f, const char *name)
5946 {
5947 asm_fprintf (f, "%U%s", name);
5948 }
5949
5950 static void
5951 aarch64_elf_asm_constructor (rtx symbol, int priority)
5952 {
5953 if (priority == DEFAULT_INIT_PRIORITY)
5954 default_ctor_section_asm_out_constructor (symbol, priority);
5955 else
5956 {
5957 section *s;
5958 /* While priority is known to be in range [0, 65535], so 18 bytes
5959 would be enough, the compiler might not know that. To avoid
5960 -Wformat-truncation false positive, use a larger size. */
5961 char buf[23];
5962 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5963 s = get_section (buf, SECTION_WRITE, NULL);
5964 switch_to_section (s);
5965 assemble_align (POINTER_SIZE);
5966 assemble_aligned_integer (POINTER_BYTES, symbol);
5967 }
5968 }
5969
5970 static void
5971 aarch64_elf_asm_destructor (rtx symbol, int priority)
5972 {
5973 if (priority == DEFAULT_INIT_PRIORITY)
5974 default_dtor_section_asm_out_destructor (symbol, priority);
5975 else
5976 {
5977 section *s;
5978 /* While priority is known to be in range [0, 65535], so 18 bytes
5979 would be enough, the compiler might not know that. To avoid
5980 -Wformat-truncation false positive, use a larger size. */
5981 char buf[23];
5982 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5983 s = get_section (buf, SECTION_WRITE, NULL);
5984 switch_to_section (s);
5985 assemble_align (POINTER_SIZE);
5986 assemble_aligned_integer (POINTER_BYTES, symbol);
5987 }
5988 }
5989
5990 const char*
5991 aarch64_output_casesi (rtx *operands)
5992 {
5993 char buf[100];
5994 char label[100];
5995 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5996 int index;
5997 static const char *const patterns[4][2] =
5998 {
5999 {
6000 "ldrb\t%w3, [%0,%w1,uxtw]",
6001 "add\t%3, %4, %w3, sxtb #2"
6002 },
6003 {
6004 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6005 "add\t%3, %4, %w3, sxth #2"
6006 },
6007 {
6008 "ldr\t%w3, [%0,%w1,uxtw #2]",
6009 "add\t%3, %4, %w3, sxtw #2"
6010 },
6011 /* We assume that DImode is only generated when not optimizing and
6012 that we don't really need 64-bit address offsets. That would
6013 imply an object file with 8GB of code in a single function! */
6014 {
6015 "ldr\t%w3, [%0,%w1,uxtw #2]",
6016 "add\t%3, %4, %w3, sxtw #2"
6017 }
6018 };
6019
6020 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6021
6022 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6023
6024 gcc_assert (index >= 0 && index <= 3);
6025
6026 /* Need to implement table size reduction, by chaning the code below. */
6027 output_asm_insn (patterns[index][0], operands);
6028 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6029 snprintf (buf, sizeof (buf),
6030 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6031 output_asm_insn (buf, operands);
6032 output_asm_insn (patterns[index][1], operands);
6033 output_asm_insn ("br\t%3", operands);
6034 assemble_label (asm_out_file, label);
6035 return "";
6036 }
6037
6038
6039 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6040 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6041 operator. */
6042
6043 int
6044 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6045 {
6046 if (shift >= 0 && shift <= 3)
6047 {
6048 int size;
6049 for (size = 8; size <= 32; size *= 2)
6050 {
6051 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6052 if (mask == bits << shift)
6053 return size;
6054 }
6055 }
6056 return 0;
6057 }
6058
6059 /* Constant pools are per function only when PC relative
6060 literal loads are true or we are in the large memory
6061 model. */
6062
6063 static inline bool
6064 aarch64_can_use_per_function_literal_pools_p (void)
6065 {
6066 return (aarch64_pcrelative_literal_loads
6067 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6068 }
6069
6070 static bool
6071 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6072 {
6073 /* Fixme:: In an ideal world this would work similar
6074 to the logic in aarch64_select_rtx_section but this
6075 breaks bootstrap in gcc go. For now we workaround
6076 this by returning false here. */
6077 return false;
6078 }
6079
6080 /* Select appropriate section for constants depending
6081 on where we place literal pools. */
6082
6083 static section *
6084 aarch64_select_rtx_section (machine_mode mode,
6085 rtx x,
6086 unsigned HOST_WIDE_INT align)
6087 {
6088 if (aarch64_can_use_per_function_literal_pools_p ())
6089 return function_section (current_function_decl);
6090
6091 return default_elf_select_rtx_section (mode, x, align);
6092 }
6093
6094 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6095 void
6096 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6097 HOST_WIDE_INT offset)
6098 {
6099 /* When using per-function literal pools, we must ensure that any code
6100 section is aligned to the minimal instruction length, lest we get
6101 errors from the assembler re "unaligned instructions". */
6102 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6103 ASM_OUTPUT_ALIGN (f, 2);
6104 }
6105
6106 /* Costs. */
6107
6108 /* Helper function for rtx cost calculation. Strip a shift expression
6109 from X. Returns the inner operand if successful, or the original
6110 expression on failure. */
6111 static rtx
6112 aarch64_strip_shift (rtx x)
6113 {
6114 rtx op = x;
6115
6116 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6117 we can convert both to ROR during final output. */
6118 if ((GET_CODE (op) == ASHIFT
6119 || GET_CODE (op) == ASHIFTRT
6120 || GET_CODE (op) == LSHIFTRT
6121 || GET_CODE (op) == ROTATERT
6122 || GET_CODE (op) == ROTATE)
6123 && CONST_INT_P (XEXP (op, 1)))
6124 return XEXP (op, 0);
6125
6126 if (GET_CODE (op) == MULT
6127 && CONST_INT_P (XEXP (op, 1))
6128 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6129 return XEXP (op, 0);
6130
6131 return x;
6132 }
6133
6134 /* Helper function for rtx cost calculation. Strip an extend
6135 expression from X. Returns the inner operand if successful, or the
6136 original expression on failure. We deal with a number of possible
6137 canonicalization variations here. If STRIP_SHIFT is true, then
6138 we can strip off a shift also. */
6139 static rtx
6140 aarch64_strip_extend (rtx x, bool strip_shift)
6141 {
6142 rtx op = x;
6143
6144 /* Zero and sign extraction of a widened value. */
6145 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6146 && XEXP (op, 2) == const0_rtx
6147 && GET_CODE (XEXP (op, 0)) == MULT
6148 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6149 XEXP (op, 1)))
6150 return XEXP (XEXP (op, 0), 0);
6151
6152 /* It can also be represented (for zero-extend) as an AND with an
6153 immediate. */
6154 if (GET_CODE (op) == AND
6155 && GET_CODE (XEXP (op, 0)) == MULT
6156 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6157 && CONST_INT_P (XEXP (op, 1))
6158 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6159 INTVAL (XEXP (op, 1))) != 0)
6160 return XEXP (XEXP (op, 0), 0);
6161
6162 /* Now handle extended register, as this may also have an optional
6163 left shift by 1..4. */
6164 if (strip_shift
6165 && GET_CODE (op) == ASHIFT
6166 && CONST_INT_P (XEXP (op, 1))
6167 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6168 op = XEXP (op, 0);
6169
6170 if (GET_CODE (op) == ZERO_EXTEND
6171 || GET_CODE (op) == SIGN_EXTEND)
6172 op = XEXP (op, 0);
6173
6174 if (op != x)
6175 return op;
6176
6177 return x;
6178 }
6179
6180 /* Return true iff CODE is a shift supported in combination
6181 with arithmetic instructions. */
6182
6183 static bool
6184 aarch64_shift_p (enum rtx_code code)
6185 {
6186 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6187 }
6188
6189
6190 /* Return true iff X is a cheap shift without a sign extend. */
6191
6192 static bool
6193 aarch64_cheap_mult_shift_p (rtx x)
6194 {
6195 rtx op0, op1;
6196
6197 op0 = XEXP (x, 0);
6198 op1 = XEXP (x, 1);
6199
6200 if (!(aarch64_tune_params.extra_tuning_flags
6201 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6202 return false;
6203
6204 if (GET_CODE (op0) == SIGN_EXTEND)
6205 return false;
6206
6207 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6208 && UINTVAL (op1) <= 4)
6209 return true;
6210
6211 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6212 return false;
6213
6214 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6215
6216 if (l2 > 0 && l2 <= 4)
6217 return true;
6218
6219 return false;
6220 }
6221
6222 /* Helper function for rtx cost calculation. Calculate the cost of
6223 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6224 Return the calculated cost of the expression, recursing manually in to
6225 operands where needed. */
6226
6227 static int
6228 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6229 {
6230 rtx op0, op1;
6231 const struct cpu_cost_table *extra_cost
6232 = aarch64_tune_params.insn_extra_cost;
6233 int cost = 0;
6234 bool compound_p = (outer == PLUS || outer == MINUS);
6235 machine_mode mode = GET_MODE (x);
6236
6237 gcc_checking_assert (code == MULT);
6238
6239 op0 = XEXP (x, 0);
6240 op1 = XEXP (x, 1);
6241
6242 if (VECTOR_MODE_P (mode))
6243 mode = GET_MODE_INNER (mode);
6244
6245 /* Integer multiply/fma. */
6246 if (GET_MODE_CLASS (mode) == MODE_INT)
6247 {
6248 /* The multiply will be canonicalized as a shift, cost it as such. */
6249 if (aarch64_shift_p (GET_CODE (x))
6250 || (CONST_INT_P (op1)
6251 && exact_log2 (INTVAL (op1)) > 0))
6252 {
6253 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6254 || GET_CODE (op0) == SIGN_EXTEND;
6255 if (speed)
6256 {
6257 if (compound_p)
6258 {
6259 /* If the shift is considered cheap,
6260 then don't add any cost. */
6261 if (aarch64_cheap_mult_shift_p (x))
6262 ;
6263 else if (REG_P (op1))
6264 /* ARITH + shift-by-register. */
6265 cost += extra_cost->alu.arith_shift_reg;
6266 else if (is_extend)
6267 /* ARITH + extended register. We don't have a cost field
6268 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6269 cost += extra_cost->alu.extend_arith;
6270 else
6271 /* ARITH + shift-by-immediate. */
6272 cost += extra_cost->alu.arith_shift;
6273 }
6274 else
6275 /* LSL (immediate). */
6276 cost += extra_cost->alu.shift;
6277
6278 }
6279 /* Strip extends as we will have costed them in the case above. */
6280 if (is_extend)
6281 op0 = aarch64_strip_extend (op0, true);
6282
6283 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6284
6285 return cost;
6286 }
6287
6288 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6289 compound and let the below cases handle it. After all, MNEG is a
6290 special-case alias of MSUB. */
6291 if (GET_CODE (op0) == NEG)
6292 {
6293 op0 = XEXP (op0, 0);
6294 compound_p = true;
6295 }
6296
6297 /* Integer multiplies or FMAs have zero/sign extending variants. */
6298 if ((GET_CODE (op0) == ZERO_EXTEND
6299 && GET_CODE (op1) == ZERO_EXTEND)
6300 || (GET_CODE (op0) == SIGN_EXTEND
6301 && GET_CODE (op1) == SIGN_EXTEND))
6302 {
6303 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6304 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6305
6306 if (speed)
6307 {
6308 if (compound_p)
6309 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6310 cost += extra_cost->mult[0].extend_add;
6311 else
6312 /* MUL/SMULL/UMULL. */
6313 cost += extra_cost->mult[0].extend;
6314 }
6315
6316 return cost;
6317 }
6318
6319 /* This is either an integer multiply or a MADD. In both cases
6320 we want to recurse and cost the operands. */
6321 cost += rtx_cost (op0, mode, MULT, 0, speed);
6322 cost += rtx_cost (op1, mode, MULT, 1, speed);
6323
6324 if (speed)
6325 {
6326 if (compound_p)
6327 /* MADD/MSUB. */
6328 cost += extra_cost->mult[mode == DImode].add;
6329 else
6330 /* MUL. */
6331 cost += extra_cost->mult[mode == DImode].simple;
6332 }
6333
6334 return cost;
6335 }
6336 else
6337 {
6338 if (speed)
6339 {
6340 /* Floating-point FMA/FMUL can also support negations of the
6341 operands, unless the rounding mode is upward or downward in
6342 which case FNMUL is different than FMUL with operand negation. */
6343 bool neg0 = GET_CODE (op0) == NEG;
6344 bool neg1 = GET_CODE (op1) == NEG;
6345 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6346 {
6347 if (neg0)
6348 op0 = XEXP (op0, 0);
6349 if (neg1)
6350 op1 = XEXP (op1, 0);
6351 }
6352
6353 if (compound_p)
6354 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6355 cost += extra_cost->fp[mode == DFmode].fma;
6356 else
6357 /* FMUL/FNMUL. */
6358 cost += extra_cost->fp[mode == DFmode].mult;
6359 }
6360
6361 cost += rtx_cost (op0, mode, MULT, 0, speed);
6362 cost += rtx_cost (op1, mode, MULT, 1, speed);
6363 return cost;
6364 }
6365 }
6366
6367 static int
6368 aarch64_address_cost (rtx x,
6369 machine_mode mode,
6370 addr_space_t as ATTRIBUTE_UNUSED,
6371 bool speed)
6372 {
6373 enum rtx_code c = GET_CODE (x);
6374 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6375 struct aarch64_address_info info;
6376 int cost = 0;
6377 info.shift = 0;
6378
6379 if (!aarch64_classify_address (&info, x, mode, c, false))
6380 {
6381 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6382 {
6383 /* This is a CONST or SYMBOL ref which will be split
6384 in a different way depending on the code model in use.
6385 Cost it through the generic infrastructure. */
6386 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6387 /* Divide through by the cost of one instruction to
6388 bring it to the same units as the address costs. */
6389 cost_symbol_ref /= COSTS_N_INSNS (1);
6390 /* The cost is then the cost of preparing the address,
6391 followed by an immediate (possibly 0) offset. */
6392 return cost_symbol_ref + addr_cost->imm_offset;
6393 }
6394 else
6395 {
6396 /* This is most likely a jump table from a case
6397 statement. */
6398 return addr_cost->register_offset;
6399 }
6400 }
6401
6402 switch (info.type)
6403 {
6404 case ADDRESS_LO_SUM:
6405 case ADDRESS_SYMBOLIC:
6406 case ADDRESS_REG_IMM:
6407 cost += addr_cost->imm_offset;
6408 break;
6409
6410 case ADDRESS_REG_WB:
6411 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6412 cost += addr_cost->pre_modify;
6413 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6414 cost += addr_cost->post_modify;
6415 else
6416 gcc_unreachable ();
6417
6418 break;
6419
6420 case ADDRESS_REG_REG:
6421 cost += addr_cost->register_offset;
6422 break;
6423
6424 case ADDRESS_REG_SXTW:
6425 cost += addr_cost->register_sextend;
6426 break;
6427
6428 case ADDRESS_REG_UXTW:
6429 cost += addr_cost->register_zextend;
6430 break;
6431
6432 default:
6433 gcc_unreachable ();
6434 }
6435
6436
6437 if (info.shift > 0)
6438 {
6439 /* For the sake of calculating the cost of the shifted register
6440 component, we can treat same sized modes in the same way. */
6441 switch (GET_MODE_BITSIZE (mode))
6442 {
6443 case 16:
6444 cost += addr_cost->addr_scale_costs.hi;
6445 break;
6446
6447 case 32:
6448 cost += addr_cost->addr_scale_costs.si;
6449 break;
6450
6451 case 64:
6452 cost += addr_cost->addr_scale_costs.di;
6453 break;
6454
6455 /* We can't tell, or this is a 128-bit vector. */
6456 default:
6457 cost += addr_cost->addr_scale_costs.ti;
6458 break;
6459 }
6460 }
6461
6462 return cost;
6463 }
6464
6465 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6466 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6467 to be taken. */
6468
6469 int
6470 aarch64_branch_cost (bool speed_p, bool predictable_p)
6471 {
6472 /* When optimizing for speed, use the cost of unpredictable branches. */
6473 const struct cpu_branch_cost *branch_costs =
6474 aarch64_tune_params.branch_costs;
6475
6476 if (!speed_p || predictable_p)
6477 return branch_costs->predictable;
6478 else
6479 return branch_costs->unpredictable;
6480 }
6481
6482 /* Return true if the RTX X in mode MODE is a zero or sign extract
6483 usable in an ADD or SUB (extended register) instruction. */
6484 static bool
6485 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6486 {
6487 /* Catch add with a sign extract.
6488 This is add_<optab><mode>_multp2. */
6489 if (GET_CODE (x) == SIGN_EXTRACT
6490 || GET_CODE (x) == ZERO_EXTRACT)
6491 {
6492 rtx op0 = XEXP (x, 0);
6493 rtx op1 = XEXP (x, 1);
6494 rtx op2 = XEXP (x, 2);
6495
6496 if (GET_CODE (op0) == MULT
6497 && CONST_INT_P (op1)
6498 && op2 == const0_rtx
6499 && CONST_INT_P (XEXP (op0, 1))
6500 && aarch64_is_extend_from_extract (mode,
6501 XEXP (op0, 1),
6502 op1))
6503 {
6504 return true;
6505 }
6506 }
6507 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6508 No shift. */
6509 else if (GET_CODE (x) == SIGN_EXTEND
6510 || GET_CODE (x) == ZERO_EXTEND)
6511 return REG_P (XEXP (x, 0));
6512
6513 return false;
6514 }
6515
6516 static bool
6517 aarch64_frint_unspec_p (unsigned int u)
6518 {
6519 switch (u)
6520 {
6521 case UNSPEC_FRINTZ:
6522 case UNSPEC_FRINTP:
6523 case UNSPEC_FRINTM:
6524 case UNSPEC_FRINTA:
6525 case UNSPEC_FRINTN:
6526 case UNSPEC_FRINTX:
6527 case UNSPEC_FRINTI:
6528 return true;
6529
6530 default:
6531 return false;
6532 }
6533 }
6534
6535 /* Return true iff X is an rtx that will match an extr instruction
6536 i.e. as described in the *extr<mode>5_insn family of patterns.
6537 OP0 and OP1 will be set to the operands of the shifts involved
6538 on success and will be NULL_RTX otherwise. */
6539
6540 static bool
6541 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6542 {
6543 rtx op0, op1;
6544 machine_mode mode = GET_MODE (x);
6545
6546 *res_op0 = NULL_RTX;
6547 *res_op1 = NULL_RTX;
6548
6549 if (GET_CODE (x) != IOR)
6550 return false;
6551
6552 op0 = XEXP (x, 0);
6553 op1 = XEXP (x, 1);
6554
6555 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6556 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6557 {
6558 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6559 if (GET_CODE (op1) == ASHIFT)
6560 std::swap (op0, op1);
6561
6562 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6563 return false;
6564
6565 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6566 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6567
6568 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6569 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6570 {
6571 *res_op0 = XEXP (op0, 0);
6572 *res_op1 = XEXP (op1, 0);
6573 return true;
6574 }
6575 }
6576
6577 return false;
6578 }
6579
6580 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6581 storing it in *COST. Result is true if the total cost of the operation
6582 has now been calculated. */
6583 static bool
6584 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6585 {
6586 rtx inner;
6587 rtx comparator;
6588 enum rtx_code cmpcode;
6589
6590 if (COMPARISON_P (op0))
6591 {
6592 inner = XEXP (op0, 0);
6593 comparator = XEXP (op0, 1);
6594 cmpcode = GET_CODE (op0);
6595 }
6596 else
6597 {
6598 inner = op0;
6599 comparator = const0_rtx;
6600 cmpcode = NE;
6601 }
6602
6603 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6604 {
6605 /* Conditional branch. */
6606 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6607 return true;
6608 else
6609 {
6610 if (cmpcode == NE || cmpcode == EQ)
6611 {
6612 if (comparator == const0_rtx)
6613 {
6614 /* TBZ/TBNZ/CBZ/CBNZ. */
6615 if (GET_CODE (inner) == ZERO_EXTRACT)
6616 /* TBZ/TBNZ. */
6617 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6618 ZERO_EXTRACT, 0, speed);
6619 else
6620 /* CBZ/CBNZ. */
6621 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6622
6623 return true;
6624 }
6625 }
6626 else if (cmpcode == LT || cmpcode == GE)
6627 {
6628 /* TBZ/TBNZ. */
6629 if (comparator == const0_rtx)
6630 return true;
6631 }
6632 }
6633 }
6634 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6635 {
6636 /* CCMP. */
6637 if (GET_CODE (op1) == COMPARE)
6638 {
6639 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6640 if (XEXP (op1, 1) == const0_rtx)
6641 *cost += 1;
6642 if (speed)
6643 {
6644 machine_mode mode = GET_MODE (XEXP (op1, 0));
6645 const struct cpu_cost_table *extra_cost
6646 = aarch64_tune_params.insn_extra_cost;
6647
6648 if (GET_MODE_CLASS (mode) == MODE_INT)
6649 *cost += extra_cost->alu.arith;
6650 else
6651 *cost += extra_cost->fp[mode == DFmode].compare;
6652 }
6653 return true;
6654 }
6655
6656 /* It's a conditional operation based on the status flags,
6657 so it must be some flavor of CSEL. */
6658
6659 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6660 if (GET_CODE (op1) == NEG
6661 || GET_CODE (op1) == NOT
6662 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6663 op1 = XEXP (op1, 0);
6664 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6665 {
6666 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6667 op1 = XEXP (op1, 0);
6668 op2 = XEXP (op2, 0);
6669 }
6670
6671 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6672 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6673 return true;
6674 }
6675
6676 /* We don't know what this is, cost all operands. */
6677 return false;
6678 }
6679
6680 /* Check whether X is a bitfield operation of the form shift + extend that
6681 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6682 operand to which the bitfield operation is applied. Otherwise return
6683 NULL_RTX. */
6684
6685 static rtx
6686 aarch64_extend_bitfield_pattern_p (rtx x)
6687 {
6688 rtx_code outer_code = GET_CODE (x);
6689 machine_mode outer_mode = GET_MODE (x);
6690
6691 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6692 && outer_mode != SImode && outer_mode != DImode)
6693 return NULL_RTX;
6694
6695 rtx inner = XEXP (x, 0);
6696 rtx_code inner_code = GET_CODE (inner);
6697 machine_mode inner_mode = GET_MODE (inner);
6698 rtx op = NULL_RTX;
6699
6700 switch (inner_code)
6701 {
6702 case ASHIFT:
6703 if (CONST_INT_P (XEXP (inner, 1))
6704 && (inner_mode == QImode || inner_mode == HImode))
6705 op = XEXP (inner, 0);
6706 break;
6707 case LSHIFTRT:
6708 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6709 && (inner_mode == QImode || inner_mode == HImode))
6710 op = XEXP (inner, 0);
6711 break;
6712 case ASHIFTRT:
6713 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6714 && (inner_mode == QImode || inner_mode == HImode))
6715 op = XEXP (inner, 0);
6716 break;
6717 default:
6718 break;
6719 }
6720
6721 return op;
6722 }
6723
6724 /* Return true if the mask and a shift amount from an RTX of the form
6725 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6726 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6727
6728 bool
6729 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6730 {
6731 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6732 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6733 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6734 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6735 }
6736
6737 /* Calculate the cost of calculating X, storing it in *COST. Result
6738 is true if the total cost of the operation has now been calculated. */
6739 static bool
6740 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6741 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6742 {
6743 rtx op0, op1, op2;
6744 const struct cpu_cost_table *extra_cost
6745 = aarch64_tune_params.insn_extra_cost;
6746 int code = GET_CODE (x);
6747
6748 /* By default, assume that everything has equivalent cost to the
6749 cheapest instruction. Any additional costs are applied as a delta
6750 above this default. */
6751 *cost = COSTS_N_INSNS (1);
6752
6753 switch (code)
6754 {
6755 case SET:
6756 /* The cost depends entirely on the operands to SET. */
6757 *cost = 0;
6758 op0 = SET_DEST (x);
6759 op1 = SET_SRC (x);
6760
6761 switch (GET_CODE (op0))
6762 {
6763 case MEM:
6764 if (speed)
6765 {
6766 rtx address = XEXP (op0, 0);
6767 if (VECTOR_MODE_P (mode))
6768 *cost += extra_cost->ldst.storev;
6769 else if (GET_MODE_CLASS (mode) == MODE_INT)
6770 *cost += extra_cost->ldst.store;
6771 else if (mode == SFmode)
6772 *cost += extra_cost->ldst.storef;
6773 else if (mode == DFmode)
6774 *cost += extra_cost->ldst.stored;
6775
6776 *cost +=
6777 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6778 0, speed));
6779 }
6780
6781 *cost += rtx_cost (op1, mode, SET, 1, speed);
6782 return true;
6783
6784 case SUBREG:
6785 if (! REG_P (SUBREG_REG (op0)))
6786 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6787
6788 /* Fall through. */
6789 case REG:
6790 /* The cost is one per vector-register copied. */
6791 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6792 {
6793 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6794 / GET_MODE_SIZE (V4SImode);
6795 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6796 }
6797 /* const0_rtx is in general free, but we will use an
6798 instruction to set a register to 0. */
6799 else if (REG_P (op1) || op1 == const0_rtx)
6800 {
6801 /* The cost is 1 per register copied. */
6802 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6803 / UNITS_PER_WORD;
6804 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6805 }
6806 else
6807 /* Cost is just the cost of the RHS of the set. */
6808 *cost += rtx_cost (op1, mode, SET, 1, speed);
6809 return true;
6810
6811 case ZERO_EXTRACT:
6812 case SIGN_EXTRACT:
6813 /* Bit-field insertion. Strip any redundant widening of
6814 the RHS to meet the width of the target. */
6815 if (GET_CODE (op1) == SUBREG)
6816 op1 = SUBREG_REG (op1);
6817 if ((GET_CODE (op1) == ZERO_EXTEND
6818 || GET_CODE (op1) == SIGN_EXTEND)
6819 && CONST_INT_P (XEXP (op0, 1))
6820 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6821 >= INTVAL (XEXP (op0, 1))))
6822 op1 = XEXP (op1, 0);
6823
6824 if (CONST_INT_P (op1))
6825 {
6826 /* MOV immediate is assumed to always be cheap. */
6827 *cost = COSTS_N_INSNS (1);
6828 }
6829 else
6830 {
6831 /* BFM. */
6832 if (speed)
6833 *cost += extra_cost->alu.bfi;
6834 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6835 }
6836
6837 return true;
6838
6839 default:
6840 /* We can't make sense of this, assume default cost. */
6841 *cost = COSTS_N_INSNS (1);
6842 return false;
6843 }
6844 return false;
6845
6846 case CONST_INT:
6847 /* If an instruction can incorporate a constant within the
6848 instruction, the instruction's expression avoids calling
6849 rtx_cost() on the constant. If rtx_cost() is called on a
6850 constant, then it is usually because the constant must be
6851 moved into a register by one or more instructions.
6852
6853 The exception is constant 0, which can be expressed
6854 as XZR/WZR and is therefore free. The exception to this is
6855 if we have (set (reg) (const0_rtx)) in which case we must cost
6856 the move. However, we can catch that when we cost the SET, so
6857 we don't need to consider that here. */
6858 if (x == const0_rtx)
6859 *cost = 0;
6860 else
6861 {
6862 /* To an approximation, building any other constant is
6863 proportionally expensive to the number of instructions
6864 required to build that constant. This is true whether we
6865 are compiling for SPEED or otherwise. */
6866 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6867 (NULL_RTX, x, false, mode));
6868 }
6869 return true;
6870
6871 case CONST_DOUBLE:
6872 if (speed)
6873 {
6874 /* mov[df,sf]_aarch64. */
6875 if (aarch64_float_const_representable_p (x))
6876 /* FMOV (scalar immediate). */
6877 *cost += extra_cost->fp[mode == DFmode].fpconst;
6878 else if (!aarch64_float_const_zero_rtx_p (x))
6879 {
6880 /* This will be a load from memory. */
6881 if (mode == DFmode)
6882 *cost += extra_cost->ldst.loadd;
6883 else
6884 *cost += extra_cost->ldst.loadf;
6885 }
6886 else
6887 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6888 or MOV v0.s[0], wzr - neither of which are modeled by the
6889 cost tables. Just use the default cost. */
6890 {
6891 }
6892 }
6893
6894 return true;
6895
6896 case MEM:
6897 if (speed)
6898 {
6899 /* For loads we want the base cost of a load, plus an
6900 approximation for the additional cost of the addressing
6901 mode. */
6902 rtx address = XEXP (x, 0);
6903 if (VECTOR_MODE_P (mode))
6904 *cost += extra_cost->ldst.loadv;
6905 else if (GET_MODE_CLASS (mode) == MODE_INT)
6906 *cost += extra_cost->ldst.load;
6907 else if (mode == SFmode)
6908 *cost += extra_cost->ldst.loadf;
6909 else if (mode == DFmode)
6910 *cost += extra_cost->ldst.loadd;
6911
6912 *cost +=
6913 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6914 0, speed));
6915 }
6916
6917 return true;
6918
6919 case NEG:
6920 op0 = XEXP (x, 0);
6921
6922 if (VECTOR_MODE_P (mode))
6923 {
6924 if (speed)
6925 {
6926 /* FNEG. */
6927 *cost += extra_cost->vect.alu;
6928 }
6929 return false;
6930 }
6931
6932 if (GET_MODE_CLASS (mode) == MODE_INT)
6933 {
6934 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6935 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6936 {
6937 /* CSETM. */
6938 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6939 return true;
6940 }
6941
6942 /* Cost this as SUB wzr, X. */
6943 op0 = CONST0_RTX (mode);
6944 op1 = XEXP (x, 0);
6945 goto cost_minus;
6946 }
6947
6948 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6949 {
6950 /* Support (neg(fma...)) as a single instruction only if
6951 sign of zeros is unimportant. This matches the decision
6952 making in aarch64.md. */
6953 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6954 {
6955 /* FNMADD. */
6956 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6957 return true;
6958 }
6959 if (GET_CODE (op0) == MULT)
6960 {
6961 /* FNMUL. */
6962 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6963 return true;
6964 }
6965 if (speed)
6966 /* FNEG. */
6967 *cost += extra_cost->fp[mode == DFmode].neg;
6968 return false;
6969 }
6970
6971 return false;
6972
6973 case CLRSB:
6974 case CLZ:
6975 if (speed)
6976 {
6977 if (VECTOR_MODE_P (mode))
6978 *cost += extra_cost->vect.alu;
6979 else
6980 *cost += extra_cost->alu.clz;
6981 }
6982
6983 return false;
6984
6985 case COMPARE:
6986 op0 = XEXP (x, 0);
6987 op1 = XEXP (x, 1);
6988
6989 if (op1 == const0_rtx
6990 && GET_CODE (op0) == AND)
6991 {
6992 x = op0;
6993 mode = GET_MODE (op0);
6994 goto cost_logic;
6995 }
6996
6997 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6998 {
6999 /* TODO: A write to the CC flags possibly costs extra, this
7000 needs encoding in the cost tables. */
7001
7002 mode = GET_MODE (op0);
7003 /* ANDS. */
7004 if (GET_CODE (op0) == AND)
7005 {
7006 x = op0;
7007 goto cost_logic;
7008 }
7009
7010 if (GET_CODE (op0) == PLUS)
7011 {
7012 /* ADDS (and CMN alias). */
7013 x = op0;
7014 goto cost_plus;
7015 }
7016
7017 if (GET_CODE (op0) == MINUS)
7018 {
7019 /* SUBS. */
7020 x = op0;
7021 goto cost_minus;
7022 }
7023
7024 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7025 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7026 && CONST_INT_P (XEXP (op0, 2)))
7027 {
7028 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7029 Handle it here directly rather than going to cost_logic
7030 since we know the immediate generated for the TST is valid
7031 so we can avoid creating an intermediate rtx for it only
7032 for costing purposes. */
7033 if (speed)
7034 *cost += extra_cost->alu.logical;
7035
7036 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7037 ZERO_EXTRACT, 0, speed);
7038 return true;
7039 }
7040
7041 if (GET_CODE (op1) == NEG)
7042 {
7043 /* CMN. */
7044 if (speed)
7045 *cost += extra_cost->alu.arith;
7046
7047 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7048 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7049 return true;
7050 }
7051
7052 /* CMP.
7053
7054 Compare can freely swap the order of operands, and
7055 canonicalization puts the more complex operation first.
7056 But the integer MINUS logic expects the shift/extend
7057 operation in op1. */
7058 if (! (REG_P (op0)
7059 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7060 {
7061 op0 = XEXP (x, 1);
7062 op1 = XEXP (x, 0);
7063 }
7064 goto cost_minus;
7065 }
7066
7067 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7068 {
7069 /* FCMP. */
7070 if (speed)
7071 *cost += extra_cost->fp[mode == DFmode].compare;
7072
7073 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7074 {
7075 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7076 /* FCMP supports constant 0.0 for no extra cost. */
7077 return true;
7078 }
7079 return false;
7080 }
7081
7082 if (VECTOR_MODE_P (mode))
7083 {
7084 /* Vector compare. */
7085 if (speed)
7086 *cost += extra_cost->vect.alu;
7087
7088 if (aarch64_float_const_zero_rtx_p (op1))
7089 {
7090 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7091 cost. */
7092 return true;
7093 }
7094 return false;
7095 }
7096 return false;
7097
7098 case MINUS:
7099 {
7100 op0 = XEXP (x, 0);
7101 op1 = XEXP (x, 1);
7102
7103 cost_minus:
7104 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7105
7106 /* Detect valid immediates. */
7107 if ((GET_MODE_CLASS (mode) == MODE_INT
7108 || (GET_MODE_CLASS (mode) == MODE_CC
7109 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7110 && CONST_INT_P (op1)
7111 && aarch64_uimm12_shift (INTVAL (op1)))
7112 {
7113 if (speed)
7114 /* SUB(S) (immediate). */
7115 *cost += extra_cost->alu.arith;
7116 return true;
7117 }
7118
7119 /* Look for SUB (extended register). */
7120 if (aarch64_rtx_arith_op_extract_p (op1, mode))
7121 {
7122 if (speed)
7123 *cost += extra_cost->alu.extend_arith;
7124
7125 op1 = aarch64_strip_extend (op1, true);
7126 *cost += rtx_cost (op1, VOIDmode,
7127 (enum rtx_code) GET_CODE (op1), 0, speed);
7128 return true;
7129 }
7130
7131 rtx new_op1 = aarch64_strip_extend (op1, false);
7132
7133 /* Cost this as an FMA-alike operation. */
7134 if ((GET_CODE (new_op1) == MULT
7135 || aarch64_shift_p (GET_CODE (new_op1)))
7136 && code != COMPARE)
7137 {
7138 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7139 (enum rtx_code) code,
7140 speed);
7141 return true;
7142 }
7143
7144 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7145
7146 if (speed)
7147 {
7148 if (VECTOR_MODE_P (mode))
7149 {
7150 /* Vector SUB. */
7151 *cost += extra_cost->vect.alu;
7152 }
7153 else if (GET_MODE_CLASS (mode) == MODE_INT)
7154 {
7155 /* SUB(S). */
7156 *cost += extra_cost->alu.arith;
7157 }
7158 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7159 {
7160 /* FSUB. */
7161 *cost += extra_cost->fp[mode == DFmode].addsub;
7162 }
7163 }
7164 return true;
7165 }
7166
7167 case PLUS:
7168 {
7169 rtx new_op0;
7170
7171 op0 = XEXP (x, 0);
7172 op1 = XEXP (x, 1);
7173
7174 cost_plus:
7175 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7176 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7177 {
7178 /* CSINC. */
7179 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7180 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7181 return true;
7182 }
7183
7184 if (GET_MODE_CLASS (mode) == MODE_INT
7185 && CONST_INT_P (op1)
7186 && aarch64_uimm12_shift (INTVAL (op1)))
7187 {
7188 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7189
7190 if (speed)
7191 /* ADD (immediate). */
7192 *cost += extra_cost->alu.arith;
7193 return true;
7194 }
7195
7196 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7197
7198 /* Look for ADD (extended register). */
7199 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7200 {
7201 if (speed)
7202 *cost += extra_cost->alu.extend_arith;
7203
7204 op0 = aarch64_strip_extend (op0, true);
7205 *cost += rtx_cost (op0, VOIDmode,
7206 (enum rtx_code) GET_CODE (op0), 0, speed);
7207 return true;
7208 }
7209
7210 /* Strip any extend, leave shifts behind as we will
7211 cost them through mult_cost. */
7212 new_op0 = aarch64_strip_extend (op0, false);
7213
7214 if (GET_CODE (new_op0) == MULT
7215 || aarch64_shift_p (GET_CODE (new_op0)))
7216 {
7217 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7218 speed);
7219 return true;
7220 }
7221
7222 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7223
7224 if (speed)
7225 {
7226 if (VECTOR_MODE_P (mode))
7227 {
7228 /* Vector ADD. */
7229 *cost += extra_cost->vect.alu;
7230 }
7231 else if (GET_MODE_CLASS (mode) == MODE_INT)
7232 {
7233 /* ADD. */
7234 *cost += extra_cost->alu.arith;
7235 }
7236 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7237 {
7238 /* FADD. */
7239 *cost += extra_cost->fp[mode == DFmode].addsub;
7240 }
7241 }
7242 return true;
7243 }
7244
7245 case BSWAP:
7246 *cost = COSTS_N_INSNS (1);
7247
7248 if (speed)
7249 {
7250 if (VECTOR_MODE_P (mode))
7251 *cost += extra_cost->vect.alu;
7252 else
7253 *cost += extra_cost->alu.rev;
7254 }
7255 return false;
7256
7257 case IOR:
7258 if (aarch_rev16_p (x))
7259 {
7260 *cost = COSTS_N_INSNS (1);
7261
7262 if (speed)
7263 {
7264 if (VECTOR_MODE_P (mode))
7265 *cost += extra_cost->vect.alu;
7266 else
7267 *cost += extra_cost->alu.rev;
7268 }
7269 return true;
7270 }
7271
7272 if (aarch64_extr_rtx_p (x, &op0, &op1))
7273 {
7274 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7275 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7276 if (speed)
7277 *cost += extra_cost->alu.shift;
7278
7279 return true;
7280 }
7281 /* Fall through. */
7282 case XOR:
7283 case AND:
7284 cost_logic:
7285 op0 = XEXP (x, 0);
7286 op1 = XEXP (x, 1);
7287
7288 if (VECTOR_MODE_P (mode))
7289 {
7290 if (speed)
7291 *cost += extra_cost->vect.alu;
7292 return true;
7293 }
7294
7295 if (code == AND
7296 && GET_CODE (op0) == MULT
7297 && CONST_INT_P (XEXP (op0, 1))
7298 && CONST_INT_P (op1)
7299 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7300 INTVAL (op1)) != 0)
7301 {
7302 /* This is a UBFM/SBFM. */
7303 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7304 if (speed)
7305 *cost += extra_cost->alu.bfx;
7306 return true;
7307 }
7308
7309 if (GET_MODE_CLASS (mode) == MODE_INT)
7310 {
7311 if (CONST_INT_P (op1))
7312 {
7313 /* We have a mask + shift version of a UBFIZ
7314 i.e. the *andim_ashift<mode>_bfiz pattern. */
7315 if (GET_CODE (op0) == ASHIFT
7316 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7317 XEXP (op0, 1)))
7318 {
7319 *cost += rtx_cost (XEXP (op0, 0), mode,
7320 (enum rtx_code) code, 0, speed);
7321 if (speed)
7322 *cost += extra_cost->alu.bfx;
7323
7324 return true;
7325 }
7326 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7327 {
7328 /* We possibly get the immediate for free, this is not
7329 modelled. */
7330 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7331 if (speed)
7332 *cost += extra_cost->alu.logical;
7333
7334 return true;
7335 }
7336 }
7337 else
7338 {
7339 rtx new_op0 = op0;
7340
7341 /* Handle ORN, EON, or BIC. */
7342 if (GET_CODE (op0) == NOT)
7343 op0 = XEXP (op0, 0);
7344
7345 new_op0 = aarch64_strip_shift (op0);
7346
7347 /* If we had a shift on op0 then this is a logical-shift-
7348 by-register/immediate operation. Otherwise, this is just
7349 a logical operation. */
7350 if (speed)
7351 {
7352 if (new_op0 != op0)
7353 {
7354 /* Shift by immediate. */
7355 if (CONST_INT_P (XEXP (op0, 1)))
7356 *cost += extra_cost->alu.log_shift;
7357 else
7358 *cost += extra_cost->alu.log_shift_reg;
7359 }
7360 else
7361 *cost += extra_cost->alu.logical;
7362 }
7363
7364 /* In both cases we want to cost both operands. */
7365 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7366 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7367
7368 return true;
7369 }
7370 }
7371 return false;
7372
7373 case NOT:
7374 x = XEXP (x, 0);
7375 op0 = aarch64_strip_shift (x);
7376
7377 if (VECTOR_MODE_P (mode))
7378 {
7379 /* Vector NOT. */
7380 *cost += extra_cost->vect.alu;
7381 return false;
7382 }
7383
7384 /* MVN-shifted-reg. */
7385 if (op0 != x)
7386 {
7387 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7388
7389 if (speed)
7390 *cost += extra_cost->alu.log_shift;
7391
7392 return true;
7393 }
7394 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7395 Handle the second form here taking care that 'a' in the above can
7396 be a shift. */
7397 else if (GET_CODE (op0) == XOR)
7398 {
7399 rtx newop0 = XEXP (op0, 0);
7400 rtx newop1 = XEXP (op0, 1);
7401 rtx op0_stripped = aarch64_strip_shift (newop0);
7402
7403 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7404 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7405
7406 if (speed)
7407 {
7408 if (op0_stripped != newop0)
7409 *cost += extra_cost->alu.log_shift;
7410 else
7411 *cost += extra_cost->alu.logical;
7412 }
7413
7414 return true;
7415 }
7416 /* MVN. */
7417 if (speed)
7418 *cost += extra_cost->alu.logical;
7419
7420 return false;
7421
7422 case ZERO_EXTEND:
7423
7424 op0 = XEXP (x, 0);
7425 /* If a value is written in SI mode, then zero extended to DI
7426 mode, the operation will in general be free as a write to
7427 a 'w' register implicitly zeroes the upper bits of an 'x'
7428 register. However, if this is
7429
7430 (set (reg) (zero_extend (reg)))
7431
7432 we must cost the explicit register move. */
7433 if (mode == DImode
7434 && GET_MODE (op0) == SImode
7435 && outer == SET)
7436 {
7437 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7438
7439 /* If OP_COST is non-zero, then the cost of the zero extend
7440 is effectively the cost of the inner operation. Otherwise
7441 we have a MOV instruction and we take the cost from the MOV
7442 itself. This is true independently of whether we are
7443 optimizing for space or time. */
7444 if (op_cost)
7445 *cost = op_cost;
7446
7447 return true;
7448 }
7449 else if (MEM_P (op0))
7450 {
7451 /* All loads can zero extend to any size for free. */
7452 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7453 return true;
7454 }
7455
7456 op0 = aarch64_extend_bitfield_pattern_p (x);
7457 if (op0)
7458 {
7459 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7460 if (speed)
7461 *cost += extra_cost->alu.bfx;
7462 return true;
7463 }
7464
7465 if (speed)
7466 {
7467 if (VECTOR_MODE_P (mode))
7468 {
7469 /* UMOV. */
7470 *cost += extra_cost->vect.alu;
7471 }
7472 else
7473 {
7474 /* We generate an AND instead of UXTB/UXTH. */
7475 *cost += extra_cost->alu.logical;
7476 }
7477 }
7478 return false;
7479
7480 case SIGN_EXTEND:
7481 if (MEM_P (XEXP (x, 0)))
7482 {
7483 /* LDRSH. */
7484 if (speed)
7485 {
7486 rtx address = XEXP (XEXP (x, 0), 0);
7487 *cost += extra_cost->ldst.load_sign_extend;
7488
7489 *cost +=
7490 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7491 0, speed));
7492 }
7493 return true;
7494 }
7495
7496 op0 = aarch64_extend_bitfield_pattern_p (x);
7497 if (op0)
7498 {
7499 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7500 if (speed)
7501 *cost += extra_cost->alu.bfx;
7502 return true;
7503 }
7504
7505 if (speed)
7506 {
7507 if (VECTOR_MODE_P (mode))
7508 *cost += extra_cost->vect.alu;
7509 else
7510 *cost += extra_cost->alu.extend;
7511 }
7512 return false;
7513
7514 case ASHIFT:
7515 op0 = XEXP (x, 0);
7516 op1 = XEXP (x, 1);
7517
7518 if (CONST_INT_P (op1))
7519 {
7520 if (speed)
7521 {
7522 if (VECTOR_MODE_P (mode))
7523 {
7524 /* Vector shift (immediate). */
7525 *cost += extra_cost->vect.alu;
7526 }
7527 else
7528 {
7529 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7530 aliases. */
7531 *cost += extra_cost->alu.shift;
7532 }
7533 }
7534
7535 /* We can incorporate zero/sign extend for free. */
7536 if (GET_CODE (op0) == ZERO_EXTEND
7537 || GET_CODE (op0) == SIGN_EXTEND)
7538 op0 = XEXP (op0, 0);
7539
7540 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7541 return true;
7542 }
7543 else
7544 {
7545 if (VECTOR_MODE_P (mode))
7546 {
7547 if (speed)
7548 /* Vector shift (register). */
7549 *cost += extra_cost->vect.alu;
7550 }
7551 else
7552 {
7553 if (speed)
7554 /* LSLV. */
7555 *cost += extra_cost->alu.shift_reg;
7556
7557 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7558 && CONST_INT_P (XEXP (op1, 1))
7559 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7560 {
7561 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7562 /* We already demanded XEXP (op1, 0) to be REG_P, so
7563 don't recurse into it. */
7564 return true;
7565 }
7566 }
7567 return false; /* All arguments need to be in registers. */
7568 }
7569
7570 case ROTATE:
7571 case ROTATERT:
7572 case LSHIFTRT:
7573 case ASHIFTRT:
7574 op0 = XEXP (x, 0);
7575 op1 = XEXP (x, 1);
7576
7577 if (CONST_INT_P (op1))
7578 {
7579 /* ASR (immediate) and friends. */
7580 if (speed)
7581 {
7582 if (VECTOR_MODE_P (mode))
7583 *cost += extra_cost->vect.alu;
7584 else
7585 *cost += extra_cost->alu.shift;
7586 }
7587
7588 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7589 return true;
7590 }
7591 else
7592 {
7593 if (VECTOR_MODE_P (mode))
7594 {
7595 if (speed)
7596 /* Vector shift (register). */
7597 *cost += extra_cost->vect.alu;
7598 }
7599 else
7600 {
7601 if (speed)
7602 /* ASR (register) and friends. */
7603 *cost += extra_cost->alu.shift_reg;
7604
7605 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7606 && CONST_INT_P (XEXP (op1, 1))
7607 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7608 {
7609 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7610 /* We already demanded XEXP (op1, 0) to be REG_P, so
7611 don't recurse into it. */
7612 return true;
7613 }
7614 }
7615 return false; /* All arguments need to be in registers. */
7616 }
7617
7618 case SYMBOL_REF:
7619
7620 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7621 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7622 {
7623 /* LDR. */
7624 if (speed)
7625 *cost += extra_cost->ldst.load;
7626 }
7627 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7628 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7629 {
7630 /* ADRP, followed by ADD. */
7631 *cost += COSTS_N_INSNS (1);
7632 if (speed)
7633 *cost += 2 * extra_cost->alu.arith;
7634 }
7635 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7636 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7637 {
7638 /* ADR. */
7639 if (speed)
7640 *cost += extra_cost->alu.arith;
7641 }
7642
7643 if (flag_pic)
7644 {
7645 /* One extra load instruction, after accessing the GOT. */
7646 *cost += COSTS_N_INSNS (1);
7647 if (speed)
7648 *cost += extra_cost->ldst.load;
7649 }
7650 return true;
7651
7652 case HIGH:
7653 case LO_SUM:
7654 /* ADRP/ADD (immediate). */
7655 if (speed)
7656 *cost += extra_cost->alu.arith;
7657 return true;
7658
7659 case ZERO_EXTRACT:
7660 case SIGN_EXTRACT:
7661 /* UBFX/SBFX. */
7662 if (speed)
7663 {
7664 if (VECTOR_MODE_P (mode))
7665 *cost += extra_cost->vect.alu;
7666 else
7667 *cost += extra_cost->alu.bfx;
7668 }
7669
7670 /* We can trust that the immediates used will be correct (there
7671 are no by-register forms), so we need only cost op0. */
7672 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7673 return true;
7674
7675 case MULT:
7676 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7677 /* aarch64_rtx_mult_cost always handles recursion to its
7678 operands. */
7679 return true;
7680
7681 case MOD:
7682 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7683 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7684 an unconditional negate. This case should only ever be reached through
7685 the set_smod_pow2_cheap check in expmed.c. */
7686 if (CONST_INT_P (XEXP (x, 1))
7687 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7688 && (mode == SImode || mode == DImode))
7689 {
7690 /* We expand to 4 instructions. Reset the baseline. */
7691 *cost = COSTS_N_INSNS (4);
7692
7693 if (speed)
7694 *cost += 2 * extra_cost->alu.logical
7695 + 2 * extra_cost->alu.arith;
7696
7697 return true;
7698 }
7699
7700 /* Fall-through. */
7701 case UMOD:
7702 if (speed)
7703 {
7704 /* Slighly prefer UMOD over SMOD. */
7705 if (VECTOR_MODE_P (mode))
7706 *cost += extra_cost->vect.alu;
7707 else if (GET_MODE_CLASS (mode) == MODE_INT)
7708 *cost += (extra_cost->mult[mode == DImode].add
7709 + extra_cost->mult[mode == DImode].idiv
7710 + (code == MOD ? 1 : 0));
7711 }
7712 return false; /* All arguments need to be in registers. */
7713
7714 case DIV:
7715 case UDIV:
7716 case SQRT:
7717 if (speed)
7718 {
7719 if (VECTOR_MODE_P (mode))
7720 *cost += extra_cost->vect.alu;
7721 else if (GET_MODE_CLASS (mode) == MODE_INT)
7722 /* There is no integer SQRT, so only DIV and UDIV can get
7723 here. */
7724 *cost += (extra_cost->mult[mode == DImode].idiv
7725 /* Slighly prefer UDIV over SDIV. */
7726 + (code == DIV ? 1 : 0));
7727 else
7728 *cost += extra_cost->fp[mode == DFmode].div;
7729 }
7730 return false; /* All arguments need to be in registers. */
7731
7732 case IF_THEN_ELSE:
7733 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7734 XEXP (x, 2), cost, speed);
7735
7736 case EQ:
7737 case NE:
7738 case GT:
7739 case GTU:
7740 case LT:
7741 case LTU:
7742 case GE:
7743 case GEU:
7744 case LE:
7745 case LEU:
7746
7747 return false; /* All arguments must be in registers. */
7748
7749 case FMA:
7750 op0 = XEXP (x, 0);
7751 op1 = XEXP (x, 1);
7752 op2 = XEXP (x, 2);
7753
7754 if (speed)
7755 {
7756 if (VECTOR_MODE_P (mode))
7757 *cost += extra_cost->vect.alu;
7758 else
7759 *cost += extra_cost->fp[mode == DFmode].fma;
7760 }
7761
7762 /* FMSUB, FNMADD, and FNMSUB are free. */
7763 if (GET_CODE (op0) == NEG)
7764 op0 = XEXP (op0, 0);
7765
7766 if (GET_CODE (op2) == NEG)
7767 op2 = XEXP (op2, 0);
7768
7769 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7770 and the by-element operand as operand 0. */
7771 if (GET_CODE (op1) == NEG)
7772 op1 = XEXP (op1, 0);
7773
7774 /* Catch vector-by-element operations. The by-element operand can
7775 either be (vec_duplicate (vec_select (x))) or just
7776 (vec_select (x)), depending on whether we are multiplying by
7777 a vector or a scalar.
7778
7779 Canonicalization is not very good in these cases, FMA4 will put the
7780 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7781 if (GET_CODE (op0) == VEC_DUPLICATE)
7782 op0 = XEXP (op0, 0);
7783 else if (GET_CODE (op1) == VEC_DUPLICATE)
7784 op1 = XEXP (op1, 0);
7785
7786 if (GET_CODE (op0) == VEC_SELECT)
7787 op0 = XEXP (op0, 0);
7788 else if (GET_CODE (op1) == VEC_SELECT)
7789 op1 = XEXP (op1, 0);
7790
7791 /* If the remaining parameters are not registers,
7792 get the cost to put them into registers. */
7793 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7794 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7795 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7796 return true;
7797
7798 case FLOAT:
7799 case UNSIGNED_FLOAT:
7800 if (speed)
7801 *cost += extra_cost->fp[mode == DFmode].fromint;
7802 return false;
7803
7804 case FLOAT_EXTEND:
7805 if (speed)
7806 {
7807 if (VECTOR_MODE_P (mode))
7808 {
7809 /*Vector truncate. */
7810 *cost += extra_cost->vect.alu;
7811 }
7812 else
7813 *cost += extra_cost->fp[mode == DFmode].widen;
7814 }
7815 return false;
7816
7817 case FLOAT_TRUNCATE:
7818 if (speed)
7819 {
7820 if (VECTOR_MODE_P (mode))
7821 {
7822 /*Vector conversion. */
7823 *cost += extra_cost->vect.alu;
7824 }
7825 else
7826 *cost += extra_cost->fp[mode == DFmode].narrow;
7827 }
7828 return false;
7829
7830 case FIX:
7831 case UNSIGNED_FIX:
7832 x = XEXP (x, 0);
7833 /* Strip the rounding part. They will all be implemented
7834 by the fcvt* family of instructions anyway. */
7835 if (GET_CODE (x) == UNSPEC)
7836 {
7837 unsigned int uns_code = XINT (x, 1);
7838
7839 if (uns_code == UNSPEC_FRINTA
7840 || uns_code == UNSPEC_FRINTM
7841 || uns_code == UNSPEC_FRINTN
7842 || uns_code == UNSPEC_FRINTP
7843 || uns_code == UNSPEC_FRINTZ)
7844 x = XVECEXP (x, 0, 0);
7845 }
7846
7847 if (speed)
7848 {
7849 if (VECTOR_MODE_P (mode))
7850 *cost += extra_cost->vect.alu;
7851 else
7852 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7853 }
7854
7855 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7856 fixed-point fcvt. */
7857 if (GET_CODE (x) == MULT
7858 && ((VECTOR_MODE_P (mode)
7859 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7860 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7861 {
7862 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7863 0, speed);
7864 return true;
7865 }
7866
7867 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7868 return true;
7869
7870 case ABS:
7871 if (VECTOR_MODE_P (mode))
7872 {
7873 /* ABS (vector). */
7874 if (speed)
7875 *cost += extra_cost->vect.alu;
7876 }
7877 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7878 {
7879 op0 = XEXP (x, 0);
7880
7881 /* FABD, which is analogous to FADD. */
7882 if (GET_CODE (op0) == MINUS)
7883 {
7884 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7885 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7886 if (speed)
7887 *cost += extra_cost->fp[mode == DFmode].addsub;
7888
7889 return true;
7890 }
7891 /* Simple FABS is analogous to FNEG. */
7892 if (speed)
7893 *cost += extra_cost->fp[mode == DFmode].neg;
7894 }
7895 else
7896 {
7897 /* Integer ABS will either be split to
7898 two arithmetic instructions, or will be an ABS
7899 (scalar), which we don't model. */
7900 *cost = COSTS_N_INSNS (2);
7901 if (speed)
7902 *cost += 2 * extra_cost->alu.arith;
7903 }
7904 return false;
7905
7906 case SMAX:
7907 case SMIN:
7908 if (speed)
7909 {
7910 if (VECTOR_MODE_P (mode))
7911 *cost += extra_cost->vect.alu;
7912 else
7913 {
7914 /* FMAXNM/FMINNM/FMAX/FMIN.
7915 TODO: This may not be accurate for all implementations, but
7916 we do not model this in the cost tables. */
7917 *cost += extra_cost->fp[mode == DFmode].addsub;
7918 }
7919 }
7920 return false;
7921
7922 case UNSPEC:
7923 /* The floating point round to integer frint* instructions. */
7924 if (aarch64_frint_unspec_p (XINT (x, 1)))
7925 {
7926 if (speed)
7927 *cost += extra_cost->fp[mode == DFmode].roundint;
7928
7929 return false;
7930 }
7931
7932 if (XINT (x, 1) == UNSPEC_RBIT)
7933 {
7934 if (speed)
7935 *cost += extra_cost->alu.rev;
7936
7937 return false;
7938 }
7939 break;
7940
7941 case TRUNCATE:
7942
7943 /* Decompose <su>muldi3_highpart. */
7944 if (/* (truncate:DI */
7945 mode == DImode
7946 /* (lshiftrt:TI */
7947 && GET_MODE (XEXP (x, 0)) == TImode
7948 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7949 /* (mult:TI */
7950 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7951 /* (ANY_EXTEND:TI (reg:DI))
7952 (ANY_EXTEND:TI (reg:DI))) */
7953 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7954 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7955 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7956 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7957 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7958 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7959 /* (const_int 64) */
7960 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7961 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7962 {
7963 /* UMULH/SMULH. */
7964 if (speed)
7965 *cost += extra_cost->mult[mode == DImode].extend;
7966 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7967 mode, MULT, 0, speed);
7968 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7969 mode, MULT, 1, speed);
7970 return true;
7971 }
7972
7973 /* Fall through. */
7974 default:
7975 break;
7976 }
7977
7978 if (dump_file
7979 && flag_aarch64_verbose_cost)
7980 fprintf (dump_file,
7981 "\nFailed to cost RTX. Assuming default cost.\n");
7982
7983 return true;
7984 }
7985
7986 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7987 calculated for X. This cost is stored in *COST. Returns true
7988 if the total cost of X was calculated. */
7989 static bool
7990 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7991 int param, int *cost, bool speed)
7992 {
7993 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7994
7995 if (dump_file
7996 && flag_aarch64_verbose_cost)
7997 {
7998 print_rtl_single (dump_file, x);
7999 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8000 speed ? "Hot" : "Cold",
8001 *cost, result ? "final" : "partial");
8002 }
8003
8004 return result;
8005 }
8006
8007 static int
8008 aarch64_register_move_cost (machine_mode mode,
8009 reg_class_t from_i, reg_class_t to_i)
8010 {
8011 enum reg_class from = (enum reg_class) from_i;
8012 enum reg_class to = (enum reg_class) to_i;
8013 const struct cpu_regmove_cost *regmove_cost
8014 = aarch64_tune_params.regmove_cost;
8015
8016 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8017 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8018 to = GENERAL_REGS;
8019
8020 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8021 from = GENERAL_REGS;
8022
8023 /* Moving between GPR and stack cost is the same as GP2GP. */
8024 if ((from == GENERAL_REGS && to == STACK_REG)
8025 || (to == GENERAL_REGS && from == STACK_REG))
8026 return regmove_cost->GP2GP;
8027
8028 /* To/From the stack register, we move via the gprs. */
8029 if (to == STACK_REG || from == STACK_REG)
8030 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8031 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8032
8033 if (GET_MODE_SIZE (mode) == 16)
8034 {
8035 /* 128-bit operations on general registers require 2 instructions. */
8036 if (from == GENERAL_REGS && to == GENERAL_REGS)
8037 return regmove_cost->GP2GP * 2;
8038 else if (from == GENERAL_REGS)
8039 return regmove_cost->GP2FP * 2;
8040 else if (to == GENERAL_REGS)
8041 return regmove_cost->FP2GP * 2;
8042
8043 /* When AdvSIMD instructions are disabled it is not possible to move
8044 a 128-bit value directly between Q registers. This is handled in
8045 secondary reload. A general register is used as a scratch to move
8046 the upper DI value and the lower DI value is moved directly,
8047 hence the cost is the sum of three moves. */
8048 if (! TARGET_SIMD)
8049 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8050
8051 return regmove_cost->FP2FP;
8052 }
8053
8054 if (from == GENERAL_REGS && to == GENERAL_REGS)
8055 return regmove_cost->GP2GP;
8056 else if (from == GENERAL_REGS)
8057 return regmove_cost->GP2FP;
8058 else if (to == GENERAL_REGS)
8059 return regmove_cost->FP2GP;
8060
8061 return regmove_cost->FP2FP;
8062 }
8063
8064 static int
8065 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8066 reg_class_t rclass ATTRIBUTE_UNUSED,
8067 bool in ATTRIBUTE_UNUSED)
8068 {
8069 return aarch64_tune_params.memmov_cost;
8070 }
8071
8072 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8073 to optimize 1.0/sqrt. */
8074
8075 static bool
8076 use_rsqrt_p (machine_mode mode)
8077 {
8078 return (!flag_trapping_math
8079 && flag_unsafe_math_optimizations
8080 && ((aarch64_tune_params.approx_modes->recip_sqrt
8081 & AARCH64_APPROX_MODE (mode))
8082 || flag_mrecip_low_precision_sqrt));
8083 }
8084
8085 /* Function to decide when to use the approximate reciprocal square root
8086 builtin. */
8087
8088 static tree
8089 aarch64_builtin_reciprocal (tree fndecl)
8090 {
8091 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8092
8093 if (!use_rsqrt_p (mode))
8094 return NULL_TREE;
8095 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8096 }
8097
8098 typedef rtx (*rsqrte_type) (rtx, rtx);
8099
8100 /* Select reciprocal square root initial estimate insn depending on machine
8101 mode. */
8102
8103 static rsqrte_type
8104 get_rsqrte_type (machine_mode mode)
8105 {
8106 switch (mode)
8107 {
8108 case DFmode: return gen_aarch64_rsqrtedf;
8109 case SFmode: return gen_aarch64_rsqrtesf;
8110 case V2DFmode: return gen_aarch64_rsqrtev2df;
8111 case V2SFmode: return gen_aarch64_rsqrtev2sf;
8112 case V4SFmode: return gen_aarch64_rsqrtev4sf;
8113 default: gcc_unreachable ();
8114 }
8115 }
8116
8117 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8118
8119 /* Select reciprocal square root series step insn depending on machine mode. */
8120
8121 static rsqrts_type
8122 get_rsqrts_type (machine_mode mode)
8123 {
8124 switch (mode)
8125 {
8126 case DFmode: return gen_aarch64_rsqrtsdf;
8127 case SFmode: return gen_aarch64_rsqrtssf;
8128 case V2DFmode: return gen_aarch64_rsqrtsv2df;
8129 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
8130 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
8131 default: gcc_unreachable ();
8132 }
8133 }
8134
8135 /* Emit instruction sequence to compute either the approximate square root
8136 or its approximate reciprocal, depending on the flag RECP, and return
8137 whether the sequence was emitted or not. */
8138
8139 bool
8140 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8141 {
8142 machine_mode mode = GET_MODE (dst);
8143
8144 if (GET_MODE_INNER (mode) == HFmode)
8145 {
8146 gcc_assert (!recp);
8147 return false;
8148 }
8149
8150 machine_mode mmsk
8151 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
8152 GET_MODE_NUNITS (mode));
8153 if (!recp)
8154 {
8155 if (!(flag_mlow_precision_sqrt
8156 || (aarch64_tune_params.approx_modes->sqrt
8157 & AARCH64_APPROX_MODE (mode))))
8158 return false;
8159
8160 if (flag_finite_math_only
8161 || flag_trapping_math
8162 || !flag_unsafe_math_optimizations
8163 || optimize_function_for_size_p (cfun))
8164 return false;
8165 }
8166 else
8167 /* Caller assumes we cannot fail. */
8168 gcc_assert (use_rsqrt_p (mode));
8169
8170
8171 rtx xmsk = gen_reg_rtx (mmsk);
8172 if (!recp)
8173 /* When calculating the approximate square root, compare the
8174 argument with 0.0 and create a mask. */
8175 emit_insn (gen_rtx_SET (xmsk,
8176 gen_rtx_NEG (mmsk,
8177 gen_rtx_EQ (mmsk, src,
8178 CONST0_RTX (mode)))));
8179
8180 /* Estimate the approximate reciprocal square root. */
8181 rtx xdst = gen_reg_rtx (mode);
8182 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8183
8184 /* Iterate over the series twice for SF and thrice for DF. */
8185 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8186
8187 /* Optionally iterate over the series once less for faster performance
8188 while sacrificing the accuracy. */
8189 if ((recp && flag_mrecip_low_precision_sqrt)
8190 || (!recp && flag_mlow_precision_sqrt))
8191 iterations--;
8192
8193 /* Iterate over the series to calculate the approximate reciprocal square
8194 root. */
8195 rtx x1 = gen_reg_rtx (mode);
8196 while (iterations--)
8197 {
8198 rtx x2 = gen_reg_rtx (mode);
8199 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8200
8201 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8202
8203 if (iterations > 0)
8204 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8205 }
8206
8207 if (!recp)
8208 {
8209 /* Qualify the approximate reciprocal square root when the argument is
8210 0.0 by squashing the intermediary result to 0.0. */
8211 rtx xtmp = gen_reg_rtx (mmsk);
8212 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8213 gen_rtx_SUBREG (mmsk, xdst, 0)));
8214 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8215
8216 /* Calculate the approximate square root. */
8217 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8218 }
8219
8220 /* Finalize the approximation. */
8221 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8222
8223 return true;
8224 }
8225
8226 typedef rtx (*recpe_type) (rtx, rtx);
8227
8228 /* Select reciprocal initial estimate insn depending on machine mode. */
8229
8230 static recpe_type
8231 get_recpe_type (machine_mode mode)
8232 {
8233 switch (mode)
8234 {
8235 case SFmode: return (gen_aarch64_frecpesf);
8236 case V2SFmode: return (gen_aarch64_frecpev2sf);
8237 case V4SFmode: return (gen_aarch64_frecpev4sf);
8238 case DFmode: return (gen_aarch64_frecpedf);
8239 case V2DFmode: return (gen_aarch64_frecpev2df);
8240 default: gcc_unreachable ();
8241 }
8242 }
8243
8244 typedef rtx (*recps_type) (rtx, rtx, rtx);
8245
8246 /* Select reciprocal series step insn depending on machine mode. */
8247
8248 static recps_type
8249 get_recps_type (machine_mode mode)
8250 {
8251 switch (mode)
8252 {
8253 case SFmode: return (gen_aarch64_frecpssf);
8254 case V2SFmode: return (gen_aarch64_frecpsv2sf);
8255 case V4SFmode: return (gen_aarch64_frecpsv4sf);
8256 case DFmode: return (gen_aarch64_frecpsdf);
8257 case V2DFmode: return (gen_aarch64_frecpsv2df);
8258 default: gcc_unreachable ();
8259 }
8260 }
8261
8262 /* Emit the instruction sequence to compute the approximation for the division
8263 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8264
8265 bool
8266 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8267 {
8268 machine_mode mode = GET_MODE (quo);
8269
8270 if (GET_MODE_INNER (mode) == HFmode)
8271 return false;
8272
8273 bool use_approx_division_p = (flag_mlow_precision_div
8274 || (aarch64_tune_params.approx_modes->division
8275 & AARCH64_APPROX_MODE (mode)));
8276
8277 if (!flag_finite_math_only
8278 || flag_trapping_math
8279 || !flag_unsafe_math_optimizations
8280 || optimize_function_for_size_p (cfun)
8281 || !use_approx_division_p)
8282 return false;
8283
8284 /* Estimate the approximate reciprocal. */
8285 rtx xrcp = gen_reg_rtx (mode);
8286 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8287
8288 /* Iterate over the series twice for SF and thrice for DF. */
8289 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8290
8291 /* Optionally iterate over the series once less for faster performance,
8292 while sacrificing the accuracy. */
8293 if (flag_mlow_precision_div)
8294 iterations--;
8295
8296 /* Iterate over the series to calculate the approximate reciprocal. */
8297 rtx xtmp = gen_reg_rtx (mode);
8298 while (iterations--)
8299 {
8300 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8301
8302 if (iterations > 0)
8303 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8304 }
8305
8306 if (num != CONST1_RTX (mode))
8307 {
8308 /* As the approximate reciprocal of DEN is already calculated, only
8309 calculate the approximate division when NUM is not 1.0. */
8310 rtx xnum = force_reg (mode, num);
8311 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8312 }
8313
8314 /* Finalize the approximation. */
8315 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8316 return true;
8317 }
8318
8319 /* Return the number of instructions that can be issued per cycle. */
8320 static int
8321 aarch64_sched_issue_rate (void)
8322 {
8323 return aarch64_tune_params.issue_rate;
8324 }
8325
8326 static int
8327 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8328 {
8329 int issue_rate = aarch64_sched_issue_rate ();
8330
8331 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8332 }
8333
8334
8335 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8336 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8337 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8338
8339 static int
8340 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8341 int ready_index)
8342 {
8343 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8344 }
8345
8346
8347 /* Vectorizer cost model target hooks. */
8348
8349 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8350 static int
8351 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8352 tree vectype,
8353 int misalign ATTRIBUTE_UNUSED)
8354 {
8355 unsigned elements;
8356 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8357 bool fp = false;
8358
8359 if (vectype != NULL)
8360 fp = FLOAT_TYPE_P (vectype);
8361
8362 switch (type_of_cost)
8363 {
8364 case scalar_stmt:
8365 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8366
8367 case scalar_load:
8368 return costs->scalar_load_cost;
8369
8370 case scalar_store:
8371 return costs->scalar_store_cost;
8372
8373 case vector_stmt:
8374 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8375
8376 case vector_load:
8377 return costs->vec_align_load_cost;
8378
8379 case vector_store:
8380 return costs->vec_store_cost;
8381
8382 case vec_to_scalar:
8383 return costs->vec_to_scalar_cost;
8384
8385 case scalar_to_vec:
8386 return costs->scalar_to_vec_cost;
8387
8388 case unaligned_load:
8389 return costs->vec_unalign_load_cost;
8390
8391 case unaligned_store:
8392 return costs->vec_unalign_store_cost;
8393
8394 case cond_branch_taken:
8395 return costs->cond_taken_branch_cost;
8396
8397 case cond_branch_not_taken:
8398 return costs->cond_not_taken_branch_cost;
8399
8400 case vec_perm:
8401 return costs->vec_permute_cost;
8402
8403 case vec_promote_demote:
8404 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8405
8406 case vec_construct:
8407 elements = TYPE_VECTOR_SUBPARTS (vectype);
8408 return elements / 2 + 1;
8409
8410 default:
8411 gcc_unreachable ();
8412 }
8413 }
8414
8415 /* Implement targetm.vectorize.add_stmt_cost. */
8416 static unsigned
8417 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8418 struct _stmt_vec_info *stmt_info, int misalign,
8419 enum vect_cost_model_location where)
8420 {
8421 unsigned *cost = (unsigned *) data;
8422 unsigned retval = 0;
8423
8424 if (flag_vect_cost_model)
8425 {
8426 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8427 int stmt_cost =
8428 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8429
8430 /* Statements in an inner loop relative to the loop being
8431 vectorized are weighted more heavily. The value here is
8432 arbitrary and could potentially be improved with analysis. */
8433 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8434 count *= 50; /* FIXME */
8435
8436 retval = (unsigned) (count * stmt_cost);
8437 cost[where] += retval;
8438 }
8439
8440 return retval;
8441 }
8442
8443 static void initialize_aarch64_code_model (struct gcc_options *);
8444
8445 /* Parse the TO_PARSE string and put the architecture struct that it
8446 selects into RES and the architectural features into ISA_FLAGS.
8447 Return an aarch64_parse_opt_result describing the parse result.
8448 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8449
8450 static enum aarch64_parse_opt_result
8451 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8452 unsigned long *isa_flags)
8453 {
8454 char *ext;
8455 const struct processor *arch;
8456 char *str = (char *) alloca (strlen (to_parse) + 1);
8457 size_t len;
8458
8459 strcpy (str, to_parse);
8460
8461 ext = strchr (str, '+');
8462
8463 if (ext != NULL)
8464 len = ext - str;
8465 else
8466 len = strlen (str);
8467
8468 if (len == 0)
8469 return AARCH64_PARSE_MISSING_ARG;
8470
8471
8472 /* Loop through the list of supported ARCHes to find a match. */
8473 for (arch = all_architectures; arch->name != NULL; arch++)
8474 {
8475 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8476 {
8477 unsigned long isa_temp = arch->flags;
8478
8479 if (ext != NULL)
8480 {
8481 /* TO_PARSE string contains at least one extension. */
8482 enum aarch64_parse_opt_result ext_res
8483 = aarch64_parse_extension (ext, &isa_temp);
8484
8485 if (ext_res != AARCH64_PARSE_OK)
8486 return ext_res;
8487 }
8488 /* Extension parsing was successful. Confirm the result
8489 arch and ISA flags. */
8490 *res = arch;
8491 *isa_flags = isa_temp;
8492 return AARCH64_PARSE_OK;
8493 }
8494 }
8495
8496 /* ARCH name not found in list. */
8497 return AARCH64_PARSE_INVALID_ARG;
8498 }
8499
8500 /* Parse the TO_PARSE string and put the result tuning in RES and the
8501 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8502 describing the parse result. If there is an error parsing, RES and
8503 ISA_FLAGS are left unchanged. */
8504
8505 static enum aarch64_parse_opt_result
8506 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8507 unsigned long *isa_flags)
8508 {
8509 char *ext;
8510 const struct processor *cpu;
8511 char *str = (char *) alloca (strlen (to_parse) + 1);
8512 size_t len;
8513
8514 strcpy (str, to_parse);
8515
8516 ext = strchr (str, '+');
8517
8518 if (ext != NULL)
8519 len = ext - str;
8520 else
8521 len = strlen (str);
8522
8523 if (len == 0)
8524 return AARCH64_PARSE_MISSING_ARG;
8525
8526
8527 /* Loop through the list of supported CPUs to find a match. */
8528 for (cpu = all_cores; cpu->name != NULL; cpu++)
8529 {
8530 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8531 {
8532 unsigned long isa_temp = cpu->flags;
8533
8534
8535 if (ext != NULL)
8536 {
8537 /* TO_PARSE string contains at least one extension. */
8538 enum aarch64_parse_opt_result ext_res
8539 = aarch64_parse_extension (ext, &isa_temp);
8540
8541 if (ext_res != AARCH64_PARSE_OK)
8542 return ext_res;
8543 }
8544 /* Extension parsing was successfull. Confirm the result
8545 cpu and ISA flags. */
8546 *res = cpu;
8547 *isa_flags = isa_temp;
8548 return AARCH64_PARSE_OK;
8549 }
8550 }
8551
8552 /* CPU name not found in list. */
8553 return AARCH64_PARSE_INVALID_ARG;
8554 }
8555
8556 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8557 Return an aarch64_parse_opt_result describing the parse result.
8558 If the parsing fails the RES does not change. */
8559
8560 static enum aarch64_parse_opt_result
8561 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8562 {
8563 const struct processor *cpu;
8564 char *str = (char *) alloca (strlen (to_parse) + 1);
8565
8566 strcpy (str, to_parse);
8567
8568 /* Loop through the list of supported CPUs to find a match. */
8569 for (cpu = all_cores; cpu->name != NULL; cpu++)
8570 {
8571 if (strcmp (cpu->name, str) == 0)
8572 {
8573 *res = cpu;
8574 return AARCH64_PARSE_OK;
8575 }
8576 }
8577
8578 /* CPU name not found in list. */
8579 return AARCH64_PARSE_INVALID_ARG;
8580 }
8581
8582 /* Parse TOKEN, which has length LENGTH to see if it is an option
8583 described in FLAG. If it is, return the index bit for that fusion type.
8584 If not, error (printing OPTION_NAME) and return zero. */
8585
8586 static unsigned int
8587 aarch64_parse_one_option_token (const char *token,
8588 size_t length,
8589 const struct aarch64_flag_desc *flag,
8590 const char *option_name)
8591 {
8592 for (; flag->name != NULL; flag++)
8593 {
8594 if (length == strlen (flag->name)
8595 && !strncmp (flag->name, token, length))
8596 return flag->flag;
8597 }
8598
8599 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8600 return 0;
8601 }
8602
8603 /* Parse OPTION which is a comma-separated list of flags to enable.
8604 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8605 default state we inherit from the CPU tuning structures. OPTION_NAME
8606 gives the top-level option we are parsing in the -moverride string,
8607 for use in error messages. */
8608
8609 static unsigned int
8610 aarch64_parse_boolean_options (const char *option,
8611 const struct aarch64_flag_desc *flags,
8612 unsigned int initial_state,
8613 const char *option_name)
8614 {
8615 const char separator = '.';
8616 const char* specs = option;
8617 const char* ntoken = option;
8618 unsigned int found_flags = initial_state;
8619
8620 while ((ntoken = strchr (specs, separator)))
8621 {
8622 size_t token_length = ntoken - specs;
8623 unsigned token_ops = aarch64_parse_one_option_token (specs,
8624 token_length,
8625 flags,
8626 option_name);
8627 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8628 in the token stream, reset the supported operations. So:
8629
8630 adrp+add.cmp+branch.none.adrp+add
8631
8632 would have the result of turning on only adrp+add fusion. */
8633 if (!token_ops)
8634 found_flags = 0;
8635
8636 found_flags |= token_ops;
8637 specs = ++ntoken;
8638 }
8639
8640 /* We ended with a comma, print something. */
8641 if (!(*specs))
8642 {
8643 error ("%s string ill-formed\n", option_name);
8644 return 0;
8645 }
8646
8647 /* We still have one more token to parse. */
8648 size_t token_length = strlen (specs);
8649 unsigned token_ops = aarch64_parse_one_option_token (specs,
8650 token_length,
8651 flags,
8652 option_name);
8653 if (!token_ops)
8654 found_flags = 0;
8655
8656 found_flags |= token_ops;
8657 return found_flags;
8658 }
8659
8660 /* Support for overriding instruction fusion. */
8661
8662 static void
8663 aarch64_parse_fuse_string (const char *fuse_string,
8664 struct tune_params *tune)
8665 {
8666 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8667 aarch64_fusible_pairs,
8668 tune->fusible_ops,
8669 "fuse=");
8670 }
8671
8672 /* Support for overriding other tuning flags. */
8673
8674 static void
8675 aarch64_parse_tune_string (const char *tune_string,
8676 struct tune_params *tune)
8677 {
8678 tune->extra_tuning_flags
8679 = aarch64_parse_boolean_options (tune_string,
8680 aarch64_tuning_flags,
8681 tune->extra_tuning_flags,
8682 "tune=");
8683 }
8684
8685 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8686 we understand. If it is, extract the option string and handoff to
8687 the appropriate function. */
8688
8689 void
8690 aarch64_parse_one_override_token (const char* token,
8691 size_t length,
8692 struct tune_params *tune)
8693 {
8694 const struct aarch64_tuning_override_function *fn
8695 = aarch64_tuning_override_functions;
8696
8697 const char *option_part = strchr (token, '=');
8698 if (!option_part)
8699 {
8700 error ("tuning string missing in option (%s)", token);
8701 return;
8702 }
8703
8704 /* Get the length of the option name. */
8705 length = option_part - token;
8706 /* Skip the '=' to get to the option string. */
8707 option_part++;
8708
8709 for (; fn->name != NULL; fn++)
8710 {
8711 if (!strncmp (fn->name, token, length))
8712 {
8713 fn->parse_override (option_part, tune);
8714 return;
8715 }
8716 }
8717
8718 error ("unknown tuning option (%s)",token);
8719 return;
8720 }
8721
8722 /* A checking mechanism for the implementation of the tls size. */
8723
8724 static void
8725 initialize_aarch64_tls_size (struct gcc_options *opts)
8726 {
8727 if (aarch64_tls_size == 0)
8728 aarch64_tls_size = 24;
8729
8730 switch (opts->x_aarch64_cmodel_var)
8731 {
8732 case AARCH64_CMODEL_TINY:
8733 /* Both the default and maximum TLS size allowed under tiny is 1M which
8734 needs two instructions to address, so we clamp the size to 24. */
8735 if (aarch64_tls_size > 24)
8736 aarch64_tls_size = 24;
8737 break;
8738 case AARCH64_CMODEL_SMALL:
8739 /* The maximum TLS size allowed under small is 4G. */
8740 if (aarch64_tls_size > 32)
8741 aarch64_tls_size = 32;
8742 break;
8743 case AARCH64_CMODEL_LARGE:
8744 /* The maximum TLS size allowed under large is 16E.
8745 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8746 if (aarch64_tls_size > 48)
8747 aarch64_tls_size = 48;
8748 break;
8749 default:
8750 gcc_unreachable ();
8751 }
8752
8753 return;
8754 }
8755
8756 /* Parse STRING looking for options in the format:
8757 string :: option:string
8758 option :: name=substring
8759 name :: {a-z}
8760 substring :: defined by option. */
8761
8762 static void
8763 aarch64_parse_override_string (const char* input_string,
8764 struct tune_params* tune)
8765 {
8766 const char separator = ':';
8767 size_t string_length = strlen (input_string) + 1;
8768 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8769 char *string = string_root;
8770 strncpy (string, input_string, string_length);
8771 string[string_length - 1] = '\0';
8772
8773 char* ntoken = string;
8774
8775 while ((ntoken = strchr (string, separator)))
8776 {
8777 size_t token_length = ntoken - string;
8778 /* Make this substring look like a string. */
8779 *ntoken = '\0';
8780 aarch64_parse_one_override_token (string, token_length, tune);
8781 string = ++ntoken;
8782 }
8783
8784 /* One last option to parse. */
8785 aarch64_parse_one_override_token (string, strlen (string), tune);
8786 free (string_root);
8787 }
8788
8789
8790 static void
8791 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8792 {
8793 /* The logic here is that if we are disabling all frame pointer generation
8794 then we do not need to disable leaf frame pointer generation as a
8795 separate operation. But if we are *only* disabling leaf frame pointer
8796 generation then we set flag_omit_frame_pointer to true, but in
8797 aarch64_frame_pointer_required we return false only for leaf functions.
8798
8799 PR 70044: We have to be careful about being called multiple times for the
8800 same function. Once we have decided to set flag_omit_frame_pointer just
8801 so that we can omit leaf frame pointers, we must then not interpret a
8802 second call as meaning that all frame pointer generation should be
8803 omitted. We do this by setting flag_omit_frame_pointer to a special,
8804 non-zero value. */
8805 if (opts->x_flag_omit_frame_pointer == 2)
8806 opts->x_flag_omit_frame_pointer = 0;
8807
8808 if (opts->x_flag_omit_frame_pointer)
8809 opts->x_flag_omit_leaf_frame_pointer = false;
8810 else if (opts->x_flag_omit_leaf_frame_pointer)
8811 opts->x_flag_omit_frame_pointer = 2;
8812
8813 /* If not optimizing for size, set the default
8814 alignment to what the target wants. */
8815 if (!opts->x_optimize_size)
8816 {
8817 if (opts->x_align_loops <= 0)
8818 opts->x_align_loops = aarch64_tune_params.loop_align;
8819 if (opts->x_align_jumps <= 0)
8820 opts->x_align_jumps = aarch64_tune_params.jump_align;
8821 if (opts->x_align_functions <= 0)
8822 opts->x_align_functions = aarch64_tune_params.function_align;
8823 }
8824
8825 /* We default to no pc-relative literal loads. */
8826
8827 aarch64_pcrelative_literal_loads = false;
8828
8829 /* If -mpc-relative-literal-loads is set on the command line, this
8830 implies that the user asked for PC relative literal loads. */
8831 if (opts->x_pcrelative_literal_loads == 1)
8832 aarch64_pcrelative_literal_loads = true;
8833
8834 /* This is PR70113. When building the Linux kernel with
8835 CONFIG_ARM64_ERRATUM_843419, support for relocations
8836 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8837 removed from the kernel to avoid loading objects with possibly
8838 offending sequences. Without -mpc-relative-literal-loads we would
8839 generate such relocations, preventing the kernel build from
8840 succeeding. */
8841 if (opts->x_pcrelative_literal_loads == 2
8842 && TARGET_FIX_ERR_A53_843419)
8843 aarch64_pcrelative_literal_loads = true;
8844
8845 /* In the tiny memory model it makes no sense to disallow PC relative
8846 literal pool loads. */
8847 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8848 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8849 aarch64_pcrelative_literal_loads = true;
8850
8851 /* When enabling the lower precision Newton series for the square root, also
8852 enable it for the reciprocal square root, since the latter is an
8853 intermediary step for the former. */
8854 if (flag_mlow_precision_sqrt)
8855 flag_mrecip_low_precision_sqrt = true;
8856 }
8857
8858 /* 'Unpack' up the internal tuning structs and update the options
8859 in OPTS. The caller must have set up selected_tune and selected_arch
8860 as all the other target-specific codegen decisions are
8861 derived from them. */
8862
8863 void
8864 aarch64_override_options_internal (struct gcc_options *opts)
8865 {
8866 aarch64_tune_flags = selected_tune->flags;
8867 aarch64_tune = selected_tune->sched_core;
8868 /* Make a copy of the tuning parameters attached to the core, which
8869 we may later overwrite. */
8870 aarch64_tune_params = *(selected_tune->tune);
8871 aarch64_architecture_version = selected_arch->architecture_version;
8872
8873 if (opts->x_aarch64_override_tune_string)
8874 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8875 &aarch64_tune_params);
8876
8877 /* This target defaults to strict volatile bitfields. */
8878 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8879 opts->x_flag_strict_volatile_bitfields = 1;
8880
8881 initialize_aarch64_code_model (opts);
8882 initialize_aarch64_tls_size (opts);
8883
8884 int queue_depth = 0;
8885 switch (aarch64_tune_params.autoprefetcher_model)
8886 {
8887 case tune_params::AUTOPREFETCHER_OFF:
8888 queue_depth = -1;
8889 break;
8890 case tune_params::AUTOPREFETCHER_WEAK:
8891 queue_depth = 0;
8892 break;
8893 case tune_params::AUTOPREFETCHER_STRONG:
8894 queue_depth = max_insn_queue_index + 1;
8895 break;
8896 default:
8897 gcc_unreachable ();
8898 }
8899
8900 /* We don't mind passing in global_options_set here as we don't use
8901 the *options_set structs anyway. */
8902 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8903 queue_depth,
8904 opts->x_param_values,
8905 global_options_set.x_param_values);
8906
8907 /* Set up parameters to be used in prefetching algorithm. Do not
8908 override the defaults unless we are tuning for a core we have
8909 researched values for. */
8910 if (aarch64_tune_params.prefetch->num_slots > 0)
8911 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
8912 aarch64_tune_params.prefetch->num_slots,
8913 opts->x_param_values,
8914 global_options_set.x_param_values);
8915 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
8916 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
8917 aarch64_tune_params.prefetch->l1_cache_size,
8918 opts->x_param_values,
8919 global_options_set.x_param_values);
8920 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
8921 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8922 aarch64_tune_params.prefetch->l1_cache_line_size,
8923 opts->x_param_values,
8924 global_options_set.x_param_values);
8925 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
8926 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
8927 aarch64_tune_params.prefetch->l2_cache_size,
8928 opts->x_param_values,
8929 global_options_set.x_param_values);
8930
8931 /* Enable sw prefetching at specified optimization level for
8932 CPUS that have prefetch. Lower optimization level threshold by 1
8933 when profiling is enabled. */
8934 if (opts->x_flag_prefetch_loop_arrays < 0
8935 && !opts->x_optimize_size
8936 && aarch64_tune_params.prefetch->default_opt_level >= 0
8937 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
8938 opts->x_flag_prefetch_loop_arrays = 1;
8939
8940 aarch64_override_options_after_change_1 (opts);
8941 }
8942
8943 /* Print a hint with a suggestion for a core or architecture name that
8944 most closely resembles what the user passed in STR. ARCH is true if
8945 the user is asking for an architecture name. ARCH is false if the user
8946 is asking for a core name. */
8947
8948 static void
8949 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8950 {
8951 auto_vec<const char *> candidates;
8952 const struct processor *entry = arch ? all_architectures : all_cores;
8953 for (; entry->name != NULL; entry++)
8954 candidates.safe_push (entry->name);
8955 char *s;
8956 const char *hint = candidates_list_and_hint (str, s, candidates);
8957 if (hint)
8958 inform (input_location, "valid arguments are: %s;"
8959 " did you mean %qs?", s, hint);
8960 XDELETEVEC (s);
8961 }
8962
8963 /* Print a hint with a suggestion for a core name that most closely resembles
8964 what the user passed in STR. */
8965
8966 inline static void
8967 aarch64_print_hint_for_core (const char *str)
8968 {
8969 aarch64_print_hint_for_core_or_arch (str, false);
8970 }
8971
8972 /* Print a hint with a suggestion for an architecture name that most closely
8973 resembles what the user passed in STR. */
8974
8975 inline static void
8976 aarch64_print_hint_for_arch (const char *str)
8977 {
8978 aarch64_print_hint_for_core_or_arch (str, true);
8979 }
8980
8981 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8982 specified in STR and throw errors if appropriate. Put the results if
8983 they are valid in RES and ISA_FLAGS. Return whether the option is
8984 valid. */
8985
8986 static bool
8987 aarch64_validate_mcpu (const char *str, const struct processor **res,
8988 unsigned long *isa_flags)
8989 {
8990 enum aarch64_parse_opt_result parse_res
8991 = aarch64_parse_cpu (str, res, isa_flags);
8992
8993 if (parse_res == AARCH64_PARSE_OK)
8994 return true;
8995
8996 switch (parse_res)
8997 {
8998 case AARCH64_PARSE_MISSING_ARG:
8999 error ("missing cpu name in %<-mcpu=%s%>", str);
9000 break;
9001 case AARCH64_PARSE_INVALID_ARG:
9002 error ("unknown value %qs for -mcpu", str);
9003 aarch64_print_hint_for_core (str);
9004 break;
9005 case AARCH64_PARSE_INVALID_FEATURE:
9006 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9007 break;
9008 default:
9009 gcc_unreachable ();
9010 }
9011
9012 return false;
9013 }
9014
9015 /* Validate a command-line -march option. Parse the arch and extensions
9016 (if any) specified in STR and throw errors if appropriate. Put the
9017 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9018 option is valid. */
9019
9020 static bool
9021 aarch64_validate_march (const char *str, const struct processor **res,
9022 unsigned long *isa_flags)
9023 {
9024 enum aarch64_parse_opt_result parse_res
9025 = aarch64_parse_arch (str, res, isa_flags);
9026
9027 if (parse_res == AARCH64_PARSE_OK)
9028 return true;
9029
9030 switch (parse_res)
9031 {
9032 case AARCH64_PARSE_MISSING_ARG:
9033 error ("missing arch name in %<-march=%s%>", str);
9034 break;
9035 case AARCH64_PARSE_INVALID_ARG:
9036 error ("unknown value %qs for -march", str);
9037 aarch64_print_hint_for_arch (str);
9038 break;
9039 case AARCH64_PARSE_INVALID_FEATURE:
9040 error ("invalid feature modifier in %<-march=%s%>", str);
9041 break;
9042 default:
9043 gcc_unreachable ();
9044 }
9045
9046 return false;
9047 }
9048
9049 /* Validate a command-line -mtune option. Parse the cpu
9050 specified in STR and throw errors if appropriate. Put the
9051 result, if it is valid, in RES. Return whether the option is
9052 valid. */
9053
9054 static bool
9055 aarch64_validate_mtune (const char *str, const struct processor **res)
9056 {
9057 enum aarch64_parse_opt_result parse_res
9058 = aarch64_parse_tune (str, res);
9059
9060 if (parse_res == AARCH64_PARSE_OK)
9061 return true;
9062
9063 switch (parse_res)
9064 {
9065 case AARCH64_PARSE_MISSING_ARG:
9066 error ("missing cpu name in %<-mtune=%s%>", str);
9067 break;
9068 case AARCH64_PARSE_INVALID_ARG:
9069 error ("unknown value %qs for -mtune", str);
9070 aarch64_print_hint_for_core (str);
9071 break;
9072 default:
9073 gcc_unreachable ();
9074 }
9075 return false;
9076 }
9077
9078 /* Return the CPU corresponding to the enum CPU.
9079 If it doesn't specify a cpu, return the default. */
9080
9081 static const struct processor *
9082 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9083 {
9084 if (cpu != aarch64_none)
9085 return &all_cores[cpu];
9086
9087 /* The & 0x3f is to extract the bottom 6 bits that encode the
9088 default cpu as selected by the --with-cpu GCC configure option
9089 in config.gcc.
9090 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9091 flags mechanism should be reworked to make it more sane. */
9092 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9093 }
9094
9095 /* Return the architecture corresponding to the enum ARCH.
9096 If it doesn't specify a valid architecture, return the default. */
9097
9098 static const struct processor *
9099 aarch64_get_arch (enum aarch64_arch arch)
9100 {
9101 if (arch != aarch64_no_arch)
9102 return &all_architectures[arch];
9103
9104 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9105
9106 return &all_architectures[cpu->arch];
9107 }
9108
9109 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9110 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9111 tuning structs. In particular it must set selected_tune and
9112 aarch64_isa_flags that define the available ISA features and tuning
9113 decisions. It must also set selected_arch as this will be used to
9114 output the .arch asm tags for each function. */
9115
9116 static void
9117 aarch64_override_options (void)
9118 {
9119 unsigned long cpu_isa = 0;
9120 unsigned long arch_isa = 0;
9121 aarch64_isa_flags = 0;
9122
9123 bool valid_cpu = true;
9124 bool valid_tune = true;
9125 bool valid_arch = true;
9126
9127 selected_cpu = NULL;
9128 selected_arch = NULL;
9129 selected_tune = NULL;
9130
9131 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9132 If either of -march or -mtune is given, they override their
9133 respective component of -mcpu. */
9134 if (aarch64_cpu_string)
9135 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9136 &cpu_isa);
9137
9138 if (aarch64_arch_string)
9139 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9140 &arch_isa);
9141
9142 if (aarch64_tune_string)
9143 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9144
9145 /* If the user did not specify a processor, choose the default
9146 one for them. This will be the CPU set during configuration using
9147 --with-cpu, otherwise it is "generic". */
9148 if (!selected_cpu)
9149 {
9150 if (selected_arch)
9151 {
9152 selected_cpu = &all_cores[selected_arch->ident];
9153 aarch64_isa_flags = arch_isa;
9154 explicit_arch = selected_arch->arch;
9155 }
9156 else
9157 {
9158 /* Get default configure-time CPU. */
9159 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9160 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9161 }
9162
9163 if (selected_tune)
9164 explicit_tune_core = selected_tune->ident;
9165 }
9166 /* If both -mcpu and -march are specified check that they are architecturally
9167 compatible, warn if they're not and prefer the -march ISA flags. */
9168 else if (selected_arch)
9169 {
9170 if (selected_arch->arch != selected_cpu->arch)
9171 {
9172 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9173 all_architectures[selected_cpu->arch].name,
9174 selected_arch->name);
9175 }
9176 aarch64_isa_flags = arch_isa;
9177 explicit_arch = selected_arch->arch;
9178 explicit_tune_core = selected_tune ? selected_tune->ident
9179 : selected_cpu->ident;
9180 }
9181 else
9182 {
9183 /* -mcpu but no -march. */
9184 aarch64_isa_flags = cpu_isa;
9185 explicit_tune_core = selected_tune ? selected_tune->ident
9186 : selected_cpu->ident;
9187 gcc_assert (selected_cpu);
9188 selected_arch = &all_architectures[selected_cpu->arch];
9189 explicit_arch = selected_arch->arch;
9190 }
9191
9192 /* Set the arch as well as we will need it when outputing
9193 the .arch directive in assembly. */
9194 if (!selected_arch)
9195 {
9196 gcc_assert (selected_cpu);
9197 selected_arch = &all_architectures[selected_cpu->arch];
9198 }
9199
9200 if (!selected_tune)
9201 selected_tune = selected_cpu;
9202
9203 #ifndef HAVE_AS_MABI_OPTION
9204 /* The compiler may have been configured with 2.23.* binutils, which does
9205 not have support for ILP32. */
9206 if (TARGET_ILP32)
9207 error ("Assembler does not support -mabi=ilp32");
9208 #endif
9209
9210 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9211 sorry ("Return address signing is only supported for -mabi=lp64");
9212
9213 /* Make sure we properly set up the explicit options. */
9214 if ((aarch64_cpu_string && valid_cpu)
9215 || (aarch64_tune_string && valid_tune))
9216 gcc_assert (explicit_tune_core != aarch64_none);
9217
9218 if ((aarch64_cpu_string && valid_cpu)
9219 || (aarch64_arch_string && valid_arch))
9220 gcc_assert (explicit_arch != aarch64_no_arch);
9221
9222 aarch64_override_options_internal (&global_options);
9223
9224 /* Save these options as the default ones in case we push and pop them later
9225 while processing functions with potential target attributes. */
9226 target_option_default_node = target_option_current_node
9227 = build_target_option_node (&global_options);
9228 }
9229
9230 /* Implement targetm.override_options_after_change. */
9231
9232 static void
9233 aarch64_override_options_after_change (void)
9234 {
9235 aarch64_override_options_after_change_1 (&global_options);
9236 }
9237
9238 static struct machine_function *
9239 aarch64_init_machine_status (void)
9240 {
9241 struct machine_function *machine;
9242 machine = ggc_cleared_alloc<machine_function> ();
9243 return machine;
9244 }
9245
9246 void
9247 aarch64_init_expanders (void)
9248 {
9249 init_machine_status = aarch64_init_machine_status;
9250 }
9251
9252 /* A checking mechanism for the implementation of the various code models. */
9253 static void
9254 initialize_aarch64_code_model (struct gcc_options *opts)
9255 {
9256 if (opts->x_flag_pic)
9257 {
9258 switch (opts->x_aarch64_cmodel_var)
9259 {
9260 case AARCH64_CMODEL_TINY:
9261 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9262 break;
9263 case AARCH64_CMODEL_SMALL:
9264 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9265 aarch64_cmodel = (flag_pic == 2
9266 ? AARCH64_CMODEL_SMALL_PIC
9267 : AARCH64_CMODEL_SMALL_SPIC);
9268 #else
9269 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9270 #endif
9271 break;
9272 case AARCH64_CMODEL_LARGE:
9273 sorry ("code model %qs with -f%s", "large",
9274 opts->x_flag_pic > 1 ? "PIC" : "pic");
9275 break;
9276 default:
9277 gcc_unreachable ();
9278 }
9279 }
9280 else
9281 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9282 }
9283
9284 /* Implement TARGET_OPTION_SAVE. */
9285
9286 static void
9287 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9288 {
9289 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9290 }
9291
9292 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9293 using the information saved in PTR. */
9294
9295 static void
9296 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9297 {
9298 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9299 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9300 opts->x_explicit_arch = ptr->x_explicit_arch;
9301 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9302 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9303
9304 aarch64_override_options_internal (opts);
9305 }
9306
9307 /* Implement TARGET_OPTION_PRINT. */
9308
9309 static void
9310 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9311 {
9312 const struct processor *cpu
9313 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9314 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9315 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9316 std::string extension
9317 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9318
9319 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9320 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9321 arch->name, extension.c_str ());
9322 }
9323
9324 static GTY(()) tree aarch64_previous_fndecl;
9325
9326 void
9327 aarch64_reset_previous_fndecl (void)
9328 {
9329 aarch64_previous_fndecl = NULL;
9330 }
9331
9332 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9333 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9334 make sure optab availability predicates are recomputed when necessary. */
9335
9336 void
9337 aarch64_save_restore_target_globals (tree new_tree)
9338 {
9339 if (TREE_TARGET_GLOBALS (new_tree))
9340 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9341 else if (new_tree == target_option_default_node)
9342 restore_target_globals (&default_target_globals);
9343 else
9344 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9345 }
9346
9347 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9348 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9349 of the function, if such exists. This function may be called multiple
9350 times on a single function so use aarch64_previous_fndecl to avoid
9351 setting up identical state. */
9352
9353 static void
9354 aarch64_set_current_function (tree fndecl)
9355 {
9356 if (!fndecl || fndecl == aarch64_previous_fndecl)
9357 return;
9358
9359 tree old_tree = (aarch64_previous_fndecl
9360 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9361 : NULL_TREE);
9362
9363 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9364
9365 /* If current function has no attributes but the previous one did,
9366 use the default node. */
9367 if (!new_tree && old_tree)
9368 new_tree = target_option_default_node;
9369
9370 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9371 the default have been handled by aarch64_save_restore_target_globals from
9372 aarch64_pragma_target_parse. */
9373 if (old_tree == new_tree)
9374 return;
9375
9376 aarch64_previous_fndecl = fndecl;
9377
9378 /* First set the target options. */
9379 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9380
9381 aarch64_save_restore_target_globals (new_tree);
9382 }
9383
9384 /* Enum describing the various ways we can handle attributes.
9385 In many cases we can reuse the generic option handling machinery. */
9386
9387 enum aarch64_attr_opt_type
9388 {
9389 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9390 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9391 aarch64_attr_enum, /* Attribute sets an enum variable. */
9392 aarch64_attr_custom /* Attribute requires a custom handling function. */
9393 };
9394
9395 /* All the information needed to handle a target attribute.
9396 NAME is the name of the attribute.
9397 ATTR_TYPE specifies the type of behavior of the attribute as described
9398 in the definition of enum aarch64_attr_opt_type.
9399 ALLOW_NEG is true if the attribute supports a "no-" form.
9400 HANDLER is the function that takes the attribute string and whether
9401 it is a pragma or attribute and handles the option. It is needed only
9402 when the ATTR_TYPE is aarch64_attr_custom.
9403 OPT_NUM is the enum specifying the option that the attribute modifies.
9404 This is needed for attributes that mirror the behavior of a command-line
9405 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9406 aarch64_attr_enum. */
9407
9408 struct aarch64_attribute_info
9409 {
9410 const char *name;
9411 enum aarch64_attr_opt_type attr_type;
9412 bool allow_neg;
9413 bool (*handler) (const char *, const char *);
9414 enum opt_code opt_num;
9415 };
9416
9417 /* Handle the ARCH_STR argument to the arch= target attribute.
9418 PRAGMA_OR_ATTR is used in potential error messages. */
9419
9420 static bool
9421 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9422 {
9423 const struct processor *tmp_arch = NULL;
9424 enum aarch64_parse_opt_result parse_res
9425 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9426
9427 if (parse_res == AARCH64_PARSE_OK)
9428 {
9429 gcc_assert (tmp_arch);
9430 selected_arch = tmp_arch;
9431 explicit_arch = selected_arch->arch;
9432 return true;
9433 }
9434
9435 switch (parse_res)
9436 {
9437 case AARCH64_PARSE_MISSING_ARG:
9438 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9439 break;
9440 case AARCH64_PARSE_INVALID_ARG:
9441 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9442 aarch64_print_hint_for_arch (str);
9443 break;
9444 case AARCH64_PARSE_INVALID_FEATURE:
9445 error ("invalid feature modifier %qs for 'arch' target %s",
9446 str, pragma_or_attr);
9447 break;
9448 default:
9449 gcc_unreachable ();
9450 }
9451
9452 return false;
9453 }
9454
9455 /* Handle the argument CPU_STR to the cpu= target attribute.
9456 PRAGMA_OR_ATTR is used in potential error messages. */
9457
9458 static bool
9459 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9460 {
9461 const struct processor *tmp_cpu = NULL;
9462 enum aarch64_parse_opt_result parse_res
9463 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9464
9465 if (parse_res == AARCH64_PARSE_OK)
9466 {
9467 gcc_assert (tmp_cpu);
9468 selected_tune = tmp_cpu;
9469 explicit_tune_core = selected_tune->ident;
9470
9471 selected_arch = &all_architectures[tmp_cpu->arch];
9472 explicit_arch = selected_arch->arch;
9473 return true;
9474 }
9475
9476 switch (parse_res)
9477 {
9478 case AARCH64_PARSE_MISSING_ARG:
9479 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9480 break;
9481 case AARCH64_PARSE_INVALID_ARG:
9482 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9483 aarch64_print_hint_for_core (str);
9484 break;
9485 case AARCH64_PARSE_INVALID_FEATURE:
9486 error ("invalid feature modifier %qs for 'cpu' target %s",
9487 str, pragma_or_attr);
9488 break;
9489 default:
9490 gcc_unreachable ();
9491 }
9492
9493 return false;
9494 }
9495
9496 /* Handle the argument STR to the tune= target attribute.
9497 PRAGMA_OR_ATTR is used in potential error messages. */
9498
9499 static bool
9500 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9501 {
9502 const struct processor *tmp_tune = NULL;
9503 enum aarch64_parse_opt_result parse_res
9504 = aarch64_parse_tune (str, &tmp_tune);
9505
9506 if (parse_res == AARCH64_PARSE_OK)
9507 {
9508 gcc_assert (tmp_tune);
9509 selected_tune = tmp_tune;
9510 explicit_tune_core = selected_tune->ident;
9511 return true;
9512 }
9513
9514 switch (parse_res)
9515 {
9516 case AARCH64_PARSE_INVALID_ARG:
9517 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9518 aarch64_print_hint_for_core (str);
9519 break;
9520 default:
9521 gcc_unreachable ();
9522 }
9523
9524 return false;
9525 }
9526
9527 /* Parse an architecture extensions target attribute string specified in STR.
9528 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9529 if successful. Update aarch64_isa_flags to reflect the ISA features
9530 modified.
9531 PRAGMA_OR_ATTR is used in potential error messages. */
9532
9533 static bool
9534 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9535 {
9536 enum aarch64_parse_opt_result parse_res;
9537 unsigned long isa_flags = aarch64_isa_flags;
9538
9539 /* We allow "+nothing" in the beginning to clear out all architectural
9540 features if the user wants to handpick specific features. */
9541 if (strncmp ("+nothing", str, 8) == 0)
9542 {
9543 isa_flags = 0;
9544 str += 8;
9545 }
9546
9547 parse_res = aarch64_parse_extension (str, &isa_flags);
9548
9549 if (parse_res == AARCH64_PARSE_OK)
9550 {
9551 aarch64_isa_flags = isa_flags;
9552 return true;
9553 }
9554
9555 switch (parse_res)
9556 {
9557 case AARCH64_PARSE_MISSING_ARG:
9558 error ("missing feature modifier in target %s %qs",
9559 pragma_or_attr, str);
9560 break;
9561
9562 case AARCH64_PARSE_INVALID_FEATURE:
9563 error ("invalid feature modifier in target %s %qs",
9564 pragma_or_attr, str);
9565 break;
9566
9567 default:
9568 gcc_unreachable ();
9569 }
9570
9571 return false;
9572 }
9573
9574 /* The target attributes that we support. On top of these we also support just
9575 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9576 handled explicitly in aarch64_process_one_target_attr. */
9577
9578 static const struct aarch64_attribute_info aarch64_attributes[] =
9579 {
9580 { "general-regs-only", aarch64_attr_mask, false, NULL,
9581 OPT_mgeneral_regs_only },
9582 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9583 OPT_mfix_cortex_a53_835769 },
9584 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9585 OPT_mfix_cortex_a53_843419 },
9586 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9587 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9588 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9589 OPT_momit_leaf_frame_pointer },
9590 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9591 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9592 OPT_march_ },
9593 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9594 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9595 OPT_mtune_ },
9596 { "sign-return-address", aarch64_attr_enum, false, NULL,
9597 OPT_msign_return_address_ },
9598 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9599 };
9600
9601 /* Parse ARG_STR which contains the definition of one target attribute.
9602 Show appropriate errors if any or return true if the attribute is valid.
9603 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9604 we're processing a target attribute or pragma. */
9605
9606 static bool
9607 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9608 {
9609 bool invert = false;
9610
9611 size_t len = strlen (arg_str);
9612
9613 if (len == 0)
9614 {
9615 error ("malformed target %s", pragma_or_attr);
9616 return false;
9617 }
9618
9619 char *str_to_check = (char *) alloca (len + 1);
9620 strcpy (str_to_check, arg_str);
9621
9622 /* Skip leading whitespace. */
9623 while (*str_to_check == ' ' || *str_to_check == '\t')
9624 str_to_check++;
9625
9626 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9627 It is easier to detect and handle it explicitly here rather than going
9628 through the machinery for the rest of the target attributes in this
9629 function. */
9630 if (*str_to_check == '+')
9631 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9632
9633 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9634 {
9635 invert = true;
9636 str_to_check += 3;
9637 }
9638 char *arg = strchr (str_to_check, '=');
9639
9640 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9641 and point ARG to "foo". */
9642 if (arg)
9643 {
9644 *arg = '\0';
9645 arg++;
9646 }
9647 const struct aarch64_attribute_info *p_attr;
9648 bool found = false;
9649 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9650 {
9651 /* If the names don't match up, or the user has given an argument
9652 to an attribute that doesn't accept one, or didn't give an argument
9653 to an attribute that expects one, fail to match. */
9654 if (strcmp (str_to_check, p_attr->name) != 0)
9655 continue;
9656
9657 found = true;
9658 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9659 || p_attr->attr_type == aarch64_attr_enum;
9660
9661 if (attr_need_arg_p ^ (arg != NULL))
9662 {
9663 error ("target %s %qs does not accept an argument",
9664 pragma_or_attr, str_to_check);
9665 return false;
9666 }
9667
9668 /* If the name matches but the attribute does not allow "no-" versions
9669 then we can't match. */
9670 if (invert && !p_attr->allow_neg)
9671 {
9672 error ("target %s %qs does not allow a negated form",
9673 pragma_or_attr, str_to_check);
9674 return false;
9675 }
9676
9677 switch (p_attr->attr_type)
9678 {
9679 /* Has a custom handler registered.
9680 For example, cpu=, arch=, tune=. */
9681 case aarch64_attr_custom:
9682 gcc_assert (p_attr->handler);
9683 if (!p_attr->handler (arg, pragma_or_attr))
9684 return false;
9685 break;
9686
9687 /* Either set or unset a boolean option. */
9688 case aarch64_attr_bool:
9689 {
9690 struct cl_decoded_option decoded;
9691
9692 generate_option (p_attr->opt_num, NULL, !invert,
9693 CL_TARGET, &decoded);
9694 aarch64_handle_option (&global_options, &global_options_set,
9695 &decoded, input_location);
9696 break;
9697 }
9698 /* Set or unset a bit in the target_flags. aarch64_handle_option
9699 should know what mask to apply given the option number. */
9700 case aarch64_attr_mask:
9701 {
9702 struct cl_decoded_option decoded;
9703 /* We only need to specify the option number.
9704 aarch64_handle_option will know which mask to apply. */
9705 decoded.opt_index = p_attr->opt_num;
9706 decoded.value = !invert;
9707 aarch64_handle_option (&global_options, &global_options_set,
9708 &decoded, input_location);
9709 break;
9710 }
9711 /* Use the option setting machinery to set an option to an enum. */
9712 case aarch64_attr_enum:
9713 {
9714 gcc_assert (arg);
9715 bool valid;
9716 int value;
9717 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9718 &value, CL_TARGET);
9719 if (valid)
9720 {
9721 set_option (&global_options, NULL, p_attr->opt_num, value,
9722 NULL, DK_UNSPECIFIED, input_location,
9723 global_dc);
9724 }
9725 else
9726 {
9727 error ("target %s %s=%s is not valid",
9728 pragma_or_attr, str_to_check, arg);
9729 }
9730 break;
9731 }
9732 default:
9733 gcc_unreachable ();
9734 }
9735 }
9736
9737 /* If we reached here we either have found an attribute and validated
9738 it or didn't match any. If we matched an attribute but its arguments
9739 were malformed we will have returned false already. */
9740 return found;
9741 }
9742
9743 /* Count how many times the character C appears in
9744 NULL-terminated string STR. */
9745
9746 static unsigned int
9747 num_occurences_in_str (char c, char *str)
9748 {
9749 unsigned int res = 0;
9750 while (*str != '\0')
9751 {
9752 if (*str == c)
9753 res++;
9754
9755 str++;
9756 }
9757
9758 return res;
9759 }
9760
9761 /* Parse the tree in ARGS that contains the target attribute information
9762 and update the global target options space. PRAGMA_OR_ATTR is a string
9763 to be used in error messages, specifying whether this is processing
9764 a target attribute or a target pragma. */
9765
9766 bool
9767 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9768 {
9769 if (TREE_CODE (args) == TREE_LIST)
9770 {
9771 do
9772 {
9773 tree head = TREE_VALUE (args);
9774 if (head)
9775 {
9776 if (!aarch64_process_target_attr (head, pragma_or_attr))
9777 return false;
9778 }
9779 args = TREE_CHAIN (args);
9780 } while (args);
9781
9782 return true;
9783 }
9784
9785 if (TREE_CODE (args) != STRING_CST)
9786 {
9787 error ("attribute %<target%> argument not a string");
9788 return false;
9789 }
9790
9791 size_t len = strlen (TREE_STRING_POINTER (args));
9792 char *str_to_check = (char *) alloca (len + 1);
9793 strcpy (str_to_check, TREE_STRING_POINTER (args));
9794
9795 if (len == 0)
9796 {
9797 error ("malformed target %s value", pragma_or_attr);
9798 return false;
9799 }
9800
9801 /* Used to catch empty spaces between commas i.e.
9802 attribute ((target ("attr1,,attr2"))). */
9803 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9804
9805 /* Handle multiple target attributes separated by ','. */
9806 char *token = strtok (str_to_check, ",");
9807
9808 unsigned int num_attrs = 0;
9809 while (token)
9810 {
9811 num_attrs++;
9812 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9813 {
9814 error ("target %s %qs is invalid", pragma_or_attr, token);
9815 return false;
9816 }
9817
9818 token = strtok (NULL, ",");
9819 }
9820
9821 if (num_attrs != num_commas + 1)
9822 {
9823 error ("malformed target %s list %qs",
9824 pragma_or_attr, TREE_STRING_POINTER (args));
9825 return false;
9826 }
9827
9828 return true;
9829 }
9830
9831 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9832 process attribute ((target ("..."))). */
9833
9834 static bool
9835 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9836 {
9837 struct cl_target_option cur_target;
9838 bool ret;
9839 tree old_optimize;
9840 tree new_target, new_optimize;
9841 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9842
9843 /* If what we're processing is the current pragma string then the
9844 target option node is already stored in target_option_current_node
9845 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9846 having to re-parse the string. This is especially useful to keep
9847 arm_neon.h compile times down since that header contains a lot
9848 of intrinsics enclosed in pragmas. */
9849 if (!existing_target && args == current_target_pragma)
9850 {
9851 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9852 return true;
9853 }
9854 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9855
9856 old_optimize = build_optimization_node (&global_options);
9857 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9858
9859 /* If the function changed the optimization levels as well as setting
9860 target options, start with the optimizations specified. */
9861 if (func_optimize && func_optimize != old_optimize)
9862 cl_optimization_restore (&global_options,
9863 TREE_OPTIMIZATION (func_optimize));
9864
9865 /* Save the current target options to restore at the end. */
9866 cl_target_option_save (&cur_target, &global_options);
9867
9868 /* If fndecl already has some target attributes applied to it, unpack
9869 them so that we add this attribute on top of them, rather than
9870 overwriting them. */
9871 if (existing_target)
9872 {
9873 struct cl_target_option *existing_options
9874 = TREE_TARGET_OPTION (existing_target);
9875
9876 if (existing_options)
9877 cl_target_option_restore (&global_options, existing_options);
9878 }
9879 else
9880 cl_target_option_restore (&global_options,
9881 TREE_TARGET_OPTION (target_option_current_node));
9882
9883
9884 ret = aarch64_process_target_attr (args, "attribute");
9885
9886 /* Set up any additional state. */
9887 if (ret)
9888 {
9889 aarch64_override_options_internal (&global_options);
9890 /* Initialize SIMD builtins if we haven't already.
9891 Set current_target_pragma to NULL for the duration so that
9892 the builtin initialization code doesn't try to tag the functions
9893 being built with the attributes specified by any current pragma, thus
9894 going into an infinite recursion. */
9895 if (TARGET_SIMD)
9896 {
9897 tree saved_current_target_pragma = current_target_pragma;
9898 current_target_pragma = NULL;
9899 aarch64_init_simd_builtins ();
9900 current_target_pragma = saved_current_target_pragma;
9901 }
9902 new_target = build_target_option_node (&global_options);
9903 }
9904 else
9905 new_target = NULL;
9906
9907 new_optimize = build_optimization_node (&global_options);
9908
9909 if (fndecl && ret)
9910 {
9911 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9912
9913 if (old_optimize != new_optimize)
9914 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9915 }
9916
9917 cl_target_option_restore (&global_options, &cur_target);
9918
9919 if (old_optimize != new_optimize)
9920 cl_optimization_restore (&global_options,
9921 TREE_OPTIMIZATION (old_optimize));
9922 return ret;
9923 }
9924
9925 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9926 tri-bool options (yes, no, don't care) and the default value is
9927 DEF, determine whether to reject inlining. */
9928
9929 static bool
9930 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9931 int dont_care, int def)
9932 {
9933 /* If the callee doesn't care, always allow inlining. */
9934 if (callee == dont_care)
9935 return true;
9936
9937 /* If the caller doesn't care, always allow inlining. */
9938 if (caller == dont_care)
9939 return true;
9940
9941 /* Otherwise, allow inlining if either the callee and caller values
9942 agree, or if the callee is using the default value. */
9943 return (callee == caller || callee == def);
9944 }
9945
9946 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9947 to inline CALLEE into CALLER based on target-specific info.
9948 Make sure that the caller and callee have compatible architectural
9949 features. Then go through the other possible target attributes
9950 and see if they can block inlining. Try not to reject always_inline
9951 callees unless they are incompatible architecturally. */
9952
9953 static bool
9954 aarch64_can_inline_p (tree caller, tree callee)
9955 {
9956 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9957 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9958
9959 /* If callee has no option attributes, then it is ok to inline. */
9960 if (!callee_tree)
9961 return true;
9962
9963 struct cl_target_option *caller_opts
9964 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9965 : target_option_default_node);
9966
9967 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9968
9969
9970 /* Callee's ISA flags should be a subset of the caller's. */
9971 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9972 != callee_opts->x_aarch64_isa_flags)
9973 return false;
9974
9975 /* Allow non-strict aligned functions inlining into strict
9976 aligned ones. */
9977 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9978 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9979 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9980 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9981 return false;
9982
9983 bool always_inline = lookup_attribute ("always_inline",
9984 DECL_ATTRIBUTES (callee));
9985
9986 /* If the architectural features match up and the callee is always_inline
9987 then the other attributes don't matter. */
9988 if (always_inline)
9989 return true;
9990
9991 if (caller_opts->x_aarch64_cmodel_var
9992 != callee_opts->x_aarch64_cmodel_var)
9993 return false;
9994
9995 if (caller_opts->x_aarch64_tls_dialect
9996 != callee_opts->x_aarch64_tls_dialect)
9997 return false;
9998
9999 /* Honour explicit requests to workaround errata. */
10000 if (!aarch64_tribools_ok_for_inlining_p (
10001 caller_opts->x_aarch64_fix_a53_err835769,
10002 callee_opts->x_aarch64_fix_a53_err835769,
10003 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10004 return false;
10005
10006 if (!aarch64_tribools_ok_for_inlining_p (
10007 caller_opts->x_aarch64_fix_a53_err843419,
10008 callee_opts->x_aarch64_fix_a53_err843419,
10009 2, TARGET_FIX_ERR_A53_843419))
10010 return false;
10011
10012 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10013 caller and calle and they don't match up, reject inlining. */
10014 if (!aarch64_tribools_ok_for_inlining_p (
10015 caller_opts->x_flag_omit_leaf_frame_pointer,
10016 callee_opts->x_flag_omit_leaf_frame_pointer,
10017 2, 1))
10018 return false;
10019
10020 /* If the callee has specific tuning overrides, respect them. */
10021 if (callee_opts->x_aarch64_override_tune_string != NULL
10022 && caller_opts->x_aarch64_override_tune_string == NULL)
10023 return false;
10024
10025 /* If the user specified tuning override strings for the
10026 caller and callee and they don't match up, reject inlining.
10027 We just do a string compare here, we don't analyze the meaning
10028 of the string, as it would be too costly for little gain. */
10029 if (callee_opts->x_aarch64_override_tune_string
10030 && caller_opts->x_aarch64_override_tune_string
10031 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10032 caller_opts->x_aarch64_override_tune_string) != 0))
10033 return false;
10034
10035 return true;
10036 }
10037
10038 /* Return true if SYMBOL_REF X binds locally. */
10039
10040 static bool
10041 aarch64_symbol_binds_local_p (const_rtx x)
10042 {
10043 return (SYMBOL_REF_DECL (x)
10044 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10045 : SYMBOL_REF_LOCAL_P (x));
10046 }
10047
10048 /* Return true if SYMBOL_REF X is thread local */
10049 static bool
10050 aarch64_tls_symbol_p (rtx x)
10051 {
10052 if (! TARGET_HAVE_TLS)
10053 return false;
10054
10055 if (GET_CODE (x) != SYMBOL_REF)
10056 return false;
10057
10058 return SYMBOL_REF_TLS_MODEL (x) != 0;
10059 }
10060
10061 /* Classify a TLS symbol into one of the TLS kinds. */
10062 enum aarch64_symbol_type
10063 aarch64_classify_tls_symbol (rtx x)
10064 {
10065 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10066
10067 switch (tls_kind)
10068 {
10069 case TLS_MODEL_GLOBAL_DYNAMIC:
10070 case TLS_MODEL_LOCAL_DYNAMIC:
10071 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10072
10073 case TLS_MODEL_INITIAL_EXEC:
10074 switch (aarch64_cmodel)
10075 {
10076 case AARCH64_CMODEL_TINY:
10077 case AARCH64_CMODEL_TINY_PIC:
10078 return SYMBOL_TINY_TLSIE;
10079 default:
10080 return SYMBOL_SMALL_TLSIE;
10081 }
10082
10083 case TLS_MODEL_LOCAL_EXEC:
10084 if (aarch64_tls_size == 12)
10085 return SYMBOL_TLSLE12;
10086 else if (aarch64_tls_size == 24)
10087 return SYMBOL_TLSLE24;
10088 else if (aarch64_tls_size == 32)
10089 return SYMBOL_TLSLE32;
10090 else if (aarch64_tls_size == 48)
10091 return SYMBOL_TLSLE48;
10092 else
10093 gcc_unreachable ();
10094
10095 case TLS_MODEL_EMULATED:
10096 case TLS_MODEL_NONE:
10097 return SYMBOL_FORCE_TO_MEM;
10098
10099 default:
10100 gcc_unreachable ();
10101 }
10102 }
10103
10104 /* Return the method that should be used to access SYMBOL_REF or
10105 LABEL_REF X. */
10106
10107 enum aarch64_symbol_type
10108 aarch64_classify_symbol (rtx x, rtx offset)
10109 {
10110 if (GET_CODE (x) == LABEL_REF)
10111 {
10112 switch (aarch64_cmodel)
10113 {
10114 case AARCH64_CMODEL_LARGE:
10115 return SYMBOL_FORCE_TO_MEM;
10116
10117 case AARCH64_CMODEL_TINY_PIC:
10118 case AARCH64_CMODEL_TINY:
10119 return SYMBOL_TINY_ABSOLUTE;
10120
10121 case AARCH64_CMODEL_SMALL_SPIC:
10122 case AARCH64_CMODEL_SMALL_PIC:
10123 case AARCH64_CMODEL_SMALL:
10124 return SYMBOL_SMALL_ABSOLUTE;
10125
10126 default:
10127 gcc_unreachable ();
10128 }
10129 }
10130
10131 if (GET_CODE (x) == SYMBOL_REF)
10132 {
10133 if (aarch64_tls_symbol_p (x))
10134 return aarch64_classify_tls_symbol (x);
10135
10136 switch (aarch64_cmodel)
10137 {
10138 case AARCH64_CMODEL_TINY:
10139 /* When we retrieve symbol + offset address, we have to make sure
10140 the offset does not cause overflow of the final address. But
10141 we have no way of knowing the address of symbol at compile time
10142 so we can't accurately say if the distance between the PC and
10143 symbol + offset is outside the addressible range of +/-1M in the
10144 TINY code model. So we rely on images not being greater than
10145 1M and cap the offset at 1M and anything beyond 1M will have to
10146 be loaded using an alternative mechanism. Furthermore if the
10147 symbol is a weak reference to something that isn't known to
10148 resolve to a symbol in this module, then force to memory. */
10149 if ((SYMBOL_REF_WEAK (x)
10150 && !aarch64_symbol_binds_local_p (x))
10151 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10152 return SYMBOL_FORCE_TO_MEM;
10153 return SYMBOL_TINY_ABSOLUTE;
10154
10155 case AARCH64_CMODEL_SMALL:
10156 /* Same reasoning as the tiny code model, but the offset cap here is
10157 4G. */
10158 if ((SYMBOL_REF_WEAK (x)
10159 && !aarch64_symbol_binds_local_p (x))
10160 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10161 HOST_WIDE_INT_C (4294967264)))
10162 return SYMBOL_FORCE_TO_MEM;
10163 return SYMBOL_SMALL_ABSOLUTE;
10164
10165 case AARCH64_CMODEL_TINY_PIC:
10166 if (!aarch64_symbol_binds_local_p (x))
10167 return SYMBOL_TINY_GOT;
10168 return SYMBOL_TINY_ABSOLUTE;
10169
10170 case AARCH64_CMODEL_SMALL_SPIC:
10171 case AARCH64_CMODEL_SMALL_PIC:
10172 if (!aarch64_symbol_binds_local_p (x))
10173 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10174 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10175 return SYMBOL_SMALL_ABSOLUTE;
10176
10177 case AARCH64_CMODEL_LARGE:
10178 /* This is alright even in PIC code as the constant
10179 pool reference is always PC relative and within
10180 the same translation unit. */
10181 if (CONSTANT_POOL_ADDRESS_P (x))
10182 return SYMBOL_SMALL_ABSOLUTE;
10183 else
10184 return SYMBOL_FORCE_TO_MEM;
10185
10186 default:
10187 gcc_unreachable ();
10188 }
10189 }
10190
10191 /* By default push everything into the constant pool. */
10192 return SYMBOL_FORCE_TO_MEM;
10193 }
10194
10195 bool
10196 aarch64_constant_address_p (rtx x)
10197 {
10198 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10199 }
10200
10201 bool
10202 aarch64_legitimate_pic_operand_p (rtx x)
10203 {
10204 if (GET_CODE (x) == SYMBOL_REF
10205 || (GET_CODE (x) == CONST
10206 && GET_CODE (XEXP (x, 0)) == PLUS
10207 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10208 return false;
10209
10210 return true;
10211 }
10212
10213 /* Return true if X holds either a quarter-precision or
10214 floating-point +0.0 constant. */
10215 static bool
10216 aarch64_valid_floating_const (machine_mode mode, rtx x)
10217 {
10218 if (!CONST_DOUBLE_P (x))
10219 return false;
10220
10221 if (aarch64_float_const_zero_rtx_p (x))
10222 return true;
10223
10224 /* We only handle moving 0.0 to a TFmode register. */
10225 if (!(mode == SFmode || mode == DFmode))
10226 return false;
10227
10228 return aarch64_float_const_representable_p (x);
10229 }
10230
10231 static bool
10232 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10233 {
10234 /* Do not allow vector struct mode constants. We could support
10235 0 and -1 easily, but they need support in aarch64-simd.md. */
10236 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10237 return false;
10238
10239 /* This could probably go away because
10240 we now decompose CONST_INTs according to expand_mov_immediate. */
10241 if ((GET_CODE (x) == CONST_VECTOR
10242 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10243 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
10244 return !targetm.cannot_force_const_mem (mode, x);
10245
10246 if (GET_CODE (x) == HIGH
10247 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10248 return true;
10249
10250 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10251 so spilling them is better than rematerialization. */
10252 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10253 return true;
10254
10255 return aarch64_constant_address_p (x);
10256 }
10257
10258 rtx
10259 aarch64_load_tp (rtx target)
10260 {
10261 if (!target
10262 || GET_MODE (target) != Pmode
10263 || !register_operand (target, Pmode))
10264 target = gen_reg_rtx (Pmode);
10265
10266 /* Can return in any reg. */
10267 emit_insn (gen_aarch64_load_tp_hard (target));
10268 return target;
10269 }
10270
10271 /* On AAPCS systems, this is the "struct __va_list". */
10272 static GTY(()) tree va_list_type;
10273
10274 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10275 Return the type to use as __builtin_va_list.
10276
10277 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10278
10279 struct __va_list
10280 {
10281 void *__stack;
10282 void *__gr_top;
10283 void *__vr_top;
10284 int __gr_offs;
10285 int __vr_offs;
10286 }; */
10287
10288 static tree
10289 aarch64_build_builtin_va_list (void)
10290 {
10291 tree va_list_name;
10292 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10293
10294 /* Create the type. */
10295 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10296 /* Give it the required name. */
10297 va_list_name = build_decl (BUILTINS_LOCATION,
10298 TYPE_DECL,
10299 get_identifier ("__va_list"),
10300 va_list_type);
10301 DECL_ARTIFICIAL (va_list_name) = 1;
10302 TYPE_NAME (va_list_type) = va_list_name;
10303 TYPE_STUB_DECL (va_list_type) = va_list_name;
10304
10305 /* Create the fields. */
10306 f_stack = build_decl (BUILTINS_LOCATION,
10307 FIELD_DECL, get_identifier ("__stack"),
10308 ptr_type_node);
10309 f_grtop = build_decl (BUILTINS_LOCATION,
10310 FIELD_DECL, get_identifier ("__gr_top"),
10311 ptr_type_node);
10312 f_vrtop = build_decl (BUILTINS_LOCATION,
10313 FIELD_DECL, get_identifier ("__vr_top"),
10314 ptr_type_node);
10315 f_groff = build_decl (BUILTINS_LOCATION,
10316 FIELD_DECL, get_identifier ("__gr_offs"),
10317 integer_type_node);
10318 f_vroff = build_decl (BUILTINS_LOCATION,
10319 FIELD_DECL, get_identifier ("__vr_offs"),
10320 integer_type_node);
10321
10322 /* Tell tree-stdarg pass about our internal offset fields.
10323 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10324 purpose to identify whether the code is updating va_list internal
10325 offset fields through irregular way. */
10326 va_list_gpr_counter_field = f_groff;
10327 va_list_fpr_counter_field = f_vroff;
10328
10329 DECL_ARTIFICIAL (f_stack) = 1;
10330 DECL_ARTIFICIAL (f_grtop) = 1;
10331 DECL_ARTIFICIAL (f_vrtop) = 1;
10332 DECL_ARTIFICIAL (f_groff) = 1;
10333 DECL_ARTIFICIAL (f_vroff) = 1;
10334
10335 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10336 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10337 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10338 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10339 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10340
10341 TYPE_FIELDS (va_list_type) = f_stack;
10342 DECL_CHAIN (f_stack) = f_grtop;
10343 DECL_CHAIN (f_grtop) = f_vrtop;
10344 DECL_CHAIN (f_vrtop) = f_groff;
10345 DECL_CHAIN (f_groff) = f_vroff;
10346
10347 /* Compute its layout. */
10348 layout_type (va_list_type);
10349
10350 return va_list_type;
10351 }
10352
10353 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10354 static void
10355 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10356 {
10357 const CUMULATIVE_ARGS *cum;
10358 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10359 tree stack, grtop, vrtop, groff, vroff;
10360 tree t;
10361 int gr_save_area_size = cfun->va_list_gpr_size;
10362 int vr_save_area_size = cfun->va_list_fpr_size;
10363 int vr_offset;
10364
10365 cum = &crtl->args.info;
10366 if (cfun->va_list_gpr_size)
10367 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10368 cfun->va_list_gpr_size);
10369 if (cfun->va_list_fpr_size)
10370 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10371 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10372
10373 if (!TARGET_FLOAT)
10374 {
10375 gcc_assert (cum->aapcs_nvrn == 0);
10376 vr_save_area_size = 0;
10377 }
10378
10379 f_stack = TYPE_FIELDS (va_list_type_node);
10380 f_grtop = DECL_CHAIN (f_stack);
10381 f_vrtop = DECL_CHAIN (f_grtop);
10382 f_groff = DECL_CHAIN (f_vrtop);
10383 f_vroff = DECL_CHAIN (f_groff);
10384
10385 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10386 NULL_TREE);
10387 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10388 NULL_TREE);
10389 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10390 NULL_TREE);
10391 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10392 NULL_TREE);
10393 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10394 NULL_TREE);
10395
10396 /* Emit code to initialize STACK, which points to the next varargs stack
10397 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10398 by named arguments. STACK is 8-byte aligned. */
10399 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10400 if (cum->aapcs_stack_size > 0)
10401 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10402 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10403 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10404
10405 /* Emit code to initialize GRTOP, the top of the GR save area.
10406 virtual_incoming_args_rtx should have been 16 byte aligned. */
10407 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10408 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10409 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10410
10411 /* Emit code to initialize VRTOP, the top of the VR save area.
10412 This address is gr_save_area_bytes below GRTOP, rounded
10413 down to the next 16-byte boundary. */
10414 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10415 vr_offset = ROUND_UP (gr_save_area_size,
10416 STACK_BOUNDARY / BITS_PER_UNIT);
10417
10418 if (vr_offset)
10419 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10420 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10421 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10422
10423 /* Emit code to initialize GROFF, the offset from GRTOP of the
10424 next GPR argument. */
10425 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10426 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10427 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10428
10429 /* Likewise emit code to initialize VROFF, the offset from FTOP
10430 of the next VR argument. */
10431 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10432 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10433 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10434 }
10435
10436 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10437
10438 static tree
10439 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10440 gimple_seq *post_p ATTRIBUTE_UNUSED)
10441 {
10442 tree addr;
10443 bool indirect_p;
10444 bool is_ha; /* is HFA or HVA. */
10445 bool dw_align; /* double-word align. */
10446 machine_mode ag_mode = VOIDmode;
10447 int nregs;
10448 machine_mode mode;
10449
10450 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10451 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10452 HOST_WIDE_INT size, rsize, adjust, align;
10453 tree t, u, cond1, cond2;
10454
10455 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10456 if (indirect_p)
10457 type = build_pointer_type (type);
10458
10459 mode = TYPE_MODE (type);
10460
10461 f_stack = TYPE_FIELDS (va_list_type_node);
10462 f_grtop = DECL_CHAIN (f_stack);
10463 f_vrtop = DECL_CHAIN (f_grtop);
10464 f_groff = DECL_CHAIN (f_vrtop);
10465 f_vroff = DECL_CHAIN (f_groff);
10466
10467 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10468 f_stack, NULL_TREE);
10469 size = int_size_in_bytes (type);
10470 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10471
10472 dw_align = false;
10473 adjust = 0;
10474 if (aarch64_vfp_is_call_or_return_candidate (mode,
10475 type,
10476 &ag_mode,
10477 &nregs,
10478 &is_ha))
10479 {
10480 /* TYPE passed in fp/simd registers. */
10481 if (!TARGET_FLOAT)
10482 aarch64_err_no_fpadvsimd (mode, "varargs");
10483
10484 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10485 unshare_expr (valist), f_vrtop, NULL_TREE);
10486 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10487 unshare_expr (valist), f_vroff, NULL_TREE);
10488
10489 rsize = nregs * UNITS_PER_VREG;
10490
10491 if (is_ha)
10492 {
10493 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10494 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10495 }
10496 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10497 && size < UNITS_PER_VREG)
10498 {
10499 adjust = UNITS_PER_VREG - size;
10500 }
10501 }
10502 else
10503 {
10504 /* TYPE passed in general registers. */
10505 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10506 unshare_expr (valist), f_grtop, NULL_TREE);
10507 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10508 unshare_expr (valist), f_groff, NULL_TREE);
10509 rsize = ROUND_UP (size, UNITS_PER_WORD);
10510 nregs = rsize / UNITS_PER_WORD;
10511
10512 if (align > 8)
10513 dw_align = true;
10514
10515 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10516 && size < UNITS_PER_WORD)
10517 {
10518 adjust = UNITS_PER_WORD - size;
10519 }
10520 }
10521
10522 /* Get a local temporary for the field value. */
10523 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10524
10525 /* Emit code to branch if off >= 0. */
10526 t = build2 (GE_EXPR, boolean_type_node, off,
10527 build_int_cst (TREE_TYPE (off), 0));
10528 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10529
10530 if (dw_align)
10531 {
10532 /* Emit: offs = (offs + 15) & -16. */
10533 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10534 build_int_cst (TREE_TYPE (off), 15));
10535 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10536 build_int_cst (TREE_TYPE (off), -16));
10537 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10538 }
10539 else
10540 roundup = NULL;
10541
10542 /* Update ap.__[g|v]r_offs */
10543 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10544 build_int_cst (TREE_TYPE (off), rsize));
10545 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10546
10547 /* String up. */
10548 if (roundup)
10549 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10550
10551 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10552 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10553 build_int_cst (TREE_TYPE (f_off), 0));
10554 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10555
10556 /* String up: make sure the assignment happens before the use. */
10557 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10558 COND_EXPR_ELSE (cond1) = t;
10559
10560 /* Prepare the trees handling the argument that is passed on the stack;
10561 the top level node will store in ON_STACK. */
10562 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10563 if (align > 8)
10564 {
10565 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10566 t = fold_convert (intDI_type_node, arg);
10567 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10568 build_int_cst (TREE_TYPE (t), 15));
10569 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10570 build_int_cst (TREE_TYPE (t), -16));
10571 t = fold_convert (TREE_TYPE (arg), t);
10572 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10573 }
10574 else
10575 roundup = NULL;
10576 /* Advance ap.__stack */
10577 t = fold_convert (intDI_type_node, arg);
10578 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10579 build_int_cst (TREE_TYPE (t), size + 7));
10580 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10581 build_int_cst (TREE_TYPE (t), -8));
10582 t = fold_convert (TREE_TYPE (arg), t);
10583 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10584 /* String up roundup and advance. */
10585 if (roundup)
10586 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10587 /* String up with arg */
10588 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10589 /* Big-endianness related address adjustment. */
10590 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10591 && size < UNITS_PER_WORD)
10592 {
10593 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10594 size_int (UNITS_PER_WORD - size));
10595 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10596 }
10597
10598 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10599 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10600
10601 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10602 t = off;
10603 if (adjust)
10604 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10605 build_int_cst (TREE_TYPE (off), adjust));
10606
10607 t = fold_convert (sizetype, t);
10608 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10609
10610 if (is_ha)
10611 {
10612 /* type ha; // treat as "struct {ftype field[n];}"
10613 ... [computing offs]
10614 for (i = 0; i <nregs; ++i, offs += 16)
10615 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10616 return ha; */
10617 int i;
10618 tree tmp_ha, field_t, field_ptr_t;
10619
10620 /* Declare a local variable. */
10621 tmp_ha = create_tmp_var_raw (type, "ha");
10622 gimple_add_tmp_var (tmp_ha);
10623
10624 /* Establish the base type. */
10625 switch (ag_mode)
10626 {
10627 case SFmode:
10628 field_t = float_type_node;
10629 field_ptr_t = float_ptr_type_node;
10630 break;
10631 case DFmode:
10632 field_t = double_type_node;
10633 field_ptr_t = double_ptr_type_node;
10634 break;
10635 case TFmode:
10636 field_t = long_double_type_node;
10637 field_ptr_t = long_double_ptr_type_node;
10638 break;
10639 case HFmode:
10640 field_t = aarch64_fp16_type_node;
10641 field_ptr_t = aarch64_fp16_ptr_type_node;
10642 break;
10643 case V2SImode:
10644 case V4SImode:
10645 {
10646 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10647 field_t = build_vector_type_for_mode (innertype, ag_mode);
10648 field_ptr_t = build_pointer_type (field_t);
10649 }
10650 break;
10651 default:
10652 gcc_assert (0);
10653 }
10654
10655 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10656 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10657 addr = t;
10658 t = fold_convert (field_ptr_t, addr);
10659 t = build2 (MODIFY_EXPR, field_t,
10660 build1 (INDIRECT_REF, field_t, tmp_ha),
10661 build1 (INDIRECT_REF, field_t, t));
10662
10663 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10664 for (i = 1; i < nregs; ++i)
10665 {
10666 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10667 u = fold_convert (field_ptr_t, addr);
10668 u = build2 (MODIFY_EXPR, field_t,
10669 build2 (MEM_REF, field_t, tmp_ha,
10670 build_int_cst (field_ptr_t,
10671 (i *
10672 int_size_in_bytes (field_t)))),
10673 build1 (INDIRECT_REF, field_t, u));
10674 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10675 }
10676
10677 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10678 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10679 }
10680
10681 COND_EXPR_ELSE (cond2) = t;
10682 addr = fold_convert (build_pointer_type (type), cond1);
10683 addr = build_va_arg_indirect_ref (addr);
10684
10685 if (indirect_p)
10686 addr = build_va_arg_indirect_ref (addr);
10687
10688 return addr;
10689 }
10690
10691 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10692
10693 static void
10694 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10695 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10696 int no_rtl)
10697 {
10698 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10699 CUMULATIVE_ARGS local_cum;
10700 int gr_saved = cfun->va_list_gpr_size;
10701 int vr_saved = cfun->va_list_fpr_size;
10702
10703 /* The caller has advanced CUM up to, but not beyond, the last named
10704 argument. Advance a local copy of CUM past the last "real" named
10705 argument, to find out how many registers are left over. */
10706 local_cum = *cum;
10707 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10708
10709 /* Found out how many registers we need to save.
10710 Honor tree-stdvar analysis results. */
10711 if (cfun->va_list_gpr_size)
10712 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10713 cfun->va_list_gpr_size / UNITS_PER_WORD);
10714 if (cfun->va_list_fpr_size)
10715 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10716 cfun->va_list_fpr_size / UNITS_PER_VREG);
10717
10718 if (!TARGET_FLOAT)
10719 {
10720 gcc_assert (local_cum.aapcs_nvrn == 0);
10721 vr_saved = 0;
10722 }
10723
10724 if (!no_rtl)
10725 {
10726 if (gr_saved > 0)
10727 {
10728 rtx ptr, mem;
10729
10730 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10731 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10732 - gr_saved * UNITS_PER_WORD);
10733 mem = gen_frame_mem (BLKmode, ptr);
10734 set_mem_alias_set (mem, get_varargs_alias_set ());
10735
10736 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10737 mem, gr_saved);
10738 }
10739 if (vr_saved > 0)
10740 {
10741 /* We can't use move_block_from_reg, because it will use
10742 the wrong mode, storing D regs only. */
10743 machine_mode mode = TImode;
10744 int off, i, vr_start;
10745
10746 /* Set OFF to the offset from virtual_incoming_args_rtx of
10747 the first vector register. The VR save area lies below
10748 the GR one, and is aligned to 16 bytes. */
10749 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10750 STACK_BOUNDARY / BITS_PER_UNIT);
10751 off -= vr_saved * UNITS_PER_VREG;
10752
10753 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10754 for (i = 0; i < vr_saved; ++i)
10755 {
10756 rtx ptr, mem;
10757
10758 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10759 mem = gen_frame_mem (mode, ptr);
10760 set_mem_alias_set (mem, get_varargs_alias_set ());
10761 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10762 off += UNITS_PER_VREG;
10763 }
10764 }
10765 }
10766
10767 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10768 any complication of having crtl->args.pretend_args_size changed. */
10769 cfun->machine->frame.saved_varargs_size
10770 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10771 STACK_BOUNDARY / BITS_PER_UNIT)
10772 + vr_saved * UNITS_PER_VREG);
10773 }
10774
10775 static void
10776 aarch64_conditional_register_usage (void)
10777 {
10778 int i;
10779 if (!TARGET_FLOAT)
10780 {
10781 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10782 {
10783 fixed_regs[i] = 1;
10784 call_used_regs[i] = 1;
10785 }
10786 }
10787 }
10788
10789 /* Walk down the type tree of TYPE counting consecutive base elements.
10790 If *MODEP is VOIDmode, then set it to the first valid floating point
10791 type. If a non-floating point type is found, or if a floating point
10792 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10793 otherwise return the count in the sub-tree. */
10794 static int
10795 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10796 {
10797 machine_mode mode;
10798 HOST_WIDE_INT size;
10799
10800 switch (TREE_CODE (type))
10801 {
10802 case REAL_TYPE:
10803 mode = TYPE_MODE (type);
10804 if (mode != DFmode && mode != SFmode
10805 && mode != TFmode && mode != HFmode)
10806 return -1;
10807
10808 if (*modep == VOIDmode)
10809 *modep = mode;
10810
10811 if (*modep == mode)
10812 return 1;
10813
10814 break;
10815
10816 case COMPLEX_TYPE:
10817 mode = TYPE_MODE (TREE_TYPE (type));
10818 if (mode != DFmode && mode != SFmode
10819 && mode != TFmode && mode != HFmode)
10820 return -1;
10821
10822 if (*modep == VOIDmode)
10823 *modep = mode;
10824
10825 if (*modep == mode)
10826 return 2;
10827
10828 break;
10829
10830 case VECTOR_TYPE:
10831 /* Use V2SImode and V4SImode as representatives of all 64-bit
10832 and 128-bit vector types. */
10833 size = int_size_in_bytes (type);
10834 switch (size)
10835 {
10836 case 8:
10837 mode = V2SImode;
10838 break;
10839 case 16:
10840 mode = V4SImode;
10841 break;
10842 default:
10843 return -1;
10844 }
10845
10846 if (*modep == VOIDmode)
10847 *modep = mode;
10848
10849 /* Vector modes are considered to be opaque: two vectors are
10850 equivalent for the purposes of being homogeneous aggregates
10851 if they are the same size. */
10852 if (*modep == mode)
10853 return 1;
10854
10855 break;
10856
10857 case ARRAY_TYPE:
10858 {
10859 int count;
10860 tree index = TYPE_DOMAIN (type);
10861
10862 /* Can't handle incomplete types nor sizes that are not
10863 fixed. */
10864 if (!COMPLETE_TYPE_P (type)
10865 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10866 return -1;
10867
10868 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10869 if (count == -1
10870 || !index
10871 || !TYPE_MAX_VALUE (index)
10872 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10873 || !TYPE_MIN_VALUE (index)
10874 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10875 || count < 0)
10876 return -1;
10877
10878 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10879 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10880
10881 /* There must be no padding. */
10882 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10883 return -1;
10884
10885 return count;
10886 }
10887
10888 case RECORD_TYPE:
10889 {
10890 int count = 0;
10891 int sub_count;
10892 tree field;
10893
10894 /* Can't handle incomplete types nor sizes that are not
10895 fixed. */
10896 if (!COMPLETE_TYPE_P (type)
10897 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10898 return -1;
10899
10900 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10901 {
10902 if (TREE_CODE (field) != FIELD_DECL)
10903 continue;
10904
10905 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10906 if (sub_count < 0)
10907 return -1;
10908 count += sub_count;
10909 }
10910
10911 /* There must be no padding. */
10912 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10913 return -1;
10914
10915 return count;
10916 }
10917
10918 case UNION_TYPE:
10919 case QUAL_UNION_TYPE:
10920 {
10921 /* These aren't very interesting except in a degenerate case. */
10922 int count = 0;
10923 int sub_count;
10924 tree field;
10925
10926 /* Can't handle incomplete types nor sizes that are not
10927 fixed. */
10928 if (!COMPLETE_TYPE_P (type)
10929 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10930 return -1;
10931
10932 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10933 {
10934 if (TREE_CODE (field) != FIELD_DECL)
10935 continue;
10936
10937 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10938 if (sub_count < 0)
10939 return -1;
10940 count = count > sub_count ? count : sub_count;
10941 }
10942
10943 /* There must be no padding. */
10944 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10945 return -1;
10946
10947 return count;
10948 }
10949
10950 default:
10951 break;
10952 }
10953
10954 return -1;
10955 }
10956
10957 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10958 type as described in AAPCS64 \S 4.1.2.
10959
10960 See the comment above aarch64_composite_type_p for the notes on MODE. */
10961
10962 static bool
10963 aarch64_short_vector_p (const_tree type,
10964 machine_mode mode)
10965 {
10966 HOST_WIDE_INT size = -1;
10967
10968 if (type && TREE_CODE (type) == VECTOR_TYPE)
10969 size = int_size_in_bytes (type);
10970 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10971 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10972 size = GET_MODE_SIZE (mode);
10973
10974 return (size == 8 || size == 16);
10975 }
10976
10977 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10978 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10979 array types. The C99 floating-point complex types are also considered
10980 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10981 types, which are GCC extensions and out of the scope of AAPCS64, are
10982 treated as composite types here as well.
10983
10984 Note that MODE itself is not sufficient in determining whether a type
10985 is such a composite type or not. This is because
10986 stor-layout.c:compute_record_mode may have already changed the MODE
10987 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10988 structure with only one field may have its MODE set to the mode of the
10989 field. Also an integer mode whose size matches the size of the
10990 RECORD_TYPE type may be used to substitute the original mode
10991 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10992 solely relied on. */
10993
10994 static bool
10995 aarch64_composite_type_p (const_tree type,
10996 machine_mode mode)
10997 {
10998 if (aarch64_short_vector_p (type, mode))
10999 return false;
11000
11001 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11002 return true;
11003
11004 if (mode == BLKmode
11005 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11006 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11007 return true;
11008
11009 return false;
11010 }
11011
11012 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11013 shall be passed or returned in simd/fp register(s) (providing these
11014 parameter passing registers are available).
11015
11016 Upon successful return, *COUNT returns the number of needed registers,
11017 *BASE_MODE returns the mode of the individual register and when IS_HAF
11018 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11019 floating-point aggregate or a homogeneous short-vector aggregate. */
11020
11021 static bool
11022 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11023 const_tree type,
11024 machine_mode *base_mode,
11025 int *count,
11026 bool *is_ha)
11027 {
11028 machine_mode new_mode = VOIDmode;
11029 bool composite_p = aarch64_composite_type_p (type, mode);
11030
11031 if (is_ha != NULL) *is_ha = false;
11032
11033 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11034 || aarch64_short_vector_p (type, mode))
11035 {
11036 *count = 1;
11037 new_mode = mode;
11038 }
11039 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11040 {
11041 if (is_ha != NULL) *is_ha = true;
11042 *count = 2;
11043 new_mode = GET_MODE_INNER (mode);
11044 }
11045 else if (type && composite_p)
11046 {
11047 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11048
11049 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11050 {
11051 if (is_ha != NULL) *is_ha = true;
11052 *count = ag_count;
11053 }
11054 else
11055 return false;
11056 }
11057 else
11058 return false;
11059
11060 *base_mode = new_mode;
11061 return true;
11062 }
11063
11064 /* Implement TARGET_STRUCT_VALUE_RTX. */
11065
11066 static rtx
11067 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11068 int incoming ATTRIBUTE_UNUSED)
11069 {
11070 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11071 }
11072
11073 /* Implements target hook vector_mode_supported_p. */
11074 static bool
11075 aarch64_vector_mode_supported_p (machine_mode mode)
11076 {
11077 if (TARGET_SIMD
11078 && (mode == V4SImode || mode == V8HImode
11079 || mode == V16QImode || mode == V2DImode
11080 || mode == V2SImode || mode == V4HImode
11081 || mode == V8QImode || mode == V2SFmode
11082 || mode == V4SFmode || mode == V2DFmode
11083 || mode == V4HFmode || mode == V8HFmode
11084 || mode == V1DFmode))
11085 return true;
11086
11087 return false;
11088 }
11089
11090 /* Return appropriate SIMD container
11091 for MODE within a vector of WIDTH bits. */
11092 static machine_mode
11093 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11094 {
11095 gcc_assert (width == 64 || width == 128);
11096 if (TARGET_SIMD)
11097 {
11098 if (width == 128)
11099 switch (mode)
11100 {
11101 case DFmode:
11102 return V2DFmode;
11103 case SFmode:
11104 return V4SFmode;
11105 case HFmode:
11106 return V8HFmode;
11107 case SImode:
11108 return V4SImode;
11109 case HImode:
11110 return V8HImode;
11111 case QImode:
11112 return V16QImode;
11113 case DImode:
11114 return V2DImode;
11115 default:
11116 break;
11117 }
11118 else
11119 switch (mode)
11120 {
11121 case SFmode:
11122 return V2SFmode;
11123 case HFmode:
11124 return V4HFmode;
11125 case SImode:
11126 return V2SImode;
11127 case HImode:
11128 return V4HImode;
11129 case QImode:
11130 return V8QImode;
11131 default:
11132 break;
11133 }
11134 }
11135 return word_mode;
11136 }
11137
11138 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11139 static machine_mode
11140 aarch64_preferred_simd_mode (machine_mode mode)
11141 {
11142 return aarch64_simd_container_mode (mode, 128);
11143 }
11144
11145 /* Return the bitmask of possible vector sizes for the vectorizer
11146 to iterate over. */
11147 static unsigned int
11148 aarch64_autovectorize_vector_sizes (void)
11149 {
11150 return (16 | 8);
11151 }
11152
11153 /* Implement TARGET_MANGLE_TYPE. */
11154
11155 static const char *
11156 aarch64_mangle_type (const_tree type)
11157 {
11158 /* The AArch64 ABI documents say that "__va_list" has to be
11159 managled as if it is in the "std" namespace. */
11160 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11161 return "St9__va_list";
11162
11163 /* Half-precision float. */
11164 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11165 return "Dh";
11166
11167 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11168 builtin types. */
11169 if (TYPE_NAME (type) != NULL)
11170 return aarch64_mangle_builtin_type (type);
11171
11172 /* Use the default mangling. */
11173 return NULL;
11174 }
11175
11176 /* Find the first rtx_insn before insn that will generate an assembly
11177 instruction. */
11178
11179 static rtx_insn *
11180 aarch64_prev_real_insn (rtx_insn *insn)
11181 {
11182 if (!insn)
11183 return NULL;
11184
11185 do
11186 {
11187 insn = prev_real_insn (insn);
11188 }
11189 while (insn && recog_memoized (insn) < 0);
11190
11191 return insn;
11192 }
11193
11194 static bool
11195 is_madd_op (enum attr_type t1)
11196 {
11197 unsigned int i;
11198 /* A number of these may be AArch32 only. */
11199 enum attr_type mlatypes[] = {
11200 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11201 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11202 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11203 };
11204
11205 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11206 {
11207 if (t1 == mlatypes[i])
11208 return true;
11209 }
11210
11211 return false;
11212 }
11213
11214 /* Check if there is a register dependency between a load and the insn
11215 for which we hold recog_data. */
11216
11217 static bool
11218 dep_between_memop_and_curr (rtx memop)
11219 {
11220 rtx load_reg;
11221 int opno;
11222
11223 gcc_assert (GET_CODE (memop) == SET);
11224
11225 if (!REG_P (SET_DEST (memop)))
11226 return false;
11227
11228 load_reg = SET_DEST (memop);
11229 for (opno = 1; opno < recog_data.n_operands; opno++)
11230 {
11231 rtx operand = recog_data.operand[opno];
11232 if (REG_P (operand)
11233 && reg_overlap_mentioned_p (load_reg, operand))
11234 return true;
11235
11236 }
11237 return false;
11238 }
11239
11240
11241 /* When working around the Cortex-A53 erratum 835769,
11242 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11243 instruction and has a preceding memory instruction such that a NOP
11244 should be inserted between them. */
11245
11246 bool
11247 aarch64_madd_needs_nop (rtx_insn* insn)
11248 {
11249 enum attr_type attr_type;
11250 rtx_insn *prev;
11251 rtx body;
11252
11253 if (!TARGET_FIX_ERR_A53_835769)
11254 return false;
11255
11256 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11257 return false;
11258
11259 attr_type = get_attr_type (insn);
11260 if (!is_madd_op (attr_type))
11261 return false;
11262
11263 prev = aarch64_prev_real_insn (insn);
11264 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11265 Restore recog state to INSN to avoid state corruption. */
11266 extract_constrain_insn_cached (insn);
11267
11268 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11269 return false;
11270
11271 body = single_set (prev);
11272
11273 /* If the previous insn is a memory op and there is no dependency between
11274 it and the DImode madd, emit a NOP between them. If body is NULL then we
11275 have a complex memory operation, probably a load/store pair.
11276 Be conservative for now and emit a NOP. */
11277 if (GET_MODE (recog_data.operand[0]) == DImode
11278 && (!body || !dep_between_memop_and_curr (body)))
11279 return true;
11280
11281 return false;
11282
11283 }
11284
11285
11286 /* Implement FINAL_PRESCAN_INSN. */
11287
11288 void
11289 aarch64_final_prescan_insn (rtx_insn *insn)
11290 {
11291 if (aarch64_madd_needs_nop (insn))
11292 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11293 }
11294
11295
11296 /* Return the equivalent letter for size. */
11297 static char
11298 sizetochar (int size)
11299 {
11300 switch (size)
11301 {
11302 case 64: return 'd';
11303 case 32: return 's';
11304 case 16: return 'h';
11305 case 8 : return 'b';
11306 default: gcc_unreachable ();
11307 }
11308 }
11309
11310 /* Return true iff x is a uniform vector of floating-point
11311 constants, and the constant can be represented in
11312 quarter-precision form. Note, as aarch64_float_const_representable
11313 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11314 static bool
11315 aarch64_vect_float_const_representable_p (rtx x)
11316 {
11317 rtx elt;
11318 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11319 && const_vec_duplicate_p (x, &elt)
11320 && aarch64_float_const_representable_p (elt));
11321 }
11322
11323 /* Return true for valid and false for invalid. */
11324 bool
11325 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11326 struct simd_immediate_info *info)
11327 {
11328 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11329 matches = 1; \
11330 for (i = 0; i < idx; i += (STRIDE)) \
11331 if (!(TEST)) \
11332 matches = 0; \
11333 if (matches) \
11334 { \
11335 immtype = (CLASS); \
11336 elsize = (ELSIZE); \
11337 eshift = (SHIFT); \
11338 emvn = (NEG); \
11339 break; \
11340 }
11341
11342 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11343 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11344 unsigned char bytes[16];
11345 int immtype = -1, matches;
11346 unsigned int invmask = inverse ? 0xff : 0;
11347 int eshift, emvn;
11348
11349 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11350 {
11351 if (! (aarch64_simd_imm_zero_p (op, mode)
11352 || aarch64_vect_float_const_representable_p (op)))
11353 return false;
11354
11355 if (info)
11356 {
11357 info->value = CONST_VECTOR_ELT (op, 0);
11358 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11359 info->mvn = false;
11360 info->shift = 0;
11361 }
11362
11363 return true;
11364 }
11365
11366 /* Splat vector constant out into a byte vector. */
11367 for (i = 0; i < n_elts; i++)
11368 {
11369 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11370 it must be laid out in the vector register in reverse order. */
11371 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11372 unsigned HOST_WIDE_INT elpart;
11373
11374 gcc_assert (CONST_INT_P (el));
11375 elpart = INTVAL (el);
11376
11377 for (unsigned int byte = 0; byte < innersize; byte++)
11378 {
11379 bytes[idx++] = (elpart & 0xff) ^ invmask;
11380 elpart >>= BITS_PER_UNIT;
11381 }
11382
11383 }
11384
11385 /* Sanity check. */
11386 gcc_assert (idx == GET_MODE_SIZE (mode));
11387
11388 do
11389 {
11390 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11391 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11392
11393 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11394 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11395
11396 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11397 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11398
11399 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11400 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11401
11402 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11403
11404 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11405
11406 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11407 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11408
11409 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11410 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11411
11412 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11413 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11414
11415 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11416 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11417
11418 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11419
11420 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11421
11422 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11423 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11424
11425 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11426 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11427
11428 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11429 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11430
11431 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11432 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11433
11434 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11435
11436 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11437 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11438 }
11439 while (0);
11440
11441 if (immtype == -1)
11442 return false;
11443
11444 if (info)
11445 {
11446 info->element_width = elsize;
11447 info->mvn = emvn != 0;
11448 info->shift = eshift;
11449
11450 unsigned HOST_WIDE_INT imm = 0;
11451
11452 if (immtype >= 12 && immtype <= 15)
11453 info->msl = true;
11454
11455 /* Un-invert bytes of recognized vector, if necessary. */
11456 if (invmask != 0)
11457 for (i = 0; i < idx; i++)
11458 bytes[i] ^= invmask;
11459
11460 if (immtype == 17)
11461 {
11462 /* FIXME: Broken on 32-bit H_W_I hosts. */
11463 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11464
11465 for (i = 0; i < 8; i++)
11466 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11467 << (i * BITS_PER_UNIT);
11468
11469
11470 info->value = GEN_INT (imm);
11471 }
11472 else
11473 {
11474 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11475 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11476
11477 /* Construct 'abcdefgh' because the assembler cannot handle
11478 generic constants. */
11479 if (info->mvn)
11480 imm = ~imm;
11481 imm = (imm >> info->shift) & 0xff;
11482 info->value = GEN_INT (imm);
11483 }
11484 }
11485
11486 return true;
11487 #undef CHECK
11488 }
11489
11490 /* Check of immediate shift constants are within range. */
11491 bool
11492 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11493 {
11494 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11495 if (left)
11496 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11497 else
11498 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11499 }
11500
11501 /* Return true if X is a uniform vector where all elements
11502 are either the floating-point constant 0.0 or the
11503 integer constant 0. */
11504 bool
11505 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11506 {
11507 return x == CONST0_RTX (mode);
11508 }
11509
11510
11511 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11512 operation of width WIDTH at bit position POS. */
11513
11514 rtx
11515 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11516 {
11517 gcc_assert (CONST_INT_P (width));
11518 gcc_assert (CONST_INT_P (pos));
11519
11520 unsigned HOST_WIDE_INT mask
11521 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11522 return GEN_INT (mask << UINTVAL (pos));
11523 }
11524
11525 bool
11526 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11527 {
11528 HOST_WIDE_INT imm = INTVAL (x);
11529 int i;
11530
11531 for (i = 0; i < 8; i++)
11532 {
11533 unsigned int byte = imm & 0xff;
11534 if (byte != 0xff && byte != 0)
11535 return false;
11536 imm >>= 8;
11537 }
11538
11539 return true;
11540 }
11541
11542 bool
11543 aarch64_mov_operand_p (rtx x, machine_mode mode)
11544 {
11545 if (GET_CODE (x) == HIGH
11546 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11547 return true;
11548
11549 if (CONST_INT_P (x))
11550 return true;
11551
11552 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11553 return true;
11554
11555 return aarch64_classify_symbolic_expression (x)
11556 == SYMBOL_TINY_ABSOLUTE;
11557 }
11558
11559 /* Return a const_int vector of VAL. */
11560 rtx
11561 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11562 {
11563 int nunits = GET_MODE_NUNITS (mode);
11564 rtvec v = rtvec_alloc (nunits);
11565 int i;
11566
11567 rtx cache = GEN_INT (val);
11568
11569 for (i=0; i < nunits; i++)
11570 RTVEC_ELT (v, i) = cache;
11571
11572 return gen_rtx_CONST_VECTOR (mode, v);
11573 }
11574
11575 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11576
11577 bool
11578 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11579 {
11580 machine_mode vmode;
11581
11582 gcc_assert (!VECTOR_MODE_P (mode));
11583 vmode = aarch64_preferred_simd_mode (mode);
11584 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11585 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11586 }
11587
11588 /* Construct and return a PARALLEL RTX vector with elements numbering the
11589 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11590 the vector - from the perspective of the architecture. This does not
11591 line up with GCC's perspective on lane numbers, so we end up with
11592 different masks depending on our target endian-ness. The diagram
11593 below may help. We must draw the distinction when building masks
11594 which select one half of the vector. An instruction selecting
11595 architectural low-lanes for a big-endian target, must be described using
11596 a mask selecting GCC high-lanes.
11597
11598 Big-Endian Little-Endian
11599
11600 GCC 0 1 2 3 3 2 1 0
11601 | x | x | x | x | | x | x | x | x |
11602 Architecture 3 2 1 0 3 2 1 0
11603
11604 Low Mask: { 2, 3 } { 0, 1 }
11605 High Mask: { 0, 1 } { 2, 3 }
11606 */
11607
11608 rtx
11609 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11610 {
11611 int nunits = GET_MODE_NUNITS (mode);
11612 rtvec v = rtvec_alloc (nunits / 2);
11613 int high_base = nunits / 2;
11614 int low_base = 0;
11615 int base;
11616 rtx t1;
11617 int i;
11618
11619 if (BYTES_BIG_ENDIAN)
11620 base = high ? low_base : high_base;
11621 else
11622 base = high ? high_base : low_base;
11623
11624 for (i = 0; i < nunits / 2; i++)
11625 RTVEC_ELT (v, i) = GEN_INT (base + i);
11626
11627 t1 = gen_rtx_PARALLEL (mode, v);
11628 return t1;
11629 }
11630
11631 /* Check OP for validity as a PARALLEL RTX vector with elements
11632 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11633 from the perspective of the architecture. See the diagram above
11634 aarch64_simd_vect_par_cnst_half for more details. */
11635
11636 bool
11637 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11638 bool high)
11639 {
11640 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11641 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11642 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11643 int i = 0;
11644
11645 if (!VECTOR_MODE_P (mode))
11646 return false;
11647
11648 if (count_op != count_ideal)
11649 return false;
11650
11651 for (i = 0; i < count_ideal; i++)
11652 {
11653 rtx elt_op = XVECEXP (op, 0, i);
11654 rtx elt_ideal = XVECEXP (ideal, 0, i);
11655
11656 if (!CONST_INT_P (elt_op)
11657 || INTVAL (elt_ideal) != INTVAL (elt_op))
11658 return false;
11659 }
11660 return true;
11661 }
11662
11663 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11664 HIGH (exclusive). */
11665 void
11666 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11667 const_tree exp)
11668 {
11669 HOST_WIDE_INT lane;
11670 gcc_assert (CONST_INT_P (operand));
11671 lane = INTVAL (operand);
11672
11673 if (lane < low || lane >= high)
11674 {
11675 if (exp)
11676 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11677 else
11678 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11679 }
11680 }
11681
11682 /* Return TRUE if OP is a valid vector addressing mode. */
11683 bool
11684 aarch64_simd_mem_operand_p (rtx op)
11685 {
11686 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11687 || REG_P (XEXP (op, 0)));
11688 }
11689
11690 /* Emit a register copy from operand to operand, taking care not to
11691 early-clobber source registers in the process.
11692
11693 COUNT is the number of components into which the copy needs to be
11694 decomposed. */
11695 void
11696 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11697 unsigned int count)
11698 {
11699 unsigned int i;
11700 int rdest = REGNO (operands[0]);
11701 int rsrc = REGNO (operands[1]);
11702
11703 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11704 || rdest < rsrc)
11705 for (i = 0; i < count; i++)
11706 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11707 gen_rtx_REG (mode, rsrc + i));
11708 else
11709 for (i = 0; i < count; i++)
11710 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11711 gen_rtx_REG (mode, rsrc + count - i - 1));
11712 }
11713
11714 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11715 one of VSTRUCT modes: OI, CI, or XI. */
11716 int
11717 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11718 {
11719 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11720 }
11721
11722 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11723 alignment of a vector to 128 bits. */
11724 static HOST_WIDE_INT
11725 aarch64_simd_vector_alignment (const_tree type)
11726 {
11727 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11728 return MIN (align, 128);
11729 }
11730
11731 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11732 static bool
11733 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11734 {
11735 if (is_packed)
11736 return false;
11737
11738 /* We guarantee alignment for vectors up to 128-bits. */
11739 if (tree_int_cst_compare (TYPE_SIZE (type),
11740 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11741 return false;
11742
11743 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11744 return true;
11745 }
11746
11747 /* Return true if the vector misalignment factor is supported by the
11748 target. */
11749 static bool
11750 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11751 const_tree type, int misalignment,
11752 bool is_packed)
11753 {
11754 if (TARGET_SIMD && STRICT_ALIGNMENT)
11755 {
11756 /* Return if movmisalign pattern is not supported for this mode. */
11757 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11758 return false;
11759
11760 if (misalignment == -1)
11761 {
11762 /* Misalignment factor is unknown at compile time but we know
11763 it's word aligned. */
11764 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11765 {
11766 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11767
11768 if (element_size != 64)
11769 return true;
11770 }
11771 return false;
11772 }
11773 }
11774 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11775 is_packed);
11776 }
11777
11778 /* If VALS is a vector constant that can be loaded into a register
11779 using DUP, generate instructions to do so and return an RTX to
11780 assign to the register. Otherwise return NULL_RTX. */
11781 static rtx
11782 aarch64_simd_dup_constant (rtx vals)
11783 {
11784 machine_mode mode = GET_MODE (vals);
11785 machine_mode inner_mode = GET_MODE_INNER (mode);
11786 rtx x;
11787
11788 if (!const_vec_duplicate_p (vals, &x))
11789 return NULL_RTX;
11790
11791 /* We can load this constant by using DUP and a constant in a
11792 single ARM register. This will be cheaper than a vector
11793 load. */
11794 x = copy_to_mode_reg (inner_mode, x);
11795 return gen_rtx_VEC_DUPLICATE (mode, x);
11796 }
11797
11798
11799 /* Generate code to load VALS, which is a PARALLEL containing only
11800 constants (for vec_init) or CONST_VECTOR, efficiently into a
11801 register. Returns an RTX to copy into the register, or NULL_RTX
11802 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11803 static rtx
11804 aarch64_simd_make_constant (rtx vals)
11805 {
11806 machine_mode mode = GET_MODE (vals);
11807 rtx const_dup;
11808 rtx const_vec = NULL_RTX;
11809 int n_elts = GET_MODE_NUNITS (mode);
11810 int n_const = 0;
11811 int i;
11812
11813 if (GET_CODE (vals) == CONST_VECTOR)
11814 const_vec = vals;
11815 else if (GET_CODE (vals) == PARALLEL)
11816 {
11817 /* A CONST_VECTOR must contain only CONST_INTs and
11818 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11819 Only store valid constants in a CONST_VECTOR. */
11820 for (i = 0; i < n_elts; ++i)
11821 {
11822 rtx x = XVECEXP (vals, 0, i);
11823 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11824 n_const++;
11825 }
11826 if (n_const == n_elts)
11827 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11828 }
11829 else
11830 gcc_unreachable ();
11831
11832 if (const_vec != NULL_RTX
11833 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11834 /* Load using MOVI/MVNI. */
11835 return const_vec;
11836 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11837 /* Loaded using DUP. */
11838 return const_dup;
11839 else if (const_vec != NULL_RTX)
11840 /* Load from constant pool. We can not take advantage of single-cycle
11841 LD1 because we need a PC-relative addressing mode. */
11842 return const_vec;
11843 else
11844 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11845 We can not construct an initializer. */
11846 return NULL_RTX;
11847 }
11848
11849 /* Expand a vector initialisation sequence, such that TARGET is
11850 initialised to contain VALS. */
11851
11852 void
11853 aarch64_expand_vector_init (rtx target, rtx vals)
11854 {
11855 machine_mode mode = GET_MODE (target);
11856 machine_mode inner_mode = GET_MODE_INNER (mode);
11857 /* The number of vector elements. */
11858 int n_elts = GET_MODE_NUNITS (mode);
11859 /* The number of vector elements which are not constant. */
11860 int n_var = 0;
11861 rtx any_const = NULL_RTX;
11862 /* The first element of vals. */
11863 rtx v0 = XVECEXP (vals, 0, 0);
11864 bool all_same = true;
11865
11866 /* Count the number of variable elements to initialise. */
11867 for (int i = 0; i < n_elts; ++i)
11868 {
11869 rtx x = XVECEXP (vals, 0, i);
11870 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11871 ++n_var;
11872 else
11873 any_const = x;
11874
11875 all_same &= rtx_equal_p (x, v0);
11876 }
11877
11878 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11879 how best to handle this. */
11880 if (n_var == 0)
11881 {
11882 rtx constant = aarch64_simd_make_constant (vals);
11883 if (constant != NULL_RTX)
11884 {
11885 emit_move_insn (target, constant);
11886 return;
11887 }
11888 }
11889
11890 /* Splat a single non-constant element if we can. */
11891 if (all_same)
11892 {
11893 rtx x = copy_to_mode_reg (inner_mode, v0);
11894 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11895 return;
11896 }
11897
11898 enum insn_code icode = optab_handler (vec_set_optab, mode);
11899 gcc_assert (icode != CODE_FOR_nothing);
11900
11901 /* If there are only variable elements, try to optimize
11902 the insertion using dup for the most common element
11903 followed by insertions. */
11904
11905 /* The algorithm will fill matches[*][0] with the earliest matching element,
11906 and matches[X][1] with the count of duplicate elements (if X is the
11907 earliest element which has duplicates). */
11908
11909 if (n_var == n_elts && n_elts <= 16)
11910 {
11911 int matches[16][2] = {0};
11912 for (int i = 0; i < n_elts; i++)
11913 {
11914 for (int j = 0; j <= i; j++)
11915 {
11916 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
11917 {
11918 matches[i][0] = j;
11919 matches[j][1]++;
11920 break;
11921 }
11922 }
11923 }
11924 int maxelement = 0;
11925 int maxv = 0;
11926 for (int i = 0; i < n_elts; i++)
11927 if (matches[i][1] > maxv)
11928 {
11929 maxelement = i;
11930 maxv = matches[i][1];
11931 }
11932
11933 /* Create a duplicate of the most common element. */
11934 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
11935 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11936
11937 /* Insert the rest. */
11938 for (int i = 0; i < n_elts; i++)
11939 {
11940 rtx x = XVECEXP (vals, 0, i);
11941 if (matches[i][0] == maxelement)
11942 continue;
11943 x = copy_to_mode_reg (inner_mode, x);
11944 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11945 }
11946 return;
11947 }
11948
11949 /* Initialise a vector which is part-variable. We want to first try
11950 to build those lanes which are constant in the most efficient way we
11951 can. */
11952 if (n_var != n_elts)
11953 {
11954 rtx copy = copy_rtx (vals);
11955
11956 /* Load constant part of vector. We really don't care what goes into the
11957 parts we will overwrite, but we're more likely to be able to load the
11958 constant efficiently if it has fewer, larger, repeating parts
11959 (see aarch64_simd_valid_immediate). */
11960 for (int i = 0; i < n_elts; i++)
11961 {
11962 rtx x = XVECEXP (vals, 0, i);
11963 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11964 continue;
11965 rtx subst = any_const;
11966 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11967 {
11968 /* Look in the copied vector, as more elements are const. */
11969 rtx test = XVECEXP (copy, 0, i ^ bit);
11970 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11971 {
11972 subst = test;
11973 break;
11974 }
11975 }
11976 XVECEXP (copy, 0, i) = subst;
11977 }
11978 aarch64_expand_vector_init (target, copy);
11979 }
11980
11981 /* Insert the variable lanes directly. */
11982 for (int i = 0; i < n_elts; i++)
11983 {
11984 rtx x = XVECEXP (vals, 0, i);
11985 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11986 continue;
11987 x = copy_to_mode_reg (inner_mode, x);
11988 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11989 }
11990 }
11991
11992 static unsigned HOST_WIDE_INT
11993 aarch64_shift_truncation_mask (machine_mode mode)
11994 {
11995 return
11996 (!SHIFT_COUNT_TRUNCATED
11997 || aarch64_vector_mode_supported_p (mode)
11998 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11999 }
12000
12001 /* Select a format to encode pointers in exception handling data. */
12002 int
12003 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12004 {
12005 int type;
12006 switch (aarch64_cmodel)
12007 {
12008 case AARCH64_CMODEL_TINY:
12009 case AARCH64_CMODEL_TINY_PIC:
12010 case AARCH64_CMODEL_SMALL:
12011 case AARCH64_CMODEL_SMALL_PIC:
12012 case AARCH64_CMODEL_SMALL_SPIC:
12013 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12014 for everything. */
12015 type = DW_EH_PE_sdata4;
12016 break;
12017 default:
12018 /* No assumptions here. 8-byte relocs required. */
12019 type = DW_EH_PE_sdata8;
12020 break;
12021 }
12022 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12023 }
12024
12025 /* The last .arch and .tune assembly strings that we printed. */
12026 static std::string aarch64_last_printed_arch_string;
12027 static std::string aarch64_last_printed_tune_string;
12028
12029 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12030 by the function fndecl. */
12031
12032 void
12033 aarch64_declare_function_name (FILE *stream, const char* name,
12034 tree fndecl)
12035 {
12036 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12037
12038 struct cl_target_option *targ_options;
12039 if (target_parts)
12040 targ_options = TREE_TARGET_OPTION (target_parts);
12041 else
12042 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12043 gcc_assert (targ_options);
12044
12045 const struct processor *this_arch
12046 = aarch64_get_arch (targ_options->x_explicit_arch);
12047
12048 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12049 std::string extension
12050 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12051 this_arch->flags);
12052 /* Only update the assembler .arch string if it is distinct from the last
12053 such string we printed. */
12054 std::string to_print = this_arch->name + extension;
12055 if (to_print != aarch64_last_printed_arch_string)
12056 {
12057 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12058 aarch64_last_printed_arch_string = to_print;
12059 }
12060
12061 /* Print the cpu name we're tuning for in the comments, might be
12062 useful to readers of the generated asm. Do it only when it changes
12063 from function to function and verbose assembly is requested. */
12064 const struct processor *this_tune
12065 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12066
12067 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12068 {
12069 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12070 this_tune->name);
12071 aarch64_last_printed_tune_string = this_tune->name;
12072 }
12073
12074 /* Don't forget the type directive for ELF. */
12075 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12076 ASM_OUTPUT_LABEL (stream, name);
12077 }
12078
12079 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12080
12081 static void
12082 aarch64_start_file (void)
12083 {
12084 struct cl_target_option *default_options
12085 = TREE_TARGET_OPTION (target_option_default_node);
12086
12087 const struct processor *default_arch
12088 = aarch64_get_arch (default_options->x_explicit_arch);
12089 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12090 std::string extension
12091 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12092 default_arch->flags);
12093
12094 aarch64_last_printed_arch_string = default_arch->name + extension;
12095 aarch64_last_printed_tune_string = "";
12096 asm_fprintf (asm_out_file, "\t.arch %s\n",
12097 aarch64_last_printed_arch_string.c_str ());
12098
12099 default_file_start ();
12100 }
12101
12102 /* Emit load exclusive. */
12103
12104 static void
12105 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12106 rtx mem, rtx model_rtx)
12107 {
12108 rtx (*gen) (rtx, rtx, rtx);
12109
12110 switch (mode)
12111 {
12112 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
12113 case HImode: gen = gen_aarch64_load_exclusivehi; break;
12114 case SImode: gen = gen_aarch64_load_exclusivesi; break;
12115 case DImode: gen = gen_aarch64_load_exclusivedi; break;
12116 default:
12117 gcc_unreachable ();
12118 }
12119
12120 emit_insn (gen (rval, mem, model_rtx));
12121 }
12122
12123 /* Emit store exclusive. */
12124
12125 static void
12126 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12127 rtx rval, rtx mem, rtx model_rtx)
12128 {
12129 rtx (*gen) (rtx, rtx, rtx, rtx);
12130
12131 switch (mode)
12132 {
12133 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
12134 case HImode: gen = gen_aarch64_store_exclusivehi; break;
12135 case SImode: gen = gen_aarch64_store_exclusivesi; break;
12136 case DImode: gen = gen_aarch64_store_exclusivedi; break;
12137 default:
12138 gcc_unreachable ();
12139 }
12140
12141 emit_insn (gen (bval, rval, mem, model_rtx));
12142 }
12143
12144 /* Mark the previous jump instruction as unlikely. */
12145
12146 static void
12147 aarch64_emit_unlikely_jump (rtx insn)
12148 {
12149 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
12150
12151 rtx_insn *jump = emit_jump_insn (insn);
12152 add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
12153 }
12154
12155 /* Expand a compare and swap pattern. */
12156
12157 void
12158 aarch64_expand_compare_and_swap (rtx operands[])
12159 {
12160 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12161 machine_mode mode, cmp_mode;
12162 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12163 int idx;
12164 gen_cas_fn gen;
12165 const gen_cas_fn split_cas[] =
12166 {
12167 gen_aarch64_compare_and_swapqi,
12168 gen_aarch64_compare_and_swaphi,
12169 gen_aarch64_compare_and_swapsi,
12170 gen_aarch64_compare_and_swapdi
12171 };
12172 const gen_cas_fn atomic_cas[] =
12173 {
12174 gen_aarch64_compare_and_swapqi_lse,
12175 gen_aarch64_compare_and_swaphi_lse,
12176 gen_aarch64_compare_and_swapsi_lse,
12177 gen_aarch64_compare_and_swapdi_lse
12178 };
12179
12180 bval = operands[0];
12181 rval = operands[1];
12182 mem = operands[2];
12183 oldval = operands[3];
12184 newval = operands[4];
12185 is_weak = operands[5];
12186 mod_s = operands[6];
12187 mod_f = operands[7];
12188 mode = GET_MODE (mem);
12189 cmp_mode = mode;
12190
12191 /* Normally the succ memory model must be stronger than fail, but in the
12192 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12193 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12194
12195 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12196 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12197 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12198
12199 switch (mode)
12200 {
12201 case QImode:
12202 case HImode:
12203 /* For short modes, we're going to perform the comparison in SImode,
12204 so do the zero-extension now. */
12205 cmp_mode = SImode;
12206 rval = gen_reg_rtx (SImode);
12207 oldval = convert_modes (SImode, mode, oldval, true);
12208 /* Fall through. */
12209
12210 case SImode:
12211 case DImode:
12212 /* Force the value into a register if needed. */
12213 if (!aarch64_plus_operand (oldval, mode))
12214 oldval = force_reg (cmp_mode, oldval);
12215 break;
12216
12217 default:
12218 gcc_unreachable ();
12219 }
12220
12221 switch (mode)
12222 {
12223 case QImode: idx = 0; break;
12224 case HImode: idx = 1; break;
12225 case SImode: idx = 2; break;
12226 case DImode: idx = 3; break;
12227 default:
12228 gcc_unreachable ();
12229 }
12230 if (TARGET_LSE)
12231 gen = atomic_cas[idx];
12232 else
12233 gen = split_cas[idx];
12234
12235 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12236
12237 if (mode == QImode || mode == HImode)
12238 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12239
12240 x = gen_rtx_REG (CCmode, CC_REGNUM);
12241 x = gen_rtx_EQ (SImode, x, const0_rtx);
12242 emit_insn (gen_rtx_SET (bval, x));
12243 }
12244
12245 /* Test whether the target supports using a atomic load-operate instruction.
12246 CODE is the operation and AFTER is TRUE if the data in memory after the
12247 operation should be returned and FALSE if the data before the operation
12248 should be returned. Returns FALSE if the operation isn't supported by the
12249 architecture. */
12250
12251 bool
12252 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12253 {
12254 if (!TARGET_LSE)
12255 return false;
12256
12257 switch (code)
12258 {
12259 case SET:
12260 case AND:
12261 case IOR:
12262 case XOR:
12263 case MINUS:
12264 case PLUS:
12265 return true;
12266 default:
12267 return false;
12268 }
12269 }
12270
12271 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12272 sequence implementing an atomic operation. */
12273
12274 static void
12275 aarch64_emit_post_barrier (enum memmodel model)
12276 {
12277 const enum memmodel base_model = memmodel_base (model);
12278
12279 if (is_mm_sync (model)
12280 && (base_model == MEMMODEL_ACQUIRE
12281 || base_model == MEMMODEL_ACQ_REL
12282 || base_model == MEMMODEL_SEQ_CST))
12283 {
12284 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12285 }
12286 }
12287
12288 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12289 for the data in memory. EXPECTED is the value expected to be in memory.
12290 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12291 is the memory ordering to use. */
12292
12293 void
12294 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12295 rtx expected, rtx desired,
12296 rtx model)
12297 {
12298 rtx (*gen) (rtx, rtx, rtx, rtx);
12299 machine_mode mode;
12300
12301 mode = GET_MODE (mem);
12302
12303 switch (mode)
12304 {
12305 case QImode: gen = gen_aarch64_atomic_casqi; break;
12306 case HImode: gen = gen_aarch64_atomic_cashi; break;
12307 case SImode: gen = gen_aarch64_atomic_cassi; break;
12308 case DImode: gen = gen_aarch64_atomic_casdi; break;
12309 default:
12310 gcc_unreachable ();
12311 }
12312
12313 /* Move the expected value into the CAS destination register. */
12314 emit_insn (gen_rtx_SET (rval, expected));
12315
12316 /* Emit the CAS. */
12317 emit_insn (gen (rval, mem, desired, model));
12318
12319 /* Compare the expected value with the value loaded by the CAS, to establish
12320 whether the swap was made. */
12321 aarch64_gen_compare_reg (EQ, rval, expected);
12322 }
12323
12324 /* Split a compare and swap pattern. */
12325
12326 void
12327 aarch64_split_compare_and_swap (rtx operands[])
12328 {
12329 rtx rval, mem, oldval, newval, scratch;
12330 machine_mode mode;
12331 bool is_weak;
12332 rtx_code_label *label1, *label2;
12333 rtx x, cond;
12334 enum memmodel model;
12335 rtx model_rtx;
12336
12337 rval = operands[0];
12338 mem = operands[1];
12339 oldval = operands[2];
12340 newval = operands[3];
12341 is_weak = (operands[4] != const0_rtx);
12342 model_rtx = operands[5];
12343 scratch = operands[7];
12344 mode = GET_MODE (mem);
12345 model = memmodel_from_int (INTVAL (model_rtx));
12346
12347 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12348 loop:
12349 .label1:
12350 LD[A]XR rval, [mem]
12351 CBNZ rval, .label2
12352 ST[L]XR scratch, newval, [mem]
12353 CBNZ scratch, .label1
12354 .label2:
12355 CMP rval, 0. */
12356 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12357
12358 label1 = NULL;
12359 if (!is_weak)
12360 {
12361 label1 = gen_label_rtx ();
12362 emit_label (label1);
12363 }
12364 label2 = gen_label_rtx ();
12365
12366 /* The initial load can be relaxed for a __sync operation since a final
12367 barrier will be emitted to stop code hoisting. */
12368 if (is_mm_sync (model))
12369 aarch64_emit_load_exclusive (mode, rval, mem,
12370 GEN_INT (MEMMODEL_RELAXED));
12371 else
12372 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12373
12374 if (strong_zero_p)
12375 {
12376 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12377 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12378 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12379 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12380 }
12381 else
12382 {
12383 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12384 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12385 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12386 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12387 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12388 }
12389
12390 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12391
12392 if (!is_weak)
12393 {
12394 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12395 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12396 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12397 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12398 }
12399 else
12400 {
12401 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12402 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12403 emit_insn (gen_rtx_SET (cond, x));
12404 }
12405
12406 emit_label (label2);
12407 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12408 to set the condition flags. If this is not used it will be removed by
12409 later passes. */
12410 if (strong_zero_p)
12411 {
12412 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12413 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12414 emit_insn (gen_rtx_SET (cond, x));
12415 }
12416 /* Emit any final barrier needed for a __sync operation. */
12417 if (is_mm_sync (model))
12418 aarch64_emit_post_barrier (model);
12419 }
12420
12421 /* Emit a BIC instruction. */
12422
12423 static void
12424 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12425 {
12426 rtx shift_rtx = GEN_INT (shift);
12427 rtx (*gen) (rtx, rtx, rtx, rtx);
12428
12429 switch (mode)
12430 {
12431 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12432 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12433 default:
12434 gcc_unreachable ();
12435 }
12436
12437 emit_insn (gen (dst, s2, shift_rtx, s1));
12438 }
12439
12440 /* Emit an atomic swap. */
12441
12442 static void
12443 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12444 rtx mem, rtx model)
12445 {
12446 rtx (*gen) (rtx, rtx, rtx, rtx);
12447
12448 switch (mode)
12449 {
12450 case QImode: gen = gen_aarch64_atomic_swpqi; break;
12451 case HImode: gen = gen_aarch64_atomic_swphi; break;
12452 case SImode: gen = gen_aarch64_atomic_swpsi; break;
12453 case DImode: gen = gen_aarch64_atomic_swpdi; break;
12454 default:
12455 gcc_unreachable ();
12456 }
12457
12458 emit_insn (gen (dst, mem, value, model));
12459 }
12460
12461 /* Operations supported by aarch64_emit_atomic_load_op. */
12462
12463 enum aarch64_atomic_load_op_code
12464 {
12465 AARCH64_LDOP_PLUS, /* A + B */
12466 AARCH64_LDOP_XOR, /* A ^ B */
12467 AARCH64_LDOP_OR, /* A | B */
12468 AARCH64_LDOP_BIC /* A & ~B */
12469 };
12470
12471 /* Emit an atomic load-operate. */
12472
12473 static void
12474 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12475 machine_mode mode, rtx dst, rtx src,
12476 rtx mem, rtx model)
12477 {
12478 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12479 const aarch64_atomic_load_op_fn plus[] =
12480 {
12481 gen_aarch64_atomic_loadaddqi,
12482 gen_aarch64_atomic_loadaddhi,
12483 gen_aarch64_atomic_loadaddsi,
12484 gen_aarch64_atomic_loadadddi
12485 };
12486 const aarch64_atomic_load_op_fn eor[] =
12487 {
12488 gen_aarch64_atomic_loadeorqi,
12489 gen_aarch64_atomic_loadeorhi,
12490 gen_aarch64_atomic_loadeorsi,
12491 gen_aarch64_atomic_loadeordi
12492 };
12493 const aarch64_atomic_load_op_fn ior[] =
12494 {
12495 gen_aarch64_atomic_loadsetqi,
12496 gen_aarch64_atomic_loadsethi,
12497 gen_aarch64_atomic_loadsetsi,
12498 gen_aarch64_atomic_loadsetdi
12499 };
12500 const aarch64_atomic_load_op_fn bic[] =
12501 {
12502 gen_aarch64_atomic_loadclrqi,
12503 gen_aarch64_atomic_loadclrhi,
12504 gen_aarch64_atomic_loadclrsi,
12505 gen_aarch64_atomic_loadclrdi
12506 };
12507 aarch64_atomic_load_op_fn gen;
12508 int idx = 0;
12509
12510 switch (mode)
12511 {
12512 case QImode: idx = 0; break;
12513 case HImode: idx = 1; break;
12514 case SImode: idx = 2; break;
12515 case DImode: idx = 3; break;
12516 default:
12517 gcc_unreachable ();
12518 }
12519
12520 switch (code)
12521 {
12522 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12523 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12524 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12525 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12526 default:
12527 gcc_unreachable ();
12528 }
12529
12530 emit_insn (gen (dst, mem, src, model));
12531 }
12532
12533 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12534 location to store the data read from memory. OUT_RESULT is the location to
12535 store the result of the operation. MEM is the memory location to read and
12536 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12537 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12538 be NULL. */
12539
12540 void
12541 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12542 rtx mem, rtx value, rtx model_rtx)
12543 {
12544 machine_mode mode = GET_MODE (mem);
12545 machine_mode wmode = (mode == DImode ? DImode : SImode);
12546 const bool short_mode = (mode < SImode);
12547 aarch64_atomic_load_op_code ldop_code;
12548 rtx src;
12549 rtx x;
12550
12551 if (out_data)
12552 out_data = gen_lowpart (mode, out_data);
12553
12554 if (out_result)
12555 out_result = gen_lowpart (mode, out_result);
12556
12557 /* Make sure the value is in a register, putting it into a destination
12558 register if it needs to be manipulated. */
12559 if (!register_operand (value, mode)
12560 || code == AND || code == MINUS)
12561 {
12562 src = out_result ? out_result : out_data;
12563 emit_move_insn (src, gen_lowpart (mode, value));
12564 }
12565 else
12566 src = value;
12567 gcc_assert (register_operand (src, mode));
12568
12569 /* Preprocess the data for the operation as necessary. If the operation is
12570 a SET then emit a swap instruction and finish. */
12571 switch (code)
12572 {
12573 case SET:
12574 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12575 return;
12576
12577 case MINUS:
12578 /* Negate the value and treat it as a PLUS. */
12579 {
12580 rtx neg_src;
12581
12582 /* Resize the value if necessary. */
12583 if (short_mode)
12584 src = gen_lowpart (wmode, src);
12585
12586 neg_src = gen_rtx_NEG (wmode, src);
12587 emit_insn (gen_rtx_SET (src, neg_src));
12588
12589 if (short_mode)
12590 src = gen_lowpart (mode, src);
12591 }
12592 /* Fall-through. */
12593 case PLUS:
12594 ldop_code = AARCH64_LDOP_PLUS;
12595 break;
12596
12597 case IOR:
12598 ldop_code = AARCH64_LDOP_OR;
12599 break;
12600
12601 case XOR:
12602 ldop_code = AARCH64_LDOP_XOR;
12603 break;
12604
12605 case AND:
12606 {
12607 rtx not_src;
12608
12609 /* Resize the value if necessary. */
12610 if (short_mode)
12611 src = gen_lowpart (wmode, src);
12612
12613 not_src = gen_rtx_NOT (wmode, src);
12614 emit_insn (gen_rtx_SET (src, not_src));
12615
12616 if (short_mode)
12617 src = gen_lowpart (mode, src);
12618 }
12619 ldop_code = AARCH64_LDOP_BIC;
12620 break;
12621
12622 default:
12623 /* The operation can't be done with atomic instructions. */
12624 gcc_unreachable ();
12625 }
12626
12627 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12628
12629 /* If necessary, calculate the data in memory after the update by redoing the
12630 operation from values in registers. */
12631 if (!out_result)
12632 return;
12633
12634 if (short_mode)
12635 {
12636 src = gen_lowpart (wmode, src);
12637 out_data = gen_lowpart (wmode, out_data);
12638 out_result = gen_lowpart (wmode, out_result);
12639 }
12640
12641 x = NULL_RTX;
12642
12643 switch (code)
12644 {
12645 case MINUS:
12646 case PLUS:
12647 x = gen_rtx_PLUS (wmode, out_data, src);
12648 break;
12649 case IOR:
12650 x = gen_rtx_IOR (wmode, out_data, src);
12651 break;
12652 case XOR:
12653 x = gen_rtx_XOR (wmode, out_data, src);
12654 break;
12655 case AND:
12656 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12657 return;
12658 default:
12659 gcc_unreachable ();
12660 }
12661
12662 emit_set_insn (out_result, x);
12663
12664 return;
12665 }
12666
12667 /* Split an atomic operation. */
12668
12669 void
12670 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12671 rtx value, rtx model_rtx, rtx cond)
12672 {
12673 machine_mode mode = GET_MODE (mem);
12674 machine_mode wmode = (mode == DImode ? DImode : SImode);
12675 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12676 const bool is_sync = is_mm_sync (model);
12677 rtx_code_label *label;
12678 rtx x;
12679
12680 /* Split the atomic operation into a sequence. */
12681 label = gen_label_rtx ();
12682 emit_label (label);
12683
12684 if (new_out)
12685 new_out = gen_lowpart (wmode, new_out);
12686 if (old_out)
12687 old_out = gen_lowpart (wmode, old_out);
12688 else
12689 old_out = new_out;
12690 value = simplify_gen_subreg (wmode, value, mode, 0);
12691
12692 /* The initial load can be relaxed for a __sync operation since a final
12693 barrier will be emitted to stop code hoisting. */
12694 if (is_sync)
12695 aarch64_emit_load_exclusive (mode, old_out, mem,
12696 GEN_INT (MEMMODEL_RELAXED));
12697 else
12698 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12699
12700 switch (code)
12701 {
12702 case SET:
12703 new_out = value;
12704 break;
12705
12706 case NOT:
12707 x = gen_rtx_AND (wmode, old_out, value);
12708 emit_insn (gen_rtx_SET (new_out, x));
12709 x = gen_rtx_NOT (wmode, new_out);
12710 emit_insn (gen_rtx_SET (new_out, x));
12711 break;
12712
12713 case MINUS:
12714 if (CONST_INT_P (value))
12715 {
12716 value = GEN_INT (-INTVAL (value));
12717 code = PLUS;
12718 }
12719 /* Fall through. */
12720
12721 default:
12722 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12723 emit_insn (gen_rtx_SET (new_out, x));
12724 break;
12725 }
12726
12727 aarch64_emit_store_exclusive (mode, cond, mem,
12728 gen_lowpart (mode, new_out), model_rtx);
12729
12730 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12731 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12732 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12733 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12734
12735 /* Emit any final barrier needed for a __sync operation. */
12736 if (is_sync)
12737 aarch64_emit_post_barrier (model);
12738 }
12739
12740 static void
12741 aarch64_init_libfuncs (void)
12742 {
12743 /* Half-precision float operations. The compiler handles all operations
12744 with NULL libfuncs by converting to SFmode. */
12745
12746 /* Conversions. */
12747 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12748 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12749
12750 /* Arithmetic. */
12751 set_optab_libfunc (add_optab, HFmode, NULL);
12752 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12753 set_optab_libfunc (smul_optab, HFmode, NULL);
12754 set_optab_libfunc (neg_optab, HFmode, NULL);
12755 set_optab_libfunc (sub_optab, HFmode, NULL);
12756
12757 /* Comparisons. */
12758 set_optab_libfunc (eq_optab, HFmode, NULL);
12759 set_optab_libfunc (ne_optab, HFmode, NULL);
12760 set_optab_libfunc (lt_optab, HFmode, NULL);
12761 set_optab_libfunc (le_optab, HFmode, NULL);
12762 set_optab_libfunc (ge_optab, HFmode, NULL);
12763 set_optab_libfunc (gt_optab, HFmode, NULL);
12764 set_optab_libfunc (unord_optab, HFmode, NULL);
12765 }
12766
12767 /* Target hook for c_mode_for_suffix. */
12768 static machine_mode
12769 aarch64_c_mode_for_suffix (char suffix)
12770 {
12771 if (suffix == 'q')
12772 return TFmode;
12773
12774 return VOIDmode;
12775 }
12776
12777 /* We can only represent floating point constants which will fit in
12778 "quarter-precision" values. These values are characterised by
12779 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12780 by:
12781
12782 (-1)^s * (n/16) * 2^r
12783
12784 Where:
12785 's' is the sign bit.
12786 'n' is an integer in the range 16 <= n <= 31.
12787 'r' is an integer in the range -3 <= r <= 4. */
12788
12789 /* Return true iff X can be represented by a quarter-precision
12790 floating point immediate operand X. Note, we cannot represent 0.0. */
12791 bool
12792 aarch64_float_const_representable_p (rtx x)
12793 {
12794 /* This represents our current view of how many bits
12795 make up the mantissa. */
12796 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12797 int exponent;
12798 unsigned HOST_WIDE_INT mantissa, mask;
12799 REAL_VALUE_TYPE r, m;
12800 bool fail;
12801
12802 if (!CONST_DOUBLE_P (x))
12803 return false;
12804
12805 /* We don't support HFmode constants yet. */
12806 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12807 return false;
12808
12809 r = *CONST_DOUBLE_REAL_VALUE (x);
12810
12811 /* We cannot represent infinities, NaNs or +/-zero. We won't
12812 know if we have +zero until we analyse the mantissa, but we
12813 can reject the other invalid values. */
12814 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12815 || REAL_VALUE_MINUS_ZERO (r))
12816 return false;
12817
12818 /* Extract exponent. */
12819 r = real_value_abs (&r);
12820 exponent = REAL_EXP (&r);
12821
12822 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12823 highest (sign) bit, with a fixed binary point at bit point_pos.
12824 m1 holds the low part of the mantissa, m2 the high part.
12825 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12826 bits for the mantissa, this can fail (low bits will be lost). */
12827 real_ldexp (&m, &r, point_pos - exponent);
12828 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12829
12830 /* If the low part of the mantissa has bits set we cannot represent
12831 the value. */
12832 if (w.ulow () != 0)
12833 return false;
12834 /* We have rejected the lower HOST_WIDE_INT, so update our
12835 understanding of how many bits lie in the mantissa and
12836 look only at the high HOST_WIDE_INT. */
12837 mantissa = w.elt (1);
12838 point_pos -= HOST_BITS_PER_WIDE_INT;
12839
12840 /* We can only represent values with a mantissa of the form 1.xxxx. */
12841 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12842 if ((mantissa & mask) != 0)
12843 return false;
12844
12845 /* Having filtered unrepresentable values, we may now remove all
12846 but the highest 5 bits. */
12847 mantissa >>= point_pos - 5;
12848
12849 /* We cannot represent the value 0.0, so reject it. This is handled
12850 elsewhere. */
12851 if (mantissa == 0)
12852 return false;
12853
12854 /* Then, as bit 4 is always set, we can mask it off, leaving
12855 the mantissa in the range [0, 15]. */
12856 mantissa &= ~(1 << 4);
12857 gcc_assert (mantissa <= 15);
12858
12859 /* GCC internally does not use IEEE754-like encoding (where normalized
12860 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12861 Our mantissa values are shifted 4 places to the left relative to
12862 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12863 by 5 places to correct for GCC's representation. */
12864 exponent = 5 - exponent;
12865
12866 return (exponent >= 0 && exponent <= 7);
12867 }
12868
12869 char*
12870 aarch64_output_simd_mov_immediate (rtx const_vector,
12871 machine_mode mode,
12872 unsigned width)
12873 {
12874 bool is_valid;
12875 static char templ[40];
12876 const char *mnemonic;
12877 const char *shift_op;
12878 unsigned int lane_count = 0;
12879 char element_char;
12880
12881 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12882
12883 /* This will return true to show const_vector is legal for use as either
12884 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12885 also update INFO to show how the immediate should be generated. */
12886 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12887 gcc_assert (is_valid);
12888
12889 element_char = sizetochar (info.element_width);
12890 lane_count = width / info.element_width;
12891
12892 mode = GET_MODE_INNER (mode);
12893 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12894 {
12895 gcc_assert (info.shift == 0 && ! info.mvn);
12896 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12897 move immediate path. */
12898 if (aarch64_float_const_zero_rtx_p (info.value))
12899 info.value = GEN_INT (0);
12900 else
12901 {
12902 const unsigned int buf_size = 20;
12903 char float_buf[buf_size] = {'\0'};
12904 real_to_decimal_for_mode (float_buf,
12905 CONST_DOUBLE_REAL_VALUE (info.value),
12906 buf_size, buf_size, 1, mode);
12907
12908 if (lane_count == 1)
12909 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12910 else
12911 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12912 lane_count, element_char, float_buf);
12913 return templ;
12914 }
12915 }
12916
12917 mnemonic = info.mvn ? "mvni" : "movi";
12918 shift_op = info.msl ? "msl" : "lsl";
12919
12920 gcc_assert (CONST_INT_P (info.value));
12921 if (lane_count == 1)
12922 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12923 mnemonic, UINTVAL (info.value));
12924 else if (info.shift)
12925 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12926 ", %s %d", mnemonic, lane_count, element_char,
12927 UINTVAL (info.value), shift_op, info.shift);
12928 else
12929 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12930 mnemonic, lane_count, element_char, UINTVAL (info.value));
12931 return templ;
12932 }
12933
12934 char*
12935 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12936 machine_mode mode)
12937 {
12938 machine_mode vmode;
12939
12940 gcc_assert (!VECTOR_MODE_P (mode));
12941 vmode = aarch64_simd_container_mode (mode, 64);
12942 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12943 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12944 }
12945
12946 /* Split operands into moves from op[1] + op[2] into op[0]. */
12947
12948 void
12949 aarch64_split_combinev16qi (rtx operands[3])
12950 {
12951 unsigned int dest = REGNO (operands[0]);
12952 unsigned int src1 = REGNO (operands[1]);
12953 unsigned int src2 = REGNO (operands[2]);
12954 machine_mode halfmode = GET_MODE (operands[1]);
12955 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12956 rtx destlo, desthi;
12957
12958 gcc_assert (halfmode == V16QImode);
12959
12960 if (src1 == dest && src2 == dest + halfregs)
12961 {
12962 /* No-op move. Can't split to nothing; emit something. */
12963 emit_note (NOTE_INSN_DELETED);
12964 return;
12965 }
12966
12967 /* Preserve register attributes for variable tracking. */
12968 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12969 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12970 GET_MODE_SIZE (halfmode));
12971
12972 /* Special case of reversed high/low parts. */
12973 if (reg_overlap_mentioned_p (operands[2], destlo)
12974 && reg_overlap_mentioned_p (operands[1], desthi))
12975 {
12976 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12977 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12978 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12979 }
12980 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12981 {
12982 /* Try to avoid unnecessary moves if part of the result
12983 is in the right place already. */
12984 if (src1 != dest)
12985 emit_move_insn (destlo, operands[1]);
12986 if (src2 != dest + halfregs)
12987 emit_move_insn (desthi, operands[2]);
12988 }
12989 else
12990 {
12991 if (src2 != dest + halfregs)
12992 emit_move_insn (desthi, operands[2]);
12993 if (src1 != dest)
12994 emit_move_insn (destlo, operands[1]);
12995 }
12996 }
12997
12998 /* vec_perm support. */
12999
13000 #define MAX_VECT_LEN 16
13001
13002 struct expand_vec_perm_d
13003 {
13004 rtx target, op0, op1;
13005 unsigned char perm[MAX_VECT_LEN];
13006 machine_mode vmode;
13007 unsigned char nelt;
13008 bool one_vector_p;
13009 bool testing_p;
13010 };
13011
13012 /* Generate a variable permutation. */
13013
13014 static void
13015 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13016 {
13017 machine_mode vmode = GET_MODE (target);
13018 bool one_vector_p = rtx_equal_p (op0, op1);
13019
13020 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13021 gcc_checking_assert (GET_MODE (op0) == vmode);
13022 gcc_checking_assert (GET_MODE (op1) == vmode);
13023 gcc_checking_assert (GET_MODE (sel) == vmode);
13024 gcc_checking_assert (TARGET_SIMD);
13025
13026 if (one_vector_p)
13027 {
13028 if (vmode == V8QImode)
13029 {
13030 /* Expand the argument to a V16QI mode by duplicating it. */
13031 rtx pair = gen_reg_rtx (V16QImode);
13032 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13033 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13034 }
13035 else
13036 {
13037 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13038 }
13039 }
13040 else
13041 {
13042 rtx pair;
13043
13044 if (vmode == V8QImode)
13045 {
13046 pair = gen_reg_rtx (V16QImode);
13047 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13048 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13049 }
13050 else
13051 {
13052 pair = gen_reg_rtx (OImode);
13053 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13054 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13055 }
13056 }
13057 }
13058
13059 void
13060 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13061 {
13062 machine_mode vmode = GET_MODE (target);
13063 unsigned int nelt = GET_MODE_NUNITS (vmode);
13064 bool one_vector_p = rtx_equal_p (op0, op1);
13065 rtx mask;
13066
13067 /* The TBL instruction does not use a modulo index, so we must take care
13068 of that ourselves. */
13069 mask = aarch64_simd_gen_const_vector_dup (vmode,
13070 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13071 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13072
13073 /* For big-endian, we also need to reverse the index within the vector
13074 (but not which vector). */
13075 if (BYTES_BIG_ENDIAN)
13076 {
13077 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13078 if (!one_vector_p)
13079 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13080 sel = expand_simple_binop (vmode, XOR, sel, mask,
13081 NULL, 0, OPTAB_LIB_WIDEN);
13082 }
13083 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13084 }
13085
13086 /* Recognize patterns suitable for the TRN instructions. */
13087 static bool
13088 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13089 {
13090 unsigned int i, odd, mask, nelt = d->nelt;
13091 rtx out, in0, in1, x;
13092 rtx (*gen) (rtx, rtx, rtx);
13093 machine_mode vmode = d->vmode;
13094
13095 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13096 return false;
13097
13098 /* Note that these are little-endian tests.
13099 We correct for big-endian later. */
13100 if (d->perm[0] == 0)
13101 odd = 0;
13102 else if (d->perm[0] == 1)
13103 odd = 1;
13104 else
13105 return false;
13106 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13107
13108 for (i = 0; i < nelt; i += 2)
13109 {
13110 if (d->perm[i] != i + odd)
13111 return false;
13112 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13113 return false;
13114 }
13115
13116 /* Success! */
13117 if (d->testing_p)
13118 return true;
13119
13120 in0 = d->op0;
13121 in1 = d->op1;
13122 if (BYTES_BIG_ENDIAN)
13123 {
13124 x = in0, in0 = in1, in1 = x;
13125 odd = !odd;
13126 }
13127 out = d->target;
13128
13129 if (odd)
13130 {
13131 switch (vmode)
13132 {
13133 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
13134 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
13135 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
13136 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
13137 case V4SImode: gen = gen_aarch64_trn2v4si; break;
13138 case V2SImode: gen = gen_aarch64_trn2v2si; break;
13139 case V2DImode: gen = gen_aarch64_trn2v2di; break;
13140 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13141 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13142 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13143 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13144 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
13145 default:
13146 return false;
13147 }
13148 }
13149 else
13150 {
13151 switch (vmode)
13152 {
13153 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
13154 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
13155 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
13156 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
13157 case V4SImode: gen = gen_aarch64_trn1v4si; break;
13158 case V2SImode: gen = gen_aarch64_trn1v2si; break;
13159 case V2DImode: gen = gen_aarch64_trn1v2di; break;
13160 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13161 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13162 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13163 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13164 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
13165 default:
13166 return false;
13167 }
13168 }
13169
13170 emit_insn (gen (out, in0, in1));
13171 return true;
13172 }
13173
13174 /* Recognize patterns suitable for the UZP instructions. */
13175 static bool
13176 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13177 {
13178 unsigned int i, odd, mask, nelt = d->nelt;
13179 rtx out, in0, in1, x;
13180 rtx (*gen) (rtx, rtx, rtx);
13181 machine_mode vmode = d->vmode;
13182
13183 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13184 return false;
13185
13186 /* Note that these are little-endian tests.
13187 We correct for big-endian later. */
13188 if (d->perm[0] == 0)
13189 odd = 0;
13190 else if (d->perm[0] == 1)
13191 odd = 1;
13192 else
13193 return false;
13194 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13195
13196 for (i = 0; i < nelt; i++)
13197 {
13198 unsigned elt = (i * 2 + odd) & mask;
13199 if (d->perm[i] != elt)
13200 return false;
13201 }
13202
13203 /* Success! */
13204 if (d->testing_p)
13205 return true;
13206
13207 in0 = d->op0;
13208 in1 = d->op1;
13209 if (BYTES_BIG_ENDIAN)
13210 {
13211 x = in0, in0 = in1, in1 = x;
13212 odd = !odd;
13213 }
13214 out = d->target;
13215
13216 if (odd)
13217 {
13218 switch (vmode)
13219 {
13220 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13221 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13222 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13223 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13224 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
13225 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
13226 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
13227 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13228 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13229 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13230 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13231 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13232 default:
13233 return false;
13234 }
13235 }
13236 else
13237 {
13238 switch (vmode)
13239 {
13240 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13241 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13242 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13243 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13244 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
13245 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
13246 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
13247 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13248 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13249 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13250 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13251 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13252 default:
13253 return false;
13254 }
13255 }
13256
13257 emit_insn (gen (out, in0, in1));
13258 return true;
13259 }
13260
13261 /* Recognize patterns suitable for the ZIP instructions. */
13262 static bool
13263 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13264 {
13265 unsigned int i, high, mask, nelt = d->nelt;
13266 rtx out, in0, in1, x;
13267 rtx (*gen) (rtx, rtx, rtx);
13268 machine_mode vmode = d->vmode;
13269
13270 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13271 return false;
13272
13273 /* Note that these are little-endian tests.
13274 We correct for big-endian later. */
13275 high = nelt / 2;
13276 if (d->perm[0] == high)
13277 /* Do Nothing. */
13278 ;
13279 else if (d->perm[0] == 0)
13280 high = 0;
13281 else
13282 return false;
13283 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13284
13285 for (i = 0; i < nelt / 2; i++)
13286 {
13287 unsigned elt = (i + high) & mask;
13288 if (d->perm[i * 2] != elt)
13289 return false;
13290 elt = (elt + nelt) & mask;
13291 if (d->perm[i * 2 + 1] != elt)
13292 return false;
13293 }
13294
13295 /* Success! */
13296 if (d->testing_p)
13297 return true;
13298
13299 in0 = d->op0;
13300 in1 = d->op1;
13301 if (BYTES_BIG_ENDIAN)
13302 {
13303 x = in0, in0 = in1, in1 = x;
13304 high = !high;
13305 }
13306 out = d->target;
13307
13308 if (high)
13309 {
13310 switch (vmode)
13311 {
13312 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
13313 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
13314 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
13315 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
13316 case V4SImode: gen = gen_aarch64_zip2v4si; break;
13317 case V2SImode: gen = gen_aarch64_zip2v2si; break;
13318 case V2DImode: gen = gen_aarch64_zip2v2di; break;
13319 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13320 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13321 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13322 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13323 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
13324 default:
13325 return false;
13326 }
13327 }
13328 else
13329 {
13330 switch (vmode)
13331 {
13332 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13333 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13334 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13335 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13336 case V4SImode: gen = gen_aarch64_zip1v4si; break;
13337 case V2SImode: gen = gen_aarch64_zip1v2si; break;
13338 case V2DImode: gen = gen_aarch64_zip1v2di; break;
13339 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13340 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13341 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13342 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13343 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13344 default:
13345 return false;
13346 }
13347 }
13348
13349 emit_insn (gen (out, in0, in1));
13350 return true;
13351 }
13352
13353 /* Recognize patterns for the EXT insn. */
13354
13355 static bool
13356 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13357 {
13358 unsigned int i, nelt = d->nelt;
13359 rtx (*gen) (rtx, rtx, rtx, rtx);
13360 rtx offset;
13361
13362 unsigned int location = d->perm[0]; /* Always < nelt. */
13363
13364 /* Check if the extracted indices are increasing by one. */
13365 for (i = 1; i < nelt; i++)
13366 {
13367 unsigned int required = location + i;
13368 if (d->one_vector_p)
13369 {
13370 /* We'll pass the same vector in twice, so allow indices to wrap. */
13371 required &= (nelt - 1);
13372 }
13373 if (d->perm[i] != required)
13374 return false;
13375 }
13376
13377 switch (d->vmode)
13378 {
13379 case V16QImode: gen = gen_aarch64_extv16qi; break;
13380 case V8QImode: gen = gen_aarch64_extv8qi; break;
13381 case V4HImode: gen = gen_aarch64_extv4hi; break;
13382 case V8HImode: gen = gen_aarch64_extv8hi; break;
13383 case V2SImode: gen = gen_aarch64_extv2si; break;
13384 case V4SImode: gen = gen_aarch64_extv4si; break;
13385 case V4HFmode: gen = gen_aarch64_extv4hf; break;
13386 case V8HFmode: gen = gen_aarch64_extv8hf; break;
13387 case V2SFmode: gen = gen_aarch64_extv2sf; break;
13388 case V4SFmode: gen = gen_aarch64_extv4sf; break;
13389 case V2DImode: gen = gen_aarch64_extv2di; break;
13390 case V2DFmode: gen = gen_aarch64_extv2df; break;
13391 default:
13392 return false;
13393 }
13394
13395 /* Success! */
13396 if (d->testing_p)
13397 return true;
13398
13399 /* The case where (location == 0) is a no-op for both big- and little-endian,
13400 and is removed by the mid-end at optimization levels -O1 and higher. */
13401
13402 if (BYTES_BIG_ENDIAN && (location != 0))
13403 {
13404 /* After setup, we want the high elements of the first vector (stored
13405 at the LSB end of the register), and the low elements of the second
13406 vector (stored at the MSB end of the register). So swap. */
13407 std::swap (d->op0, d->op1);
13408 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13409 location = nelt - location;
13410 }
13411
13412 offset = GEN_INT (location);
13413 emit_insn (gen (d->target, d->op0, d->op1, offset));
13414 return true;
13415 }
13416
13417 /* Recognize patterns for the REV insns. */
13418
13419 static bool
13420 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13421 {
13422 unsigned int i, j, diff, nelt = d->nelt;
13423 rtx (*gen) (rtx, rtx);
13424
13425 if (!d->one_vector_p)
13426 return false;
13427
13428 diff = d->perm[0];
13429 switch (diff)
13430 {
13431 case 7:
13432 switch (d->vmode)
13433 {
13434 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13435 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
13436 default:
13437 return false;
13438 }
13439 break;
13440 case 3:
13441 switch (d->vmode)
13442 {
13443 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13444 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
13445 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
13446 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
13447 default:
13448 return false;
13449 }
13450 break;
13451 case 1:
13452 switch (d->vmode)
13453 {
13454 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13455 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
13456 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
13457 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
13458 case V4SImode: gen = gen_aarch64_rev64v4si; break;
13459 case V2SImode: gen = gen_aarch64_rev64v2si; break;
13460 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13461 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13462 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13463 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13464 default:
13465 return false;
13466 }
13467 break;
13468 default:
13469 return false;
13470 }
13471
13472 for (i = 0; i < nelt ; i += diff + 1)
13473 for (j = 0; j <= diff; j += 1)
13474 {
13475 /* This is guaranteed to be true as the value of diff
13476 is 7, 3, 1 and we should have enough elements in the
13477 queue to generate this. Getting a vector mask with a
13478 value of diff other than these values implies that
13479 something is wrong by the time we get here. */
13480 gcc_assert (i + j < nelt);
13481 if (d->perm[i + j] != i + diff - j)
13482 return false;
13483 }
13484
13485 /* Success! */
13486 if (d->testing_p)
13487 return true;
13488
13489 emit_insn (gen (d->target, d->op0));
13490 return true;
13491 }
13492
13493 static bool
13494 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13495 {
13496 rtx (*gen) (rtx, rtx, rtx);
13497 rtx out = d->target;
13498 rtx in0;
13499 machine_mode vmode = d->vmode;
13500 unsigned int i, elt, nelt = d->nelt;
13501 rtx lane;
13502
13503 elt = d->perm[0];
13504 for (i = 1; i < nelt; i++)
13505 {
13506 if (elt != d->perm[i])
13507 return false;
13508 }
13509
13510 /* The generic preparation in aarch64_expand_vec_perm_const_1
13511 swaps the operand order and the permute indices if it finds
13512 d->perm[0] to be in the second operand. Thus, we can always
13513 use d->op0 and need not do any extra arithmetic to get the
13514 correct lane number. */
13515 in0 = d->op0;
13516 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13517
13518 switch (vmode)
13519 {
13520 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13521 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13522 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13523 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13524 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13525 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13526 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13527 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13528 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13529 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13530 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13531 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13532 default:
13533 return false;
13534 }
13535
13536 emit_insn (gen (out, in0, lane));
13537 return true;
13538 }
13539
13540 static bool
13541 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13542 {
13543 rtx rperm[MAX_VECT_LEN], sel;
13544 machine_mode vmode = d->vmode;
13545 unsigned int i, nelt = d->nelt;
13546
13547 if (d->testing_p)
13548 return true;
13549
13550 /* Generic code will try constant permutation twice. Once with the
13551 original mode and again with the elements lowered to QImode.
13552 So wait and don't do the selector expansion ourselves. */
13553 if (vmode != V8QImode && vmode != V16QImode)
13554 return false;
13555
13556 for (i = 0; i < nelt; ++i)
13557 {
13558 int nunits = GET_MODE_NUNITS (vmode);
13559
13560 /* If big-endian and two vectors we end up with a weird mixed-endian
13561 mode on NEON. Reverse the index within each word but not the word
13562 itself. */
13563 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13564 : d->perm[i]);
13565 }
13566 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13567 sel = force_reg (vmode, sel);
13568
13569 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13570 return true;
13571 }
13572
13573 static bool
13574 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13575 {
13576 /* The pattern matching functions above are written to look for a small
13577 number to begin the sequence (0, 1, N/2). If we begin with an index
13578 from the second operand, we can swap the operands. */
13579 if (d->perm[0] >= d->nelt)
13580 {
13581 unsigned i, nelt = d->nelt;
13582
13583 gcc_assert (nelt == (nelt & -nelt));
13584 for (i = 0; i < nelt; ++i)
13585 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13586
13587 std::swap (d->op0, d->op1);
13588 }
13589
13590 if (TARGET_SIMD)
13591 {
13592 if (aarch64_evpc_rev (d))
13593 return true;
13594 else if (aarch64_evpc_ext (d))
13595 return true;
13596 else if (aarch64_evpc_dup (d))
13597 return true;
13598 else if (aarch64_evpc_zip (d))
13599 return true;
13600 else if (aarch64_evpc_uzp (d))
13601 return true;
13602 else if (aarch64_evpc_trn (d))
13603 return true;
13604 return aarch64_evpc_tbl (d);
13605 }
13606 return false;
13607 }
13608
13609 /* Expand a vec_perm_const pattern. */
13610
13611 bool
13612 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13613 {
13614 struct expand_vec_perm_d d;
13615 int i, nelt, which;
13616
13617 d.target = target;
13618 d.op0 = op0;
13619 d.op1 = op1;
13620
13621 d.vmode = GET_MODE (target);
13622 gcc_assert (VECTOR_MODE_P (d.vmode));
13623 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13624 d.testing_p = false;
13625
13626 for (i = which = 0; i < nelt; ++i)
13627 {
13628 rtx e = XVECEXP (sel, 0, i);
13629 int ei = INTVAL (e) & (2 * nelt - 1);
13630 which |= (ei < nelt ? 1 : 2);
13631 d.perm[i] = ei;
13632 }
13633
13634 switch (which)
13635 {
13636 default:
13637 gcc_unreachable ();
13638
13639 case 3:
13640 d.one_vector_p = false;
13641 if (!rtx_equal_p (op0, op1))
13642 break;
13643
13644 /* The elements of PERM do not suggest that only the first operand
13645 is used, but both operands are identical. Allow easier matching
13646 of the permutation by folding the permutation into the single
13647 input vector. */
13648 /* Fall Through. */
13649 case 2:
13650 for (i = 0; i < nelt; ++i)
13651 d.perm[i] &= nelt - 1;
13652 d.op0 = op1;
13653 d.one_vector_p = true;
13654 break;
13655
13656 case 1:
13657 d.op1 = op0;
13658 d.one_vector_p = true;
13659 break;
13660 }
13661
13662 return aarch64_expand_vec_perm_const_1 (&d);
13663 }
13664
13665 static bool
13666 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13667 const unsigned char *sel)
13668 {
13669 struct expand_vec_perm_d d;
13670 unsigned int i, nelt, which;
13671 bool ret;
13672
13673 d.vmode = vmode;
13674 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13675 d.testing_p = true;
13676 memcpy (d.perm, sel, nelt);
13677
13678 /* Calculate whether all elements are in one vector. */
13679 for (i = which = 0; i < nelt; ++i)
13680 {
13681 unsigned char e = d.perm[i];
13682 gcc_assert (e < 2 * nelt);
13683 which |= (e < nelt ? 1 : 2);
13684 }
13685
13686 /* If all elements are from the second vector, reindex as if from the
13687 first vector. */
13688 if (which == 2)
13689 for (i = 0; i < nelt; ++i)
13690 d.perm[i] -= nelt;
13691
13692 /* Check whether the mask can be applied to a single vector. */
13693 d.one_vector_p = (which != 3);
13694
13695 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13696 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13697 if (!d.one_vector_p)
13698 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13699
13700 start_sequence ();
13701 ret = aarch64_expand_vec_perm_const_1 (&d);
13702 end_sequence ();
13703
13704 return ret;
13705 }
13706
13707 rtx
13708 aarch64_reverse_mask (enum machine_mode mode)
13709 {
13710 /* We have to reverse each vector because we dont have
13711 a permuted load that can reverse-load according to ABI rules. */
13712 rtx mask;
13713 rtvec v = rtvec_alloc (16);
13714 int i, j;
13715 int nunits = GET_MODE_NUNITS (mode);
13716 int usize = GET_MODE_UNIT_SIZE (mode);
13717
13718 gcc_assert (BYTES_BIG_ENDIAN);
13719 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13720
13721 for (i = 0; i < nunits; i++)
13722 for (j = 0; j < usize; j++)
13723 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13724 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13725 return force_reg (V16QImode, mask);
13726 }
13727
13728 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13729 However due to issues with register allocation it is preferable to avoid
13730 tieing integer scalar and FP scalar modes. Executing integer operations
13731 in general registers is better than treating them as scalar vector
13732 operations. This reduces latency and avoids redundant int<->FP moves.
13733 So tie modes if they are either the same class, or vector modes with
13734 other vector modes, vector structs or any scalar mode.
13735 */
13736
13737 bool
13738 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13739 {
13740 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13741 return true;
13742
13743 /* We specifically want to allow elements of "structure" modes to
13744 be tieable to the structure. This more general condition allows
13745 other rarer situations too. */
13746 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13747 return true;
13748
13749 /* Also allow any scalar modes with vectors. */
13750 if (aarch64_vector_mode_supported_p (mode1)
13751 || aarch64_vector_mode_supported_p (mode2))
13752 return true;
13753
13754 return false;
13755 }
13756
13757 /* Return a new RTX holding the result of moving POINTER forward by
13758 AMOUNT bytes. */
13759
13760 static rtx
13761 aarch64_move_pointer (rtx pointer, int amount)
13762 {
13763 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13764
13765 return adjust_automodify_address (pointer, GET_MODE (pointer),
13766 next, amount);
13767 }
13768
13769 /* Return a new RTX holding the result of moving POINTER forward by the
13770 size of the mode it points to. */
13771
13772 static rtx
13773 aarch64_progress_pointer (rtx pointer)
13774 {
13775 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13776
13777 return aarch64_move_pointer (pointer, amount);
13778 }
13779
13780 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13781 MODE bytes. */
13782
13783 static void
13784 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13785 machine_mode mode)
13786 {
13787 rtx reg = gen_reg_rtx (mode);
13788
13789 /* "Cast" the pointers to the correct mode. */
13790 *src = adjust_address (*src, mode, 0);
13791 *dst = adjust_address (*dst, mode, 0);
13792 /* Emit the memcpy. */
13793 emit_move_insn (reg, *src);
13794 emit_move_insn (*dst, reg);
13795 /* Move the pointers forward. */
13796 *src = aarch64_progress_pointer (*src);
13797 *dst = aarch64_progress_pointer (*dst);
13798 }
13799
13800 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13801 we succeed, otherwise return false. */
13802
13803 bool
13804 aarch64_expand_movmem (rtx *operands)
13805 {
13806 unsigned int n;
13807 rtx dst = operands[0];
13808 rtx src = operands[1];
13809 rtx base;
13810 bool speed_p = !optimize_function_for_size_p (cfun);
13811
13812 /* When optimizing for size, give a better estimate of the length of a
13813 memcpy call, but use the default otherwise. */
13814 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13815
13816 /* We can't do anything smart if the amount to copy is not constant. */
13817 if (!CONST_INT_P (operands[2]))
13818 return false;
13819
13820 n = UINTVAL (operands[2]);
13821
13822 /* Try to keep the number of instructions low. For cases below 16 bytes we
13823 need to make at most two moves. For cases above 16 bytes it will be one
13824 move for each 16 byte chunk, then at most two additional moves. */
13825 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13826 return false;
13827
13828 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13829 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13830
13831 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13832 src = adjust_automodify_address (src, VOIDmode, base, 0);
13833
13834 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13835 1-byte chunk. */
13836 if (n < 4)
13837 {
13838 if (n >= 2)
13839 {
13840 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13841 n -= 2;
13842 }
13843
13844 if (n == 1)
13845 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13846
13847 return true;
13848 }
13849
13850 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13851 4-byte chunk, partially overlapping with the previously copied chunk. */
13852 if (n < 8)
13853 {
13854 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13855 n -= 4;
13856 if (n > 0)
13857 {
13858 int move = n - 4;
13859
13860 src = aarch64_move_pointer (src, move);
13861 dst = aarch64_move_pointer (dst, move);
13862 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13863 }
13864 return true;
13865 }
13866
13867 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13868 them, then (if applicable) an 8-byte chunk. */
13869 while (n >= 8)
13870 {
13871 if (n / 16)
13872 {
13873 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13874 n -= 16;
13875 }
13876 else
13877 {
13878 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13879 n -= 8;
13880 }
13881 }
13882
13883 /* Finish the final bytes of the copy. We can always do this in one
13884 instruction. We either copy the exact amount we need, or partially
13885 overlap with the previous chunk we copied and copy 8-bytes. */
13886 if (n == 0)
13887 return true;
13888 else if (n == 1)
13889 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13890 else if (n == 2)
13891 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13892 else if (n == 4)
13893 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13894 else
13895 {
13896 if (n == 3)
13897 {
13898 src = aarch64_move_pointer (src, -1);
13899 dst = aarch64_move_pointer (dst, -1);
13900 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13901 }
13902 else
13903 {
13904 int move = n - 8;
13905
13906 src = aarch64_move_pointer (src, move);
13907 dst = aarch64_move_pointer (dst, move);
13908 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13909 }
13910 }
13911
13912 return true;
13913 }
13914
13915 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13916 SImode stores. Handle the case when the constant has identical
13917 bottom and top halves. This is beneficial when the two stores can be
13918 merged into an STP and we avoid synthesising potentially expensive
13919 immediates twice. Return true if such a split is possible. */
13920
13921 bool
13922 aarch64_split_dimode_const_store (rtx dst, rtx src)
13923 {
13924 rtx lo = gen_lowpart (SImode, src);
13925 rtx hi = gen_highpart_mode (SImode, DImode, src);
13926
13927 bool size_p = optimize_function_for_size_p (cfun);
13928
13929 if (!rtx_equal_p (lo, hi))
13930 return false;
13931
13932 unsigned int orig_cost
13933 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13934 unsigned int lo_cost
13935 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13936
13937 /* We want to transform:
13938 MOV x1, 49370
13939 MOVK x1, 0x140, lsl 16
13940 MOVK x1, 0xc0da, lsl 32
13941 MOVK x1, 0x140, lsl 48
13942 STR x1, [x0]
13943 into:
13944 MOV w1, 49370
13945 MOVK w1, 0x140, lsl 16
13946 STP w1, w1, [x0]
13947 So we want to perform this only when we save two instructions
13948 or more. When optimizing for size, however, accept any code size
13949 savings we can. */
13950 if (size_p && orig_cost <= lo_cost)
13951 return false;
13952
13953 if (!size_p
13954 && (orig_cost <= lo_cost + 1))
13955 return false;
13956
13957 rtx mem_lo = adjust_address (dst, SImode, 0);
13958 if (!aarch64_mem_pair_operand (mem_lo, SImode))
13959 return false;
13960
13961 rtx tmp_reg = gen_reg_rtx (SImode);
13962 aarch64_expand_mov_immediate (tmp_reg, lo);
13963 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13964 /* Don't emit an explicit store pair as this may not be always profitable.
13965 Let the sched-fusion logic decide whether to merge them. */
13966 emit_move_insn (mem_lo, tmp_reg);
13967 emit_move_insn (mem_hi, tmp_reg);
13968
13969 return true;
13970 }
13971
13972 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13973
13974 static unsigned HOST_WIDE_INT
13975 aarch64_asan_shadow_offset (void)
13976 {
13977 return (HOST_WIDE_INT_1 << 36);
13978 }
13979
13980 static bool
13981 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13982 unsigned int align,
13983 enum by_pieces_operation op,
13984 bool speed_p)
13985 {
13986 /* STORE_BY_PIECES can be used when copying a constant string, but
13987 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13988 For now we always fail this and let the move_by_pieces code copy
13989 the string from read-only memory. */
13990 if (op == STORE_BY_PIECES)
13991 return false;
13992
13993 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13994 }
13995
13996 static rtx
13997 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13998 int code, tree treeop0, tree treeop1)
13999 {
14000 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14001 rtx op0, op1;
14002 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14003 insn_code icode;
14004 struct expand_operand ops[4];
14005
14006 start_sequence ();
14007 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14008
14009 op_mode = GET_MODE (op0);
14010 if (op_mode == VOIDmode)
14011 op_mode = GET_MODE (op1);
14012
14013 switch (op_mode)
14014 {
14015 case QImode:
14016 case HImode:
14017 case SImode:
14018 cmp_mode = SImode;
14019 icode = CODE_FOR_cmpsi;
14020 break;
14021
14022 case DImode:
14023 cmp_mode = DImode;
14024 icode = CODE_FOR_cmpdi;
14025 break;
14026
14027 case SFmode:
14028 cmp_mode = SFmode;
14029 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14030 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14031 break;
14032
14033 case DFmode:
14034 cmp_mode = DFmode;
14035 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14036 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14037 break;
14038
14039 default:
14040 end_sequence ();
14041 return NULL_RTX;
14042 }
14043
14044 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14045 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14046 if (!op0 || !op1)
14047 {
14048 end_sequence ();
14049 return NULL_RTX;
14050 }
14051 *prep_seq = get_insns ();
14052 end_sequence ();
14053
14054 create_fixed_operand (&ops[0], op0);
14055 create_fixed_operand (&ops[1], op1);
14056
14057 start_sequence ();
14058 if (!maybe_expand_insn (icode, 2, ops))
14059 {
14060 end_sequence ();
14061 return NULL_RTX;
14062 }
14063 *gen_seq = get_insns ();
14064 end_sequence ();
14065
14066 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14067 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14068 }
14069
14070 static rtx
14071 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14072 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14073 {
14074 rtx op0, op1, target;
14075 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14076 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14077 insn_code icode;
14078 struct expand_operand ops[6];
14079 int aarch64_cond;
14080
14081 push_to_sequence (*prep_seq);
14082 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14083
14084 op_mode = GET_MODE (op0);
14085 if (op_mode == VOIDmode)
14086 op_mode = GET_MODE (op1);
14087
14088 switch (op_mode)
14089 {
14090 case QImode:
14091 case HImode:
14092 case SImode:
14093 cmp_mode = SImode;
14094 icode = CODE_FOR_ccmpsi;
14095 break;
14096
14097 case DImode:
14098 cmp_mode = DImode;
14099 icode = CODE_FOR_ccmpdi;
14100 break;
14101
14102 case SFmode:
14103 cmp_mode = SFmode;
14104 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14105 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14106 break;
14107
14108 case DFmode:
14109 cmp_mode = DFmode;
14110 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14111 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14112 break;
14113
14114 default:
14115 end_sequence ();
14116 return NULL_RTX;
14117 }
14118
14119 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14120 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14121 if (!op0 || !op1)
14122 {
14123 end_sequence ();
14124 return NULL_RTX;
14125 }
14126 *prep_seq = get_insns ();
14127 end_sequence ();
14128
14129 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14130 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14131
14132 if (bit_code != AND)
14133 {
14134 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14135 GET_MODE (XEXP (prev, 0))),
14136 VOIDmode, XEXP (prev, 0), const0_rtx);
14137 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14138 }
14139
14140 create_fixed_operand (&ops[0], XEXP (prev, 0));
14141 create_fixed_operand (&ops[1], target);
14142 create_fixed_operand (&ops[2], op0);
14143 create_fixed_operand (&ops[3], op1);
14144 create_fixed_operand (&ops[4], prev);
14145 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14146
14147 push_to_sequence (*gen_seq);
14148 if (!maybe_expand_insn (icode, 6, ops))
14149 {
14150 end_sequence ();
14151 return NULL_RTX;
14152 }
14153
14154 *gen_seq = get_insns ();
14155 end_sequence ();
14156
14157 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14158 }
14159
14160 #undef TARGET_GEN_CCMP_FIRST
14161 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14162
14163 #undef TARGET_GEN_CCMP_NEXT
14164 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14165
14166 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14167 instruction fusion of some sort. */
14168
14169 static bool
14170 aarch64_macro_fusion_p (void)
14171 {
14172 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14173 }
14174
14175
14176 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14177 should be kept together during scheduling. */
14178
14179 static bool
14180 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14181 {
14182 rtx set_dest;
14183 rtx prev_set = single_set (prev);
14184 rtx curr_set = single_set (curr);
14185 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14186 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14187
14188 if (!aarch64_macro_fusion_p ())
14189 return false;
14190
14191 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14192 {
14193 /* We are trying to match:
14194 prev (mov) == (set (reg r0) (const_int imm16))
14195 curr (movk) == (set (zero_extract (reg r0)
14196 (const_int 16)
14197 (const_int 16))
14198 (const_int imm16_1)) */
14199
14200 set_dest = SET_DEST (curr_set);
14201
14202 if (GET_CODE (set_dest) == ZERO_EXTRACT
14203 && CONST_INT_P (SET_SRC (curr_set))
14204 && CONST_INT_P (SET_SRC (prev_set))
14205 && CONST_INT_P (XEXP (set_dest, 2))
14206 && INTVAL (XEXP (set_dest, 2)) == 16
14207 && REG_P (XEXP (set_dest, 0))
14208 && REG_P (SET_DEST (prev_set))
14209 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14210 {
14211 return true;
14212 }
14213 }
14214
14215 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14216 {
14217
14218 /* We're trying to match:
14219 prev (adrp) == (set (reg r1)
14220 (high (symbol_ref ("SYM"))))
14221 curr (add) == (set (reg r0)
14222 (lo_sum (reg r1)
14223 (symbol_ref ("SYM"))))
14224 Note that r0 need not necessarily be the same as r1, especially
14225 during pre-regalloc scheduling. */
14226
14227 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14228 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14229 {
14230 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14231 && REG_P (XEXP (SET_SRC (curr_set), 0))
14232 && REGNO (XEXP (SET_SRC (curr_set), 0))
14233 == REGNO (SET_DEST (prev_set))
14234 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14235 XEXP (SET_SRC (curr_set), 1)))
14236 return true;
14237 }
14238 }
14239
14240 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14241 {
14242
14243 /* We're trying to match:
14244 prev (movk) == (set (zero_extract (reg r0)
14245 (const_int 16)
14246 (const_int 32))
14247 (const_int imm16_1))
14248 curr (movk) == (set (zero_extract (reg r0)
14249 (const_int 16)
14250 (const_int 48))
14251 (const_int imm16_2)) */
14252
14253 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14254 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14255 && REG_P (XEXP (SET_DEST (prev_set), 0))
14256 && REG_P (XEXP (SET_DEST (curr_set), 0))
14257 && REGNO (XEXP (SET_DEST (prev_set), 0))
14258 == REGNO (XEXP (SET_DEST (curr_set), 0))
14259 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14260 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14261 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14262 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14263 && CONST_INT_P (SET_SRC (prev_set))
14264 && CONST_INT_P (SET_SRC (curr_set)))
14265 return true;
14266
14267 }
14268 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14269 {
14270 /* We're trying to match:
14271 prev (adrp) == (set (reg r0)
14272 (high (symbol_ref ("SYM"))))
14273 curr (ldr) == (set (reg r1)
14274 (mem (lo_sum (reg r0)
14275 (symbol_ref ("SYM")))))
14276 or
14277 curr (ldr) == (set (reg r1)
14278 (zero_extend (mem
14279 (lo_sum (reg r0)
14280 (symbol_ref ("SYM")))))) */
14281 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14282 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14283 {
14284 rtx curr_src = SET_SRC (curr_set);
14285
14286 if (GET_CODE (curr_src) == ZERO_EXTEND)
14287 curr_src = XEXP (curr_src, 0);
14288
14289 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14290 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14291 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14292 == REGNO (SET_DEST (prev_set))
14293 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14294 XEXP (SET_SRC (prev_set), 0)))
14295 return true;
14296 }
14297 }
14298
14299 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14300 && aarch_crypto_can_dual_issue (prev, curr))
14301 return true;
14302
14303 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14304 && any_condjump_p (curr))
14305 {
14306 enum attr_type prev_type = get_attr_type (prev);
14307
14308 unsigned int condreg1, condreg2;
14309 rtx cc_reg_1;
14310 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14311 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14312
14313 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14314 && prev
14315 && modified_in_p (cc_reg_1, prev))
14316 {
14317 /* FIXME: this misses some which is considered simple arthematic
14318 instructions for ThunderX. Simple shifts are missed here. */
14319 if (prev_type == TYPE_ALUS_SREG
14320 || prev_type == TYPE_ALUS_IMM
14321 || prev_type == TYPE_LOGICS_REG
14322 || prev_type == TYPE_LOGICS_IMM)
14323 return true;
14324 }
14325 }
14326
14327 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14328 && any_condjump_p (curr))
14329 {
14330 /* We're trying to match:
14331 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14332 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14333 (const_int 0))
14334 (label_ref ("SYM"))
14335 (pc)) */
14336 if (SET_DEST (curr_set) == (pc_rtx)
14337 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14338 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14339 && REG_P (SET_DEST (prev_set))
14340 && REGNO (SET_DEST (prev_set))
14341 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14342 {
14343 /* Fuse ALU operations followed by conditional branch instruction. */
14344 switch (get_attr_type (prev))
14345 {
14346 case TYPE_ALU_IMM:
14347 case TYPE_ALU_SREG:
14348 case TYPE_ADC_REG:
14349 case TYPE_ADC_IMM:
14350 case TYPE_ADCS_REG:
14351 case TYPE_ADCS_IMM:
14352 case TYPE_LOGIC_REG:
14353 case TYPE_LOGIC_IMM:
14354 case TYPE_CSEL:
14355 case TYPE_ADR:
14356 case TYPE_MOV_IMM:
14357 case TYPE_SHIFT_REG:
14358 case TYPE_SHIFT_IMM:
14359 case TYPE_BFM:
14360 case TYPE_RBIT:
14361 case TYPE_REV:
14362 case TYPE_EXTEND:
14363 return true;
14364
14365 default:;
14366 }
14367 }
14368 }
14369
14370 return false;
14371 }
14372
14373 /* Return true iff the instruction fusion described by OP is enabled. */
14374
14375 bool
14376 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14377 {
14378 return (aarch64_tune_params.fusible_ops & op) != 0;
14379 }
14380
14381 /* If MEM is in the form of [base+offset], extract the two parts
14382 of address and set to BASE and OFFSET, otherwise return false
14383 after clearing BASE and OFFSET. */
14384
14385 bool
14386 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14387 {
14388 rtx addr;
14389
14390 gcc_assert (MEM_P (mem));
14391
14392 addr = XEXP (mem, 0);
14393
14394 if (REG_P (addr))
14395 {
14396 *base = addr;
14397 *offset = const0_rtx;
14398 return true;
14399 }
14400
14401 if (GET_CODE (addr) == PLUS
14402 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14403 {
14404 *base = XEXP (addr, 0);
14405 *offset = XEXP (addr, 1);
14406 return true;
14407 }
14408
14409 *base = NULL_RTX;
14410 *offset = NULL_RTX;
14411
14412 return false;
14413 }
14414
14415 /* Types for scheduling fusion. */
14416 enum sched_fusion_type
14417 {
14418 SCHED_FUSION_NONE = 0,
14419 SCHED_FUSION_LD_SIGN_EXTEND,
14420 SCHED_FUSION_LD_ZERO_EXTEND,
14421 SCHED_FUSION_LD,
14422 SCHED_FUSION_ST,
14423 SCHED_FUSION_NUM
14424 };
14425
14426 /* If INSN is a load or store of address in the form of [base+offset],
14427 extract the two parts and set to BASE and OFFSET. Return scheduling
14428 fusion type this INSN is. */
14429
14430 static enum sched_fusion_type
14431 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14432 {
14433 rtx x, dest, src;
14434 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14435
14436 gcc_assert (INSN_P (insn));
14437 x = PATTERN (insn);
14438 if (GET_CODE (x) != SET)
14439 return SCHED_FUSION_NONE;
14440
14441 src = SET_SRC (x);
14442 dest = SET_DEST (x);
14443
14444 machine_mode dest_mode = GET_MODE (dest);
14445
14446 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14447 return SCHED_FUSION_NONE;
14448
14449 if (GET_CODE (src) == SIGN_EXTEND)
14450 {
14451 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14452 src = XEXP (src, 0);
14453 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14454 return SCHED_FUSION_NONE;
14455 }
14456 else if (GET_CODE (src) == ZERO_EXTEND)
14457 {
14458 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14459 src = XEXP (src, 0);
14460 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14461 return SCHED_FUSION_NONE;
14462 }
14463
14464 if (GET_CODE (src) == MEM && REG_P (dest))
14465 extract_base_offset_in_addr (src, base, offset);
14466 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14467 {
14468 fusion = SCHED_FUSION_ST;
14469 extract_base_offset_in_addr (dest, base, offset);
14470 }
14471 else
14472 return SCHED_FUSION_NONE;
14473
14474 if (*base == NULL_RTX || *offset == NULL_RTX)
14475 fusion = SCHED_FUSION_NONE;
14476
14477 return fusion;
14478 }
14479
14480 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14481
14482 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14483 and PRI are only calculated for these instructions. For other instruction,
14484 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14485 type instruction fusion can be added by returning different priorities.
14486
14487 It's important that irrelevant instructions get the largest FUSION_PRI. */
14488
14489 static void
14490 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14491 int *fusion_pri, int *pri)
14492 {
14493 int tmp, off_val;
14494 rtx base, offset;
14495 enum sched_fusion_type fusion;
14496
14497 gcc_assert (INSN_P (insn));
14498
14499 tmp = max_pri - 1;
14500 fusion = fusion_load_store (insn, &base, &offset);
14501 if (fusion == SCHED_FUSION_NONE)
14502 {
14503 *pri = tmp;
14504 *fusion_pri = tmp;
14505 return;
14506 }
14507
14508 /* Set FUSION_PRI according to fusion type and base register. */
14509 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14510
14511 /* Calculate PRI. */
14512 tmp /= 2;
14513
14514 /* INSN with smaller offset goes first. */
14515 off_val = (int)(INTVAL (offset));
14516 if (off_val >= 0)
14517 tmp -= (off_val & 0xfffff);
14518 else
14519 tmp += ((- off_val) & 0xfffff);
14520
14521 *pri = tmp;
14522 return;
14523 }
14524
14525 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14526 Adjust priority of sha1h instructions so they are scheduled before
14527 other SHA1 instructions. */
14528
14529 static int
14530 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14531 {
14532 rtx x = PATTERN (insn);
14533
14534 if (GET_CODE (x) == SET)
14535 {
14536 x = SET_SRC (x);
14537
14538 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14539 return priority + 10;
14540 }
14541
14542 return priority;
14543 }
14544
14545 /* Given OPERANDS of consecutive load/store, check if we can merge
14546 them into ldp/stp. LOAD is true if they are load instructions.
14547 MODE is the mode of memory operands. */
14548
14549 bool
14550 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14551 enum machine_mode mode)
14552 {
14553 HOST_WIDE_INT offval_1, offval_2, msize;
14554 enum reg_class rclass_1, rclass_2;
14555 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14556
14557 if (load)
14558 {
14559 mem_1 = operands[1];
14560 mem_2 = operands[3];
14561 reg_1 = operands[0];
14562 reg_2 = operands[2];
14563 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14564 if (REGNO (reg_1) == REGNO (reg_2))
14565 return false;
14566 }
14567 else
14568 {
14569 mem_1 = operands[0];
14570 mem_2 = operands[2];
14571 reg_1 = operands[1];
14572 reg_2 = operands[3];
14573 }
14574
14575 /* The mems cannot be volatile. */
14576 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14577 return false;
14578
14579 /* If we have SImode and slow unaligned ldp,
14580 check the alignment to be at least 8 byte. */
14581 if (mode == SImode
14582 && (aarch64_tune_params.extra_tuning_flags
14583 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14584 && !optimize_size
14585 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14586 return false;
14587
14588 /* Check if the addresses are in the form of [base+offset]. */
14589 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14590 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14591 return false;
14592 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14593 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14594 return false;
14595
14596 /* Check if the bases are same. */
14597 if (!rtx_equal_p (base_1, base_2))
14598 return false;
14599
14600 offval_1 = INTVAL (offset_1);
14601 offval_2 = INTVAL (offset_2);
14602 msize = GET_MODE_SIZE (mode);
14603 /* Check if the offsets are consecutive. */
14604 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14605 return false;
14606
14607 /* Check if the addresses are clobbered by load. */
14608 if (load)
14609 {
14610 if (reg_mentioned_p (reg_1, mem_1))
14611 return false;
14612
14613 /* In increasing order, the last load can clobber the address. */
14614 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14615 return false;
14616 }
14617
14618 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14619 rclass_1 = FP_REGS;
14620 else
14621 rclass_1 = GENERAL_REGS;
14622
14623 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14624 rclass_2 = FP_REGS;
14625 else
14626 rclass_2 = GENERAL_REGS;
14627
14628 /* Check if the registers are of same class. */
14629 if (rclass_1 != rclass_2)
14630 return false;
14631
14632 return true;
14633 }
14634
14635 /* Given OPERANDS of consecutive load/store, check if we can merge
14636 them into ldp/stp by adjusting the offset. LOAD is true if they
14637 are load instructions. MODE is the mode of memory operands.
14638
14639 Given below consecutive stores:
14640
14641 str w1, [xb, 0x100]
14642 str w1, [xb, 0x104]
14643 str w1, [xb, 0x108]
14644 str w1, [xb, 0x10c]
14645
14646 Though the offsets are out of the range supported by stp, we can
14647 still pair them after adjusting the offset, like:
14648
14649 add scratch, xb, 0x100
14650 stp w1, w1, [scratch]
14651 stp w1, w1, [scratch, 0x8]
14652
14653 The peephole patterns detecting this opportunity should guarantee
14654 the scratch register is avaliable. */
14655
14656 bool
14657 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14658 enum machine_mode mode)
14659 {
14660 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14661 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14662 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14663 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14664
14665 if (load)
14666 {
14667 reg_1 = operands[0];
14668 mem_1 = operands[1];
14669 reg_2 = operands[2];
14670 mem_2 = operands[3];
14671 reg_3 = operands[4];
14672 mem_3 = operands[5];
14673 reg_4 = operands[6];
14674 mem_4 = operands[7];
14675 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14676 && REG_P (reg_3) && REG_P (reg_4));
14677 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14678 return false;
14679 }
14680 else
14681 {
14682 mem_1 = operands[0];
14683 reg_1 = operands[1];
14684 mem_2 = operands[2];
14685 reg_2 = operands[3];
14686 mem_3 = operands[4];
14687 reg_3 = operands[5];
14688 mem_4 = operands[6];
14689 reg_4 = operands[7];
14690 }
14691 /* Skip if memory operand is by itslef valid for ldp/stp. */
14692 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14693 return false;
14694
14695 /* The mems cannot be volatile. */
14696 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14697 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14698 return false;
14699
14700 /* Check if the addresses are in the form of [base+offset]. */
14701 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14702 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14703 return false;
14704 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14705 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14706 return false;
14707 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14708 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14709 return false;
14710 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14711 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14712 return false;
14713
14714 /* Check if the bases are same. */
14715 if (!rtx_equal_p (base_1, base_2)
14716 || !rtx_equal_p (base_2, base_3)
14717 || !rtx_equal_p (base_3, base_4))
14718 return false;
14719
14720 offval_1 = INTVAL (offset_1);
14721 offval_2 = INTVAL (offset_2);
14722 offval_3 = INTVAL (offset_3);
14723 offval_4 = INTVAL (offset_4);
14724 msize = GET_MODE_SIZE (mode);
14725 /* Check if the offsets are consecutive. */
14726 if ((offval_1 != (offval_2 + msize)
14727 || offval_1 != (offval_3 + msize * 2)
14728 || offval_1 != (offval_4 + msize * 3))
14729 && (offval_4 != (offval_3 + msize)
14730 || offval_4 != (offval_2 + msize * 2)
14731 || offval_4 != (offval_1 + msize * 3)))
14732 return false;
14733
14734 /* Check if the addresses are clobbered by load. */
14735 if (load)
14736 {
14737 if (reg_mentioned_p (reg_1, mem_1)
14738 || reg_mentioned_p (reg_2, mem_2)
14739 || reg_mentioned_p (reg_3, mem_3))
14740 return false;
14741
14742 /* In increasing order, the last load can clobber the address. */
14743 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14744 return false;
14745 }
14746
14747 /* If we have SImode and slow unaligned ldp,
14748 check the alignment to be at least 8 byte. */
14749 if (mode == SImode
14750 && (aarch64_tune_params.extra_tuning_flags
14751 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14752 && !optimize_size
14753 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14754 return false;
14755
14756 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14757 rclass_1 = FP_REGS;
14758 else
14759 rclass_1 = GENERAL_REGS;
14760
14761 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14762 rclass_2 = FP_REGS;
14763 else
14764 rclass_2 = GENERAL_REGS;
14765
14766 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14767 rclass_3 = FP_REGS;
14768 else
14769 rclass_3 = GENERAL_REGS;
14770
14771 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14772 rclass_4 = FP_REGS;
14773 else
14774 rclass_4 = GENERAL_REGS;
14775
14776 /* Check if the registers are of same class. */
14777 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14778 return false;
14779
14780 return true;
14781 }
14782
14783 /* Given OPERANDS of consecutive load/store, this function pairs them
14784 into ldp/stp after adjusting the offset. It depends on the fact
14785 that addresses of load/store instructions are in increasing order.
14786 MODE is the mode of memory operands. CODE is the rtl operator
14787 which should be applied to all memory operands, it's SIGN_EXTEND,
14788 ZERO_EXTEND or UNKNOWN. */
14789
14790 bool
14791 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14792 enum machine_mode mode, RTX_CODE code)
14793 {
14794 rtx base, offset, t1, t2;
14795 rtx mem_1, mem_2, mem_3, mem_4;
14796 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14797
14798 if (load)
14799 {
14800 mem_1 = operands[1];
14801 mem_2 = operands[3];
14802 mem_3 = operands[5];
14803 mem_4 = operands[7];
14804 }
14805 else
14806 {
14807 mem_1 = operands[0];
14808 mem_2 = operands[2];
14809 mem_3 = operands[4];
14810 mem_4 = operands[6];
14811 gcc_assert (code == UNKNOWN);
14812 }
14813
14814 extract_base_offset_in_addr (mem_1, &base, &offset);
14815 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14816
14817 /* Adjust offset thus it can fit in ldp/stp instruction. */
14818 msize = GET_MODE_SIZE (mode);
14819 stp_off_limit = msize * 0x40;
14820 off_val = INTVAL (offset);
14821 abs_off = (off_val < 0) ? -off_val : off_val;
14822 new_off = abs_off % stp_off_limit;
14823 adj_off = abs_off - new_off;
14824
14825 /* Further adjust to make sure all offsets are OK. */
14826 if ((new_off + msize * 2) >= stp_off_limit)
14827 {
14828 adj_off += stp_off_limit;
14829 new_off -= stp_off_limit;
14830 }
14831
14832 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14833 if (adj_off >= 0x1000)
14834 return false;
14835
14836 if (off_val < 0)
14837 {
14838 adj_off = -adj_off;
14839 new_off = -new_off;
14840 }
14841
14842 /* Create new memory references. */
14843 mem_1 = change_address (mem_1, VOIDmode,
14844 plus_constant (DImode, operands[8], new_off));
14845
14846 /* Check if the adjusted address is OK for ldp/stp. */
14847 if (!aarch64_mem_pair_operand (mem_1, mode))
14848 return false;
14849
14850 msize = GET_MODE_SIZE (mode);
14851 mem_2 = change_address (mem_2, VOIDmode,
14852 plus_constant (DImode,
14853 operands[8],
14854 new_off + msize));
14855 mem_3 = change_address (mem_3, VOIDmode,
14856 plus_constant (DImode,
14857 operands[8],
14858 new_off + msize * 2));
14859 mem_4 = change_address (mem_4, VOIDmode,
14860 plus_constant (DImode,
14861 operands[8],
14862 new_off + msize * 3));
14863
14864 if (code == ZERO_EXTEND)
14865 {
14866 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14867 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14868 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14869 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14870 }
14871 else if (code == SIGN_EXTEND)
14872 {
14873 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14874 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14875 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14876 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14877 }
14878
14879 if (load)
14880 {
14881 operands[1] = mem_1;
14882 operands[3] = mem_2;
14883 operands[5] = mem_3;
14884 operands[7] = mem_4;
14885 }
14886 else
14887 {
14888 operands[0] = mem_1;
14889 operands[2] = mem_2;
14890 operands[4] = mem_3;
14891 operands[6] = mem_4;
14892 }
14893
14894 /* Emit adjusting instruction. */
14895 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14896 /* Emit ldp/stp instructions. */
14897 t1 = gen_rtx_SET (operands[0], operands[1]);
14898 t2 = gen_rtx_SET (operands[2], operands[3]);
14899 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14900 t1 = gen_rtx_SET (operands[4], operands[5]);
14901 t2 = gen_rtx_SET (operands[6], operands[7]);
14902 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14903 return true;
14904 }
14905
14906 /* Return 1 if pseudo register should be created and used to hold
14907 GOT address for PIC code. */
14908
14909 bool
14910 aarch64_use_pseudo_pic_reg (void)
14911 {
14912 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14913 }
14914
14915 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14916
14917 static int
14918 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14919 {
14920 switch (XINT (x, 1))
14921 {
14922 case UNSPEC_GOTSMALLPIC:
14923 case UNSPEC_GOTSMALLPIC28K:
14924 case UNSPEC_GOTTINYPIC:
14925 return 0;
14926 default:
14927 break;
14928 }
14929
14930 return default_unspec_may_trap_p (x, flags);
14931 }
14932
14933
14934 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14935 return the log2 of that value. Otherwise return -1. */
14936
14937 int
14938 aarch64_fpconst_pow_of_2 (rtx x)
14939 {
14940 const REAL_VALUE_TYPE *r;
14941
14942 if (!CONST_DOUBLE_P (x))
14943 return -1;
14944
14945 r = CONST_DOUBLE_REAL_VALUE (x);
14946
14947 if (REAL_VALUE_NEGATIVE (*r)
14948 || REAL_VALUE_ISNAN (*r)
14949 || REAL_VALUE_ISINF (*r)
14950 || !real_isinteger (r, DFmode))
14951 return -1;
14952
14953 return exact_log2 (real_to_integer (r));
14954 }
14955
14956 /* If X is a vector of equal CONST_DOUBLE values and that value is
14957 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14958
14959 int
14960 aarch64_vec_fpconst_pow_of_2 (rtx x)
14961 {
14962 if (GET_CODE (x) != CONST_VECTOR)
14963 return -1;
14964
14965 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14966 return -1;
14967
14968 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14969 if (firstval <= 0)
14970 return -1;
14971
14972 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14973 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14974 return -1;
14975
14976 return firstval;
14977 }
14978
14979 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14980 to float.
14981
14982 __fp16 always promotes through this hook.
14983 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14984 through the generic excess precision logic rather than here. */
14985
14986 static tree
14987 aarch64_promoted_type (const_tree t)
14988 {
14989 if (SCALAR_FLOAT_TYPE_P (t)
14990 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14991 return float_type_node;
14992
14993 return NULL_TREE;
14994 }
14995
14996 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14997
14998 static bool
14999 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15000 optimization_type opt_type)
15001 {
15002 switch (op)
15003 {
15004 case rsqrt_optab:
15005 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15006
15007 default:
15008 return true;
15009 }
15010 }
15011
15012 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15013 if MODE is HFmode, and punt to the generic implementation otherwise. */
15014
15015 static bool
15016 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
15017 {
15018 return (mode == HFmode
15019 ? true
15020 : default_libgcc_floating_mode_supported_p (mode));
15021 }
15022
15023 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15024 if MODE is HFmode, and punt to the generic implementation otherwise. */
15025
15026 static bool
15027 aarch64_scalar_mode_supported_p (machine_mode mode)
15028 {
15029 return (mode == HFmode
15030 ? true
15031 : default_scalar_mode_supported_p (mode));
15032 }
15033
15034 /* Set the value of FLT_EVAL_METHOD.
15035 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15036
15037 0: evaluate all operations and constants, whose semantic type has at
15038 most the range and precision of type float, to the range and
15039 precision of float; evaluate all other operations and constants to
15040 the range and precision of the semantic type;
15041
15042 N, where _FloatN is a supported interchange floating type
15043 evaluate all operations and constants, whose semantic type has at
15044 most the range and precision of _FloatN type, to the range and
15045 precision of the _FloatN type; evaluate all other operations and
15046 constants to the range and precision of the semantic type;
15047
15048 If we have the ARMv8.2-A extensions then we support _Float16 in native
15049 precision, so we should set this to 16. Otherwise, we support the type,
15050 but want to evaluate expressions in float precision, so set this to
15051 0. */
15052
15053 static enum flt_eval_method
15054 aarch64_excess_precision (enum excess_precision_type type)
15055 {
15056 switch (type)
15057 {
15058 case EXCESS_PRECISION_TYPE_FAST:
15059 case EXCESS_PRECISION_TYPE_STANDARD:
15060 /* We can calculate either in 16-bit range and precision or
15061 32-bit range and precision. Make that decision based on whether
15062 we have native support for the ARMv8.2-A 16-bit floating-point
15063 instructions or not. */
15064 return (TARGET_FP_F16INST
15065 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15066 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15067 case EXCESS_PRECISION_TYPE_IMPLICIT:
15068 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15069 default:
15070 gcc_unreachable ();
15071 }
15072 return FLT_EVAL_METHOD_UNPREDICTABLE;
15073 }
15074
15075 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15076 scheduled for speculative execution. Reject the long-running division
15077 and square-root instructions. */
15078
15079 static bool
15080 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15081 {
15082 switch (get_attr_type (insn))
15083 {
15084 case TYPE_SDIV:
15085 case TYPE_UDIV:
15086 case TYPE_FDIVS:
15087 case TYPE_FDIVD:
15088 case TYPE_FSQRTS:
15089 case TYPE_FSQRTD:
15090 case TYPE_NEON_FP_SQRT_S:
15091 case TYPE_NEON_FP_SQRT_D:
15092 case TYPE_NEON_FP_SQRT_S_Q:
15093 case TYPE_NEON_FP_SQRT_D_Q:
15094 case TYPE_NEON_FP_DIV_S:
15095 case TYPE_NEON_FP_DIV_D:
15096 case TYPE_NEON_FP_DIV_S_Q:
15097 case TYPE_NEON_FP_DIV_D_Q:
15098 return false;
15099 default:
15100 return true;
15101 }
15102 }
15103
15104 /* Target-specific selftests. */
15105
15106 #if CHECKING_P
15107
15108 namespace selftest {
15109
15110 /* Selftest for the RTL loader.
15111 Verify that the RTL loader copes with a dump from
15112 print_rtx_function. This is essentially just a test that class
15113 function_reader can handle a real dump, but it also verifies
15114 that lookup_reg_by_dump_name correctly handles hard regs.
15115 The presence of hard reg names in the dump means that the test is
15116 target-specific, hence it is in this file. */
15117
15118 static void
15119 aarch64_test_loading_full_dump ()
15120 {
15121 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15122
15123 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15124
15125 rtx_insn *insn_1 = get_insn_by_uid (1);
15126 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15127
15128 rtx_insn *insn_15 = get_insn_by_uid (15);
15129 ASSERT_EQ (INSN, GET_CODE (insn_15));
15130 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15131
15132 /* Verify crtl->return_rtx. */
15133 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15134 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15135 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15136 }
15137
15138 /* Run all target-specific selftests. */
15139
15140 static void
15141 aarch64_run_selftests (void)
15142 {
15143 aarch64_test_loading_full_dump ();
15144 }
15145
15146 } // namespace selftest
15147
15148 #endif /* #if CHECKING_P */
15149
15150 #undef TARGET_ADDRESS_COST
15151 #define TARGET_ADDRESS_COST aarch64_address_cost
15152
15153 /* This hook will determines whether unnamed bitfields affect the alignment
15154 of the containing structure. The hook returns true if the structure
15155 should inherit the alignment requirements of an unnamed bitfield's
15156 type. */
15157 #undef TARGET_ALIGN_ANON_BITFIELD
15158 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15159
15160 #undef TARGET_ASM_ALIGNED_DI_OP
15161 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15162
15163 #undef TARGET_ASM_ALIGNED_HI_OP
15164 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15165
15166 #undef TARGET_ASM_ALIGNED_SI_OP
15167 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15168
15169 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15170 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15171 hook_bool_const_tree_hwi_hwi_const_tree_true
15172
15173 #undef TARGET_ASM_FILE_START
15174 #define TARGET_ASM_FILE_START aarch64_start_file
15175
15176 #undef TARGET_ASM_OUTPUT_MI_THUNK
15177 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15178
15179 #undef TARGET_ASM_SELECT_RTX_SECTION
15180 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15181
15182 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15183 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15184
15185 #undef TARGET_BUILD_BUILTIN_VA_LIST
15186 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15187
15188 #undef TARGET_CALLEE_COPIES
15189 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15190
15191 #undef TARGET_CAN_ELIMINATE
15192 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15193
15194 #undef TARGET_CAN_INLINE_P
15195 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15196
15197 #undef TARGET_CANNOT_FORCE_CONST_MEM
15198 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15199
15200 #undef TARGET_CASE_VALUES_THRESHOLD
15201 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15202
15203 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15204 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15205
15206 /* Only the least significant bit is used for initialization guard
15207 variables. */
15208 #undef TARGET_CXX_GUARD_MASK_BIT
15209 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15210
15211 #undef TARGET_C_MODE_FOR_SUFFIX
15212 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15213
15214 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15215 #undef TARGET_DEFAULT_TARGET_FLAGS
15216 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15217 #endif
15218
15219 #undef TARGET_CLASS_MAX_NREGS
15220 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15221
15222 #undef TARGET_BUILTIN_DECL
15223 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15224
15225 #undef TARGET_BUILTIN_RECIPROCAL
15226 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15227
15228 #undef TARGET_C_EXCESS_PRECISION
15229 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15230
15231 #undef TARGET_EXPAND_BUILTIN
15232 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15233
15234 #undef TARGET_EXPAND_BUILTIN_VA_START
15235 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15236
15237 #undef TARGET_FOLD_BUILTIN
15238 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15239
15240 #undef TARGET_FUNCTION_ARG
15241 #define TARGET_FUNCTION_ARG aarch64_function_arg
15242
15243 #undef TARGET_FUNCTION_ARG_ADVANCE
15244 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15245
15246 #undef TARGET_FUNCTION_ARG_BOUNDARY
15247 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15248
15249 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15250 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15251
15252 #undef TARGET_FUNCTION_VALUE
15253 #define TARGET_FUNCTION_VALUE aarch64_function_value
15254
15255 #undef TARGET_FUNCTION_VALUE_REGNO_P
15256 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15257
15258 #undef TARGET_FRAME_POINTER_REQUIRED
15259 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15260
15261 #undef TARGET_GIMPLE_FOLD_BUILTIN
15262 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15263
15264 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15265 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15266
15267 #undef TARGET_INIT_BUILTINS
15268 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15269
15270 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15271 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15272 aarch64_ira_change_pseudo_allocno_class
15273
15274 #undef TARGET_LEGITIMATE_ADDRESS_P
15275 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15276
15277 #undef TARGET_LEGITIMATE_CONSTANT_P
15278 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15279
15280 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15281 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15282 aarch64_legitimize_address_displacement
15283
15284 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15285 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15286
15287 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15288 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15289 aarch64_libgcc_floating_mode_supported_p
15290
15291 #undef TARGET_MANGLE_TYPE
15292 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15293
15294 #undef TARGET_MEMORY_MOVE_COST
15295 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15296
15297 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15298 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15299
15300 #undef TARGET_MUST_PASS_IN_STACK
15301 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15302
15303 /* This target hook should return true if accesses to volatile bitfields
15304 should use the narrowest mode possible. It should return false if these
15305 accesses should use the bitfield container type. */
15306 #undef TARGET_NARROW_VOLATILE_BITFIELD
15307 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15308
15309 #undef TARGET_OPTION_OVERRIDE
15310 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15311
15312 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15313 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15314 aarch64_override_options_after_change
15315
15316 #undef TARGET_OPTION_SAVE
15317 #define TARGET_OPTION_SAVE aarch64_option_save
15318
15319 #undef TARGET_OPTION_RESTORE
15320 #define TARGET_OPTION_RESTORE aarch64_option_restore
15321
15322 #undef TARGET_OPTION_PRINT
15323 #define TARGET_OPTION_PRINT aarch64_option_print
15324
15325 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15326 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15327
15328 #undef TARGET_SET_CURRENT_FUNCTION
15329 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15330
15331 #undef TARGET_PASS_BY_REFERENCE
15332 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15333
15334 #undef TARGET_PREFERRED_RELOAD_CLASS
15335 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15336
15337 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15338 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15339
15340 #undef TARGET_PROMOTED_TYPE
15341 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15342
15343 #undef TARGET_SECONDARY_RELOAD
15344 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15345
15346 #undef TARGET_SHIFT_TRUNCATION_MASK
15347 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15348
15349 #undef TARGET_SETUP_INCOMING_VARARGS
15350 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15351
15352 #undef TARGET_STRUCT_VALUE_RTX
15353 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15354
15355 #undef TARGET_REGISTER_MOVE_COST
15356 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15357
15358 #undef TARGET_RETURN_IN_MEMORY
15359 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15360
15361 #undef TARGET_RETURN_IN_MSB
15362 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15363
15364 #undef TARGET_RTX_COSTS
15365 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15366
15367 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15368 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15369
15370 #undef TARGET_SCHED_ISSUE_RATE
15371 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15372
15373 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15374 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15375 aarch64_sched_first_cycle_multipass_dfa_lookahead
15376
15377 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15378 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15379 aarch64_first_cycle_multipass_dfa_lookahead_guard
15380
15381 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15382 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15383 aarch64_get_separate_components
15384
15385 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15386 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15387 aarch64_components_for_bb
15388
15389 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15390 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15391 aarch64_disqualify_components
15392
15393 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15394 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15395 aarch64_emit_prologue_components
15396
15397 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15398 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15399 aarch64_emit_epilogue_components
15400
15401 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15402 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15403 aarch64_set_handled_components
15404
15405 #undef TARGET_TRAMPOLINE_INIT
15406 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15407
15408 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15409 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15410
15411 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15412 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15413
15414 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15415 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15416 aarch64_builtin_support_vector_misalignment
15417
15418 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15419 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15420
15421 #undef TARGET_VECTORIZE_ADD_STMT_COST
15422 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15423
15424 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15425 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15426 aarch64_builtin_vectorization_cost
15427
15428 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15429 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15430
15431 #undef TARGET_VECTORIZE_BUILTINS
15432 #define TARGET_VECTORIZE_BUILTINS
15433
15434 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15435 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15436 aarch64_builtin_vectorized_function
15437
15438 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15439 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15440 aarch64_autovectorize_vector_sizes
15441
15442 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15443 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15444 aarch64_atomic_assign_expand_fenv
15445
15446 /* Section anchor support. */
15447
15448 #undef TARGET_MIN_ANCHOR_OFFSET
15449 #define TARGET_MIN_ANCHOR_OFFSET -256
15450
15451 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15452 byte offset; we can do much more for larger data types, but have no way
15453 to determine the size of the access. We assume accesses are aligned. */
15454 #undef TARGET_MAX_ANCHOR_OFFSET
15455 #define TARGET_MAX_ANCHOR_OFFSET 4095
15456
15457 #undef TARGET_VECTOR_ALIGNMENT
15458 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15459
15460 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15461 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15462 aarch64_simd_vector_alignment_reachable
15463
15464 /* vec_perm support. */
15465
15466 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15467 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15468 aarch64_vectorize_vec_perm_const_ok
15469
15470 #undef TARGET_INIT_LIBFUNCS
15471 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15472
15473 #undef TARGET_FIXED_CONDITION_CODE_REGS
15474 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15475
15476 #undef TARGET_FLAGS_REGNUM
15477 #define TARGET_FLAGS_REGNUM CC_REGNUM
15478
15479 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15480 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15481
15482 #undef TARGET_ASAN_SHADOW_OFFSET
15483 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15484
15485 #undef TARGET_LEGITIMIZE_ADDRESS
15486 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15487
15488 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15489 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15490 aarch64_use_by_pieces_infrastructure_p
15491
15492 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15493 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15494
15495 #undef TARGET_CAN_USE_DOLOOP_P
15496 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15497
15498 #undef TARGET_SCHED_ADJUST_PRIORITY
15499 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15500
15501 #undef TARGET_SCHED_MACRO_FUSION_P
15502 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15503
15504 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15505 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15506
15507 #undef TARGET_SCHED_FUSION_PRIORITY
15508 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15509
15510 #undef TARGET_UNSPEC_MAY_TRAP_P
15511 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15512
15513 #undef TARGET_USE_PSEUDO_PIC_REG
15514 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15515
15516 #undef TARGET_PRINT_OPERAND
15517 #define TARGET_PRINT_OPERAND aarch64_print_operand
15518
15519 #undef TARGET_PRINT_OPERAND_ADDRESS
15520 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15521
15522 #undef TARGET_OPTAB_SUPPORTED_P
15523 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15524
15525 #undef TARGET_OMIT_STRUCT_RETURN_REG
15526 #define TARGET_OMIT_STRUCT_RETURN_REG true
15527
15528 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15529 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15530 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15531
15532 #if CHECKING_P
15533 #undef TARGET_RUN_TARGET_SELFTESTS
15534 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15535 #endif /* #if CHECKING_P */
15536
15537 struct gcc_target targetm = TARGET_INITIALIZER;
15538
15539 #include "gt-aarch64.h"