]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AArch64] Tighten address register subreg checks
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "attribs.h"
37 #include "optabs.h"
38 #include "regs.h"
39 #include "emit-rtl.h"
40 #include "recog.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "flags.h"
50 #include "explow.h"
51 #include "expr.h"
52 #include "reload.h"
53 #include "langhooks.h"
54 #include "opts.h"
55 #include "params.h"
56 #include "gimplify.h"
57 #include "dwarf2.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
61 #include "dumpfile.h"
62 #include "builtins.h"
63 #include "rtl-iter.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
68 #include "selftest.h"
69 #include "selftest-rtl.h"
70
71 /* This file should be included last. */
72 #include "target-def.h"
73
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
76
77 /* Classifies an address.
78
79 ADDRESS_REG_IMM
80 A simple base register plus immediate offset.
81
82 ADDRESS_REG_WB
83 A base register indexed by immediate offset with writeback.
84
85 ADDRESS_REG_REG
86 A base register indexed by (optionally scaled) register.
87
88 ADDRESS_REG_UXTW
89 A base register indexed by (optionally scaled) zero-extended register.
90
91 ADDRESS_REG_SXTW
92 A base register indexed by (optionally scaled) sign-extended register.
93
94 ADDRESS_LO_SUM
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
96
97 ADDRESS_SYMBOLIC:
98 A constant symbolic address, in pc-relative literal pool. */
99
100 enum aarch64_address_type {
101 ADDRESS_REG_IMM,
102 ADDRESS_REG_WB,
103 ADDRESS_REG_REG,
104 ADDRESS_REG_UXTW,
105 ADDRESS_REG_SXTW,
106 ADDRESS_LO_SUM,
107 ADDRESS_SYMBOLIC
108 };
109
110 struct aarch64_address_info {
111 enum aarch64_address_type type;
112 rtx base;
113 rtx offset;
114 int shift;
115 enum aarch64_symbol_type symbol_type;
116 };
117
118 struct simd_immediate_info
119 {
120 rtx value;
121 int shift;
122 int element_width;
123 bool mvn;
124 bool msl;
125 };
126
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel;
129
130 #ifdef HAVE_AS_TLS
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
133 #endif
134
135 static bool aarch64_composite_type_p (const_tree, machine_mode);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
137 const_tree,
138 machine_mode *, int *,
139 bool *);
140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
145 const unsigned char *sel);
146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
148 const_tree type,
149 int misalignment,
150 bool is_packed);
151 static machine_mode
152 aarch64_simd_container_mode (machine_mode mode, unsigned width);
153
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version;
156
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune = cortexa53;
159
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags = 0;
162
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads;
165
166 /* Support for command line parsing of boolean flags in the tuning
167 structures. */
168 struct aarch64_flag_desc
169 {
170 const char* name;
171 unsigned int flag;
172 };
173
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
177 {
178 { "none", AARCH64_FUSE_NOTHING },
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL },
181 { NULL, AARCH64_FUSE_NOTHING }
182 };
183
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
187 {
188 { "none", AARCH64_EXTRA_TUNE_NONE },
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL },
191 { NULL, AARCH64_EXTRA_TUNE_NONE }
192 };
193
194 /* Tuning parameters. */
195
196 static const struct cpu_addrcost_table generic_addrcost_table =
197 {
198 {
199 1, /* hi */
200 0, /* si */
201 0, /* di */
202 1, /* ti */
203 },
204 0, /* pre_modify */
205 0, /* post_modify */
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
209 0 /* imm_offset */
210 };
211
212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
213 {
214 {
215 0, /* hi */
216 0, /* si */
217 0, /* di */
218 2, /* ti */
219 },
220 0, /* pre_modify */
221 0, /* post_modify */
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
225 0, /* imm_offset */
226 };
227
228 static const struct cpu_addrcost_table xgene1_addrcost_table =
229 {
230 {
231 1, /* hi */
232 0, /* si */
233 0, /* di */
234 1, /* ti */
235 },
236 1, /* pre_modify */
237 0, /* post_modify */
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
241 0, /* imm_offset */
242 };
243
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
245 {
246 {
247 1, /* hi */
248 1, /* si */
249 1, /* di */
250 2, /* ti */
251 },
252 0, /* pre_modify */
253 0, /* post_modify */
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
257 0, /* imm_offset */
258 };
259
260 static const struct cpu_regmove_cost generic_regmove_cost =
261 {
262 1, /* GP2GP */
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
265 5, /* GP2FP */
266 5, /* FP2GP */
267 2 /* FP2FP */
268 };
269
270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
271 {
272 1, /* GP2GP */
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
275 5, /* GP2FP */
276 5, /* FP2GP */
277 2 /* FP2FP */
278 };
279
280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
281 {
282 1, /* GP2GP */
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
285 5, /* GP2FP */
286 5, /* FP2GP */
287 2 /* FP2FP */
288 };
289
290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
291 {
292 1, /* GP2GP */
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
295 9, /* GP2FP */
296 9, /* FP2GP */
297 1 /* FP2FP */
298 };
299
300 static const struct cpu_regmove_cost thunderx_regmove_cost =
301 {
302 2, /* GP2GP */
303 2, /* GP2FP */
304 6, /* FP2GP */
305 4 /* FP2FP */
306 };
307
308 static const struct cpu_regmove_cost xgene1_regmove_cost =
309 {
310 1, /* GP2GP */
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
313 8, /* GP2FP */
314 8, /* FP2GP */
315 2 /* FP2FP */
316 };
317
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
319 {
320 2, /* GP2GP */
321 /* Avoid the use of int<->fp moves for spilling. */
322 6, /* GP2FP */
323 6, /* FP2GP */
324 4 /* FP2FP */
325 };
326
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
328 {
329 1, /* GP2GP */
330 /* Avoid the use of int<->fp moves for spilling. */
331 8, /* GP2FP */
332 8, /* FP2GP */
333 4 /* FP2FP */
334 };
335
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost =
338 {
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
354 };
355
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost =
358 {
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
374 };
375
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost =
378 {
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
394 };
395
396 static const struct cpu_vector_cost exynosm1_vector_cost =
397 {
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
413 };
414
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost =
417 {
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
433 };
434
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
437 {
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
453 };
454
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost =
457 {
458 1, /* Predictable. */
459 3 /* Unpredictable. */
460 };
461
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes =
464 {
465 AARCH64_APPROX_NONE, /* division */
466 AARCH64_APPROX_NONE, /* sqrt */
467 AARCH64_APPROX_NONE /* recip_sqrt */
468 };
469
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes =
472 {
473 AARCH64_APPROX_NONE, /* division */
474 AARCH64_APPROX_ALL, /* sqrt */
475 AARCH64_APPROX_ALL /* recip_sqrt */
476 };
477
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes =
480 {
481 AARCH64_APPROX_NONE, /* division */
482 AARCH64_APPROX_NONE, /* sqrt */
483 AARCH64_APPROX_ALL /* recip_sqrt */
484 };
485
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune =
488 {
489 0, /* num_slots */
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
494 };
495
496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
497 {
498 0, /* num_slots */
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
503 };
504
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
506 {
507 4, /* num_slots */
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
512 };
513
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
515 {
516 8, /* num_slots */
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
521 };
522
523 static const cpu_prefetch_tune thunderx_prefetch_tune =
524 {
525 8, /* num_slots */
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
530 };
531
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
533 {
534 8, /* num_slots */
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
539 };
540
541 static const struct tune_params generic_tunings =
542 {
543 &cortexa57_extra_costs,
544 &generic_addrcost_table,
545 &generic_regmove_cost,
546 &generic_vector_cost,
547 &generic_branch_cost,
548 &generic_approx_modes,
549 4, /* memmov_cost */
550 2, /* issue_rate */
551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
552 8, /* function_align. */
553 4, /* jump_align. */
554 8, /* loop_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
563 &generic_prefetch_tune
564 };
565
566 static const struct tune_params cortexa35_tunings =
567 {
568 &cortexa53_extra_costs,
569 &generic_addrcost_table,
570 &cortexa53_regmove_cost,
571 &generic_vector_cost,
572 &generic_branch_cost,
573 &generic_approx_modes,
574 4, /* memmov_cost */
575 1, /* issue_rate */
576 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
578 16, /* function_align. */
579 4, /* jump_align. */
580 8, /* loop_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
589 &generic_prefetch_tune
590 };
591
592 static const struct tune_params cortexa53_tunings =
593 {
594 &cortexa53_extra_costs,
595 &generic_addrcost_table,
596 &cortexa53_regmove_cost,
597 &generic_vector_cost,
598 &generic_branch_cost,
599 &generic_approx_modes,
600 4, /* memmov_cost */
601 2, /* issue_rate */
602 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
604 16, /* function_align. */
605 4, /* jump_align. */
606 8, /* loop_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
615 &generic_prefetch_tune
616 };
617
618 static const struct tune_params cortexa57_tunings =
619 {
620 &cortexa57_extra_costs,
621 &generic_addrcost_table,
622 &cortexa57_regmove_cost,
623 &cortexa57_vector_cost,
624 &generic_branch_cost,
625 &generic_approx_modes,
626 4, /* memmov_cost */
627 3, /* issue_rate */
628 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
630 16, /* function_align. */
631 4, /* jump_align. */
632 8, /* loop_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
641 &generic_prefetch_tune
642 };
643
644 static const struct tune_params cortexa72_tunings =
645 {
646 &cortexa57_extra_costs,
647 &generic_addrcost_table,
648 &cortexa57_regmove_cost,
649 &cortexa57_vector_cost,
650 &generic_branch_cost,
651 &generic_approx_modes,
652 4, /* memmov_cost */
653 3, /* issue_rate */
654 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
656 16, /* function_align. */
657 4, /* jump_align. */
658 8, /* loop_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
667 &generic_prefetch_tune
668 };
669
670 static const struct tune_params cortexa73_tunings =
671 {
672 &cortexa57_extra_costs,
673 &generic_addrcost_table,
674 &cortexa57_regmove_cost,
675 &cortexa57_vector_cost,
676 &generic_branch_cost,
677 &generic_approx_modes,
678 4, /* memmov_cost. */
679 2, /* issue_rate. */
680 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
682 16, /* function_align. */
683 4, /* jump_align. */
684 8, /* loop_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
693 &generic_prefetch_tune
694 };
695
696
697
698 static const struct tune_params exynosm1_tunings =
699 {
700 &exynosm1_extra_costs,
701 &exynosm1_addrcost_table,
702 &exynosm1_regmove_cost,
703 &exynosm1_vector_cost,
704 &generic_branch_cost,
705 &exynosm1_approx_modes,
706 4, /* memmov_cost */
707 3, /* issue_rate */
708 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
709 4, /* function_align. */
710 4, /* jump_align. */
711 4, /* loop_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
720 &exynosm1_prefetch_tune
721 };
722
723 static const struct tune_params thunderxt88_tunings =
724 {
725 &thunderx_extra_costs,
726 &generic_addrcost_table,
727 &thunderx_regmove_cost,
728 &thunderx_vector_cost,
729 &generic_branch_cost,
730 &generic_approx_modes,
731 6, /* memmov_cost */
732 2, /* issue_rate */
733 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
734 8, /* function_align. */
735 8, /* jump_align. */
736 8, /* loop_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
745 &thunderxt88_prefetch_tune
746 };
747
748 static const struct tune_params thunderx_tunings =
749 {
750 &thunderx_extra_costs,
751 &generic_addrcost_table,
752 &thunderx_regmove_cost,
753 &thunderx_vector_cost,
754 &generic_branch_cost,
755 &generic_approx_modes,
756 6, /* memmov_cost */
757 2, /* issue_rate */
758 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
759 8, /* function_align. */
760 8, /* jump_align. */
761 8, /* loop_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
771 &thunderx_prefetch_tune
772 };
773
774 static const struct tune_params xgene1_tunings =
775 {
776 &xgene1_extra_costs,
777 &xgene1_addrcost_table,
778 &xgene1_regmove_cost,
779 &xgene1_vector_cost,
780 &generic_branch_cost,
781 &xgene1_approx_modes,
782 6, /* memmov_cost */
783 4, /* issue_rate */
784 AARCH64_FUSE_NOTHING, /* fusible_ops */
785 16, /* function_align. */
786 8, /* jump_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
797 };
798
799 static const struct tune_params qdf24xx_tunings =
800 {
801 &qdf24xx_extra_costs,
802 &generic_addrcost_table,
803 &qdf24xx_regmove_cost,
804 &generic_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 4, /* memmov_cost */
808 4, /* issue_rate */
809 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
811 16, /* function_align. */
812 8, /* jump_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
822 &qdf24xx_prefetch_tune
823 };
824
825 static const struct tune_params thunderx2t99_tunings =
826 {
827 &thunderx2t99_extra_costs,
828 &thunderx2t99_addrcost_table,
829 &thunderx2t99_regmove_cost,
830 &thunderx2t99_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 4, /* memmov_cost. */
834 4, /* issue_rate. */
835 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
837 16, /* function_align. */
838 8, /* jump_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
849 };
850
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
853 {
854 const char* name;
855 void (*parse_override)(const char*, struct tune_params*);
856 };
857
858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
860
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions[] =
863 {
864 { "fuse", aarch64_parse_fuse_string },
865 { "tune", aarch64_parse_tune_string },
866 { NULL, NULL }
867 };
868
869 /* A processor implementing AArch64. */
870 struct processor
871 {
872 const char *const name;
873 enum aarch64_processor ident;
874 enum aarch64_processor sched_core;
875 enum aarch64_arch arch;
876 unsigned architecture_version;
877 const unsigned long flags;
878 const struct tune_params *const tune;
879 };
880
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures[] =
883 {
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
888 };
889
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores[] =
892 {
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
899 AARCH64_FL_FOR_ARCH8, &generic_tunings},
900 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
901 };
902
903
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor *selected_arch;
907 static const struct processor *selected_cpu;
908 static const struct processor *selected_tune;
909
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params = generic_tunings;
912
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
914
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
917 {
918 const char *const name;
919 const unsigned long flags_on;
920 const unsigned long flags_off;
921 };
922
923 typedef enum aarch64_cond_code
924 {
925 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
926 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
927 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
928 }
929 aarch64_cc;
930
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
932
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes[] =
935 {
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
938 };
939
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 const char *
942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
943 const char * branch_format)
944 {
945 rtx_code_label * tmp_label = gen_label_rtx ();
946 char label_buf[256];
947 char buffer[128];
948 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
949 CODE_LABEL_NUMBER (tmp_label));
950 const char *label_ptr = targetm.strip_name_encoding (label_buf);
951 rtx dest_label = operands[pos_label];
952 operands[pos_label] = tmp_label;
953
954 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
955 output_asm_insn (buffer, operands);
956
957 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
958 operands[pos_label] = dest_label;
959 output_asm_insn (buffer, operands);
960 return "";
961 }
962
963 void
964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
965 {
966 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
969 else
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
971 }
972
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
986 */
987
988 static reg_class_t
989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
990 reg_class_t best_class)
991 {
992 machine_mode mode;
993
994 if (allocno_class != ALL_REGS)
995 return allocno_class;
996
997 if (best_class != ALL_REGS)
998 return best_class;
999
1000 mode = PSEUDO_REGNO_MODE (regno);
1001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007 if (GET_MODE_UNIT_SIZE (mode) == 4)
1008 return aarch64_tune_params.min_div_recip_mul_sf;
1009 return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014 machine_mode mode)
1015 {
1016 if (VECTOR_MODE_P (mode))
1017 return aarch64_tune_params.vec_reassoc_width;
1018 if (INTEGRAL_MODE_P (mode))
1019 return aarch64_tune_params.int_reassoc_width;
1020 if (FLOAT_MODE_P (mode))
1021 return aarch64_tune_params.fp_reassoc_width;
1022 return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029 if (GP_REGNUM_P (regno))
1030 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031 else if (regno == SP_REGNUM)
1032 return AARCH64_DWARF_SP;
1033 else if (FP_REGNUM_P (regno))
1034 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes. */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045 return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes. */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052 return aarch64_vector_mode_supported_p (mode)
1053 || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059 unsigned HOST_WIDE_INT nelems)
1060 {
1061 if (TARGET_SIMD
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064 && (nelems >= 2 && nelems <= 4))
1065 return true;
1066
1067 return false;
1068 }
1069
1070 /* Implement HARD_REGNO_NREGS. */
1071
1072 int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075 switch (aarch64_regno_regclass (regno))
1076 {
1077 case FP_REGS:
1078 case FP_LO_REGS:
1079 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080 default:
1081 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082 }
1083 gcc_unreachable ();
1084 }
1085
1086 /* Implement HARD_REGNO_MODE_OK. */
1087
1088 int
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091 if (GET_MODE_CLASS (mode) == MODE_CC)
1092 return regno == CC_REGNUM;
1093
1094 if (regno == SP_REGNUM)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode == Pmode || mode == ptr_mode;
1099
1100 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101 return mode == Pmode;
1102
1103 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104 return 1;
1105
1106 if (FP_REGNUM_P (regno))
1107 {
1108 if (aarch64_vect_struct_mode_p (mode))
1109 return
1110 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111 else
1112 return 1;
1113 }
1114
1115 return 0;
1116 }
1117
1118 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1119 machine_mode
1120 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1121 machine_mode mode)
1122 {
1123 /* Handle modes that fit within single registers. */
1124 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1125 {
1126 if (GET_MODE_SIZE (mode) >= 4)
1127 return mode;
1128 else
1129 return SImode;
1130 }
1131 /* Fall back to generic for multi-reg and very large modes. */
1132 else
1133 return choose_hard_reg_mode (regno, nregs, false);
1134 }
1135
1136 /* Return true if calls to DECL should be treated as
1137 long-calls (ie called via a register). */
1138 static bool
1139 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1140 {
1141 return false;
1142 }
1143
1144 /* Return true if calls to symbol-ref SYM should be treated as
1145 long-calls (ie called via a register). */
1146 bool
1147 aarch64_is_long_call_p (rtx sym)
1148 {
1149 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1150 }
1151
1152 /* Return true if calls to symbol-ref SYM should not go through
1153 plt stubs. */
1154
1155 bool
1156 aarch64_is_noplt_call_p (rtx sym)
1157 {
1158 const_tree decl = SYMBOL_REF_DECL (sym);
1159
1160 if (flag_pic
1161 && decl
1162 && (!flag_plt
1163 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1164 && !targetm.binds_local_p (decl))
1165 return true;
1166
1167 return false;
1168 }
1169
1170 /* Return true if the offsets to a zero/sign-extract operation
1171 represent an expression that matches an extend operation. The
1172 operands represent the paramters from
1173
1174 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1175 bool
1176 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1177 rtx extract_imm)
1178 {
1179 HOST_WIDE_INT mult_val, extract_val;
1180
1181 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1182 return false;
1183
1184 mult_val = INTVAL (mult_imm);
1185 extract_val = INTVAL (extract_imm);
1186
1187 if (extract_val > 8
1188 && extract_val < GET_MODE_BITSIZE (mode)
1189 && exact_log2 (extract_val & ~7) > 0
1190 && (extract_val & 7) <= 4
1191 && mult_val == (1 << (extract_val & 7)))
1192 return true;
1193
1194 return false;
1195 }
1196
1197 /* Emit an insn that's a simple single-set. Both the operands must be
1198 known to be valid. */
1199 inline static rtx_insn *
1200 emit_set_insn (rtx x, rtx y)
1201 {
1202 return emit_insn (gen_rtx_SET (x, y));
1203 }
1204
1205 /* X and Y are two things to compare using CODE. Emit the compare insn and
1206 return the rtx for register 0 in the proper mode. */
1207 rtx
1208 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1209 {
1210 machine_mode mode = SELECT_CC_MODE (code, x, y);
1211 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1212
1213 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1214 return cc_reg;
1215 }
1216
1217 /* Build the SYMBOL_REF for __tls_get_addr. */
1218
1219 static GTY(()) rtx tls_get_addr_libfunc;
1220
1221 rtx
1222 aarch64_tls_get_addr (void)
1223 {
1224 if (!tls_get_addr_libfunc)
1225 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1226 return tls_get_addr_libfunc;
1227 }
1228
1229 /* Return the TLS model to use for ADDR. */
1230
1231 static enum tls_model
1232 tls_symbolic_operand_type (rtx addr)
1233 {
1234 enum tls_model tls_kind = TLS_MODEL_NONE;
1235 rtx sym, addend;
1236
1237 if (GET_CODE (addr) == CONST)
1238 {
1239 split_const (addr, &sym, &addend);
1240 if (GET_CODE (sym) == SYMBOL_REF)
1241 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1242 }
1243 else if (GET_CODE (addr) == SYMBOL_REF)
1244 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1245
1246 return tls_kind;
1247 }
1248
1249 /* We'll allow lo_sum's in addresses in our legitimate addresses
1250 so that combine would take care of combining addresses where
1251 necessary, but for generation purposes, we'll generate the address
1252 as :
1253 RTL Absolute
1254 tmp = hi (symbol_ref); adrp x1, foo
1255 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1256 nop
1257
1258 PIC TLS
1259 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1260 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1261 bl __tls_get_addr
1262 nop
1263
1264 Load TLS symbol, depending on TLS mechanism and TLS access model.
1265
1266 Global Dynamic - Traditional TLS:
1267 adrp tmp, :tlsgd:imm
1268 add dest, tmp, #:tlsgd_lo12:imm
1269 bl __tls_get_addr
1270
1271 Global Dynamic - TLS Descriptors:
1272 adrp dest, :tlsdesc:imm
1273 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1274 add dest, dest, #:tlsdesc_lo12:imm
1275 blr tmp
1276 mrs tp, tpidr_el0
1277 add dest, dest, tp
1278
1279 Initial Exec:
1280 mrs tp, tpidr_el0
1281 adrp tmp, :gottprel:imm
1282 ldr dest, [tmp, #:gottprel_lo12:imm]
1283 add dest, dest, tp
1284
1285 Local Exec:
1286 mrs tp, tpidr_el0
1287 add t0, tp, #:tprel_hi12:imm, lsl #12
1288 add t0, t0, #:tprel_lo12_nc:imm
1289 */
1290
1291 static void
1292 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1293 enum aarch64_symbol_type type)
1294 {
1295 switch (type)
1296 {
1297 case SYMBOL_SMALL_ABSOLUTE:
1298 {
1299 /* In ILP32, the mode of dest can be either SImode or DImode. */
1300 rtx tmp_reg = dest;
1301 machine_mode mode = GET_MODE (dest);
1302
1303 gcc_assert (mode == Pmode || mode == ptr_mode);
1304
1305 if (can_create_pseudo_p ())
1306 tmp_reg = gen_reg_rtx (mode);
1307
1308 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1309 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1310 return;
1311 }
1312
1313 case SYMBOL_TINY_ABSOLUTE:
1314 emit_insn (gen_rtx_SET (dest, imm));
1315 return;
1316
1317 case SYMBOL_SMALL_GOT_28K:
1318 {
1319 machine_mode mode = GET_MODE (dest);
1320 rtx gp_rtx = pic_offset_table_rtx;
1321 rtx insn;
1322 rtx mem;
1323
1324 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1325 here before rtl expand. Tree IVOPT will generate rtl pattern to
1326 decide rtx costs, in which case pic_offset_table_rtx is not
1327 initialized. For that case no need to generate the first adrp
1328 instruction as the final cost for global variable access is
1329 one instruction. */
1330 if (gp_rtx != NULL)
1331 {
1332 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1333 using the page base as GOT base, the first page may be wasted,
1334 in the worst scenario, there is only 28K space for GOT).
1335
1336 The generate instruction sequence for accessing global variable
1337 is:
1338
1339 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1340
1341 Only one instruction needed. But we must initialize
1342 pic_offset_table_rtx properly. We generate initialize insn for
1343 every global access, and allow CSE to remove all redundant.
1344
1345 The final instruction sequences will look like the following
1346 for multiply global variables access.
1347
1348 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1349
1350 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1351 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1352 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1353 ... */
1354
1355 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1356 crtl->uses_pic_offset_table = 1;
1357 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1358
1359 if (mode != GET_MODE (gp_rtx))
1360 gp_rtx = gen_lowpart (mode, gp_rtx);
1361
1362 }
1363
1364 if (mode == ptr_mode)
1365 {
1366 if (mode == DImode)
1367 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1368 else
1369 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1370
1371 mem = XVECEXP (SET_SRC (insn), 0, 0);
1372 }
1373 else
1374 {
1375 gcc_assert (mode == Pmode);
1376
1377 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1378 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1379 }
1380
1381 /* The operand is expected to be MEM. Whenever the related insn
1382 pattern changed, above code which calculate mem should be
1383 updated. */
1384 gcc_assert (GET_CODE (mem) == MEM);
1385 MEM_READONLY_P (mem) = 1;
1386 MEM_NOTRAP_P (mem) = 1;
1387 emit_insn (insn);
1388 return;
1389 }
1390
1391 case SYMBOL_SMALL_GOT_4G:
1392 {
1393 /* In ILP32, the mode of dest can be either SImode or DImode,
1394 while the got entry is always of SImode size. The mode of
1395 dest depends on how dest is used: if dest is assigned to a
1396 pointer (e.g. in the memory), it has SImode; it may have
1397 DImode if dest is dereferenced to access the memeory.
1398 This is why we have to handle three different ldr_got_small
1399 patterns here (two patterns for ILP32). */
1400
1401 rtx insn;
1402 rtx mem;
1403 rtx tmp_reg = dest;
1404 machine_mode mode = GET_MODE (dest);
1405
1406 if (can_create_pseudo_p ())
1407 tmp_reg = gen_reg_rtx (mode);
1408
1409 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1410 if (mode == ptr_mode)
1411 {
1412 if (mode == DImode)
1413 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1414 else
1415 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1416
1417 mem = XVECEXP (SET_SRC (insn), 0, 0);
1418 }
1419 else
1420 {
1421 gcc_assert (mode == Pmode);
1422
1423 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1424 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1425 }
1426
1427 gcc_assert (GET_CODE (mem) == MEM);
1428 MEM_READONLY_P (mem) = 1;
1429 MEM_NOTRAP_P (mem) = 1;
1430 emit_insn (insn);
1431 return;
1432 }
1433
1434 case SYMBOL_SMALL_TLSGD:
1435 {
1436 rtx_insn *insns;
1437 machine_mode mode = GET_MODE (dest);
1438 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1439
1440 start_sequence ();
1441 if (TARGET_ILP32)
1442 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1443 else
1444 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1445 insns = get_insns ();
1446 end_sequence ();
1447
1448 RTL_CONST_CALL_P (insns) = 1;
1449 emit_libcall_block (insns, dest, result, imm);
1450 return;
1451 }
1452
1453 case SYMBOL_SMALL_TLSDESC:
1454 {
1455 machine_mode mode = GET_MODE (dest);
1456 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1457 rtx tp;
1458
1459 gcc_assert (mode == Pmode || mode == ptr_mode);
1460
1461 /* In ILP32, the got entry is always of SImode size. Unlike
1462 small GOT, the dest is fixed at reg 0. */
1463 if (TARGET_ILP32)
1464 emit_insn (gen_tlsdesc_small_si (imm));
1465 else
1466 emit_insn (gen_tlsdesc_small_di (imm));
1467 tp = aarch64_load_tp (NULL);
1468
1469 if (mode != Pmode)
1470 tp = gen_lowpart (mode, tp);
1471
1472 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1473 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1474 return;
1475 }
1476
1477 case SYMBOL_SMALL_TLSIE:
1478 {
1479 /* In ILP32, the mode of dest can be either SImode or DImode,
1480 while the got entry is always of SImode size. The mode of
1481 dest depends on how dest is used: if dest is assigned to a
1482 pointer (e.g. in the memory), it has SImode; it may have
1483 DImode if dest is dereferenced to access the memeory.
1484 This is why we have to handle three different tlsie_small
1485 patterns here (two patterns for ILP32). */
1486 machine_mode mode = GET_MODE (dest);
1487 rtx tmp_reg = gen_reg_rtx (mode);
1488 rtx tp = aarch64_load_tp (NULL);
1489
1490 if (mode == ptr_mode)
1491 {
1492 if (mode == DImode)
1493 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1494 else
1495 {
1496 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1497 tp = gen_lowpart (mode, tp);
1498 }
1499 }
1500 else
1501 {
1502 gcc_assert (mode == Pmode);
1503 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1504 }
1505
1506 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1507 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508 return;
1509 }
1510
1511 case SYMBOL_TLSLE12:
1512 case SYMBOL_TLSLE24:
1513 case SYMBOL_TLSLE32:
1514 case SYMBOL_TLSLE48:
1515 {
1516 machine_mode mode = GET_MODE (dest);
1517 rtx tp = aarch64_load_tp (NULL);
1518
1519 if (mode != Pmode)
1520 tp = gen_lowpart (mode, tp);
1521
1522 switch (type)
1523 {
1524 case SYMBOL_TLSLE12:
1525 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1526 (dest, tp, imm));
1527 break;
1528 case SYMBOL_TLSLE24:
1529 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1530 (dest, tp, imm));
1531 break;
1532 case SYMBOL_TLSLE32:
1533 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1534 (dest, imm));
1535 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1536 (dest, dest, tp));
1537 break;
1538 case SYMBOL_TLSLE48:
1539 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1540 (dest, imm));
1541 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1542 (dest, dest, tp));
1543 break;
1544 default:
1545 gcc_unreachable ();
1546 }
1547
1548 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1549 return;
1550 }
1551
1552 case SYMBOL_TINY_GOT:
1553 emit_insn (gen_ldr_got_tiny (dest, imm));
1554 return;
1555
1556 case SYMBOL_TINY_TLSIE:
1557 {
1558 machine_mode mode = GET_MODE (dest);
1559 rtx tp = aarch64_load_tp (NULL);
1560
1561 if (mode == ptr_mode)
1562 {
1563 if (mode == DImode)
1564 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1565 else
1566 {
1567 tp = gen_lowpart (mode, tp);
1568 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1569 }
1570 }
1571 else
1572 {
1573 gcc_assert (mode == Pmode);
1574 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1575 }
1576
1577 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1578 return;
1579 }
1580
1581 default:
1582 gcc_unreachable ();
1583 }
1584 }
1585
1586 /* Emit a move from SRC to DEST. Assume that the move expanders can
1587 handle all moves if !can_create_pseudo_p (). The distinction is
1588 important because, unlike emit_move_insn, the move expanders know
1589 how to force Pmode objects into the constant pool even when the
1590 constant pool address is not itself legitimate. */
1591 static rtx
1592 aarch64_emit_move (rtx dest, rtx src)
1593 {
1594 return (can_create_pseudo_p ()
1595 ? emit_move_insn (dest, src)
1596 : emit_move_insn_1 (dest, src));
1597 }
1598
1599 /* Split a 128-bit move operation into two 64-bit move operations,
1600 taking care to handle partial overlap of register to register
1601 copies. Special cases are needed when moving between GP regs and
1602 FP regs. SRC can be a register, constant or memory; DST a register
1603 or memory. If either operand is memory it must not have any side
1604 effects. */
1605 void
1606 aarch64_split_128bit_move (rtx dst, rtx src)
1607 {
1608 rtx dst_lo, dst_hi;
1609 rtx src_lo, src_hi;
1610
1611 machine_mode mode = GET_MODE (dst);
1612
1613 gcc_assert (mode == TImode || mode == TFmode);
1614 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1615 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1616
1617 if (REG_P (dst) && REG_P (src))
1618 {
1619 int src_regno = REGNO (src);
1620 int dst_regno = REGNO (dst);
1621
1622 /* Handle FP <-> GP regs. */
1623 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1624 {
1625 src_lo = gen_lowpart (word_mode, src);
1626 src_hi = gen_highpart (word_mode, src);
1627
1628 if (mode == TImode)
1629 {
1630 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1631 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1632 }
1633 else
1634 {
1635 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1636 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1637 }
1638 return;
1639 }
1640 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1641 {
1642 dst_lo = gen_lowpart (word_mode, dst);
1643 dst_hi = gen_highpart (word_mode, dst);
1644
1645 if (mode == TImode)
1646 {
1647 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1648 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1649 }
1650 else
1651 {
1652 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1653 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1654 }
1655 return;
1656 }
1657 }
1658
1659 dst_lo = gen_lowpart (word_mode, dst);
1660 dst_hi = gen_highpart (word_mode, dst);
1661 src_lo = gen_lowpart (word_mode, src);
1662 src_hi = gen_highpart_mode (word_mode, mode, src);
1663
1664 /* At most one pairing may overlap. */
1665 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1666 {
1667 aarch64_emit_move (dst_hi, src_hi);
1668 aarch64_emit_move (dst_lo, src_lo);
1669 }
1670 else
1671 {
1672 aarch64_emit_move (dst_lo, src_lo);
1673 aarch64_emit_move (dst_hi, src_hi);
1674 }
1675 }
1676
1677 bool
1678 aarch64_split_128bit_move_p (rtx dst, rtx src)
1679 {
1680 return (! REG_P (src)
1681 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1682 }
1683
1684 /* Split a complex SIMD combine. */
1685
1686 void
1687 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1688 {
1689 machine_mode src_mode = GET_MODE (src1);
1690 machine_mode dst_mode = GET_MODE (dst);
1691
1692 gcc_assert (VECTOR_MODE_P (dst_mode));
1693 gcc_assert (register_operand (dst, dst_mode)
1694 && register_operand (src1, src_mode)
1695 && register_operand (src2, src_mode));
1696
1697 rtx (*gen) (rtx, rtx, rtx);
1698
1699 switch (src_mode)
1700 {
1701 case E_V8QImode:
1702 gen = gen_aarch64_simd_combinev8qi;
1703 break;
1704 case E_V4HImode:
1705 gen = gen_aarch64_simd_combinev4hi;
1706 break;
1707 case E_V2SImode:
1708 gen = gen_aarch64_simd_combinev2si;
1709 break;
1710 case E_V4HFmode:
1711 gen = gen_aarch64_simd_combinev4hf;
1712 break;
1713 case E_V2SFmode:
1714 gen = gen_aarch64_simd_combinev2sf;
1715 break;
1716 case E_DImode:
1717 gen = gen_aarch64_simd_combinedi;
1718 break;
1719 case E_DFmode:
1720 gen = gen_aarch64_simd_combinedf;
1721 break;
1722 default:
1723 gcc_unreachable ();
1724 }
1725
1726 emit_insn (gen (dst, src1, src2));
1727 return;
1728 }
1729
1730 /* Split a complex SIMD move. */
1731
1732 void
1733 aarch64_split_simd_move (rtx dst, rtx src)
1734 {
1735 machine_mode src_mode = GET_MODE (src);
1736 machine_mode dst_mode = GET_MODE (dst);
1737
1738 gcc_assert (VECTOR_MODE_P (dst_mode));
1739
1740 if (REG_P (dst) && REG_P (src))
1741 {
1742 rtx (*gen) (rtx, rtx);
1743
1744 gcc_assert (VECTOR_MODE_P (src_mode));
1745
1746 switch (src_mode)
1747 {
1748 case E_V16QImode:
1749 gen = gen_aarch64_split_simd_movv16qi;
1750 break;
1751 case E_V8HImode:
1752 gen = gen_aarch64_split_simd_movv8hi;
1753 break;
1754 case E_V4SImode:
1755 gen = gen_aarch64_split_simd_movv4si;
1756 break;
1757 case E_V2DImode:
1758 gen = gen_aarch64_split_simd_movv2di;
1759 break;
1760 case E_V8HFmode:
1761 gen = gen_aarch64_split_simd_movv8hf;
1762 break;
1763 case E_V4SFmode:
1764 gen = gen_aarch64_split_simd_movv4sf;
1765 break;
1766 case E_V2DFmode:
1767 gen = gen_aarch64_split_simd_movv2df;
1768 break;
1769 default:
1770 gcc_unreachable ();
1771 }
1772
1773 emit_insn (gen (dst, src));
1774 return;
1775 }
1776 }
1777
1778 bool
1779 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1780 machine_mode ymode, rtx y)
1781 {
1782 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1783 gcc_assert (r != NULL);
1784 return rtx_equal_p (x, r);
1785 }
1786
1787
1788 static rtx
1789 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1790 {
1791 if (can_create_pseudo_p ())
1792 return force_reg (mode, value);
1793 else
1794 {
1795 x = aarch64_emit_move (x, value);
1796 return x;
1797 }
1798 }
1799
1800
1801 static rtx
1802 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1803 {
1804 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1805 {
1806 rtx high;
1807 /* Load the full offset into a register. This
1808 might be improvable in the future. */
1809 high = GEN_INT (offset);
1810 offset = 0;
1811 high = aarch64_force_temporary (mode, temp, high);
1812 reg = aarch64_force_temporary (mode, temp,
1813 gen_rtx_PLUS (mode, high, reg));
1814 }
1815 return plus_constant (mode, reg, offset);
1816 }
1817
1818 static int
1819 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1820 machine_mode mode)
1821 {
1822 int i;
1823 unsigned HOST_WIDE_INT val, val2, mask;
1824 int one_match, zero_match;
1825 int num_insns;
1826
1827 val = INTVAL (imm);
1828
1829 if (aarch64_move_imm (val, mode))
1830 {
1831 if (generate)
1832 emit_insn (gen_rtx_SET (dest, imm));
1833 return 1;
1834 }
1835
1836 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1837 (with XXXX non-zero). In that case check to see if the move can be done in
1838 a smaller mode. */
1839 val2 = val & 0xffffffff;
1840 if (mode == DImode
1841 && aarch64_move_imm (val2, SImode)
1842 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1843 {
1844 if (generate)
1845 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1846
1847 /* Check if we have to emit a second instruction by checking to see
1848 if any of the upper 32 bits of the original DI mode value is set. */
1849 if (val == val2)
1850 return 1;
1851
1852 i = (val >> 48) ? 48 : 32;
1853
1854 if (generate)
1855 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1856 GEN_INT ((val >> i) & 0xffff)));
1857
1858 return 2;
1859 }
1860
1861 if ((val >> 32) == 0 || mode == SImode)
1862 {
1863 if (generate)
1864 {
1865 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1866 if (mode == SImode)
1867 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1868 GEN_INT ((val >> 16) & 0xffff)));
1869 else
1870 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1871 GEN_INT ((val >> 16) & 0xffff)));
1872 }
1873 return 2;
1874 }
1875
1876 /* Remaining cases are all for DImode. */
1877
1878 mask = 0xffff;
1879 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1880 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1881 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1882 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1883
1884 if (zero_match != 2 && one_match != 2)
1885 {
1886 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1887 For a 64-bit bitmask try whether changing 16 bits to all ones or
1888 zeroes creates a valid bitmask. To check any repeated bitmask,
1889 try using 16 bits from the other 32-bit half of val. */
1890
1891 for (i = 0; i < 64; i += 16, mask <<= 16)
1892 {
1893 val2 = val & ~mask;
1894 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1895 break;
1896 val2 = val | mask;
1897 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1898 break;
1899 val2 = val2 & ~mask;
1900 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1901 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1902 break;
1903 }
1904 if (i != 64)
1905 {
1906 if (generate)
1907 {
1908 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1909 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1910 GEN_INT ((val >> i) & 0xffff)));
1911 }
1912 return 2;
1913 }
1914 }
1915
1916 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1917 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1918 otherwise skip zero bits. */
1919
1920 num_insns = 1;
1921 mask = 0xffff;
1922 val2 = one_match > zero_match ? ~val : val;
1923 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1924
1925 if (generate)
1926 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1927 ? (val | ~(mask << i))
1928 : (val & (mask << i)))));
1929 for (i += 16; i < 64; i += 16)
1930 {
1931 if ((val2 & (mask << i)) == 0)
1932 continue;
1933 if (generate)
1934 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1935 GEN_INT ((val >> i) & 0xffff)));
1936 num_insns ++;
1937 }
1938
1939 return num_insns;
1940 }
1941
1942
1943 void
1944 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1945 {
1946 machine_mode mode = GET_MODE (dest);
1947
1948 gcc_assert (mode == SImode || mode == DImode);
1949
1950 /* Check on what type of symbol it is. */
1951 if (GET_CODE (imm) == SYMBOL_REF
1952 || GET_CODE (imm) == LABEL_REF
1953 || GET_CODE (imm) == CONST)
1954 {
1955 rtx mem, base, offset;
1956 enum aarch64_symbol_type sty;
1957
1958 /* If we have (const (plus symbol offset)), separate out the offset
1959 before we start classifying the symbol. */
1960 split_const (imm, &base, &offset);
1961
1962 sty = aarch64_classify_symbol (base, offset);
1963 switch (sty)
1964 {
1965 case SYMBOL_FORCE_TO_MEM:
1966 if (offset != const0_rtx
1967 && targetm.cannot_force_const_mem (mode, imm))
1968 {
1969 gcc_assert (can_create_pseudo_p ());
1970 base = aarch64_force_temporary (mode, dest, base);
1971 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1972 aarch64_emit_move (dest, base);
1973 return;
1974 }
1975
1976 mem = force_const_mem (ptr_mode, imm);
1977 gcc_assert (mem);
1978
1979 /* If we aren't generating PC relative literals, then
1980 we need to expand the literal pool access carefully.
1981 This is something that needs to be done in a number
1982 of places, so could well live as a separate function. */
1983 if (!aarch64_pcrelative_literal_loads)
1984 {
1985 gcc_assert (can_create_pseudo_p ());
1986 base = gen_reg_rtx (ptr_mode);
1987 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1988 if (ptr_mode != Pmode)
1989 base = convert_memory_address (Pmode, base);
1990 mem = gen_rtx_MEM (ptr_mode, base);
1991 }
1992
1993 if (mode != ptr_mode)
1994 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1995
1996 emit_insn (gen_rtx_SET (dest, mem));
1997
1998 return;
1999
2000 case SYMBOL_SMALL_TLSGD:
2001 case SYMBOL_SMALL_TLSDESC:
2002 case SYMBOL_SMALL_TLSIE:
2003 case SYMBOL_SMALL_GOT_28K:
2004 case SYMBOL_SMALL_GOT_4G:
2005 case SYMBOL_TINY_GOT:
2006 case SYMBOL_TINY_TLSIE:
2007 if (offset != const0_rtx)
2008 {
2009 gcc_assert(can_create_pseudo_p ());
2010 base = aarch64_force_temporary (mode, dest, base);
2011 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2012 aarch64_emit_move (dest, base);
2013 return;
2014 }
2015 /* FALLTHRU */
2016
2017 case SYMBOL_SMALL_ABSOLUTE:
2018 case SYMBOL_TINY_ABSOLUTE:
2019 case SYMBOL_TLSLE12:
2020 case SYMBOL_TLSLE24:
2021 case SYMBOL_TLSLE32:
2022 case SYMBOL_TLSLE48:
2023 aarch64_load_symref_appropriately (dest, imm, sty);
2024 return;
2025
2026 default:
2027 gcc_unreachable ();
2028 }
2029 }
2030
2031 if (!CONST_INT_P (imm))
2032 {
2033 if (GET_CODE (imm) == HIGH)
2034 emit_insn (gen_rtx_SET (dest, imm));
2035 else
2036 {
2037 rtx mem = force_const_mem (mode, imm);
2038 gcc_assert (mem);
2039 emit_insn (gen_rtx_SET (dest, mem));
2040 }
2041
2042 return;
2043 }
2044
2045 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2046 }
2047
2048 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2049 temporary value if necessary. FRAME_RELATED_P should be true if
2050 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2051 to the generated instructions. If SCRATCHREG is known to hold
2052 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2053 immediate again.
2054
2055 Since this function may be used to adjust the stack pointer, we must
2056 ensure that it cannot cause transient stack deallocation (for example
2057 by first incrementing SP and then decrementing when adjusting by a
2058 large immediate). */
2059
2060 static void
2061 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2062 HOST_WIDE_INT delta, bool frame_related_p,
2063 bool emit_move_imm)
2064 {
2065 HOST_WIDE_INT mdelta = abs_hwi (delta);
2066 rtx this_rtx = gen_rtx_REG (mode, regnum);
2067 rtx_insn *insn;
2068
2069 if (!mdelta)
2070 return;
2071
2072 /* Single instruction adjustment. */
2073 if (aarch64_uimm12_shift (mdelta))
2074 {
2075 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2076 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2077 return;
2078 }
2079
2080 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2081 Only do this if mdelta is not a 16-bit move as adjusting using a move
2082 is better. */
2083 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2084 {
2085 HOST_WIDE_INT low_off = mdelta & 0xfff;
2086
2087 low_off = delta < 0 ? -low_off : low_off;
2088 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2089 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2090 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2091 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2092 return;
2093 }
2094
2095 /* Emit a move immediate if required and an addition/subtraction. */
2096 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2097 if (emit_move_imm)
2098 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2099 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2100 : gen_add2_insn (this_rtx, scratch_rtx));
2101 if (frame_related_p)
2102 {
2103 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2104 rtx adj = plus_constant (mode, this_rtx, delta);
2105 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2106 }
2107 }
2108
2109 static inline void
2110 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2111 HOST_WIDE_INT delta)
2112 {
2113 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2114 }
2115
2116 static inline void
2117 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2118 {
2119 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2120 true, emit_move_imm);
2121 }
2122
2123 static inline void
2124 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2125 {
2126 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2127 frame_related_p, true);
2128 }
2129
2130 static bool
2131 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2132 tree exp ATTRIBUTE_UNUSED)
2133 {
2134 /* Currently, always true. */
2135 return true;
2136 }
2137
2138 /* Implement TARGET_PASS_BY_REFERENCE. */
2139
2140 static bool
2141 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2142 machine_mode mode,
2143 const_tree type,
2144 bool named ATTRIBUTE_UNUSED)
2145 {
2146 HOST_WIDE_INT size;
2147 machine_mode dummymode;
2148 int nregs;
2149
2150 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2151 size = (mode == BLKmode && type)
2152 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2153
2154 /* Aggregates are passed by reference based on their size. */
2155 if (type && AGGREGATE_TYPE_P (type))
2156 {
2157 size = int_size_in_bytes (type);
2158 }
2159
2160 /* Variable sized arguments are always returned by reference. */
2161 if (size < 0)
2162 return true;
2163
2164 /* Can this be a candidate to be passed in fp/simd register(s)? */
2165 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2166 &dummymode, &nregs,
2167 NULL))
2168 return false;
2169
2170 /* Arguments which are variable sized or larger than 2 registers are
2171 passed by reference unless they are a homogenous floating point
2172 aggregate. */
2173 return size > 2 * UNITS_PER_WORD;
2174 }
2175
2176 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2177 static bool
2178 aarch64_return_in_msb (const_tree valtype)
2179 {
2180 machine_mode dummy_mode;
2181 int dummy_int;
2182
2183 /* Never happens in little-endian mode. */
2184 if (!BYTES_BIG_ENDIAN)
2185 return false;
2186
2187 /* Only composite types smaller than or equal to 16 bytes can
2188 be potentially returned in registers. */
2189 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2190 || int_size_in_bytes (valtype) <= 0
2191 || int_size_in_bytes (valtype) > 16)
2192 return false;
2193
2194 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2195 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2196 is always passed/returned in the least significant bits of fp/simd
2197 register(s). */
2198 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2199 &dummy_mode, &dummy_int, NULL))
2200 return false;
2201
2202 return true;
2203 }
2204
2205 /* Implement TARGET_FUNCTION_VALUE.
2206 Define how to find the value returned by a function. */
2207
2208 static rtx
2209 aarch64_function_value (const_tree type, const_tree func,
2210 bool outgoing ATTRIBUTE_UNUSED)
2211 {
2212 machine_mode mode;
2213 int unsignedp;
2214 int count;
2215 machine_mode ag_mode;
2216
2217 mode = TYPE_MODE (type);
2218 if (INTEGRAL_TYPE_P (type))
2219 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2220
2221 if (aarch64_return_in_msb (type))
2222 {
2223 HOST_WIDE_INT size = int_size_in_bytes (type);
2224
2225 if (size % UNITS_PER_WORD != 0)
2226 {
2227 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2228 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2229 }
2230 }
2231
2232 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2233 &ag_mode, &count, NULL))
2234 {
2235 if (!aarch64_composite_type_p (type, mode))
2236 {
2237 gcc_assert (count == 1 && mode == ag_mode);
2238 return gen_rtx_REG (mode, V0_REGNUM);
2239 }
2240 else
2241 {
2242 int i;
2243 rtx par;
2244
2245 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2246 for (i = 0; i < count; i++)
2247 {
2248 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2249 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2250 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2251 XVECEXP (par, 0, i) = tmp;
2252 }
2253 return par;
2254 }
2255 }
2256 else
2257 return gen_rtx_REG (mode, R0_REGNUM);
2258 }
2259
2260 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2261 Return true if REGNO is the number of a hard register in which the values
2262 of called function may come back. */
2263
2264 static bool
2265 aarch64_function_value_regno_p (const unsigned int regno)
2266 {
2267 /* Maximum of 16 bytes can be returned in the general registers. Examples
2268 of 16-byte return values are: 128-bit integers and 16-byte small
2269 structures (excluding homogeneous floating-point aggregates). */
2270 if (regno == R0_REGNUM || regno == R1_REGNUM)
2271 return true;
2272
2273 /* Up to four fp/simd registers can return a function value, e.g. a
2274 homogeneous floating-point aggregate having four members. */
2275 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2276 return TARGET_FLOAT;
2277
2278 return false;
2279 }
2280
2281 /* Implement TARGET_RETURN_IN_MEMORY.
2282
2283 If the type T of the result of a function is such that
2284 void func (T arg)
2285 would require that arg be passed as a value in a register (or set of
2286 registers) according to the parameter passing rules, then the result
2287 is returned in the same registers as would be used for such an
2288 argument. */
2289
2290 static bool
2291 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2292 {
2293 HOST_WIDE_INT size;
2294 machine_mode ag_mode;
2295 int count;
2296
2297 if (!AGGREGATE_TYPE_P (type)
2298 && TREE_CODE (type) != COMPLEX_TYPE
2299 && TREE_CODE (type) != VECTOR_TYPE)
2300 /* Simple scalar types always returned in registers. */
2301 return false;
2302
2303 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2304 type,
2305 &ag_mode,
2306 &count,
2307 NULL))
2308 return false;
2309
2310 /* Types larger than 2 registers returned in memory. */
2311 size = int_size_in_bytes (type);
2312 return (size < 0 || size > 2 * UNITS_PER_WORD);
2313 }
2314
2315 static bool
2316 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2317 const_tree type, int *nregs)
2318 {
2319 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2320 return aarch64_vfp_is_call_or_return_candidate (mode,
2321 type,
2322 &pcum->aapcs_vfp_rmode,
2323 nregs,
2324 NULL);
2325 }
2326
2327 /* Given MODE and TYPE of a function argument, return the alignment in
2328 bits. The idea is to suppress any stronger alignment requested by
2329 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2330 This is a helper function for local use only. */
2331
2332 static unsigned int
2333 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2334 {
2335 if (!type)
2336 return GET_MODE_ALIGNMENT (mode);
2337
2338 if (integer_zerop (TYPE_SIZE (type)))
2339 return 0;
2340
2341 gcc_assert (TYPE_MODE (type) == mode);
2342
2343 if (!AGGREGATE_TYPE_P (type))
2344 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2345
2346 if (TREE_CODE (type) == ARRAY_TYPE)
2347 return TYPE_ALIGN (TREE_TYPE (type));
2348
2349 unsigned int alignment = 0;
2350 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2351 if (TREE_CODE (field) == FIELD_DECL)
2352 alignment = std::max (alignment, DECL_ALIGN (field));
2353
2354 return alignment;
2355 }
2356
2357 /* Layout a function argument according to the AAPCS64 rules. The rule
2358 numbers refer to the rule numbers in the AAPCS64. */
2359
2360 static void
2361 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2362 const_tree type,
2363 bool named ATTRIBUTE_UNUSED)
2364 {
2365 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2366 int ncrn, nvrn, nregs;
2367 bool allocate_ncrn, allocate_nvrn;
2368 HOST_WIDE_INT size;
2369
2370 /* We need to do this once per argument. */
2371 if (pcum->aapcs_arg_processed)
2372 return;
2373
2374 pcum->aapcs_arg_processed = true;
2375
2376 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2377 size
2378 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2379 UNITS_PER_WORD);
2380
2381 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2382 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2383 mode,
2384 type,
2385 &nregs);
2386
2387 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2388 The following code thus handles passing by SIMD/FP registers first. */
2389
2390 nvrn = pcum->aapcs_nvrn;
2391
2392 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2393 and homogenous short-vector aggregates (HVA). */
2394 if (allocate_nvrn)
2395 {
2396 if (!TARGET_FLOAT)
2397 aarch64_err_no_fpadvsimd (mode, "argument");
2398
2399 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2400 {
2401 pcum->aapcs_nextnvrn = nvrn + nregs;
2402 if (!aarch64_composite_type_p (type, mode))
2403 {
2404 gcc_assert (nregs == 1);
2405 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2406 }
2407 else
2408 {
2409 rtx par;
2410 int i;
2411 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2412 for (i = 0; i < nregs; i++)
2413 {
2414 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2415 V0_REGNUM + nvrn + i);
2416 tmp = gen_rtx_EXPR_LIST
2417 (VOIDmode, tmp,
2418 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2419 XVECEXP (par, 0, i) = tmp;
2420 }
2421 pcum->aapcs_reg = par;
2422 }
2423 return;
2424 }
2425 else
2426 {
2427 /* C.3 NSRN is set to 8. */
2428 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2429 goto on_stack;
2430 }
2431 }
2432
2433 ncrn = pcum->aapcs_ncrn;
2434 nregs = size / UNITS_PER_WORD;
2435
2436 /* C6 - C9. though the sign and zero extension semantics are
2437 handled elsewhere. This is the case where the argument fits
2438 entirely general registers. */
2439 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2440 {
2441
2442 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2443
2444 /* C.8 if the argument has an alignment of 16 then the NGRN is
2445 rounded up to the next even number. */
2446 if (nregs == 2
2447 && ncrn % 2
2448 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2449 comparison is there because for > 16 * BITS_PER_UNIT
2450 alignment nregs should be > 2 and therefore it should be
2451 passed by reference rather than value. */
2452 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2453 {
2454 ++ncrn;
2455 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2456 }
2457
2458 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2459 A reg is still generated for it, but the caller should be smart
2460 enough not to use it. */
2461 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2462 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2463 else
2464 {
2465 rtx par;
2466 int i;
2467
2468 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2469 for (i = 0; i < nregs; i++)
2470 {
2471 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2472 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2473 GEN_INT (i * UNITS_PER_WORD));
2474 XVECEXP (par, 0, i) = tmp;
2475 }
2476 pcum->aapcs_reg = par;
2477 }
2478
2479 pcum->aapcs_nextncrn = ncrn + nregs;
2480 return;
2481 }
2482
2483 /* C.11 */
2484 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2485
2486 /* The argument is passed on stack; record the needed number of words for
2487 this argument and align the total size if necessary. */
2488 on_stack:
2489 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2490
2491 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2492 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2493 16 / UNITS_PER_WORD);
2494 return;
2495 }
2496
2497 /* Implement TARGET_FUNCTION_ARG. */
2498
2499 static rtx
2500 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2501 const_tree type, bool named)
2502 {
2503 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2504 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2505
2506 if (mode == VOIDmode)
2507 return NULL_RTX;
2508
2509 aarch64_layout_arg (pcum_v, mode, type, named);
2510 return pcum->aapcs_reg;
2511 }
2512
2513 void
2514 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2515 const_tree fntype ATTRIBUTE_UNUSED,
2516 rtx libname ATTRIBUTE_UNUSED,
2517 const_tree fndecl ATTRIBUTE_UNUSED,
2518 unsigned n_named ATTRIBUTE_UNUSED)
2519 {
2520 pcum->aapcs_ncrn = 0;
2521 pcum->aapcs_nvrn = 0;
2522 pcum->aapcs_nextncrn = 0;
2523 pcum->aapcs_nextnvrn = 0;
2524 pcum->pcs_variant = ARM_PCS_AAPCS64;
2525 pcum->aapcs_reg = NULL_RTX;
2526 pcum->aapcs_arg_processed = false;
2527 pcum->aapcs_stack_words = 0;
2528 pcum->aapcs_stack_size = 0;
2529
2530 if (!TARGET_FLOAT
2531 && fndecl && TREE_PUBLIC (fndecl)
2532 && fntype && fntype != error_mark_node)
2533 {
2534 const_tree type = TREE_TYPE (fntype);
2535 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2536 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2537 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2538 &mode, &nregs, NULL))
2539 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2540 }
2541 return;
2542 }
2543
2544 static void
2545 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2546 machine_mode mode,
2547 const_tree type,
2548 bool named)
2549 {
2550 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2551 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2552 {
2553 aarch64_layout_arg (pcum_v, mode, type, named);
2554 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2555 != (pcum->aapcs_stack_words != 0));
2556 pcum->aapcs_arg_processed = false;
2557 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2558 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2559 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2560 pcum->aapcs_stack_words = 0;
2561 pcum->aapcs_reg = NULL_RTX;
2562 }
2563 }
2564
2565 bool
2566 aarch64_function_arg_regno_p (unsigned regno)
2567 {
2568 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2569 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2570 }
2571
2572 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2573 PARM_BOUNDARY bits of alignment, but will be given anything up
2574 to STACK_BOUNDARY bits if the type requires it. This makes sure
2575 that both before and after the layout of each argument, the Next
2576 Stacked Argument Address (NSAA) will have a minimum alignment of
2577 8 bytes. */
2578
2579 static unsigned int
2580 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2581 {
2582 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2583 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2584 }
2585
2586 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2587
2588 Return true if an argument passed on the stack should be padded upwards,
2589 i.e. if the least-significant byte of the stack slot has useful data.
2590
2591 Small aggregate types are placed in the lowest memory address.
2592
2593 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2594
2595 bool
2596 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2597 {
2598 /* On little-endian targets, the least significant byte of every stack
2599 argument is passed at the lowest byte address of the stack slot. */
2600 if (!BYTES_BIG_ENDIAN)
2601 return true;
2602
2603 /* Otherwise, integral, floating-point and pointer types are padded downward:
2604 the least significant byte of a stack argument is passed at the highest
2605 byte address of the stack slot. */
2606 if (type
2607 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2608 || POINTER_TYPE_P (type))
2609 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2610 return false;
2611
2612 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2613 return true;
2614 }
2615
2616 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2617
2618 It specifies padding for the last (may also be the only)
2619 element of a block move between registers and memory. If
2620 assuming the block is in the memory, padding upward means that
2621 the last element is padded after its highest significant byte,
2622 while in downward padding, the last element is padded at the
2623 its least significant byte side.
2624
2625 Small aggregates and small complex types are always padded
2626 upwards.
2627
2628 We don't need to worry about homogeneous floating-point or
2629 short-vector aggregates; their move is not affected by the
2630 padding direction determined here. Regardless of endianness,
2631 each element of such an aggregate is put in the least
2632 significant bits of a fp/simd register.
2633
2634 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2635 register has useful data, and return the opposite if the most
2636 significant byte does. */
2637
2638 bool
2639 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2640 bool first ATTRIBUTE_UNUSED)
2641 {
2642
2643 /* Small composite types are always padded upward. */
2644 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2645 {
2646 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2647 : GET_MODE_SIZE (mode));
2648 if (size < 2 * UNITS_PER_WORD)
2649 return true;
2650 }
2651
2652 /* Otherwise, use the default padding. */
2653 return !BYTES_BIG_ENDIAN;
2654 }
2655
2656 static scalar_int_mode
2657 aarch64_libgcc_cmp_return_mode (void)
2658 {
2659 return SImode;
2660 }
2661
2662 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2663
2664 /* We use the 12-bit shifted immediate arithmetic instructions so values
2665 must be multiple of (1 << 12), i.e. 4096. */
2666 #define ARITH_FACTOR 4096
2667
2668 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2669 #error Cannot use simple address calculation for stack probing
2670 #endif
2671
2672 /* The pair of scratch registers used for stack probing. */
2673 #define PROBE_STACK_FIRST_REG 9
2674 #define PROBE_STACK_SECOND_REG 10
2675
2676 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2677 inclusive. These are offsets from the current stack pointer. */
2678
2679 static void
2680 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2681 {
2682 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2683
2684 /* See the same assertion on PROBE_INTERVAL above. */
2685 gcc_assert ((first % ARITH_FACTOR) == 0);
2686
2687 /* See if we have a constant small number of probes to generate. If so,
2688 that's the easy case. */
2689 if (size <= PROBE_INTERVAL)
2690 {
2691 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2692
2693 emit_set_insn (reg1,
2694 plus_constant (Pmode,
2695 stack_pointer_rtx, -(first + base)));
2696 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2697 }
2698
2699 /* The run-time loop is made up of 8 insns in the generic case while the
2700 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2701 else if (size <= 4 * PROBE_INTERVAL)
2702 {
2703 HOST_WIDE_INT i, rem;
2704
2705 emit_set_insn (reg1,
2706 plus_constant (Pmode,
2707 stack_pointer_rtx,
2708 -(first + PROBE_INTERVAL)));
2709 emit_stack_probe (reg1);
2710
2711 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2712 it exceeds SIZE. If only two probes are needed, this will not
2713 generate any code. Then probe at FIRST + SIZE. */
2714 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2715 {
2716 emit_set_insn (reg1,
2717 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2718 emit_stack_probe (reg1);
2719 }
2720
2721 rem = size - (i - PROBE_INTERVAL);
2722 if (rem > 256)
2723 {
2724 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2725
2726 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2727 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2728 }
2729 else
2730 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2731 }
2732
2733 /* Otherwise, do the same as above, but in a loop. Note that we must be
2734 extra careful with variables wrapping around because we might be at
2735 the very top (or the very bottom) of the address space and we have
2736 to be able to handle this case properly; in particular, we use an
2737 equality test for the loop condition. */
2738 else
2739 {
2740 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2741
2742 /* Step 1: round SIZE to the previous multiple of the interval. */
2743
2744 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2745
2746
2747 /* Step 2: compute initial and final value of the loop counter. */
2748
2749 /* TEST_ADDR = SP + FIRST. */
2750 emit_set_insn (reg1,
2751 plus_constant (Pmode, stack_pointer_rtx, -first));
2752
2753 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2754 HOST_WIDE_INT adjustment = - (first + rounded_size);
2755 if (! aarch64_uimm12_shift (adjustment))
2756 {
2757 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2758 true, Pmode);
2759 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2760 }
2761 else
2762 {
2763 emit_set_insn (reg2,
2764 plus_constant (Pmode, stack_pointer_rtx, adjustment));
2765 }
2766
2767 /* Step 3: the loop
2768
2769 do
2770 {
2771 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2772 probe at TEST_ADDR
2773 }
2774 while (TEST_ADDR != LAST_ADDR)
2775
2776 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2777 until it is equal to ROUNDED_SIZE. */
2778
2779 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2780
2781
2782 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2783 that SIZE is equal to ROUNDED_SIZE. */
2784
2785 if (size != rounded_size)
2786 {
2787 HOST_WIDE_INT rem = size - rounded_size;
2788
2789 if (rem > 256)
2790 {
2791 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2792
2793 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2794 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2795 }
2796 else
2797 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2798 }
2799 }
2800
2801 /* Make sure nothing is scheduled before we are done. */
2802 emit_insn (gen_blockage ());
2803 }
2804
2805 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2806 absolute addresses. */
2807
2808 const char *
2809 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2810 {
2811 static int labelno = 0;
2812 char loop_lab[32];
2813 rtx xops[2];
2814
2815 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2816
2817 /* Loop. */
2818 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2819
2820 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2821 xops[0] = reg1;
2822 xops[1] = GEN_INT (PROBE_INTERVAL);
2823 output_asm_insn ("sub\t%0, %0, %1", xops);
2824
2825 /* Probe at TEST_ADDR. */
2826 output_asm_insn ("str\txzr, [%0]", xops);
2827
2828 /* Test if TEST_ADDR == LAST_ADDR. */
2829 xops[1] = reg2;
2830 output_asm_insn ("cmp\t%0, %1", xops);
2831
2832 /* Branch. */
2833 fputs ("\tb.ne\t", asm_out_file);
2834 assemble_name_raw (asm_out_file, loop_lab);
2835 fputc ('\n', asm_out_file);
2836
2837 return "";
2838 }
2839
2840 static bool
2841 aarch64_frame_pointer_required (void)
2842 {
2843 /* In aarch64_override_options_after_change
2844 flag_omit_leaf_frame_pointer turns off the frame pointer by
2845 default. Turn it back on now if we've not got a leaf
2846 function. */
2847 if (flag_omit_leaf_frame_pointer
2848 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2849 return true;
2850
2851 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2852 if (crtl->calls_eh_return)
2853 return true;
2854
2855 return false;
2856 }
2857
2858 /* Mark the registers that need to be saved by the callee and calculate
2859 the size of the callee-saved registers area and frame record (both FP
2860 and LR may be omitted). */
2861 static void
2862 aarch64_layout_frame (void)
2863 {
2864 HOST_WIDE_INT offset = 0;
2865 int regno, last_fp_reg = INVALID_REGNUM;
2866
2867 if (reload_completed && cfun->machine->frame.laid_out)
2868 return;
2869
2870 #define SLOT_NOT_REQUIRED (-2)
2871 #define SLOT_REQUIRED (-1)
2872
2873 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2874 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2875
2876 /* First mark all the registers that really need to be saved... */
2877 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2878 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2879
2880 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2881 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2882
2883 /* ... that includes the eh data registers (if needed)... */
2884 if (crtl->calls_eh_return)
2885 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2886 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2887 = SLOT_REQUIRED;
2888
2889 /* ... and any callee saved register that dataflow says is live. */
2890 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2891 if (df_regs_ever_live_p (regno)
2892 && (regno == R30_REGNUM
2893 || !call_used_regs[regno]))
2894 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2895
2896 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2897 if (df_regs_ever_live_p (regno)
2898 && !call_used_regs[regno])
2899 {
2900 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2901 last_fp_reg = regno;
2902 }
2903
2904 if (frame_pointer_needed)
2905 {
2906 /* FP and LR are placed in the linkage record. */
2907 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2908 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2909 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2910 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2911 offset += 2 * UNITS_PER_WORD;
2912 }
2913
2914 /* Now assign stack slots for them. */
2915 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2916 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2917 {
2918 cfun->machine->frame.reg_offset[regno] = offset;
2919 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2920 cfun->machine->frame.wb_candidate1 = regno;
2921 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2922 cfun->machine->frame.wb_candidate2 = regno;
2923 offset += UNITS_PER_WORD;
2924 }
2925
2926 HOST_WIDE_INT max_int_offset = offset;
2927 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2928 bool has_align_gap = offset != max_int_offset;
2929
2930 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2931 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2932 {
2933 /* If there is an alignment gap between integer and fp callee-saves,
2934 allocate the last fp register to it if possible. */
2935 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2936 {
2937 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2938 break;
2939 }
2940
2941 cfun->machine->frame.reg_offset[regno] = offset;
2942 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2943 cfun->machine->frame.wb_candidate1 = regno;
2944 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2945 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2946 cfun->machine->frame.wb_candidate2 = regno;
2947 offset += UNITS_PER_WORD;
2948 }
2949
2950 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2951
2952 cfun->machine->frame.saved_regs_size = offset;
2953
2954 HOST_WIDE_INT varargs_and_saved_regs_size
2955 = offset + cfun->machine->frame.saved_varargs_size;
2956
2957 cfun->machine->frame.hard_fp_offset
2958 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2959 STACK_BOUNDARY / BITS_PER_UNIT);
2960
2961 cfun->machine->frame.frame_size
2962 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2963 + crtl->outgoing_args_size,
2964 STACK_BOUNDARY / BITS_PER_UNIT);
2965
2966 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2967
2968 cfun->machine->frame.initial_adjust = 0;
2969 cfun->machine->frame.final_adjust = 0;
2970 cfun->machine->frame.callee_adjust = 0;
2971 cfun->machine->frame.callee_offset = 0;
2972
2973 HOST_WIDE_INT max_push_offset = 0;
2974 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2975 max_push_offset = 512;
2976 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2977 max_push_offset = 256;
2978
2979 if (cfun->machine->frame.frame_size < max_push_offset
2980 && crtl->outgoing_args_size == 0)
2981 {
2982 /* Simple, small frame with no outgoing arguments:
2983 stp reg1, reg2, [sp, -frame_size]!
2984 stp reg3, reg4, [sp, 16] */
2985 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2986 }
2987 else if ((crtl->outgoing_args_size
2988 + cfun->machine->frame.saved_regs_size < 512)
2989 && !(cfun->calls_alloca
2990 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2991 {
2992 /* Frame with small outgoing arguments:
2993 sub sp, sp, frame_size
2994 stp reg1, reg2, [sp, outgoing_args_size]
2995 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2996 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2997 cfun->machine->frame.callee_offset
2998 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2999 }
3000 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3001 {
3002 /* Frame with large outgoing arguments but a small local area:
3003 stp reg1, reg2, [sp, -hard_fp_offset]!
3004 stp reg3, reg4, [sp, 16]
3005 sub sp, sp, outgoing_args_size */
3006 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3007 cfun->machine->frame.final_adjust
3008 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3009 }
3010 else if (!frame_pointer_needed
3011 && varargs_and_saved_regs_size < max_push_offset)
3012 {
3013 /* Frame with large local area and outgoing arguments (this pushes the
3014 callee-saves first, followed by the locals and outgoing area):
3015 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3016 stp reg3, reg4, [sp, 16]
3017 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3018 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3019 cfun->machine->frame.final_adjust
3020 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3021 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3022 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3023 }
3024 else
3025 {
3026 /* Frame with large local area and outgoing arguments using frame pointer:
3027 sub sp, sp, hard_fp_offset
3028 stp x29, x30, [sp, 0]
3029 add x29, sp, 0
3030 stp reg3, reg4, [sp, 16]
3031 sub sp, sp, outgoing_args_size */
3032 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3033 cfun->machine->frame.final_adjust
3034 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3035 }
3036
3037 cfun->machine->frame.laid_out = true;
3038 }
3039
3040 /* Return true if the register REGNO is saved on entry to
3041 the current function. */
3042
3043 static bool
3044 aarch64_register_saved_on_entry (int regno)
3045 {
3046 return cfun->machine->frame.reg_offset[regno] >= 0;
3047 }
3048
3049 /* Return the next register up from REGNO up to LIMIT for the callee
3050 to save. */
3051
3052 static unsigned
3053 aarch64_next_callee_save (unsigned regno, unsigned limit)
3054 {
3055 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3056 regno ++;
3057 return regno;
3058 }
3059
3060 /* Push the register number REGNO of mode MODE to the stack with write-back
3061 adjusting the stack by ADJUSTMENT. */
3062
3063 static void
3064 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3065 HOST_WIDE_INT adjustment)
3066 {
3067 rtx base_rtx = stack_pointer_rtx;
3068 rtx insn, reg, mem;
3069
3070 reg = gen_rtx_REG (mode, regno);
3071 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3072 plus_constant (Pmode, base_rtx, -adjustment));
3073 mem = gen_frame_mem (mode, mem);
3074
3075 insn = emit_move_insn (mem, reg);
3076 RTX_FRAME_RELATED_P (insn) = 1;
3077 }
3078
3079 /* Generate and return an instruction to store the pair of registers
3080 REG and REG2 of mode MODE to location BASE with write-back adjusting
3081 the stack location BASE by ADJUSTMENT. */
3082
3083 static rtx
3084 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3085 HOST_WIDE_INT adjustment)
3086 {
3087 switch (mode)
3088 {
3089 case E_DImode:
3090 return gen_storewb_pairdi_di (base, base, reg, reg2,
3091 GEN_INT (-adjustment),
3092 GEN_INT (UNITS_PER_WORD - adjustment));
3093 case E_DFmode:
3094 return gen_storewb_pairdf_di (base, base, reg, reg2,
3095 GEN_INT (-adjustment),
3096 GEN_INT (UNITS_PER_WORD - adjustment));
3097 default:
3098 gcc_unreachable ();
3099 }
3100 }
3101
3102 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3103 stack pointer by ADJUSTMENT. */
3104
3105 static void
3106 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3107 {
3108 rtx_insn *insn;
3109 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3110
3111 if (regno2 == INVALID_REGNUM)
3112 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3113
3114 rtx reg1 = gen_rtx_REG (mode, regno1);
3115 rtx reg2 = gen_rtx_REG (mode, regno2);
3116
3117 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3118 reg2, adjustment));
3119 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3120 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3121 RTX_FRAME_RELATED_P (insn) = 1;
3122 }
3123
3124 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3125 adjusting it by ADJUSTMENT afterwards. */
3126
3127 static rtx
3128 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3129 HOST_WIDE_INT adjustment)
3130 {
3131 switch (mode)
3132 {
3133 case E_DImode:
3134 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3135 GEN_INT (UNITS_PER_WORD));
3136 case E_DFmode:
3137 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3138 GEN_INT (UNITS_PER_WORD));
3139 default:
3140 gcc_unreachable ();
3141 }
3142 }
3143
3144 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3145 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3146 into CFI_OPS. */
3147
3148 static void
3149 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3150 rtx *cfi_ops)
3151 {
3152 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3153 rtx reg1 = gen_rtx_REG (mode, regno1);
3154
3155 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3156
3157 if (regno2 == INVALID_REGNUM)
3158 {
3159 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3160 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3161 emit_move_insn (reg1, gen_frame_mem (mode, mem));
3162 }
3163 else
3164 {
3165 rtx reg2 = gen_rtx_REG (mode, regno2);
3166 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3167 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3168 reg2, adjustment));
3169 }
3170 }
3171
3172 /* Generate and return a store pair instruction of mode MODE to store
3173 register REG1 to MEM1 and register REG2 to MEM2. */
3174
3175 static rtx
3176 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3177 rtx reg2)
3178 {
3179 switch (mode)
3180 {
3181 case E_DImode:
3182 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3183
3184 case E_DFmode:
3185 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3186
3187 default:
3188 gcc_unreachable ();
3189 }
3190 }
3191
3192 /* Generate and regurn a load pair isntruction of mode MODE to load register
3193 REG1 from MEM1 and register REG2 from MEM2. */
3194
3195 static rtx
3196 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3197 rtx mem2)
3198 {
3199 switch (mode)
3200 {
3201 case E_DImode:
3202 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3203
3204 case E_DFmode:
3205 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3206
3207 default:
3208 gcc_unreachable ();
3209 }
3210 }
3211
3212 /* Return TRUE if return address signing should be enabled for the current
3213 function, otherwise return FALSE. */
3214
3215 bool
3216 aarch64_return_address_signing_enabled (void)
3217 {
3218 /* This function should only be called after frame laid out. */
3219 gcc_assert (cfun->machine->frame.laid_out);
3220
3221 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3222 if it's LR is pushed onto stack. */
3223 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3224 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3225 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3226 }
3227
3228 /* Emit code to save the callee-saved registers from register number START
3229 to LIMIT to the stack at the location starting at offset START_OFFSET,
3230 skipping any write-back candidates if SKIP_WB is true. */
3231
3232 static void
3233 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3234 unsigned start, unsigned limit, bool skip_wb)
3235 {
3236 rtx_insn *insn;
3237 unsigned regno;
3238 unsigned regno2;
3239
3240 for (regno = aarch64_next_callee_save (start, limit);
3241 regno <= limit;
3242 regno = aarch64_next_callee_save (regno + 1, limit))
3243 {
3244 rtx reg, mem;
3245 HOST_WIDE_INT offset;
3246
3247 if (skip_wb
3248 && (regno == cfun->machine->frame.wb_candidate1
3249 || regno == cfun->machine->frame.wb_candidate2))
3250 continue;
3251
3252 if (cfun->machine->reg_is_wrapped_separately[regno])
3253 continue;
3254
3255 reg = gen_rtx_REG (mode, regno);
3256 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3257 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3258 offset));
3259
3260 regno2 = aarch64_next_callee_save (regno + 1, limit);
3261
3262 if (regno2 <= limit
3263 && !cfun->machine->reg_is_wrapped_separately[regno2]
3264 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3265 == cfun->machine->frame.reg_offset[regno2]))
3266
3267 {
3268 rtx reg2 = gen_rtx_REG (mode, regno2);
3269 rtx mem2;
3270
3271 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3272 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3273 offset));
3274 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3275 reg2));
3276
3277 /* The first part of a frame-related parallel insn is
3278 always assumed to be relevant to the frame
3279 calculations; subsequent parts, are only
3280 frame-related if explicitly marked. */
3281 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3282 regno = regno2;
3283 }
3284 else
3285 insn = emit_move_insn (mem, reg);
3286
3287 RTX_FRAME_RELATED_P (insn) = 1;
3288 }
3289 }
3290
3291 /* Emit code to restore the callee registers of mode MODE from register
3292 number START up to and including LIMIT. Restore from the stack offset
3293 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3294 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3295
3296 static void
3297 aarch64_restore_callee_saves (machine_mode mode,
3298 HOST_WIDE_INT start_offset, unsigned start,
3299 unsigned limit, bool skip_wb, rtx *cfi_ops)
3300 {
3301 rtx base_rtx = stack_pointer_rtx;
3302 unsigned regno;
3303 unsigned regno2;
3304 HOST_WIDE_INT offset;
3305
3306 for (regno = aarch64_next_callee_save (start, limit);
3307 regno <= limit;
3308 regno = aarch64_next_callee_save (regno + 1, limit))
3309 {
3310 if (cfun->machine->reg_is_wrapped_separately[regno])
3311 continue;
3312
3313 rtx reg, mem;
3314
3315 if (skip_wb
3316 && (regno == cfun->machine->frame.wb_candidate1
3317 || regno == cfun->machine->frame.wb_candidate2))
3318 continue;
3319
3320 reg = gen_rtx_REG (mode, regno);
3321 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3322 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3323
3324 regno2 = aarch64_next_callee_save (regno + 1, limit);
3325
3326 if (regno2 <= limit
3327 && !cfun->machine->reg_is_wrapped_separately[regno2]
3328 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3329 == cfun->machine->frame.reg_offset[regno2]))
3330 {
3331 rtx reg2 = gen_rtx_REG (mode, regno2);
3332 rtx mem2;
3333
3334 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3335 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3336 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3337
3338 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3339 regno = regno2;
3340 }
3341 else
3342 emit_move_insn (reg, mem);
3343 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3344 }
3345 }
3346
3347 static inline bool
3348 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3349 HOST_WIDE_INT offset)
3350 {
3351 return offset >= -256 && offset < 256;
3352 }
3353
3354 static inline bool
3355 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3356 {
3357 return (offset >= 0
3358 && offset < 4096 * GET_MODE_SIZE (mode)
3359 && offset % GET_MODE_SIZE (mode) == 0);
3360 }
3361
3362 bool
3363 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3364 {
3365 return (offset >= -64 * GET_MODE_SIZE (mode)
3366 && offset < 64 * GET_MODE_SIZE (mode)
3367 && offset % GET_MODE_SIZE (mode) == 0);
3368 }
3369
3370 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3371
3372 static sbitmap
3373 aarch64_get_separate_components (void)
3374 {
3375 aarch64_layout_frame ();
3376
3377 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3378 bitmap_clear (components);
3379
3380 /* The registers we need saved to the frame. */
3381 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3382 if (aarch64_register_saved_on_entry (regno))
3383 {
3384 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3385 if (!frame_pointer_needed)
3386 offset += cfun->machine->frame.frame_size
3387 - cfun->machine->frame.hard_fp_offset;
3388 /* Check that we can access the stack slot of the register with one
3389 direct load with no adjustments needed. */
3390 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3391 bitmap_set_bit (components, regno);
3392 }
3393
3394 /* Don't mess with the hard frame pointer. */
3395 if (frame_pointer_needed)
3396 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3397
3398 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3399 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3400 /* If aarch64_layout_frame has chosen registers to store/restore with
3401 writeback don't interfere with them to avoid having to output explicit
3402 stack adjustment instructions. */
3403 if (reg2 != INVALID_REGNUM)
3404 bitmap_clear_bit (components, reg2);
3405 if (reg1 != INVALID_REGNUM)
3406 bitmap_clear_bit (components, reg1);
3407
3408 bitmap_clear_bit (components, LR_REGNUM);
3409 bitmap_clear_bit (components, SP_REGNUM);
3410
3411 return components;
3412 }
3413
3414 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3415
3416 static sbitmap
3417 aarch64_components_for_bb (basic_block bb)
3418 {
3419 bitmap in = DF_LIVE_IN (bb);
3420 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3421 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3422
3423 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3424 bitmap_clear (components);
3425
3426 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3427 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3428 if ((!call_used_regs[regno])
3429 && (bitmap_bit_p (in, regno)
3430 || bitmap_bit_p (gen, regno)
3431 || bitmap_bit_p (kill, regno)))
3432 bitmap_set_bit (components, regno);
3433
3434 return components;
3435 }
3436
3437 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3438 Nothing to do for aarch64. */
3439
3440 static void
3441 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3442 {
3443 }
3444
3445 /* Return the next set bit in BMP from START onwards. Return the total number
3446 of bits in BMP if no set bit is found at or after START. */
3447
3448 static unsigned int
3449 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3450 {
3451 unsigned int nbits = SBITMAP_SIZE (bmp);
3452 if (start == nbits)
3453 return start;
3454
3455 gcc_assert (start < nbits);
3456 for (unsigned int i = start; i < nbits; i++)
3457 if (bitmap_bit_p (bmp, i))
3458 return i;
3459
3460 return nbits;
3461 }
3462
3463 /* Do the work for aarch64_emit_prologue_components and
3464 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3465 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3466 for these components or the epilogue sequence. That is, it determines
3467 whether we should emit stores or loads and what kind of CFA notes to attach
3468 to the insns. Otherwise the logic for the two sequences is very
3469 similar. */
3470
3471 static void
3472 aarch64_process_components (sbitmap components, bool prologue_p)
3473 {
3474 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3475 ? HARD_FRAME_POINTER_REGNUM
3476 : STACK_POINTER_REGNUM);
3477
3478 unsigned last_regno = SBITMAP_SIZE (components);
3479 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3480 rtx_insn *insn = NULL;
3481
3482 while (regno != last_regno)
3483 {
3484 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3485 so DFmode for the vector registers is enough. */
3486 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3487 rtx reg = gen_rtx_REG (mode, regno);
3488 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3489 if (!frame_pointer_needed)
3490 offset += cfun->machine->frame.frame_size
3491 - cfun->machine->frame.hard_fp_offset;
3492 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3493 rtx mem = gen_frame_mem (mode, addr);
3494
3495 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3496 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3497 /* No more registers to handle after REGNO.
3498 Emit a single save/restore and exit. */
3499 if (regno2 == last_regno)
3500 {
3501 insn = emit_insn (set);
3502 RTX_FRAME_RELATED_P (insn) = 1;
3503 if (prologue_p)
3504 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3505 else
3506 add_reg_note (insn, REG_CFA_RESTORE, reg);
3507 break;
3508 }
3509
3510 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3511 /* The next register is not of the same class or its offset is not
3512 mergeable with the current one into a pair. */
3513 if (!satisfies_constraint_Ump (mem)
3514 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3515 || (offset2 - cfun->machine->frame.reg_offset[regno])
3516 != GET_MODE_SIZE (mode))
3517 {
3518 insn = emit_insn (set);
3519 RTX_FRAME_RELATED_P (insn) = 1;
3520 if (prologue_p)
3521 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3522 else
3523 add_reg_note (insn, REG_CFA_RESTORE, reg);
3524
3525 regno = regno2;
3526 continue;
3527 }
3528
3529 /* REGNO2 can be saved/restored in a pair with REGNO. */
3530 rtx reg2 = gen_rtx_REG (mode, regno2);
3531 if (!frame_pointer_needed)
3532 offset2 += cfun->machine->frame.frame_size
3533 - cfun->machine->frame.hard_fp_offset;
3534 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3535 rtx mem2 = gen_frame_mem (mode, addr2);
3536 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3537 : gen_rtx_SET (reg2, mem2);
3538
3539 if (prologue_p)
3540 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3541 else
3542 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3543
3544 RTX_FRAME_RELATED_P (insn) = 1;
3545 if (prologue_p)
3546 {
3547 add_reg_note (insn, REG_CFA_OFFSET, set);
3548 add_reg_note (insn, REG_CFA_OFFSET, set2);
3549 }
3550 else
3551 {
3552 add_reg_note (insn, REG_CFA_RESTORE, reg);
3553 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3554 }
3555
3556 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3557 }
3558 }
3559
3560 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3561
3562 static void
3563 aarch64_emit_prologue_components (sbitmap components)
3564 {
3565 aarch64_process_components (components, true);
3566 }
3567
3568 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3569
3570 static void
3571 aarch64_emit_epilogue_components (sbitmap components)
3572 {
3573 aarch64_process_components (components, false);
3574 }
3575
3576 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3577
3578 static void
3579 aarch64_set_handled_components (sbitmap components)
3580 {
3581 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3582 if (bitmap_bit_p (components, regno))
3583 cfun->machine->reg_is_wrapped_separately[regno] = true;
3584 }
3585
3586 /* AArch64 stack frames generated by this compiler look like:
3587
3588 +-------------------------------+
3589 | |
3590 | incoming stack arguments |
3591 | |
3592 +-------------------------------+
3593 | | <-- incoming stack pointer (aligned)
3594 | callee-allocated save area |
3595 | for register varargs |
3596 | |
3597 +-------------------------------+
3598 | local variables | <-- frame_pointer_rtx
3599 | |
3600 +-------------------------------+
3601 | padding0 | \
3602 +-------------------------------+ |
3603 | callee-saved registers | | frame.saved_regs_size
3604 +-------------------------------+ |
3605 | LR' | |
3606 +-------------------------------+ |
3607 | FP' | / <- hard_frame_pointer_rtx (aligned)
3608 +-------------------------------+
3609 | dynamic allocation |
3610 +-------------------------------+
3611 | padding |
3612 +-------------------------------+
3613 | outgoing stack arguments | <-- arg_pointer
3614 | |
3615 +-------------------------------+
3616 | | <-- stack_pointer_rtx (aligned)
3617
3618 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3619 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3620 unchanged. */
3621
3622 /* Generate the prologue instructions for entry into a function.
3623 Establish the stack frame by decreasing the stack pointer with a
3624 properly calculated size and, if necessary, create a frame record
3625 filled with the values of LR and previous frame pointer. The
3626 current FP is also set up if it is in use. */
3627
3628 void
3629 aarch64_expand_prologue (void)
3630 {
3631 aarch64_layout_frame ();
3632
3633 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3634 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3635 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3636 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3637 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3638 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3639 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3640 rtx_insn *insn;
3641
3642 /* Sign return address for functions. */
3643 if (aarch64_return_address_signing_enabled ())
3644 {
3645 insn = emit_insn (gen_pacisp ());
3646 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3647 RTX_FRAME_RELATED_P (insn) = 1;
3648 }
3649
3650 if (flag_stack_usage_info)
3651 current_function_static_stack_size = frame_size;
3652
3653 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3654 {
3655 if (crtl->is_leaf && !cfun->calls_alloca)
3656 {
3657 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3658 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3659 frame_size - STACK_CHECK_PROTECT);
3660 }
3661 else if (frame_size > 0)
3662 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3663 }
3664
3665 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3666
3667 if (callee_adjust != 0)
3668 aarch64_push_regs (reg1, reg2, callee_adjust);
3669
3670 if (frame_pointer_needed)
3671 {
3672 if (callee_adjust == 0)
3673 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3674 R30_REGNUM, false);
3675 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3676 stack_pointer_rtx,
3677 GEN_INT (callee_offset)));
3678 RTX_FRAME_RELATED_P (insn) = 1;
3679 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3680 }
3681
3682 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3683 callee_adjust != 0 || frame_pointer_needed);
3684 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3685 callee_adjust != 0 || frame_pointer_needed);
3686 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3687 }
3688
3689 /* Return TRUE if we can use a simple_return insn.
3690
3691 This function checks whether the callee saved stack is empty, which
3692 means no restore actions are need. The pro_and_epilogue will use
3693 this to check whether shrink-wrapping opt is feasible. */
3694
3695 bool
3696 aarch64_use_return_insn_p (void)
3697 {
3698 if (!reload_completed)
3699 return false;
3700
3701 if (crtl->profile)
3702 return false;
3703
3704 aarch64_layout_frame ();
3705
3706 return cfun->machine->frame.frame_size == 0;
3707 }
3708
3709 /* Generate the epilogue instructions for returning from a function.
3710 This is almost exactly the reverse of the prolog sequence, except
3711 that we need to insert barriers to avoid scheduling loads that read
3712 from a deallocated stack, and we optimize the unwind records by
3713 emitting them all together if possible. */
3714 void
3715 aarch64_expand_epilogue (bool for_sibcall)
3716 {
3717 aarch64_layout_frame ();
3718
3719 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3720 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3721 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3722 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3723 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3724 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3725 rtx cfi_ops = NULL;
3726 rtx_insn *insn;
3727
3728 /* We need to add memory barrier to prevent read from deallocated stack. */
3729 bool need_barrier_p = (get_frame_size ()
3730 + cfun->machine->frame.saved_varargs_size) != 0;
3731
3732 /* Emit a barrier to prevent loads from a deallocated stack. */
3733 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3734 || crtl->calls_eh_return)
3735 {
3736 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3737 need_barrier_p = false;
3738 }
3739
3740 /* Restore the stack pointer from the frame pointer if it may not
3741 be the same as the stack pointer. */
3742 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3743 {
3744 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3745 hard_frame_pointer_rtx,
3746 GEN_INT (-callee_offset)));
3747 /* If writeback is used when restoring callee-saves, the CFA
3748 is restored on the instruction doing the writeback. */
3749 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3750 }
3751 else
3752 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3753
3754 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3755 callee_adjust != 0, &cfi_ops);
3756 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3757 callee_adjust != 0, &cfi_ops);
3758
3759 if (need_barrier_p)
3760 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3761
3762 if (callee_adjust != 0)
3763 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3764
3765 if (callee_adjust != 0 || initial_adjust > 65536)
3766 {
3767 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3768 insn = get_last_insn ();
3769 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3770 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3771 RTX_FRAME_RELATED_P (insn) = 1;
3772 cfi_ops = NULL;
3773 }
3774
3775 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3776
3777 if (cfi_ops)
3778 {
3779 /* Emit delayed restores and reset the CFA to be SP. */
3780 insn = get_last_insn ();
3781 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3782 REG_NOTES (insn) = cfi_ops;
3783 RTX_FRAME_RELATED_P (insn) = 1;
3784 }
3785
3786 /* We prefer to emit the combined return/authenticate instruction RETAA,
3787 however there are three cases in which we must instead emit an explicit
3788 authentication instruction.
3789
3790 1) Sibcalls don't return in a normal way, so if we're about to call one
3791 we must authenticate.
3792
3793 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3794 generating code for !TARGET_ARMV8_3 we can't use it and must
3795 explicitly authenticate.
3796
3797 3) On an eh_return path we make extra stack adjustments to update the
3798 canonical frame address to be the exception handler's CFA. We want
3799 to authenticate using the CFA of the function which calls eh_return.
3800 */
3801 if (aarch64_return_address_signing_enabled ()
3802 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3803 {
3804 insn = emit_insn (gen_autisp ());
3805 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3806 RTX_FRAME_RELATED_P (insn) = 1;
3807 }
3808
3809 /* Stack adjustment for exception handler. */
3810 if (crtl->calls_eh_return)
3811 {
3812 /* We need to unwind the stack by the offset computed by
3813 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3814 to be SP; letting the CFA move during this adjustment
3815 is just as correct as retaining the CFA from the body
3816 of the function. Therefore, do nothing special. */
3817 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3818 }
3819
3820 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3821 if (!for_sibcall)
3822 emit_jump_insn (ret_rtx);
3823 }
3824
3825 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3826 normally or return to a previous frame after unwinding.
3827
3828 An EH return uses a single shared return sequence. The epilogue is
3829 exactly like a normal epilogue except that it has an extra input
3830 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3831 that must be applied after the frame has been destroyed. An extra label
3832 is inserted before the epilogue which initializes this register to zero,
3833 and this is the entry point for a normal return.
3834
3835 An actual EH return updates the return address, initializes the stack
3836 adjustment and jumps directly into the epilogue (bypassing the zeroing
3837 of the adjustment). Since the return address is typically saved on the
3838 stack when a function makes a call, the saved LR must be updated outside
3839 the epilogue.
3840
3841 This poses problems as the store is generated well before the epilogue,
3842 so the offset of LR is not known yet. Also optimizations will remove the
3843 store as it appears dead, even after the epilogue is generated (as the
3844 base or offset for loading LR is different in many cases).
3845
3846 To avoid these problems this implementation forces the frame pointer
3847 in eh_return functions so that the location of LR is fixed and known early.
3848 It also marks the store volatile, so no optimization is permitted to
3849 remove the store. */
3850 rtx
3851 aarch64_eh_return_handler_rtx (void)
3852 {
3853 rtx tmp = gen_frame_mem (Pmode,
3854 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3855
3856 /* Mark the store volatile, so no optimization is permitted to remove it. */
3857 MEM_VOLATILE_P (tmp) = true;
3858 return tmp;
3859 }
3860
3861 /* Output code to add DELTA to the first argument, and then jump
3862 to FUNCTION. Used for C++ multiple inheritance. */
3863 static void
3864 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3865 HOST_WIDE_INT delta,
3866 HOST_WIDE_INT vcall_offset,
3867 tree function)
3868 {
3869 /* The this pointer is always in x0. Note that this differs from
3870 Arm where the this pointer maybe bumped to r1 if r0 is required
3871 to return a pointer to an aggregate. On AArch64 a result value
3872 pointer will be in x8. */
3873 int this_regno = R0_REGNUM;
3874 rtx this_rtx, temp0, temp1, addr, funexp;
3875 rtx_insn *insn;
3876
3877 reload_completed = 1;
3878 emit_note (NOTE_INSN_PROLOGUE_END);
3879
3880 if (vcall_offset == 0)
3881 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3882 else
3883 {
3884 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3885
3886 this_rtx = gen_rtx_REG (Pmode, this_regno);
3887 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3888 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3889
3890 addr = this_rtx;
3891 if (delta != 0)
3892 {
3893 if (delta >= -256 && delta < 256)
3894 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3895 plus_constant (Pmode, this_rtx, delta));
3896 else
3897 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3898 }
3899
3900 if (Pmode == ptr_mode)
3901 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3902 else
3903 aarch64_emit_move (temp0,
3904 gen_rtx_ZERO_EXTEND (Pmode,
3905 gen_rtx_MEM (ptr_mode, addr)));
3906
3907 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3908 addr = plus_constant (Pmode, temp0, vcall_offset);
3909 else
3910 {
3911 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3912 Pmode);
3913 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3914 }
3915
3916 if (Pmode == ptr_mode)
3917 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3918 else
3919 aarch64_emit_move (temp1,
3920 gen_rtx_SIGN_EXTEND (Pmode,
3921 gen_rtx_MEM (ptr_mode, addr)));
3922
3923 emit_insn (gen_add2_insn (this_rtx, temp1));
3924 }
3925
3926 /* Generate a tail call to the target function. */
3927 if (!TREE_USED (function))
3928 {
3929 assemble_external (function);
3930 TREE_USED (function) = 1;
3931 }
3932 funexp = XEXP (DECL_RTL (function), 0);
3933 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3934 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3935 SIBLING_CALL_P (insn) = 1;
3936
3937 insn = get_insns ();
3938 shorten_branches (insn);
3939 final_start_function (insn, file, 1);
3940 final (insn, file, 1);
3941 final_end_function ();
3942
3943 /* Stop pretending to be a post-reload pass. */
3944 reload_completed = 0;
3945 }
3946
3947 static bool
3948 aarch64_tls_referenced_p (rtx x)
3949 {
3950 if (!TARGET_HAVE_TLS)
3951 return false;
3952 subrtx_iterator::array_type array;
3953 FOR_EACH_SUBRTX (iter, array, x, ALL)
3954 {
3955 const_rtx x = *iter;
3956 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3957 return true;
3958 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3959 TLS offsets, not real symbol references. */
3960 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3961 iter.skip_subrtxes ();
3962 }
3963 return false;
3964 }
3965
3966
3967 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3968 a left shift of 0 or 12 bits. */
3969 bool
3970 aarch64_uimm12_shift (HOST_WIDE_INT val)
3971 {
3972 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3973 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3974 );
3975 }
3976
3977
3978 /* Return true if val is an immediate that can be loaded into a
3979 register by a MOVZ instruction. */
3980 static bool
3981 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3982 {
3983 if (GET_MODE_SIZE (mode) > 4)
3984 {
3985 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3986 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3987 return 1;
3988 }
3989 else
3990 {
3991 /* Ignore sign extension. */
3992 val &= (HOST_WIDE_INT) 0xffffffff;
3993 }
3994 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3995 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3996 }
3997
3998 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3999
4000 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4001 {
4002 0x0000000100000001ull,
4003 0x0001000100010001ull,
4004 0x0101010101010101ull,
4005 0x1111111111111111ull,
4006 0x5555555555555555ull,
4007 };
4008
4009
4010 /* Return true if val is a valid bitmask immediate. */
4011
4012 bool
4013 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4014 {
4015 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4016 int bits;
4017
4018 /* Check for a single sequence of one bits and return quickly if so.
4019 The special cases of all ones and all zeroes returns false. */
4020 val = (unsigned HOST_WIDE_INT) val_in;
4021 tmp = val + (val & -val);
4022
4023 if (tmp == (tmp & -tmp))
4024 return (val + 1) > 1;
4025
4026 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4027 if (mode == SImode)
4028 val = (val << 32) | (val & 0xffffffff);
4029
4030 /* Invert if the immediate doesn't start with a zero bit - this means we
4031 only need to search for sequences of one bits. */
4032 if (val & 1)
4033 val = ~val;
4034
4035 /* Find the first set bit and set tmp to val with the first sequence of one
4036 bits removed. Return success if there is a single sequence of ones. */
4037 first_one = val & -val;
4038 tmp = val & (val + first_one);
4039
4040 if (tmp == 0)
4041 return true;
4042
4043 /* Find the next set bit and compute the difference in bit position. */
4044 next_one = tmp & -tmp;
4045 bits = clz_hwi (first_one) - clz_hwi (next_one);
4046 mask = val ^ tmp;
4047
4048 /* Check the bit position difference is a power of 2, and that the first
4049 sequence of one bits fits within 'bits' bits. */
4050 if ((mask >> bits) != 0 || bits != (bits & -bits))
4051 return false;
4052
4053 /* Check the sequence of one bits is repeated 64/bits times. */
4054 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4055 }
4056
4057 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4058 Assumed precondition: VAL_IN Is not zero. */
4059
4060 unsigned HOST_WIDE_INT
4061 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4062 {
4063 int lowest_bit_set = ctz_hwi (val_in);
4064 int highest_bit_set = floor_log2 (val_in);
4065 gcc_assert (val_in != 0);
4066
4067 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4068 (HOST_WIDE_INT_1U << lowest_bit_set));
4069 }
4070
4071 /* Create constant where bits outside of lowest bit set to highest bit set
4072 are set to 1. */
4073
4074 unsigned HOST_WIDE_INT
4075 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4076 {
4077 return val_in | ~aarch64_and_split_imm1 (val_in);
4078 }
4079
4080 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4081
4082 bool
4083 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4084 {
4085 if (aarch64_bitmask_imm (val_in, mode))
4086 return false;
4087
4088 if (aarch64_move_imm (val_in, mode))
4089 return false;
4090
4091 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4092
4093 return aarch64_bitmask_imm (imm2, mode);
4094 }
4095
4096 /* Return true if val is an immediate that can be loaded into a
4097 register in a single instruction. */
4098 bool
4099 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4100 {
4101 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4102 return 1;
4103 return aarch64_bitmask_imm (val, mode);
4104 }
4105
4106 static bool
4107 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4108 {
4109 rtx base, offset;
4110
4111 if (GET_CODE (x) == HIGH)
4112 return true;
4113
4114 split_const (x, &base, &offset);
4115 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4116 {
4117 if (aarch64_classify_symbol (base, offset)
4118 != SYMBOL_FORCE_TO_MEM)
4119 return true;
4120 else
4121 /* Avoid generating a 64-bit relocation in ILP32; leave
4122 to aarch64_expand_mov_immediate to handle it properly. */
4123 return mode != ptr_mode;
4124 }
4125
4126 return aarch64_tls_referenced_p (x);
4127 }
4128
4129 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4130 The expansion for a table switch is quite expensive due to the number
4131 of instructions, the table lookup and hard to predict indirect jump.
4132 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4133 set, otherwise use tables for > 16 cases as a tradeoff between size and
4134 performance. When optimizing for size, use the default setting. */
4135
4136 static unsigned int
4137 aarch64_case_values_threshold (void)
4138 {
4139 /* Use the specified limit for the number of cases before using jump
4140 tables at higher optimization levels. */
4141 if (optimize > 2
4142 && selected_cpu->tune->max_case_values != 0)
4143 return selected_cpu->tune->max_case_values;
4144 else
4145 return optimize_size ? default_case_values_threshold () : 17;
4146 }
4147
4148 /* Return true if register REGNO is a valid index register.
4149 STRICT_P is true if REG_OK_STRICT is in effect. */
4150
4151 bool
4152 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4153 {
4154 if (!HARD_REGISTER_NUM_P (regno))
4155 {
4156 if (!strict_p)
4157 return true;
4158
4159 if (!reg_renumber)
4160 return false;
4161
4162 regno = reg_renumber[regno];
4163 }
4164 return GP_REGNUM_P (regno);
4165 }
4166
4167 /* Return true if register REGNO is a valid base register for mode MODE.
4168 STRICT_P is true if REG_OK_STRICT is in effect. */
4169
4170 bool
4171 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4172 {
4173 if (!HARD_REGISTER_NUM_P (regno))
4174 {
4175 if (!strict_p)
4176 return true;
4177
4178 if (!reg_renumber)
4179 return false;
4180
4181 regno = reg_renumber[regno];
4182 }
4183
4184 /* The fake registers will be eliminated to either the stack or
4185 hard frame pointer, both of which are usually valid base registers.
4186 Reload deals with the cases where the eliminated form isn't valid. */
4187 return (GP_REGNUM_P (regno)
4188 || regno == SP_REGNUM
4189 || regno == FRAME_POINTER_REGNUM
4190 || regno == ARG_POINTER_REGNUM);
4191 }
4192
4193 /* Return true if X is a valid base register for mode MODE.
4194 STRICT_P is true if REG_OK_STRICT is in effect. */
4195
4196 static bool
4197 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4198 {
4199 if (!strict_p
4200 && GET_CODE (x) == SUBREG
4201 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4202 x = SUBREG_REG (x);
4203
4204 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4205 }
4206
4207 /* Return true if address offset is a valid index. If it is, fill in INFO
4208 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4209
4210 static bool
4211 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4212 machine_mode mode, bool strict_p)
4213 {
4214 enum aarch64_address_type type;
4215 rtx index;
4216 int shift;
4217
4218 /* (reg:P) */
4219 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4220 && GET_MODE (x) == Pmode)
4221 {
4222 type = ADDRESS_REG_REG;
4223 index = x;
4224 shift = 0;
4225 }
4226 /* (sign_extend:DI (reg:SI)) */
4227 else if ((GET_CODE (x) == SIGN_EXTEND
4228 || GET_CODE (x) == ZERO_EXTEND)
4229 && GET_MODE (x) == DImode
4230 && GET_MODE (XEXP (x, 0)) == SImode)
4231 {
4232 type = (GET_CODE (x) == SIGN_EXTEND)
4233 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4234 index = XEXP (x, 0);
4235 shift = 0;
4236 }
4237 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4238 else if (GET_CODE (x) == MULT
4239 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4240 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4241 && GET_MODE (XEXP (x, 0)) == DImode
4242 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4243 && CONST_INT_P (XEXP (x, 1)))
4244 {
4245 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4246 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4247 index = XEXP (XEXP (x, 0), 0);
4248 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4249 }
4250 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4251 else if (GET_CODE (x) == ASHIFT
4252 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4253 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4254 && GET_MODE (XEXP (x, 0)) == DImode
4255 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4256 && CONST_INT_P (XEXP (x, 1)))
4257 {
4258 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4259 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4260 index = XEXP (XEXP (x, 0), 0);
4261 shift = INTVAL (XEXP (x, 1));
4262 }
4263 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4264 else if ((GET_CODE (x) == SIGN_EXTRACT
4265 || GET_CODE (x) == ZERO_EXTRACT)
4266 && GET_MODE (x) == DImode
4267 && GET_CODE (XEXP (x, 0)) == MULT
4268 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4269 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4270 {
4271 type = (GET_CODE (x) == SIGN_EXTRACT)
4272 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4273 index = XEXP (XEXP (x, 0), 0);
4274 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4275 if (INTVAL (XEXP (x, 1)) != 32 + shift
4276 || INTVAL (XEXP (x, 2)) != 0)
4277 shift = -1;
4278 }
4279 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4280 (const_int 0xffffffff<<shift)) */
4281 else if (GET_CODE (x) == AND
4282 && GET_MODE (x) == DImode
4283 && GET_CODE (XEXP (x, 0)) == MULT
4284 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4285 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4286 && CONST_INT_P (XEXP (x, 1)))
4287 {
4288 type = ADDRESS_REG_UXTW;
4289 index = XEXP (XEXP (x, 0), 0);
4290 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4291 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4292 shift = -1;
4293 }
4294 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4295 else if ((GET_CODE (x) == SIGN_EXTRACT
4296 || GET_CODE (x) == ZERO_EXTRACT)
4297 && GET_MODE (x) == DImode
4298 && GET_CODE (XEXP (x, 0)) == ASHIFT
4299 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4300 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4301 {
4302 type = (GET_CODE (x) == SIGN_EXTRACT)
4303 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4304 index = XEXP (XEXP (x, 0), 0);
4305 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4306 if (INTVAL (XEXP (x, 1)) != 32 + shift
4307 || INTVAL (XEXP (x, 2)) != 0)
4308 shift = -1;
4309 }
4310 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4311 (const_int 0xffffffff<<shift)) */
4312 else if (GET_CODE (x) == AND
4313 && GET_MODE (x) == DImode
4314 && GET_CODE (XEXP (x, 0)) == ASHIFT
4315 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4316 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4317 && CONST_INT_P (XEXP (x, 1)))
4318 {
4319 type = ADDRESS_REG_UXTW;
4320 index = XEXP (XEXP (x, 0), 0);
4321 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4322 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4323 shift = -1;
4324 }
4325 /* (mult:P (reg:P) (const_int scale)) */
4326 else if (GET_CODE (x) == MULT
4327 && GET_MODE (x) == Pmode
4328 && GET_MODE (XEXP (x, 0)) == Pmode
4329 && CONST_INT_P (XEXP (x, 1)))
4330 {
4331 type = ADDRESS_REG_REG;
4332 index = XEXP (x, 0);
4333 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4334 }
4335 /* (ashift:P (reg:P) (const_int shift)) */
4336 else if (GET_CODE (x) == ASHIFT
4337 && GET_MODE (x) == Pmode
4338 && GET_MODE (XEXP (x, 0)) == Pmode
4339 && CONST_INT_P (XEXP (x, 1)))
4340 {
4341 type = ADDRESS_REG_REG;
4342 index = XEXP (x, 0);
4343 shift = INTVAL (XEXP (x, 1));
4344 }
4345 else
4346 return false;
4347
4348 if (!strict_p
4349 && GET_CODE (index) == SUBREG
4350 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4351 index = SUBREG_REG (index);
4352
4353 if ((shift == 0 ||
4354 (shift > 0 && shift <= 3
4355 && (1 << shift) == GET_MODE_SIZE (mode)))
4356 && REG_P (index)
4357 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4358 {
4359 info->type = type;
4360 info->offset = index;
4361 info->shift = shift;
4362 return true;
4363 }
4364
4365 return false;
4366 }
4367
4368 /* Return true if MODE is one of the modes for which we
4369 support LDP/STP operations. */
4370
4371 static bool
4372 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4373 {
4374 return mode == SImode || mode == DImode
4375 || mode == SFmode || mode == DFmode
4376 || (aarch64_vector_mode_supported_p (mode)
4377 && GET_MODE_SIZE (mode) == 8);
4378 }
4379
4380 /* Return true if REGNO is a virtual pointer register, or an eliminable
4381 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4382 include stack_pointer or hard_frame_pointer. */
4383 static bool
4384 virt_or_elim_regno_p (unsigned regno)
4385 {
4386 return ((regno >= FIRST_VIRTUAL_REGISTER
4387 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4388 || regno == FRAME_POINTER_REGNUM
4389 || regno == ARG_POINTER_REGNUM);
4390 }
4391
4392 /* Return true if X is a valid address for machine mode MODE. If it is,
4393 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4394 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4395
4396 static bool
4397 aarch64_classify_address (struct aarch64_address_info *info,
4398 rtx x, machine_mode mode,
4399 RTX_CODE outer_code, bool strict_p)
4400 {
4401 enum rtx_code code = GET_CODE (x);
4402 rtx op0, op1;
4403
4404 /* On BE, we use load/store pair for all large int mode load/stores.
4405 TI/TFmode may also use a load/store pair. */
4406 bool load_store_pair_p = (outer_code == PARALLEL
4407 || mode == TImode
4408 || mode == TFmode
4409 || (BYTES_BIG_ENDIAN
4410 && aarch64_vect_struct_mode_p (mode)));
4411
4412 bool allow_reg_index_p =
4413 !load_store_pair_p
4414 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4415 && !aarch64_vect_struct_mode_p (mode);
4416
4417 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4418 REG addressing. */
4419 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4420 && (code != POST_INC && code != REG))
4421 return false;
4422
4423 switch (code)
4424 {
4425 case REG:
4426 case SUBREG:
4427 info->type = ADDRESS_REG_IMM;
4428 info->base = x;
4429 info->offset = const0_rtx;
4430 return aarch64_base_register_rtx_p (x, strict_p);
4431
4432 case PLUS:
4433 op0 = XEXP (x, 0);
4434 op1 = XEXP (x, 1);
4435
4436 if (! strict_p
4437 && REG_P (op0)
4438 && virt_or_elim_regno_p (REGNO (op0))
4439 && CONST_INT_P (op1))
4440 {
4441 info->type = ADDRESS_REG_IMM;
4442 info->base = op0;
4443 info->offset = op1;
4444
4445 return true;
4446 }
4447
4448 if (GET_MODE_SIZE (mode) != 0
4449 && CONST_INT_P (op1)
4450 && aarch64_base_register_rtx_p (op0, strict_p))
4451 {
4452 HOST_WIDE_INT offset = INTVAL (op1);
4453
4454 info->type = ADDRESS_REG_IMM;
4455 info->base = op0;
4456 info->offset = op1;
4457
4458 /* TImode and TFmode values are allowed in both pairs of X
4459 registers and individual Q registers. The available
4460 address modes are:
4461 X,X: 7-bit signed scaled offset
4462 Q: 9-bit signed offset
4463 We conservatively require an offset representable in either mode.
4464 When performing the check for pairs of X registers i.e. LDP/STP
4465 pass down DImode since that is the natural size of the LDP/STP
4466 instruction memory accesses. */
4467 if (mode == TImode || mode == TFmode)
4468 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4469 && (offset_9bit_signed_unscaled_p (mode, offset)
4470 || offset_12bit_unsigned_scaled_p (mode, offset)));
4471
4472 /* A 7bit offset check because OImode will emit a ldp/stp
4473 instruction (only big endian will get here).
4474 For ldp/stp instructions, the offset is scaled for the size of a
4475 single element of the pair. */
4476 if (mode == OImode)
4477 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4478
4479 /* Three 9/12 bit offsets checks because CImode will emit three
4480 ldr/str instructions (only big endian will get here). */
4481 if (mode == CImode)
4482 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4483 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4484 || offset_12bit_unsigned_scaled_p (V16QImode,
4485 offset + 32)));
4486
4487 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4488 instructions (only big endian will get here). */
4489 if (mode == XImode)
4490 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4491 && aarch64_offset_7bit_signed_scaled_p (TImode,
4492 offset + 32));
4493
4494 if (load_store_pair_p)
4495 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4496 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4497 else
4498 return (offset_9bit_signed_unscaled_p (mode, offset)
4499 || offset_12bit_unsigned_scaled_p (mode, offset));
4500 }
4501
4502 if (allow_reg_index_p)
4503 {
4504 /* Look for base + (scaled/extended) index register. */
4505 if (aarch64_base_register_rtx_p (op0, strict_p)
4506 && aarch64_classify_index (info, op1, mode, strict_p))
4507 {
4508 info->base = op0;
4509 return true;
4510 }
4511 if (aarch64_base_register_rtx_p (op1, strict_p)
4512 && aarch64_classify_index (info, op0, mode, strict_p))
4513 {
4514 info->base = op1;
4515 return true;
4516 }
4517 }
4518
4519 return false;
4520
4521 case POST_INC:
4522 case POST_DEC:
4523 case PRE_INC:
4524 case PRE_DEC:
4525 info->type = ADDRESS_REG_WB;
4526 info->base = XEXP (x, 0);
4527 info->offset = NULL_RTX;
4528 return aarch64_base_register_rtx_p (info->base, strict_p);
4529
4530 case POST_MODIFY:
4531 case PRE_MODIFY:
4532 info->type = ADDRESS_REG_WB;
4533 info->base = XEXP (x, 0);
4534 if (GET_CODE (XEXP (x, 1)) == PLUS
4535 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4536 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4537 && aarch64_base_register_rtx_p (info->base, strict_p))
4538 {
4539 HOST_WIDE_INT offset;
4540 info->offset = XEXP (XEXP (x, 1), 1);
4541 offset = INTVAL (info->offset);
4542
4543 /* TImode and TFmode values are allowed in both pairs of X
4544 registers and individual Q registers. The available
4545 address modes are:
4546 X,X: 7-bit signed scaled offset
4547 Q: 9-bit signed offset
4548 We conservatively require an offset representable in either mode.
4549 */
4550 if (mode == TImode || mode == TFmode)
4551 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4552 && offset_9bit_signed_unscaled_p (mode, offset));
4553
4554 if (load_store_pair_p)
4555 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4556 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4557 else
4558 return offset_9bit_signed_unscaled_p (mode, offset);
4559 }
4560 return false;
4561
4562 case CONST:
4563 case SYMBOL_REF:
4564 case LABEL_REF:
4565 /* load literal: pc-relative constant pool entry. Only supported
4566 for SI mode or larger. */
4567 info->type = ADDRESS_SYMBOLIC;
4568
4569 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4570 {
4571 rtx sym, addend;
4572
4573 split_const (x, &sym, &addend);
4574 return ((GET_CODE (sym) == LABEL_REF
4575 || (GET_CODE (sym) == SYMBOL_REF
4576 && CONSTANT_POOL_ADDRESS_P (sym)
4577 && aarch64_pcrelative_literal_loads)));
4578 }
4579 return false;
4580
4581 case LO_SUM:
4582 info->type = ADDRESS_LO_SUM;
4583 info->base = XEXP (x, 0);
4584 info->offset = XEXP (x, 1);
4585 if (allow_reg_index_p
4586 && aarch64_base_register_rtx_p (info->base, strict_p))
4587 {
4588 rtx sym, offs;
4589 split_const (info->offset, &sym, &offs);
4590 if (GET_CODE (sym) == SYMBOL_REF
4591 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4592 {
4593 /* The symbol and offset must be aligned to the access size. */
4594 unsigned int align;
4595 unsigned int ref_size;
4596
4597 if (CONSTANT_POOL_ADDRESS_P (sym))
4598 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4599 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4600 {
4601 tree exp = SYMBOL_REF_DECL (sym);
4602 align = TYPE_ALIGN (TREE_TYPE (exp));
4603 align = CONSTANT_ALIGNMENT (exp, align);
4604 }
4605 else if (SYMBOL_REF_DECL (sym))
4606 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4607 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4608 && SYMBOL_REF_BLOCK (sym) != NULL)
4609 align = SYMBOL_REF_BLOCK (sym)->alignment;
4610 else
4611 align = BITS_PER_UNIT;
4612
4613 ref_size = GET_MODE_SIZE (mode);
4614 if (ref_size == 0)
4615 ref_size = GET_MODE_SIZE (DImode);
4616
4617 return ((INTVAL (offs) & (ref_size - 1)) == 0
4618 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4619 }
4620 }
4621 return false;
4622
4623 default:
4624 return false;
4625 }
4626 }
4627
4628 /* Return true if the address X is valid for a PRFM instruction.
4629 STRICT_P is true if we should do strict checking with
4630 aarch64_classify_address. */
4631
4632 bool
4633 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4634 {
4635 struct aarch64_address_info addr;
4636
4637 /* PRFM accepts the same addresses as DImode... */
4638 bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4639 if (!res)
4640 return false;
4641
4642 /* ... except writeback forms. */
4643 return addr.type != ADDRESS_REG_WB;
4644 }
4645
4646 bool
4647 aarch64_symbolic_address_p (rtx x)
4648 {
4649 rtx offset;
4650
4651 split_const (x, &x, &offset);
4652 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4653 }
4654
4655 /* Classify the base of symbolic expression X. */
4656
4657 enum aarch64_symbol_type
4658 aarch64_classify_symbolic_expression (rtx x)
4659 {
4660 rtx offset;
4661
4662 split_const (x, &x, &offset);
4663 return aarch64_classify_symbol (x, offset);
4664 }
4665
4666
4667 /* Return TRUE if X is a legitimate address for accessing memory in
4668 mode MODE. */
4669 static bool
4670 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4671 {
4672 struct aarch64_address_info addr;
4673
4674 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4675 }
4676
4677 /* Return TRUE if X is a legitimate address for accessing memory in
4678 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4679 pair operation. */
4680 bool
4681 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4682 RTX_CODE outer_code, bool strict_p)
4683 {
4684 struct aarch64_address_info addr;
4685
4686 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4687 }
4688
4689 /* Split an out-of-range address displacement into a base and offset.
4690 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4691 to increase opportunities for sharing the base address of different sizes.
4692 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4693 static bool
4694 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4695 {
4696 HOST_WIDE_INT offset = INTVAL (*disp);
4697 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4698
4699 if (mode == TImode || mode == TFmode
4700 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4701 base = (offset + 0x100) & ~0x1ff;
4702
4703 *off = GEN_INT (base);
4704 *disp = GEN_INT (offset - base);
4705 return true;
4706 }
4707
4708 /* Return the binary representation of floating point constant VALUE in INTVAL.
4709 If the value cannot be converted, return false without setting INTVAL.
4710 The conversion is done in the given MODE. */
4711 bool
4712 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4713 {
4714
4715 /* We make a general exception for 0. */
4716 if (aarch64_float_const_zero_rtx_p (value))
4717 {
4718 *intval = 0;
4719 return true;
4720 }
4721
4722 machine_mode mode = GET_MODE (value);
4723 if (GET_CODE (value) != CONST_DOUBLE
4724 || !SCALAR_FLOAT_MODE_P (mode)
4725 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4726 /* Only support up to DF mode. */
4727 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4728 return false;
4729
4730 unsigned HOST_WIDE_INT ival = 0;
4731
4732 long res[2];
4733 real_to_target (res,
4734 CONST_DOUBLE_REAL_VALUE (value),
4735 REAL_MODE_FORMAT (mode));
4736
4737 if (mode == DFmode)
4738 {
4739 int order = BYTES_BIG_ENDIAN ? 1 : 0;
4740 ival = zext_hwi (res[order], 32);
4741 ival |= (zext_hwi (res[1 - order], 32) << 32);
4742 }
4743 else
4744 ival = zext_hwi (res[0], 32);
4745
4746 *intval = ival;
4747 return true;
4748 }
4749
4750 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4751 single MOV(+MOVK) followed by an FMOV. */
4752 bool
4753 aarch64_float_const_rtx_p (rtx x)
4754 {
4755 machine_mode mode = GET_MODE (x);
4756 if (mode == VOIDmode)
4757 return false;
4758
4759 /* Determine whether it's cheaper to write float constants as
4760 mov/movk pairs over ldr/adrp pairs. */
4761 unsigned HOST_WIDE_INT ival;
4762
4763 if (GET_CODE (x) == CONST_DOUBLE
4764 && SCALAR_FLOAT_MODE_P (mode)
4765 && aarch64_reinterpret_float_as_int (x, &ival))
4766 {
4767 machine_mode imode = (mode == HFmode
4768 ? SImode
4769 : int_mode_for_mode (mode).require ());
4770 int num_instr = aarch64_internal_mov_immediate
4771 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4772 return num_instr < 3;
4773 }
4774
4775 return false;
4776 }
4777
4778 /* Return TRUE if rtx X is immediate constant 0.0 */
4779 bool
4780 aarch64_float_const_zero_rtx_p (rtx x)
4781 {
4782 if (GET_MODE (x) == VOIDmode)
4783 return false;
4784
4785 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4786 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4787 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4788 }
4789
4790 /* Return TRUE if rtx X is immediate constant that fits in a single
4791 MOVI immediate operation. */
4792 bool
4793 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4794 {
4795 if (!TARGET_SIMD)
4796 return false;
4797
4798 machine_mode vmode, imode;
4799 unsigned HOST_WIDE_INT ival;
4800
4801 if (GET_CODE (x) == CONST_DOUBLE
4802 && SCALAR_FLOAT_MODE_P (mode))
4803 {
4804 if (!aarch64_reinterpret_float_as_int (x, &ival))
4805 return false;
4806
4807 /* We make a general exception for 0. */
4808 if (aarch64_float_const_zero_rtx_p (x))
4809 return true;
4810
4811 imode = int_mode_for_mode (mode).require ();
4812 }
4813 else if (GET_CODE (x) == CONST_INT
4814 && SCALAR_INT_MODE_P (mode))
4815 {
4816 imode = mode;
4817 ival = INTVAL (x);
4818 }
4819 else
4820 return false;
4821
4822 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4823 a 128 bit vector mode. */
4824 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4825
4826 vmode = aarch64_simd_container_mode (imode, width);
4827 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4828
4829 return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4830 }
4831
4832
4833 /* Return the fixed registers used for condition codes. */
4834
4835 static bool
4836 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4837 {
4838 *p1 = CC_REGNUM;
4839 *p2 = INVALID_REGNUM;
4840 return true;
4841 }
4842
4843 /* This function is used by the call expanders of the machine description.
4844 RESULT is the register in which the result is returned. It's NULL for
4845 "call" and "sibcall".
4846 MEM is the location of the function call.
4847 SIBCALL indicates whether this function call is normal call or sibling call.
4848 It will generate different pattern accordingly. */
4849
4850 void
4851 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4852 {
4853 rtx call, callee, tmp;
4854 rtvec vec;
4855 machine_mode mode;
4856
4857 gcc_assert (MEM_P (mem));
4858 callee = XEXP (mem, 0);
4859 mode = GET_MODE (callee);
4860 gcc_assert (mode == Pmode);
4861
4862 /* Decide if we should generate indirect calls by loading the
4863 address of the callee into a register before performing
4864 the branch-and-link. */
4865 if (SYMBOL_REF_P (callee)
4866 ? (aarch64_is_long_call_p (callee)
4867 || aarch64_is_noplt_call_p (callee))
4868 : !REG_P (callee))
4869 XEXP (mem, 0) = force_reg (mode, callee);
4870
4871 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4872
4873 if (result != NULL_RTX)
4874 call = gen_rtx_SET (result, call);
4875
4876 if (sibcall)
4877 tmp = ret_rtx;
4878 else
4879 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4880
4881 vec = gen_rtvec (2, call, tmp);
4882 call = gen_rtx_PARALLEL (VOIDmode, vec);
4883
4884 aarch64_emit_call_insn (call);
4885 }
4886
4887 /* Emit call insn with PAT and do aarch64-specific handling. */
4888
4889 void
4890 aarch64_emit_call_insn (rtx pat)
4891 {
4892 rtx insn = emit_call_insn (pat);
4893
4894 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4895 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4896 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4897 }
4898
4899 machine_mode
4900 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4901 {
4902 /* All floating point compares return CCFP if it is an equality
4903 comparison, and CCFPE otherwise. */
4904 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4905 {
4906 switch (code)
4907 {
4908 case EQ:
4909 case NE:
4910 case UNORDERED:
4911 case ORDERED:
4912 case UNLT:
4913 case UNLE:
4914 case UNGT:
4915 case UNGE:
4916 case UNEQ:
4917 case LTGT:
4918 return CCFPmode;
4919
4920 case LT:
4921 case LE:
4922 case GT:
4923 case GE:
4924 return CCFPEmode;
4925
4926 default:
4927 gcc_unreachable ();
4928 }
4929 }
4930
4931 /* Equality comparisons of short modes against zero can be performed
4932 using the TST instruction with the appropriate bitmask. */
4933 if (y == const0_rtx && REG_P (x)
4934 && (code == EQ || code == NE)
4935 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4936 return CC_NZmode;
4937
4938 /* Similarly, comparisons of zero_extends from shorter modes can
4939 be performed using an ANDS with an immediate mask. */
4940 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4941 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4942 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4943 && (code == EQ || code == NE))
4944 return CC_NZmode;
4945
4946 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4947 && y == const0_rtx
4948 && (code == EQ || code == NE || code == LT || code == GE)
4949 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4950 || GET_CODE (x) == NEG
4951 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4952 && CONST_INT_P (XEXP (x, 2)))))
4953 return CC_NZmode;
4954
4955 /* A compare with a shifted operand. Because of canonicalization,
4956 the comparison will have to be swapped when we emit the assembly
4957 code. */
4958 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4959 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4960 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4961 || GET_CODE (x) == LSHIFTRT
4962 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4963 return CC_SWPmode;
4964
4965 /* Similarly for a negated operand, but we can only do this for
4966 equalities. */
4967 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4968 && (REG_P (y) || GET_CODE (y) == SUBREG)
4969 && (code == EQ || code == NE)
4970 && GET_CODE (x) == NEG)
4971 return CC_Zmode;
4972
4973 /* A test for unsigned overflow. */
4974 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4975 && code == NE
4976 && GET_CODE (x) == PLUS
4977 && GET_CODE (y) == ZERO_EXTEND)
4978 return CC_Cmode;
4979
4980 /* For everything else, return CCmode. */
4981 return CCmode;
4982 }
4983
4984 static int
4985 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4986
4987 int
4988 aarch64_get_condition_code (rtx x)
4989 {
4990 machine_mode mode = GET_MODE (XEXP (x, 0));
4991 enum rtx_code comp_code = GET_CODE (x);
4992
4993 if (GET_MODE_CLASS (mode) != MODE_CC)
4994 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4995 return aarch64_get_condition_code_1 (mode, comp_code);
4996 }
4997
4998 static int
4999 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5000 {
5001 switch (mode)
5002 {
5003 case E_CCFPmode:
5004 case E_CCFPEmode:
5005 switch (comp_code)
5006 {
5007 case GE: return AARCH64_GE;
5008 case GT: return AARCH64_GT;
5009 case LE: return AARCH64_LS;
5010 case LT: return AARCH64_MI;
5011 case NE: return AARCH64_NE;
5012 case EQ: return AARCH64_EQ;
5013 case ORDERED: return AARCH64_VC;
5014 case UNORDERED: return AARCH64_VS;
5015 case UNLT: return AARCH64_LT;
5016 case UNLE: return AARCH64_LE;
5017 case UNGT: return AARCH64_HI;
5018 case UNGE: return AARCH64_PL;
5019 default: return -1;
5020 }
5021 break;
5022
5023 case E_CCmode:
5024 switch (comp_code)
5025 {
5026 case NE: return AARCH64_NE;
5027 case EQ: return AARCH64_EQ;
5028 case GE: return AARCH64_GE;
5029 case GT: return AARCH64_GT;
5030 case LE: return AARCH64_LE;
5031 case LT: return AARCH64_LT;
5032 case GEU: return AARCH64_CS;
5033 case GTU: return AARCH64_HI;
5034 case LEU: return AARCH64_LS;
5035 case LTU: return AARCH64_CC;
5036 default: return -1;
5037 }
5038 break;
5039
5040 case E_CC_SWPmode:
5041 switch (comp_code)
5042 {
5043 case NE: return AARCH64_NE;
5044 case EQ: return AARCH64_EQ;
5045 case GE: return AARCH64_LE;
5046 case GT: return AARCH64_LT;
5047 case LE: return AARCH64_GE;
5048 case LT: return AARCH64_GT;
5049 case GEU: return AARCH64_LS;
5050 case GTU: return AARCH64_CC;
5051 case LEU: return AARCH64_CS;
5052 case LTU: return AARCH64_HI;
5053 default: return -1;
5054 }
5055 break;
5056
5057 case E_CC_NZmode:
5058 switch (comp_code)
5059 {
5060 case NE: return AARCH64_NE;
5061 case EQ: return AARCH64_EQ;
5062 case GE: return AARCH64_PL;
5063 case LT: return AARCH64_MI;
5064 default: return -1;
5065 }
5066 break;
5067
5068 case E_CC_Zmode:
5069 switch (comp_code)
5070 {
5071 case NE: return AARCH64_NE;
5072 case EQ: return AARCH64_EQ;
5073 default: return -1;
5074 }
5075 break;
5076
5077 case E_CC_Cmode:
5078 switch (comp_code)
5079 {
5080 case NE: return AARCH64_CS;
5081 case EQ: return AARCH64_CC;
5082 default: return -1;
5083 }
5084 break;
5085
5086 default:
5087 return -1;
5088 }
5089
5090 return -1;
5091 }
5092
5093 bool
5094 aarch64_const_vec_all_same_in_range_p (rtx x,
5095 HOST_WIDE_INT minval,
5096 HOST_WIDE_INT maxval)
5097 {
5098 HOST_WIDE_INT firstval;
5099 int count, i;
5100
5101 if (GET_CODE (x) != CONST_VECTOR
5102 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5103 return false;
5104
5105 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5106 if (firstval < minval || firstval > maxval)
5107 return false;
5108
5109 count = CONST_VECTOR_NUNITS (x);
5110 for (i = 1; i < count; i++)
5111 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5112 return false;
5113
5114 return true;
5115 }
5116
5117 bool
5118 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5119 {
5120 return aarch64_const_vec_all_same_in_range_p (x, val, val);
5121 }
5122
5123
5124 /* N Z C V. */
5125 #define AARCH64_CC_V 1
5126 #define AARCH64_CC_C (1 << 1)
5127 #define AARCH64_CC_Z (1 << 2)
5128 #define AARCH64_CC_N (1 << 3)
5129
5130 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5131 static const int aarch64_nzcv_codes[] =
5132 {
5133 0, /* EQ, Z == 1. */
5134 AARCH64_CC_Z, /* NE, Z == 0. */
5135 0, /* CS, C == 1. */
5136 AARCH64_CC_C, /* CC, C == 0. */
5137 0, /* MI, N == 1. */
5138 AARCH64_CC_N, /* PL, N == 0. */
5139 0, /* VS, V == 1. */
5140 AARCH64_CC_V, /* VC, V == 0. */
5141 0, /* HI, C ==1 && Z == 0. */
5142 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
5143 AARCH64_CC_V, /* GE, N == V. */
5144 0, /* LT, N != V. */
5145 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
5146 0, /* LE, !(Z == 0 && N == V). */
5147 0, /* AL, Any. */
5148 0 /* NV, Any. */
5149 };
5150
5151 /* Print operand X to file F in a target specific manner according to CODE.
5152 The acceptable formatting commands given by CODE are:
5153 'c': An integer or symbol address without a preceding #
5154 sign.
5155 'e': Print the sign/zero-extend size as a character 8->b,
5156 16->h, 32->w.
5157 'p': Prints N such that 2^N == X (X must be power of 2 and
5158 const int).
5159 'P': Print the number of non-zero bits in X (a const_int).
5160 'H': Print the higher numbered register of a pair (TImode)
5161 of regs.
5162 'm': Print a condition (eq, ne, etc).
5163 'M': Same as 'm', but invert condition.
5164 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5165 'S/T/U/V': Print a FP/SIMD register name for a register list.
5166 The register printed is the FP/SIMD register name
5167 of X + 0/1/2/3 for S/T/U/V.
5168 'R': Print a scalar FP/SIMD register name + 1.
5169 'X': Print bottom 16 bits of integer constant in hex.
5170 'w/x': Print a general register name or the zero register
5171 (32-bit or 64-bit).
5172 '0': Print a normal operand, if it's a general register,
5173 then we assume DImode.
5174 'k': Print NZCV for conditional compare instructions.
5175 'A': Output address constant representing the first
5176 argument of X, specifying a relocation offset
5177 if appropriate.
5178 'L': Output constant address specified by X
5179 with a relocation offset if appropriate.
5180 'G': Prints address of X, specifying a PC relative
5181 relocation mode if appropriate. */
5182
5183 static void
5184 aarch64_print_operand (FILE *f, rtx x, int code)
5185 {
5186 switch (code)
5187 {
5188 case 'c':
5189 switch (GET_CODE (x))
5190 {
5191 case CONST_INT:
5192 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5193 break;
5194
5195 case SYMBOL_REF:
5196 output_addr_const (f, x);
5197 break;
5198
5199 case CONST:
5200 if (GET_CODE (XEXP (x, 0)) == PLUS
5201 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5202 {
5203 output_addr_const (f, x);
5204 break;
5205 }
5206 /* Fall through. */
5207
5208 default:
5209 output_operand_lossage ("Unsupported operand for code '%c'", code);
5210 }
5211 break;
5212
5213 case 'e':
5214 {
5215 int n;
5216
5217 if (!CONST_INT_P (x)
5218 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5219 {
5220 output_operand_lossage ("invalid operand for '%%%c'", code);
5221 return;
5222 }
5223
5224 switch (n)
5225 {
5226 case 3:
5227 fputc ('b', f);
5228 break;
5229 case 4:
5230 fputc ('h', f);
5231 break;
5232 case 5:
5233 fputc ('w', f);
5234 break;
5235 default:
5236 output_operand_lossage ("invalid operand for '%%%c'", code);
5237 return;
5238 }
5239 }
5240 break;
5241
5242 case 'p':
5243 {
5244 int n;
5245
5246 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5247 {
5248 output_operand_lossage ("invalid operand for '%%%c'", code);
5249 return;
5250 }
5251
5252 asm_fprintf (f, "%d", n);
5253 }
5254 break;
5255
5256 case 'P':
5257 if (!CONST_INT_P (x))
5258 {
5259 output_operand_lossage ("invalid operand for '%%%c'", code);
5260 return;
5261 }
5262
5263 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5264 break;
5265
5266 case 'H':
5267 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5268 {
5269 output_operand_lossage ("invalid operand for '%%%c'", code);
5270 return;
5271 }
5272
5273 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5274 break;
5275
5276 case 'M':
5277 case 'm':
5278 {
5279 int cond_code;
5280 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5281 if (x == const_true_rtx)
5282 {
5283 if (code == 'M')
5284 fputs ("nv", f);
5285 return;
5286 }
5287
5288 if (!COMPARISON_P (x))
5289 {
5290 output_operand_lossage ("invalid operand for '%%%c'", code);
5291 return;
5292 }
5293
5294 cond_code = aarch64_get_condition_code (x);
5295 gcc_assert (cond_code >= 0);
5296 if (code == 'M')
5297 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5298 fputs (aarch64_condition_codes[cond_code], f);
5299 }
5300 break;
5301
5302 case 'b':
5303 case 'h':
5304 case 's':
5305 case 'd':
5306 case 'q':
5307 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5308 {
5309 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5310 return;
5311 }
5312 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5313 break;
5314
5315 case 'S':
5316 case 'T':
5317 case 'U':
5318 case 'V':
5319 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5320 {
5321 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5322 return;
5323 }
5324 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5325 break;
5326
5327 case 'R':
5328 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5329 {
5330 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5331 return;
5332 }
5333 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5334 break;
5335
5336 case 'X':
5337 if (!CONST_INT_P (x))
5338 {
5339 output_operand_lossage ("invalid operand for '%%%c'", code);
5340 return;
5341 }
5342 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5343 break;
5344
5345 case 'w':
5346 case 'x':
5347 if (x == const0_rtx
5348 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5349 {
5350 asm_fprintf (f, "%czr", code);
5351 break;
5352 }
5353
5354 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5355 {
5356 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5357 break;
5358 }
5359
5360 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5361 {
5362 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5363 break;
5364 }
5365
5366 /* Fall through */
5367
5368 case 0:
5369 if (x == NULL)
5370 {
5371 output_operand_lossage ("missing operand");
5372 return;
5373 }
5374
5375 switch (GET_CODE (x))
5376 {
5377 case REG:
5378 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5379 break;
5380
5381 case MEM:
5382 output_address (GET_MODE (x), XEXP (x, 0));
5383 /* Check all memory references are Pmode - even with ILP32. */
5384 gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5385 break;
5386
5387 case CONST:
5388 case LABEL_REF:
5389 case SYMBOL_REF:
5390 output_addr_const (asm_out_file, x);
5391 break;
5392
5393 case CONST_INT:
5394 asm_fprintf (f, "%wd", INTVAL (x));
5395 break;
5396
5397 case CONST_VECTOR:
5398 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5399 {
5400 gcc_assert (
5401 aarch64_const_vec_all_same_in_range_p (x,
5402 HOST_WIDE_INT_MIN,
5403 HOST_WIDE_INT_MAX));
5404 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5405 }
5406 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5407 {
5408 fputc ('0', f);
5409 }
5410 else
5411 gcc_unreachable ();
5412 break;
5413
5414 case CONST_DOUBLE:
5415 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5416 be getting CONST_DOUBLEs holding integers. */
5417 gcc_assert (GET_MODE (x) != VOIDmode);
5418 if (aarch64_float_const_zero_rtx_p (x))
5419 {
5420 fputc ('0', f);
5421 break;
5422 }
5423 else if (aarch64_float_const_representable_p (x))
5424 {
5425 #define buf_size 20
5426 char float_buf[buf_size] = {'\0'};
5427 real_to_decimal_for_mode (float_buf,
5428 CONST_DOUBLE_REAL_VALUE (x),
5429 buf_size, buf_size,
5430 1, GET_MODE (x));
5431 asm_fprintf (asm_out_file, "%s", float_buf);
5432 break;
5433 #undef buf_size
5434 }
5435 output_operand_lossage ("invalid constant");
5436 return;
5437 default:
5438 output_operand_lossage ("invalid operand");
5439 return;
5440 }
5441 break;
5442
5443 case 'A':
5444 if (GET_CODE (x) == HIGH)
5445 x = XEXP (x, 0);
5446
5447 switch (aarch64_classify_symbolic_expression (x))
5448 {
5449 case SYMBOL_SMALL_GOT_4G:
5450 asm_fprintf (asm_out_file, ":got:");
5451 break;
5452
5453 case SYMBOL_SMALL_TLSGD:
5454 asm_fprintf (asm_out_file, ":tlsgd:");
5455 break;
5456
5457 case SYMBOL_SMALL_TLSDESC:
5458 asm_fprintf (asm_out_file, ":tlsdesc:");
5459 break;
5460
5461 case SYMBOL_SMALL_TLSIE:
5462 asm_fprintf (asm_out_file, ":gottprel:");
5463 break;
5464
5465 case SYMBOL_TLSLE24:
5466 asm_fprintf (asm_out_file, ":tprel:");
5467 break;
5468
5469 case SYMBOL_TINY_GOT:
5470 gcc_unreachable ();
5471 break;
5472
5473 default:
5474 break;
5475 }
5476 output_addr_const (asm_out_file, x);
5477 break;
5478
5479 case 'L':
5480 switch (aarch64_classify_symbolic_expression (x))
5481 {
5482 case SYMBOL_SMALL_GOT_4G:
5483 asm_fprintf (asm_out_file, ":lo12:");
5484 break;
5485
5486 case SYMBOL_SMALL_TLSGD:
5487 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5488 break;
5489
5490 case SYMBOL_SMALL_TLSDESC:
5491 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5492 break;
5493
5494 case SYMBOL_SMALL_TLSIE:
5495 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5496 break;
5497
5498 case SYMBOL_TLSLE12:
5499 asm_fprintf (asm_out_file, ":tprel_lo12:");
5500 break;
5501
5502 case SYMBOL_TLSLE24:
5503 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5504 break;
5505
5506 case SYMBOL_TINY_GOT:
5507 asm_fprintf (asm_out_file, ":got:");
5508 break;
5509
5510 case SYMBOL_TINY_TLSIE:
5511 asm_fprintf (asm_out_file, ":gottprel:");
5512 break;
5513
5514 default:
5515 break;
5516 }
5517 output_addr_const (asm_out_file, x);
5518 break;
5519
5520 case 'G':
5521 switch (aarch64_classify_symbolic_expression (x))
5522 {
5523 case SYMBOL_TLSLE24:
5524 asm_fprintf (asm_out_file, ":tprel_hi12:");
5525 break;
5526 default:
5527 break;
5528 }
5529 output_addr_const (asm_out_file, x);
5530 break;
5531
5532 case 'k':
5533 {
5534 HOST_WIDE_INT cond_code;
5535
5536 if (!CONST_INT_P (x))
5537 {
5538 output_operand_lossage ("invalid operand for '%%%c'", code);
5539 return;
5540 }
5541
5542 cond_code = INTVAL (x);
5543 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5544 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5545 }
5546 break;
5547
5548 default:
5549 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5550 return;
5551 }
5552 }
5553
5554 static void
5555 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5556 {
5557 struct aarch64_address_info addr;
5558
5559 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5560 switch (addr.type)
5561 {
5562 case ADDRESS_REG_IMM:
5563 if (addr.offset == const0_rtx)
5564 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5565 else
5566 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5567 INTVAL (addr.offset));
5568 return;
5569
5570 case ADDRESS_REG_REG:
5571 if (addr.shift == 0)
5572 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5573 reg_names [REGNO (addr.offset)]);
5574 else
5575 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5576 reg_names [REGNO (addr.offset)], addr.shift);
5577 return;
5578
5579 case ADDRESS_REG_UXTW:
5580 if (addr.shift == 0)
5581 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5582 REGNO (addr.offset) - R0_REGNUM);
5583 else
5584 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5585 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5586 return;
5587
5588 case ADDRESS_REG_SXTW:
5589 if (addr.shift == 0)
5590 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5591 REGNO (addr.offset) - R0_REGNUM);
5592 else
5593 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5594 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5595 return;
5596
5597 case ADDRESS_REG_WB:
5598 switch (GET_CODE (x))
5599 {
5600 case PRE_INC:
5601 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5602 GET_MODE_SIZE (mode));
5603 return;
5604 case POST_INC:
5605 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5606 GET_MODE_SIZE (mode));
5607 return;
5608 case PRE_DEC:
5609 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5610 GET_MODE_SIZE (mode));
5611 return;
5612 case POST_DEC:
5613 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5614 GET_MODE_SIZE (mode));
5615 return;
5616 case PRE_MODIFY:
5617 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5618 INTVAL (addr.offset));
5619 return;
5620 case POST_MODIFY:
5621 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5622 INTVAL (addr.offset));
5623 return;
5624 default:
5625 break;
5626 }
5627 break;
5628
5629 case ADDRESS_LO_SUM:
5630 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5631 output_addr_const (f, addr.offset);
5632 asm_fprintf (f, "]");
5633 return;
5634
5635 case ADDRESS_SYMBOLIC:
5636 break;
5637 }
5638
5639 output_addr_const (f, x);
5640 }
5641
5642 bool
5643 aarch64_label_mentioned_p (rtx x)
5644 {
5645 const char *fmt;
5646 int i;
5647
5648 if (GET_CODE (x) == LABEL_REF)
5649 return true;
5650
5651 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5652 referencing instruction, but they are constant offsets, not
5653 symbols. */
5654 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5655 return false;
5656
5657 fmt = GET_RTX_FORMAT (GET_CODE (x));
5658 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5659 {
5660 if (fmt[i] == 'E')
5661 {
5662 int j;
5663
5664 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5665 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5666 return 1;
5667 }
5668 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5669 return 1;
5670 }
5671
5672 return 0;
5673 }
5674
5675 /* Implement REGNO_REG_CLASS. */
5676
5677 enum reg_class
5678 aarch64_regno_regclass (unsigned regno)
5679 {
5680 if (GP_REGNUM_P (regno))
5681 return GENERAL_REGS;
5682
5683 if (regno == SP_REGNUM)
5684 return STACK_REG;
5685
5686 if (regno == FRAME_POINTER_REGNUM
5687 || regno == ARG_POINTER_REGNUM)
5688 return POINTER_REGS;
5689
5690 if (FP_REGNUM_P (regno))
5691 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5692
5693 return NO_REGS;
5694 }
5695
5696 static rtx
5697 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5698 {
5699 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5700 where mask is selected by alignment and size of the offset.
5701 We try to pick as large a range for the offset as possible to
5702 maximize the chance of a CSE. However, for aligned addresses
5703 we limit the range to 4k so that structures with different sized
5704 elements are likely to use the same base. We need to be careful
5705 not to split a CONST for some forms of address expression, otherwise
5706 it will generate sub-optimal code. */
5707
5708 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5709 {
5710 rtx base = XEXP (x, 0);
5711 rtx offset_rtx = XEXP (x, 1);
5712 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5713
5714 if (GET_CODE (base) == PLUS)
5715 {
5716 rtx op0 = XEXP (base, 0);
5717 rtx op1 = XEXP (base, 1);
5718
5719 /* Force any scaling into a temp for CSE. */
5720 op0 = force_reg (Pmode, op0);
5721 op1 = force_reg (Pmode, op1);
5722
5723 /* Let the pointer register be in op0. */
5724 if (REG_POINTER (op1))
5725 std::swap (op0, op1);
5726
5727 /* If the pointer is virtual or frame related, then we know that
5728 virtual register instantiation or register elimination is going
5729 to apply a second constant. We want the two constants folded
5730 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5731 if (virt_or_elim_regno_p (REGNO (op0)))
5732 {
5733 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5734 NULL_RTX, true, OPTAB_DIRECT);
5735 return gen_rtx_PLUS (Pmode, base, op1);
5736 }
5737
5738 /* Otherwise, in order to encourage CSE (and thence loop strength
5739 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5740 base = expand_binop (Pmode, add_optab, op0, op1,
5741 NULL_RTX, true, OPTAB_DIRECT);
5742 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5743 }
5744
5745 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5746 HOST_WIDE_INT base_offset;
5747 if (GET_MODE_SIZE (mode) > 16)
5748 base_offset = (offset + 0x400) & ~0x7f0;
5749 /* For offsets aren't a multiple of the access size, the limit is
5750 -256...255. */
5751 else if (offset & (GET_MODE_SIZE (mode) - 1))
5752 {
5753 base_offset = (offset + 0x100) & ~0x1ff;
5754
5755 /* BLKmode typically uses LDP of X-registers. */
5756 if (mode == BLKmode)
5757 base_offset = (offset + 512) & ~0x3ff;
5758 }
5759 /* Small negative offsets are supported. */
5760 else if (IN_RANGE (offset, -256, 0))
5761 base_offset = 0;
5762 else if (mode == TImode || mode == TFmode)
5763 base_offset = (offset + 0x100) & ~0x1ff;
5764 /* Use 12-bit offset by access size. */
5765 else
5766 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5767
5768 if (base_offset != 0)
5769 {
5770 base = plus_constant (Pmode, base, base_offset);
5771 base = force_operand (base, NULL_RTX);
5772 return plus_constant (Pmode, base, offset - base_offset);
5773 }
5774 }
5775
5776 return x;
5777 }
5778
5779 /* Return the reload icode required for a constant pool in mode. */
5780 static enum insn_code
5781 aarch64_constant_pool_reload_icode (machine_mode mode)
5782 {
5783 switch (mode)
5784 {
5785 case E_SFmode:
5786 return CODE_FOR_aarch64_reload_movcpsfdi;
5787
5788 case E_DFmode:
5789 return CODE_FOR_aarch64_reload_movcpdfdi;
5790
5791 case E_TFmode:
5792 return CODE_FOR_aarch64_reload_movcptfdi;
5793
5794 case E_V8QImode:
5795 return CODE_FOR_aarch64_reload_movcpv8qidi;
5796
5797 case E_V16QImode:
5798 return CODE_FOR_aarch64_reload_movcpv16qidi;
5799
5800 case E_V4HImode:
5801 return CODE_FOR_aarch64_reload_movcpv4hidi;
5802
5803 case E_V8HImode:
5804 return CODE_FOR_aarch64_reload_movcpv8hidi;
5805
5806 case E_V2SImode:
5807 return CODE_FOR_aarch64_reload_movcpv2sidi;
5808
5809 case E_V4SImode:
5810 return CODE_FOR_aarch64_reload_movcpv4sidi;
5811
5812 case E_V2DImode:
5813 return CODE_FOR_aarch64_reload_movcpv2didi;
5814
5815 case E_V2DFmode:
5816 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5817
5818 default:
5819 gcc_unreachable ();
5820 }
5821
5822 gcc_unreachable ();
5823 }
5824 static reg_class_t
5825 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5826 reg_class_t rclass,
5827 machine_mode mode,
5828 secondary_reload_info *sri)
5829 {
5830
5831 /* If we have to disable direct literal pool loads and stores because the
5832 function is too big, then we need a scratch register. */
5833 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5834 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5835 || targetm.vector_mode_supported_p (GET_MODE (x)))
5836 && !aarch64_pcrelative_literal_loads)
5837 {
5838 sri->icode = aarch64_constant_pool_reload_icode (mode);
5839 return NO_REGS;
5840 }
5841
5842 /* Without the TARGET_SIMD instructions we cannot move a Q register
5843 to a Q register directly. We need a scratch. */
5844 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5845 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5846 && reg_class_subset_p (rclass, FP_REGS))
5847 {
5848 if (mode == TFmode)
5849 sri->icode = CODE_FOR_aarch64_reload_movtf;
5850 else if (mode == TImode)
5851 sri->icode = CODE_FOR_aarch64_reload_movti;
5852 return NO_REGS;
5853 }
5854
5855 /* A TFmode or TImode memory access should be handled via an FP_REGS
5856 because AArch64 has richer addressing modes for LDR/STR instructions
5857 than LDP/STP instructions. */
5858 if (TARGET_FLOAT && rclass == GENERAL_REGS
5859 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5860 return FP_REGS;
5861
5862 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5863 return GENERAL_REGS;
5864
5865 return NO_REGS;
5866 }
5867
5868 static bool
5869 aarch64_can_eliminate (const int from, const int to)
5870 {
5871 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5872 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5873
5874 if (frame_pointer_needed)
5875 {
5876 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5877 return true;
5878 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5879 return false;
5880 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5881 && !cfun->calls_alloca)
5882 return true;
5883 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5884 return true;
5885
5886 return false;
5887 }
5888 else
5889 {
5890 /* If we decided that we didn't need a leaf frame pointer but then used
5891 LR in the function, then we'll want a frame pointer after all, so
5892 prevent this elimination to ensure a frame pointer is used. */
5893 if (to == STACK_POINTER_REGNUM
5894 && flag_omit_leaf_frame_pointer
5895 && df_regs_ever_live_p (LR_REGNUM))
5896 return false;
5897 }
5898
5899 return true;
5900 }
5901
5902 HOST_WIDE_INT
5903 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5904 {
5905 aarch64_layout_frame ();
5906
5907 if (to == HARD_FRAME_POINTER_REGNUM)
5908 {
5909 if (from == ARG_POINTER_REGNUM)
5910 return cfun->machine->frame.hard_fp_offset;
5911
5912 if (from == FRAME_POINTER_REGNUM)
5913 return cfun->machine->frame.hard_fp_offset
5914 - cfun->machine->frame.locals_offset;
5915 }
5916
5917 if (to == STACK_POINTER_REGNUM)
5918 {
5919 if (from == FRAME_POINTER_REGNUM)
5920 return cfun->machine->frame.frame_size
5921 - cfun->machine->frame.locals_offset;
5922 }
5923
5924 return cfun->machine->frame.frame_size;
5925 }
5926
5927 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5928 previous frame. */
5929
5930 rtx
5931 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5932 {
5933 if (count != 0)
5934 return const0_rtx;
5935 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5936 }
5937
5938
5939 static void
5940 aarch64_asm_trampoline_template (FILE *f)
5941 {
5942 if (TARGET_ILP32)
5943 {
5944 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5945 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5946 }
5947 else
5948 {
5949 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5950 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5951 }
5952 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5953 assemble_aligned_integer (4, const0_rtx);
5954 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5955 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5956 }
5957
5958 static void
5959 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5960 {
5961 rtx fnaddr, mem, a_tramp;
5962 const int tramp_code_sz = 16;
5963
5964 /* Don't need to copy the trailing D-words, we fill those in below. */
5965 emit_block_move (m_tramp, assemble_trampoline_template (),
5966 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5967 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5968 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5969 if (GET_MODE (fnaddr) != ptr_mode)
5970 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5971 emit_move_insn (mem, fnaddr);
5972
5973 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5974 emit_move_insn (mem, chain_value);
5975
5976 /* XXX We should really define a "clear_cache" pattern and use
5977 gen_clear_cache(). */
5978 a_tramp = XEXP (m_tramp, 0);
5979 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5980 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5981 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5982 ptr_mode);
5983 }
5984
5985 static unsigned char
5986 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5987 {
5988 switch (regclass)
5989 {
5990 case CALLER_SAVE_REGS:
5991 case POINTER_REGS:
5992 case GENERAL_REGS:
5993 case ALL_REGS:
5994 case FP_REGS:
5995 case FP_LO_REGS:
5996 return
5997 aarch64_vector_mode_p (mode)
5998 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5999 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6000 case STACK_REG:
6001 return 1;
6002
6003 case NO_REGS:
6004 return 0;
6005
6006 default:
6007 break;
6008 }
6009 gcc_unreachable ();
6010 }
6011
6012 static reg_class_t
6013 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6014 {
6015 if (regclass == POINTER_REGS)
6016 return GENERAL_REGS;
6017
6018 if (regclass == STACK_REG)
6019 {
6020 if (REG_P(x)
6021 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6022 return regclass;
6023
6024 return NO_REGS;
6025 }
6026
6027 /* Register eliminiation can result in a request for
6028 SP+constant->FP_REGS. We cannot support such operations which
6029 use SP as source and an FP_REG as destination, so reject out
6030 right now. */
6031 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6032 {
6033 rtx lhs = XEXP (x, 0);
6034
6035 /* Look through a possible SUBREG introduced by ILP32. */
6036 if (GET_CODE (lhs) == SUBREG)
6037 lhs = SUBREG_REG (lhs);
6038
6039 gcc_assert (REG_P (lhs));
6040 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6041 POINTER_REGS));
6042 return NO_REGS;
6043 }
6044
6045 return regclass;
6046 }
6047
6048 void
6049 aarch64_asm_output_labelref (FILE* f, const char *name)
6050 {
6051 asm_fprintf (f, "%U%s", name);
6052 }
6053
6054 static void
6055 aarch64_elf_asm_constructor (rtx symbol, int priority)
6056 {
6057 if (priority == DEFAULT_INIT_PRIORITY)
6058 default_ctor_section_asm_out_constructor (symbol, priority);
6059 else
6060 {
6061 section *s;
6062 /* While priority is known to be in range [0, 65535], so 18 bytes
6063 would be enough, the compiler might not know that. To avoid
6064 -Wformat-truncation false positive, use a larger size. */
6065 char buf[23];
6066 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6067 s = get_section (buf, SECTION_WRITE, NULL);
6068 switch_to_section (s);
6069 assemble_align (POINTER_SIZE);
6070 assemble_aligned_integer (POINTER_BYTES, symbol);
6071 }
6072 }
6073
6074 static void
6075 aarch64_elf_asm_destructor (rtx symbol, int priority)
6076 {
6077 if (priority == DEFAULT_INIT_PRIORITY)
6078 default_dtor_section_asm_out_destructor (symbol, priority);
6079 else
6080 {
6081 section *s;
6082 /* While priority is known to be in range [0, 65535], so 18 bytes
6083 would be enough, the compiler might not know that. To avoid
6084 -Wformat-truncation false positive, use a larger size. */
6085 char buf[23];
6086 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6087 s = get_section (buf, SECTION_WRITE, NULL);
6088 switch_to_section (s);
6089 assemble_align (POINTER_SIZE);
6090 assemble_aligned_integer (POINTER_BYTES, symbol);
6091 }
6092 }
6093
6094 const char*
6095 aarch64_output_casesi (rtx *operands)
6096 {
6097 char buf[100];
6098 char label[100];
6099 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6100 int index;
6101 static const char *const patterns[4][2] =
6102 {
6103 {
6104 "ldrb\t%w3, [%0,%w1,uxtw]",
6105 "add\t%3, %4, %w3, sxtb #2"
6106 },
6107 {
6108 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6109 "add\t%3, %4, %w3, sxth #2"
6110 },
6111 {
6112 "ldr\t%w3, [%0,%w1,uxtw #2]",
6113 "add\t%3, %4, %w3, sxtw #2"
6114 },
6115 /* We assume that DImode is only generated when not optimizing and
6116 that we don't really need 64-bit address offsets. That would
6117 imply an object file with 8GB of code in a single function! */
6118 {
6119 "ldr\t%w3, [%0,%w1,uxtw #2]",
6120 "add\t%3, %4, %w3, sxtw #2"
6121 }
6122 };
6123
6124 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6125
6126 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6127
6128 gcc_assert (index >= 0 && index <= 3);
6129
6130 /* Need to implement table size reduction, by chaning the code below. */
6131 output_asm_insn (patterns[index][0], operands);
6132 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6133 snprintf (buf, sizeof (buf),
6134 "adr\t%%4, %s", targetm.strip_name_encoding (label));
6135 output_asm_insn (buf, operands);
6136 output_asm_insn (patterns[index][1], operands);
6137 output_asm_insn ("br\t%3", operands);
6138 assemble_label (asm_out_file, label);
6139 return "";
6140 }
6141
6142
6143 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6144 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6145 operator. */
6146
6147 int
6148 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6149 {
6150 if (shift >= 0 && shift <= 3)
6151 {
6152 int size;
6153 for (size = 8; size <= 32; size *= 2)
6154 {
6155 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6156 if (mask == bits << shift)
6157 return size;
6158 }
6159 }
6160 return 0;
6161 }
6162
6163 /* Constant pools are per function only when PC relative
6164 literal loads are true or we are in the large memory
6165 model. */
6166
6167 static inline bool
6168 aarch64_can_use_per_function_literal_pools_p (void)
6169 {
6170 return (aarch64_pcrelative_literal_loads
6171 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6172 }
6173
6174 static bool
6175 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6176 {
6177 /* Fixme:: In an ideal world this would work similar
6178 to the logic in aarch64_select_rtx_section but this
6179 breaks bootstrap in gcc go. For now we workaround
6180 this by returning false here. */
6181 return false;
6182 }
6183
6184 /* Select appropriate section for constants depending
6185 on where we place literal pools. */
6186
6187 static section *
6188 aarch64_select_rtx_section (machine_mode mode,
6189 rtx x,
6190 unsigned HOST_WIDE_INT align)
6191 {
6192 if (aarch64_can_use_per_function_literal_pools_p ())
6193 return function_section (current_function_decl);
6194
6195 return default_elf_select_rtx_section (mode, x, align);
6196 }
6197
6198 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6199 void
6200 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6201 HOST_WIDE_INT offset)
6202 {
6203 /* When using per-function literal pools, we must ensure that any code
6204 section is aligned to the minimal instruction length, lest we get
6205 errors from the assembler re "unaligned instructions". */
6206 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6207 ASM_OUTPUT_ALIGN (f, 2);
6208 }
6209
6210 /* Costs. */
6211
6212 /* Helper function for rtx cost calculation. Strip a shift expression
6213 from X. Returns the inner operand if successful, or the original
6214 expression on failure. */
6215 static rtx
6216 aarch64_strip_shift (rtx x)
6217 {
6218 rtx op = x;
6219
6220 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6221 we can convert both to ROR during final output. */
6222 if ((GET_CODE (op) == ASHIFT
6223 || GET_CODE (op) == ASHIFTRT
6224 || GET_CODE (op) == LSHIFTRT
6225 || GET_CODE (op) == ROTATERT
6226 || GET_CODE (op) == ROTATE)
6227 && CONST_INT_P (XEXP (op, 1)))
6228 return XEXP (op, 0);
6229
6230 if (GET_CODE (op) == MULT
6231 && CONST_INT_P (XEXP (op, 1))
6232 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6233 return XEXP (op, 0);
6234
6235 return x;
6236 }
6237
6238 /* Helper function for rtx cost calculation. Strip an extend
6239 expression from X. Returns the inner operand if successful, or the
6240 original expression on failure. We deal with a number of possible
6241 canonicalization variations here. If STRIP_SHIFT is true, then
6242 we can strip off a shift also. */
6243 static rtx
6244 aarch64_strip_extend (rtx x, bool strip_shift)
6245 {
6246 rtx op = x;
6247
6248 /* Zero and sign extraction of a widened value. */
6249 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6250 && XEXP (op, 2) == const0_rtx
6251 && GET_CODE (XEXP (op, 0)) == MULT
6252 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6253 XEXP (op, 1)))
6254 return XEXP (XEXP (op, 0), 0);
6255
6256 /* It can also be represented (for zero-extend) as an AND with an
6257 immediate. */
6258 if (GET_CODE (op) == AND
6259 && GET_CODE (XEXP (op, 0)) == MULT
6260 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6261 && CONST_INT_P (XEXP (op, 1))
6262 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6263 INTVAL (XEXP (op, 1))) != 0)
6264 return XEXP (XEXP (op, 0), 0);
6265
6266 /* Now handle extended register, as this may also have an optional
6267 left shift by 1..4. */
6268 if (strip_shift
6269 && GET_CODE (op) == ASHIFT
6270 && CONST_INT_P (XEXP (op, 1))
6271 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6272 op = XEXP (op, 0);
6273
6274 if (GET_CODE (op) == ZERO_EXTEND
6275 || GET_CODE (op) == SIGN_EXTEND)
6276 op = XEXP (op, 0);
6277
6278 if (op != x)
6279 return op;
6280
6281 return x;
6282 }
6283
6284 /* Return true iff CODE is a shift supported in combination
6285 with arithmetic instructions. */
6286
6287 static bool
6288 aarch64_shift_p (enum rtx_code code)
6289 {
6290 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6291 }
6292
6293
6294 /* Return true iff X is a cheap shift without a sign extend. */
6295
6296 static bool
6297 aarch64_cheap_mult_shift_p (rtx x)
6298 {
6299 rtx op0, op1;
6300
6301 op0 = XEXP (x, 0);
6302 op1 = XEXP (x, 1);
6303
6304 if (!(aarch64_tune_params.extra_tuning_flags
6305 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6306 return false;
6307
6308 if (GET_CODE (op0) == SIGN_EXTEND)
6309 return false;
6310
6311 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6312 && UINTVAL (op1) <= 4)
6313 return true;
6314
6315 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6316 return false;
6317
6318 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6319
6320 if (l2 > 0 && l2 <= 4)
6321 return true;
6322
6323 return false;
6324 }
6325
6326 /* Helper function for rtx cost calculation. Calculate the cost of
6327 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6328 Return the calculated cost of the expression, recursing manually in to
6329 operands where needed. */
6330
6331 static int
6332 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6333 {
6334 rtx op0, op1;
6335 const struct cpu_cost_table *extra_cost
6336 = aarch64_tune_params.insn_extra_cost;
6337 int cost = 0;
6338 bool compound_p = (outer == PLUS || outer == MINUS);
6339 machine_mode mode = GET_MODE (x);
6340
6341 gcc_checking_assert (code == MULT);
6342
6343 op0 = XEXP (x, 0);
6344 op1 = XEXP (x, 1);
6345
6346 if (VECTOR_MODE_P (mode))
6347 mode = GET_MODE_INNER (mode);
6348
6349 /* Integer multiply/fma. */
6350 if (GET_MODE_CLASS (mode) == MODE_INT)
6351 {
6352 /* The multiply will be canonicalized as a shift, cost it as such. */
6353 if (aarch64_shift_p (GET_CODE (x))
6354 || (CONST_INT_P (op1)
6355 && exact_log2 (INTVAL (op1)) > 0))
6356 {
6357 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6358 || GET_CODE (op0) == SIGN_EXTEND;
6359 if (speed)
6360 {
6361 if (compound_p)
6362 {
6363 /* If the shift is considered cheap,
6364 then don't add any cost. */
6365 if (aarch64_cheap_mult_shift_p (x))
6366 ;
6367 else if (REG_P (op1))
6368 /* ARITH + shift-by-register. */
6369 cost += extra_cost->alu.arith_shift_reg;
6370 else if (is_extend)
6371 /* ARITH + extended register. We don't have a cost field
6372 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6373 cost += extra_cost->alu.extend_arith;
6374 else
6375 /* ARITH + shift-by-immediate. */
6376 cost += extra_cost->alu.arith_shift;
6377 }
6378 else
6379 /* LSL (immediate). */
6380 cost += extra_cost->alu.shift;
6381
6382 }
6383 /* Strip extends as we will have costed them in the case above. */
6384 if (is_extend)
6385 op0 = aarch64_strip_extend (op0, true);
6386
6387 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6388
6389 return cost;
6390 }
6391
6392 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6393 compound and let the below cases handle it. After all, MNEG is a
6394 special-case alias of MSUB. */
6395 if (GET_CODE (op0) == NEG)
6396 {
6397 op0 = XEXP (op0, 0);
6398 compound_p = true;
6399 }
6400
6401 /* Integer multiplies or FMAs have zero/sign extending variants. */
6402 if ((GET_CODE (op0) == ZERO_EXTEND
6403 && GET_CODE (op1) == ZERO_EXTEND)
6404 || (GET_CODE (op0) == SIGN_EXTEND
6405 && GET_CODE (op1) == SIGN_EXTEND))
6406 {
6407 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6408 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6409
6410 if (speed)
6411 {
6412 if (compound_p)
6413 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6414 cost += extra_cost->mult[0].extend_add;
6415 else
6416 /* MUL/SMULL/UMULL. */
6417 cost += extra_cost->mult[0].extend;
6418 }
6419
6420 return cost;
6421 }
6422
6423 /* This is either an integer multiply or a MADD. In both cases
6424 we want to recurse and cost the operands. */
6425 cost += rtx_cost (op0, mode, MULT, 0, speed);
6426 cost += rtx_cost (op1, mode, MULT, 1, speed);
6427
6428 if (speed)
6429 {
6430 if (compound_p)
6431 /* MADD/MSUB. */
6432 cost += extra_cost->mult[mode == DImode].add;
6433 else
6434 /* MUL. */
6435 cost += extra_cost->mult[mode == DImode].simple;
6436 }
6437
6438 return cost;
6439 }
6440 else
6441 {
6442 if (speed)
6443 {
6444 /* Floating-point FMA/FMUL can also support negations of the
6445 operands, unless the rounding mode is upward or downward in
6446 which case FNMUL is different than FMUL with operand negation. */
6447 bool neg0 = GET_CODE (op0) == NEG;
6448 bool neg1 = GET_CODE (op1) == NEG;
6449 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6450 {
6451 if (neg0)
6452 op0 = XEXP (op0, 0);
6453 if (neg1)
6454 op1 = XEXP (op1, 0);
6455 }
6456
6457 if (compound_p)
6458 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6459 cost += extra_cost->fp[mode == DFmode].fma;
6460 else
6461 /* FMUL/FNMUL. */
6462 cost += extra_cost->fp[mode == DFmode].mult;
6463 }
6464
6465 cost += rtx_cost (op0, mode, MULT, 0, speed);
6466 cost += rtx_cost (op1, mode, MULT, 1, speed);
6467 return cost;
6468 }
6469 }
6470
6471 static int
6472 aarch64_address_cost (rtx x,
6473 machine_mode mode,
6474 addr_space_t as ATTRIBUTE_UNUSED,
6475 bool speed)
6476 {
6477 enum rtx_code c = GET_CODE (x);
6478 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6479 struct aarch64_address_info info;
6480 int cost = 0;
6481 info.shift = 0;
6482
6483 if (!aarch64_classify_address (&info, x, mode, c, false))
6484 {
6485 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6486 {
6487 /* This is a CONST or SYMBOL ref which will be split
6488 in a different way depending on the code model in use.
6489 Cost it through the generic infrastructure. */
6490 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6491 /* Divide through by the cost of one instruction to
6492 bring it to the same units as the address costs. */
6493 cost_symbol_ref /= COSTS_N_INSNS (1);
6494 /* The cost is then the cost of preparing the address,
6495 followed by an immediate (possibly 0) offset. */
6496 return cost_symbol_ref + addr_cost->imm_offset;
6497 }
6498 else
6499 {
6500 /* This is most likely a jump table from a case
6501 statement. */
6502 return addr_cost->register_offset;
6503 }
6504 }
6505
6506 switch (info.type)
6507 {
6508 case ADDRESS_LO_SUM:
6509 case ADDRESS_SYMBOLIC:
6510 case ADDRESS_REG_IMM:
6511 cost += addr_cost->imm_offset;
6512 break;
6513
6514 case ADDRESS_REG_WB:
6515 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6516 cost += addr_cost->pre_modify;
6517 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6518 cost += addr_cost->post_modify;
6519 else
6520 gcc_unreachable ();
6521
6522 break;
6523
6524 case ADDRESS_REG_REG:
6525 cost += addr_cost->register_offset;
6526 break;
6527
6528 case ADDRESS_REG_SXTW:
6529 cost += addr_cost->register_sextend;
6530 break;
6531
6532 case ADDRESS_REG_UXTW:
6533 cost += addr_cost->register_zextend;
6534 break;
6535
6536 default:
6537 gcc_unreachable ();
6538 }
6539
6540
6541 if (info.shift > 0)
6542 {
6543 /* For the sake of calculating the cost of the shifted register
6544 component, we can treat same sized modes in the same way. */
6545 switch (GET_MODE_BITSIZE (mode))
6546 {
6547 case 16:
6548 cost += addr_cost->addr_scale_costs.hi;
6549 break;
6550
6551 case 32:
6552 cost += addr_cost->addr_scale_costs.si;
6553 break;
6554
6555 case 64:
6556 cost += addr_cost->addr_scale_costs.di;
6557 break;
6558
6559 /* We can't tell, or this is a 128-bit vector. */
6560 default:
6561 cost += addr_cost->addr_scale_costs.ti;
6562 break;
6563 }
6564 }
6565
6566 return cost;
6567 }
6568
6569 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6570 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6571 to be taken. */
6572
6573 int
6574 aarch64_branch_cost (bool speed_p, bool predictable_p)
6575 {
6576 /* When optimizing for speed, use the cost of unpredictable branches. */
6577 const struct cpu_branch_cost *branch_costs =
6578 aarch64_tune_params.branch_costs;
6579
6580 if (!speed_p || predictable_p)
6581 return branch_costs->predictable;
6582 else
6583 return branch_costs->unpredictable;
6584 }
6585
6586 /* Return true if the RTX X in mode MODE is a zero or sign extract
6587 usable in an ADD or SUB (extended register) instruction. */
6588 static bool
6589 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6590 {
6591 /* Catch add with a sign extract.
6592 This is add_<optab><mode>_multp2. */
6593 if (GET_CODE (x) == SIGN_EXTRACT
6594 || GET_CODE (x) == ZERO_EXTRACT)
6595 {
6596 rtx op0 = XEXP (x, 0);
6597 rtx op1 = XEXP (x, 1);
6598 rtx op2 = XEXP (x, 2);
6599
6600 if (GET_CODE (op0) == MULT
6601 && CONST_INT_P (op1)
6602 && op2 == const0_rtx
6603 && CONST_INT_P (XEXP (op0, 1))
6604 && aarch64_is_extend_from_extract (mode,
6605 XEXP (op0, 1),
6606 op1))
6607 {
6608 return true;
6609 }
6610 }
6611 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6612 No shift. */
6613 else if (GET_CODE (x) == SIGN_EXTEND
6614 || GET_CODE (x) == ZERO_EXTEND)
6615 return REG_P (XEXP (x, 0));
6616
6617 return false;
6618 }
6619
6620 static bool
6621 aarch64_frint_unspec_p (unsigned int u)
6622 {
6623 switch (u)
6624 {
6625 case UNSPEC_FRINTZ:
6626 case UNSPEC_FRINTP:
6627 case UNSPEC_FRINTM:
6628 case UNSPEC_FRINTA:
6629 case UNSPEC_FRINTN:
6630 case UNSPEC_FRINTX:
6631 case UNSPEC_FRINTI:
6632 return true;
6633
6634 default:
6635 return false;
6636 }
6637 }
6638
6639 /* Return true iff X is an rtx that will match an extr instruction
6640 i.e. as described in the *extr<mode>5_insn family of patterns.
6641 OP0 and OP1 will be set to the operands of the shifts involved
6642 on success and will be NULL_RTX otherwise. */
6643
6644 static bool
6645 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6646 {
6647 rtx op0, op1;
6648 machine_mode mode = GET_MODE (x);
6649
6650 *res_op0 = NULL_RTX;
6651 *res_op1 = NULL_RTX;
6652
6653 if (GET_CODE (x) != IOR)
6654 return false;
6655
6656 op0 = XEXP (x, 0);
6657 op1 = XEXP (x, 1);
6658
6659 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6660 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6661 {
6662 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6663 if (GET_CODE (op1) == ASHIFT)
6664 std::swap (op0, op1);
6665
6666 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6667 return false;
6668
6669 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6670 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6671
6672 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6673 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6674 {
6675 *res_op0 = XEXP (op0, 0);
6676 *res_op1 = XEXP (op1, 0);
6677 return true;
6678 }
6679 }
6680
6681 return false;
6682 }
6683
6684 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6685 storing it in *COST. Result is true if the total cost of the operation
6686 has now been calculated. */
6687 static bool
6688 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6689 {
6690 rtx inner;
6691 rtx comparator;
6692 enum rtx_code cmpcode;
6693
6694 if (COMPARISON_P (op0))
6695 {
6696 inner = XEXP (op0, 0);
6697 comparator = XEXP (op0, 1);
6698 cmpcode = GET_CODE (op0);
6699 }
6700 else
6701 {
6702 inner = op0;
6703 comparator = const0_rtx;
6704 cmpcode = NE;
6705 }
6706
6707 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6708 {
6709 /* Conditional branch. */
6710 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6711 return true;
6712 else
6713 {
6714 if (cmpcode == NE || cmpcode == EQ)
6715 {
6716 if (comparator == const0_rtx)
6717 {
6718 /* TBZ/TBNZ/CBZ/CBNZ. */
6719 if (GET_CODE (inner) == ZERO_EXTRACT)
6720 /* TBZ/TBNZ. */
6721 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6722 ZERO_EXTRACT, 0, speed);
6723 else
6724 /* CBZ/CBNZ. */
6725 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6726
6727 return true;
6728 }
6729 }
6730 else if (cmpcode == LT || cmpcode == GE)
6731 {
6732 /* TBZ/TBNZ. */
6733 if (comparator == const0_rtx)
6734 return true;
6735 }
6736 }
6737 }
6738 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6739 {
6740 /* CCMP. */
6741 if (GET_CODE (op1) == COMPARE)
6742 {
6743 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6744 if (XEXP (op1, 1) == const0_rtx)
6745 *cost += 1;
6746 if (speed)
6747 {
6748 machine_mode mode = GET_MODE (XEXP (op1, 0));
6749 const struct cpu_cost_table *extra_cost
6750 = aarch64_tune_params.insn_extra_cost;
6751
6752 if (GET_MODE_CLASS (mode) == MODE_INT)
6753 *cost += extra_cost->alu.arith;
6754 else
6755 *cost += extra_cost->fp[mode == DFmode].compare;
6756 }
6757 return true;
6758 }
6759
6760 /* It's a conditional operation based on the status flags,
6761 so it must be some flavor of CSEL. */
6762
6763 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6764 if (GET_CODE (op1) == NEG
6765 || GET_CODE (op1) == NOT
6766 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6767 op1 = XEXP (op1, 0);
6768 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6769 {
6770 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6771 op1 = XEXP (op1, 0);
6772 op2 = XEXP (op2, 0);
6773 }
6774
6775 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6776 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6777 return true;
6778 }
6779
6780 /* We don't know what this is, cost all operands. */
6781 return false;
6782 }
6783
6784 /* Check whether X is a bitfield operation of the form shift + extend that
6785 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6786 operand to which the bitfield operation is applied. Otherwise return
6787 NULL_RTX. */
6788
6789 static rtx
6790 aarch64_extend_bitfield_pattern_p (rtx x)
6791 {
6792 rtx_code outer_code = GET_CODE (x);
6793 machine_mode outer_mode = GET_MODE (x);
6794
6795 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6796 && outer_mode != SImode && outer_mode != DImode)
6797 return NULL_RTX;
6798
6799 rtx inner = XEXP (x, 0);
6800 rtx_code inner_code = GET_CODE (inner);
6801 machine_mode inner_mode = GET_MODE (inner);
6802 rtx op = NULL_RTX;
6803
6804 switch (inner_code)
6805 {
6806 case ASHIFT:
6807 if (CONST_INT_P (XEXP (inner, 1))
6808 && (inner_mode == QImode || inner_mode == HImode))
6809 op = XEXP (inner, 0);
6810 break;
6811 case LSHIFTRT:
6812 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6813 && (inner_mode == QImode || inner_mode == HImode))
6814 op = XEXP (inner, 0);
6815 break;
6816 case ASHIFTRT:
6817 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6818 && (inner_mode == QImode || inner_mode == HImode))
6819 op = XEXP (inner, 0);
6820 break;
6821 default:
6822 break;
6823 }
6824
6825 return op;
6826 }
6827
6828 /* Return true if the mask and a shift amount from an RTX of the form
6829 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6830 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6831
6832 bool
6833 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6834 {
6835 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6836 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6837 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6838 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6839 }
6840
6841 /* Calculate the cost of calculating X, storing it in *COST. Result
6842 is true if the total cost of the operation has now been calculated. */
6843 static bool
6844 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6845 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6846 {
6847 rtx op0, op1, op2;
6848 const struct cpu_cost_table *extra_cost
6849 = aarch64_tune_params.insn_extra_cost;
6850 int code = GET_CODE (x);
6851 scalar_int_mode int_mode;
6852
6853 /* By default, assume that everything has equivalent cost to the
6854 cheapest instruction. Any additional costs are applied as a delta
6855 above this default. */
6856 *cost = COSTS_N_INSNS (1);
6857
6858 switch (code)
6859 {
6860 case SET:
6861 /* The cost depends entirely on the operands to SET. */
6862 *cost = 0;
6863 op0 = SET_DEST (x);
6864 op1 = SET_SRC (x);
6865
6866 switch (GET_CODE (op0))
6867 {
6868 case MEM:
6869 if (speed)
6870 {
6871 rtx address = XEXP (op0, 0);
6872 if (VECTOR_MODE_P (mode))
6873 *cost += extra_cost->ldst.storev;
6874 else if (GET_MODE_CLASS (mode) == MODE_INT)
6875 *cost += extra_cost->ldst.store;
6876 else if (mode == SFmode)
6877 *cost += extra_cost->ldst.storef;
6878 else if (mode == DFmode)
6879 *cost += extra_cost->ldst.stored;
6880
6881 *cost +=
6882 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6883 0, speed));
6884 }
6885
6886 *cost += rtx_cost (op1, mode, SET, 1, speed);
6887 return true;
6888
6889 case SUBREG:
6890 if (! REG_P (SUBREG_REG (op0)))
6891 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6892
6893 /* Fall through. */
6894 case REG:
6895 /* The cost is one per vector-register copied. */
6896 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6897 {
6898 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6899 / GET_MODE_SIZE (V4SImode);
6900 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6901 }
6902 /* const0_rtx is in general free, but we will use an
6903 instruction to set a register to 0. */
6904 else if (REG_P (op1) || op1 == const0_rtx)
6905 {
6906 /* The cost is 1 per register copied. */
6907 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6908 / UNITS_PER_WORD;
6909 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6910 }
6911 else
6912 /* Cost is just the cost of the RHS of the set. */
6913 *cost += rtx_cost (op1, mode, SET, 1, speed);
6914 return true;
6915
6916 case ZERO_EXTRACT:
6917 case SIGN_EXTRACT:
6918 /* Bit-field insertion. Strip any redundant widening of
6919 the RHS to meet the width of the target. */
6920 if (GET_CODE (op1) == SUBREG)
6921 op1 = SUBREG_REG (op1);
6922 if ((GET_CODE (op1) == ZERO_EXTEND
6923 || GET_CODE (op1) == SIGN_EXTEND)
6924 && CONST_INT_P (XEXP (op0, 1))
6925 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6926 >= INTVAL (XEXP (op0, 1))))
6927 op1 = XEXP (op1, 0);
6928
6929 if (CONST_INT_P (op1))
6930 {
6931 /* MOV immediate is assumed to always be cheap. */
6932 *cost = COSTS_N_INSNS (1);
6933 }
6934 else
6935 {
6936 /* BFM. */
6937 if (speed)
6938 *cost += extra_cost->alu.bfi;
6939 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6940 }
6941
6942 return true;
6943
6944 default:
6945 /* We can't make sense of this, assume default cost. */
6946 *cost = COSTS_N_INSNS (1);
6947 return false;
6948 }
6949 return false;
6950
6951 case CONST_INT:
6952 /* If an instruction can incorporate a constant within the
6953 instruction, the instruction's expression avoids calling
6954 rtx_cost() on the constant. If rtx_cost() is called on a
6955 constant, then it is usually because the constant must be
6956 moved into a register by one or more instructions.
6957
6958 The exception is constant 0, which can be expressed
6959 as XZR/WZR and is therefore free. The exception to this is
6960 if we have (set (reg) (const0_rtx)) in which case we must cost
6961 the move. However, we can catch that when we cost the SET, so
6962 we don't need to consider that here. */
6963 if (x == const0_rtx)
6964 *cost = 0;
6965 else
6966 {
6967 /* To an approximation, building any other constant is
6968 proportionally expensive to the number of instructions
6969 required to build that constant. This is true whether we
6970 are compiling for SPEED or otherwise. */
6971 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6972 (NULL_RTX, x, false, mode));
6973 }
6974 return true;
6975
6976 case CONST_DOUBLE:
6977
6978 /* First determine number of instructions to do the move
6979 as an integer constant. */
6980 if (!aarch64_float_const_representable_p (x)
6981 && !aarch64_can_const_movi_rtx_p (x, mode)
6982 && aarch64_float_const_rtx_p (x))
6983 {
6984 unsigned HOST_WIDE_INT ival;
6985 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6986 gcc_assert (succeed);
6987
6988 machine_mode imode = (mode == HFmode
6989 ? SImode
6990 : int_mode_for_mode (mode).require ());
6991 int ncost = aarch64_internal_mov_immediate
6992 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6993 *cost += COSTS_N_INSNS (ncost);
6994 return true;
6995 }
6996
6997 if (speed)
6998 {
6999 /* mov[df,sf]_aarch64. */
7000 if (aarch64_float_const_representable_p (x))
7001 /* FMOV (scalar immediate). */
7002 *cost += extra_cost->fp[mode == DFmode].fpconst;
7003 else if (!aarch64_float_const_zero_rtx_p (x))
7004 {
7005 /* This will be a load from memory. */
7006 if (mode == DFmode)
7007 *cost += extra_cost->ldst.loadd;
7008 else
7009 *cost += extra_cost->ldst.loadf;
7010 }
7011 else
7012 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7013 or MOV v0.s[0], wzr - neither of which are modeled by the
7014 cost tables. Just use the default cost. */
7015 {
7016 }
7017 }
7018
7019 return true;
7020
7021 case MEM:
7022 if (speed)
7023 {
7024 /* For loads we want the base cost of a load, plus an
7025 approximation for the additional cost of the addressing
7026 mode. */
7027 rtx address = XEXP (x, 0);
7028 if (VECTOR_MODE_P (mode))
7029 *cost += extra_cost->ldst.loadv;
7030 else if (GET_MODE_CLASS (mode) == MODE_INT)
7031 *cost += extra_cost->ldst.load;
7032 else if (mode == SFmode)
7033 *cost += extra_cost->ldst.loadf;
7034 else if (mode == DFmode)
7035 *cost += extra_cost->ldst.loadd;
7036
7037 *cost +=
7038 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7039 0, speed));
7040 }
7041
7042 return true;
7043
7044 case NEG:
7045 op0 = XEXP (x, 0);
7046
7047 if (VECTOR_MODE_P (mode))
7048 {
7049 if (speed)
7050 {
7051 /* FNEG. */
7052 *cost += extra_cost->vect.alu;
7053 }
7054 return false;
7055 }
7056
7057 if (GET_MODE_CLASS (mode) == MODE_INT)
7058 {
7059 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7060 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7061 {
7062 /* CSETM. */
7063 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7064 return true;
7065 }
7066
7067 /* Cost this as SUB wzr, X. */
7068 op0 = CONST0_RTX (mode);
7069 op1 = XEXP (x, 0);
7070 goto cost_minus;
7071 }
7072
7073 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7074 {
7075 /* Support (neg(fma...)) as a single instruction only if
7076 sign of zeros is unimportant. This matches the decision
7077 making in aarch64.md. */
7078 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7079 {
7080 /* FNMADD. */
7081 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7082 return true;
7083 }
7084 if (GET_CODE (op0) == MULT)
7085 {
7086 /* FNMUL. */
7087 *cost = rtx_cost (op0, mode, NEG, 0, speed);
7088 return true;
7089 }
7090 if (speed)
7091 /* FNEG. */
7092 *cost += extra_cost->fp[mode == DFmode].neg;
7093 return false;
7094 }
7095
7096 return false;
7097
7098 case CLRSB:
7099 case CLZ:
7100 if (speed)
7101 {
7102 if (VECTOR_MODE_P (mode))
7103 *cost += extra_cost->vect.alu;
7104 else
7105 *cost += extra_cost->alu.clz;
7106 }
7107
7108 return false;
7109
7110 case COMPARE:
7111 op0 = XEXP (x, 0);
7112 op1 = XEXP (x, 1);
7113
7114 if (op1 == const0_rtx
7115 && GET_CODE (op0) == AND)
7116 {
7117 x = op0;
7118 mode = GET_MODE (op0);
7119 goto cost_logic;
7120 }
7121
7122 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7123 {
7124 /* TODO: A write to the CC flags possibly costs extra, this
7125 needs encoding in the cost tables. */
7126
7127 mode = GET_MODE (op0);
7128 /* ANDS. */
7129 if (GET_CODE (op0) == AND)
7130 {
7131 x = op0;
7132 goto cost_logic;
7133 }
7134
7135 if (GET_CODE (op0) == PLUS)
7136 {
7137 /* ADDS (and CMN alias). */
7138 x = op0;
7139 goto cost_plus;
7140 }
7141
7142 if (GET_CODE (op0) == MINUS)
7143 {
7144 /* SUBS. */
7145 x = op0;
7146 goto cost_minus;
7147 }
7148
7149 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7150 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7151 && CONST_INT_P (XEXP (op0, 2)))
7152 {
7153 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7154 Handle it here directly rather than going to cost_logic
7155 since we know the immediate generated for the TST is valid
7156 so we can avoid creating an intermediate rtx for it only
7157 for costing purposes. */
7158 if (speed)
7159 *cost += extra_cost->alu.logical;
7160
7161 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7162 ZERO_EXTRACT, 0, speed);
7163 return true;
7164 }
7165
7166 if (GET_CODE (op1) == NEG)
7167 {
7168 /* CMN. */
7169 if (speed)
7170 *cost += extra_cost->alu.arith;
7171
7172 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7173 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7174 return true;
7175 }
7176
7177 /* CMP.
7178
7179 Compare can freely swap the order of operands, and
7180 canonicalization puts the more complex operation first.
7181 But the integer MINUS logic expects the shift/extend
7182 operation in op1. */
7183 if (! (REG_P (op0)
7184 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7185 {
7186 op0 = XEXP (x, 1);
7187 op1 = XEXP (x, 0);
7188 }
7189 goto cost_minus;
7190 }
7191
7192 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7193 {
7194 /* FCMP. */
7195 if (speed)
7196 *cost += extra_cost->fp[mode == DFmode].compare;
7197
7198 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7199 {
7200 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7201 /* FCMP supports constant 0.0 for no extra cost. */
7202 return true;
7203 }
7204 return false;
7205 }
7206
7207 if (VECTOR_MODE_P (mode))
7208 {
7209 /* Vector compare. */
7210 if (speed)
7211 *cost += extra_cost->vect.alu;
7212
7213 if (aarch64_float_const_zero_rtx_p (op1))
7214 {
7215 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7216 cost. */
7217 return true;
7218 }
7219 return false;
7220 }
7221 return false;
7222
7223 case MINUS:
7224 {
7225 op0 = XEXP (x, 0);
7226 op1 = XEXP (x, 1);
7227
7228 cost_minus:
7229 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7230
7231 /* Detect valid immediates. */
7232 if ((GET_MODE_CLASS (mode) == MODE_INT
7233 || (GET_MODE_CLASS (mode) == MODE_CC
7234 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7235 && CONST_INT_P (op1)
7236 && aarch64_uimm12_shift (INTVAL (op1)))
7237 {
7238 if (speed)
7239 /* SUB(S) (immediate). */
7240 *cost += extra_cost->alu.arith;
7241 return true;
7242 }
7243
7244 /* Look for SUB (extended register). */
7245 if (aarch64_rtx_arith_op_extract_p (op1, mode))
7246 {
7247 if (speed)
7248 *cost += extra_cost->alu.extend_arith;
7249
7250 op1 = aarch64_strip_extend (op1, true);
7251 *cost += rtx_cost (op1, VOIDmode,
7252 (enum rtx_code) GET_CODE (op1), 0, speed);
7253 return true;
7254 }
7255
7256 rtx new_op1 = aarch64_strip_extend (op1, false);
7257
7258 /* Cost this as an FMA-alike operation. */
7259 if ((GET_CODE (new_op1) == MULT
7260 || aarch64_shift_p (GET_CODE (new_op1)))
7261 && code != COMPARE)
7262 {
7263 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7264 (enum rtx_code) code,
7265 speed);
7266 return true;
7267 }
7268
7269 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7270
7271 if (speed)
7272 {
7273 if (VECTOR_MODE_P (mode))
7274 {
7275 /* Vector SUB. */
7276 *cost += extra_cost->vect.alu;
7277 }
7278 else if (GET_MODE_CLASS (mode) == MODE_INT)
7279 {
7280 /* SUB(S). */
7281 *cost += extra_cost->alu.arith;
7282 }
7283 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7284 {
7285 /* FSUB. */
7286 *cost += extra_cost->fp[mode == DFmode].addsub;
7287 }
7288 }
7289 return true;
7290 }
7291
7292 case PLUS:
7293 {
7294 rtx new_op0;
7295
7296 op0 = XEXP (x, 0);
7297 op1 = XEXP (x, 1);
7298
7299 cost_plus:
7300 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7301 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7302 {
7303 /* CSINC. */
7304 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7305 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7306 return true;
7307 }
7308
7309 if (GET_MODE_CLASS (mode) == MODE_INT
7310 && CONST_INT_P (op1)
7311 && aarch64_uimm12_shift (INTVAL (op1)))
7312 {
7313 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7314
7315 if (speed)
7316 /* ADD (immediate). */
7317 *cost += extra_cost->alu.arith;
7318 return true;
7319 }
7320
7321 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7322
7323 /* Look for ADD (extended register). */
7324 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7325 {
7326 if (speed)
7327 *cost += extra_cost->alu.extend_arith;
7328
7329 op0 = aarch64_strip_extend (op0, true);
7330 *cost += rtx_cost (op0, VOIDmode,
7331 (enum rtx_code) GET_CODE (op0), 0, speed);
7332 return true;
7333 }
7334
7335 /* Strip any extend, leave shifts behind as we will
7336 cost them through mult_cost. */
7337 new_op0 = aarch64_strip_extend (op0, false);
7338
7339 if (GET_CODE (new_op0) == MULT
7340 || aarch64_shift_p (GET_CODE (new_op0)))
7341 {
7342 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7343 speed);
7344 return true;
7345 }
7346
7347 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7348
7349 if (speed)
7350 {
7351 if (VECTOR_MODE_P (mode))
7352 {
7353 /* Vector ADD. */
7354 *cost += extra_cost->vect.alu;
7355 }
7356 else if (GET_MODE_CLASS (mode) == MODE_INT)
7357 {
7358 /* ADD. */
7359 *cost += extra_cost->alu.arith;
7360 }
7361 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7362 {
7363 /* FADD. */
7364 *cost += extra_cost->fp[mode == DFmode].addsub;
7365 }
7366 }
7367 return true;
7368 }
7369
7370 case BSWAP:
7371 *cost = COSTS_N_INSNS (1);
7372
7373 if (speed)
7374 {
7375 if (VECTOR_MODE_P (mode))
7376 *cost += extra_cost->vect.alu;
7377 else
7378 *cost += extra_cost->alu.rev;
7379 }
7380 return false;
7381
7382 case IOR:
7383 if (aarch_rev16_p (x))
7384 {
7385 *cost = COSTS_N_INSNS (1);
7386
7387 if (speed)
7388 {
7389 if (VECTOR_MODE_P (mode))
7390 *cost += extra_cost->vect.alu;
7391 else
7392 *cost += extra_cost->alu.rev;
7393 }
7394 return true;
7395 }
7396
7397 if (aarch64_extr_rtx_p (x, &op0, &op1))
7398 {
7399 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7400 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7401 if (speed)
7402 *cost += extra_cost->alu.shift;
7403
7404 return true;
7405 }
7406 /* Fall through. */
7407 case XOR:
7408 case AND:
7409 cost_logic:
7410 op0 = XEXP (x, 0);
7411 op1 = XEXP (x, 1);
7412
7413 if (VECTOR_MODE_P (mode))
7414 {
7415 if (speed)
7416 *cost += extra_cost->vect.alu;
7417 return true;
7418 }
7419
7420 if (code == AND
7421 && GET_CODE (op0) == MULT
7422 && CONST_INT_P (XEXP (op0, 1))
7423 && CONST_INT_P (op1)
7424 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7425 INTVAL (op1)) != 0)
7426 {
7427 /* This is a UBFM/SBFM. */
7428 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7429 if (speed)
7430 *cost += extra_cost->alu.bfx;
7431 return true;
7432 }
7433
7434 if (is_int_mode (mode, &int_mode))
7435 {
7436 if (CONST_INT_P (op1))
7437 {
7438 /* We have a mask + shift version of a UBFIZ
7439 i.e. the *andim_ashift<mode>_bfiz pattern. */
7440 if (GET_CODE (op0) == ASHIFT
7441 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7442 XEXP (op0, 1)))
7443 {
7444 *cost += rtx_cost (XEXP (op0, 0), int_mode,
7445 (enum rtx_code) code, 0, speed);
7446 if (speed)
7447 *cost += extra_cost->alu.bfx;
7448
7449 return true;
7450 }
7451 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7452 {
7453 /* We possibly get the immediate for free, this is not
7454 modelled. */
7455 *cost += rtx_cost (op0, int_mode,
7456 (enum rtx_code) code, 0, speed);
7457 if (speed)
7458 *cost += extra_cost->alu.logical;
7459
7460 return true;
7461 }
7462 }
7463 else
7464 {
7465 rtx new_op0 = op0;
7466
7467 /* Handle ORN, EON, or BIC. */
7468 if (GET_CODE (op0) == NOT)
7469 op0 = XEXP (op0, 0);
7470
7471 new_op0 = aarch64_strip_shift (op0);
7472
7473 /* If we had a shift on op0 then this is a logical-shift-
7474 by-register/immediate operation. Otherwise, this is just
7475 a logical operation. */
7476 if (speed)
7477 {
7478 if (new_op0 != op0)
7479 {
7480 /* Shift by immediate. */
7481 if (CONST_INT_P (XEXP (op0, 1)))
7482 *cost += extra_cost->alu.log_shift;
7483 else
7484 *cost += extra_cost->alu.log_shift_reg;
7485 }
7486 else
7487 *cost += extra_cost->alu.logical;
7488 }
7489
7490 /* In both cases we want to cost both operands. */
7491 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7492 0, speed);
7493 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7494 1, speed);
7495
7496 return true;
7497 }
7498 }
7499 return false;
7500
7501 case NOT:
7502 x = XEXP (x, 0);
7503 op0 = aarch64_strip_shift (x);
7504
7505 if (VECTOR_MODE_P (mode))
7506 {
7507 /* Vector NOT. */
7508 *cost += extra_cost->vect.alu;
7509 return false;
7510 }
7511
7512 /* MVN-shifted-reg. */
7513 if (op0 != x)
7514 {
7515 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7516
7517 if (speed)
7518 *cost += extra_cost->alu.log_shift;
7519
7520 return true;
7521 }
7522 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7523 Handle the second form here taking care that 'a' in the above can
7524 be a shift. */
7525 else if (GET_CODE (op0) == XOR)
7526 {
7527 rtx newop0 = XEXP (op0, 0);
7528 rtx newop1 = XEXP (op0, 1);
7529 rtx op0_stripped = aarch64_strip_shift (newop0);
7530
7531 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7532 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7533
7534 if (speed)
7535 {
7536 if (op0_stripped != newop0)
7537 *cost += extra_cost->alu.log_shift;
7538 else
7539 *cost += extra_cost->alu.logical;
7540 }
7541
7542 return true;
7543 }
7544 /* MVN. */
7545 if (speed)
7546 *cost += extra_cost->alu.logical;
7547
7548 return false;
7549
7550 case ZERO_EXTEND:
7551
7552 op0 = XEXP (x, 0);
7553 /* If a value is written in SI mode, then zero extended to DI
7554 mode, the operation will in general be free as a write to
7555 a 'w' register implicitly zeroes the upper bits of an 'x'
7556 register. However, if this is
7557
7558 (set (reg) (zero_extend (reg)))
7559
7560 we must cost the explicit register move. */
7561 if (mode == DImode
7562 && GET_MODE (op0) == SImode
7563 && outer == SET)
7564 {
7565 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7566
7567 /* If OP_COST is non-zero, then the cost of the zero extend
7568 is effectively the cost of the inner operation. Otherwise
7569 we have a MOV instruction and we take the cost from the MOV
7570 itself. This is true independently of whether we are
7571 optimizing for space or time. */
7572 if (op_cost)
7573 *cost = op_cost;
7574
7575 return true;
7576 }
7577 else if (MEM_P (op0))
7578 {
7579 /* All loads can zero extend to any size for free. */
7580 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7581 return true;
7582 }
7583
7584 op0 = aarch64_extend_bitfield_pattern_p (x);
7585 if (op0)
7586 {
7587 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7588 if (speed)
7589 *cost += extra_cost->alu.bfx;
7590 return true;
7591 }
7592
7593 if (speed)
7594 {
7595 if (VECTOR_MODE_P (mode))
7596 {
7597 /* UMOV. */
7598 *cost += extra_cost->vect.alu;
7599 }
7600 else
7601 {
7602 /* We generate an AND instead of UXTB/UXTH. */
7603 *cost += extra_cost->alu.logical;
7604 }
7605 }
7606 return false;
7607
7608 case SIGN_EXTEND:
7609 if (MEM_P (XEXP (x, 0)))
7610 {
7611 /* LDRSH. */
7612 if (speed)
7613 {
7614 rtx address = XEXP (XEXP (x, 0), 0);
7615 *cost += extra_cost->ldst.load_sign_extend;
7616
7617 *cost +=
7618 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7619 0, speed));
7620 }
7621 return true;
7622 }
7623
7624 op0 = aarch64_extend_bitfield_pattern_p (x);
7625 if (op0)
7626 {
7627 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7628 if (speed)
7629 *cost += extra_cost->alu.bfx;
7630 return true;
7631 }
7632
7633 if (speed)
7634 {
7635 if (VECTOR_MODE_P (mode))
7636 *cost += extra_cost->vect.alu;
7637 else
7638 *cost += extra_cost->alu.extend;
7639 }
7640 return false;
7641
7642 case ASHIFT:
7643 op0 = XEXP (x, 0);
7644 op1 = XEXP (x, 1);
7645
7646 if (CONST_INT_P (op1))
7647 {
7648 if (speed)
7649 {
7650 if (VECTOR_MODE_P (mode))
7651 {
7652 /* Vector shift (immediate). */
7653 *cost += extra_cost->vect.alu;
7654 }
7655 else
7656 {
7657 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7658 aliases. */
7659 *cost += extra_cost->alu.shift;
7660 }
7661 }
7662
7663 /* We can incorporate zero/sign extend for free. */
7664 if (GET_CODE (op0) == ZERO_EXTEND
7665 || GET_CODE (op0) == SIGN_EXTEND)
7666 op0 = XEXP (op0, 0);
7667
7668 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7669 return true;
7670 }
7671 else
7672 {
7673 if (VECTOR_MODE_P (mode))
7674 {
7675 if (speed)
7676 /* Vector shift (register). */
7677 *cost += extra_cost->vect.alu;
7678 }
7679 else
7680 {
7681 if (speed)
7682 /* LSLV. */
7683 *cost += extra_cost->alu.shift_reg;
7684
7685 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7686 && CONST_INT_P (XEXP (op1, 1))
7687 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7688 {
7689 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7690 /* We already demanded XEXP (op1, 0) to be REG_P, so
7691 don't recurse into it. */
7692 return true;
7693 }
7694 }
7695 return false; /* All arguments need to be in registers. */
7696 }
7697
7698 case ROTATE:
7699 case ROTATERT:
7700 case LSHIFTRT:
7701 case ASHIFTRT:
7702 op0 = XEXP (x, 0);
7703 op1 = XEXP (x, 1);
7704
7705 if (CONST_INT_P (op1))
7706 {
7707 /* ASR (immediate) and friends. */
7708 if (speed)
7709 {
7710 if (VECTOR_MODE_P (mode))
7711 *cost += extra_cost->vect.alu;
7712 else
7713 *cost += extra_cost->alu.shift;
7714 }
7715
7716 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7717 return true;
7718 }
7719 else
7720 {
7721 if (VECTOR_MODE_P (mode))
7722 {
7723 if (speed)
7724 /* Vector shift (register). */
7725 *cost += extra_cost->vect.alu;
7726 }
7727 else
7728 {
7729 if (speed)
7730 /* ASR (register) and friends. */
7731 *cost += extra_cost->alu.shift_reg;
7732
7733 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7734 && CONST_INT_P (XEXP (op1, 1))
7735 && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7736 {
7737 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7738 /* We already demanded XEXP (op1, 0) to be REG_P, so
7739 don't recurse into it. */
7740 return true;
7741 }
7742 }
7743 return false; /* All arguments need to be in registers. */
7744 }
7745
7746 case SYMBOL_REF:
7747
7748 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7749 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7750 {
7751 /* LDR. */
7752 if (speed)
7753 *cost += extra_cost->ldst.load;
7754 }
7755 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7756 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7757 {
7758 /* ADRP, followed by ADD. */
7759 *cost += COSTS_N_INSNS (1);
7760 if (speed)
7761 *cost += 2 * extra_cost->alu.arith;
7762 }
7763 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7764 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7765 {
7766 /* ADR. */
7767 if (speed)
7768 *cost += extra_cost->alu.arith;
7769 }
7770
7771 if (flag_pic)
7772 {
7773 /* One extra load instruction, after accessing the GOT. */
7774 *cost += COSTS_N_INSNS (1);
7775 if (speed)
7776 *cost += extra_cost->ldst.load;
7777 }
7778 return true;
7779
7780 case HIGH:
7781 case LO_SUM:
7782 /* ADRP/ADD (immediate). */
7783 if (speed)
7784 *cost += extra_cost->alu.arith;
7785 return true;
7786
7787 case ZERO_EXTRACT:
7788 case SIGN_EXTRACT:
7789 /* UBFX/SBFX. */
7790 if (speed)
7791 {
7792 if (VECTOR_MODE_P (mode))
7793 *cost += extra_cost->vect.alu;
7794 else
7795 *cost += extra_cost->alu.bfx;
7796 }
7797
7798 /* We can trust that the immediates used will be correct (there
7799 are no by-register forms), so we need only cost op0. */
7800 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7801 return true;
7802
7803 case MULT:
7804 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7805 /* aarch64_rtx_mult_cost always handles recursion to its
7806 operands. */
7807 return true;
7808
7809 case MOD:
7810 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7811 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7812 an unconditional negate. This case should only ever be reached through
7813 the set_smod_pow2_cheap check in expmed.c. */
7814 if (CONST_INT_P (XEXP (x, 1))
7815 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7816 && (mode == SImode || mode == DImode))
7817 {
7818 /* We expand to 4 instructions. Reset the baseline. */
7819 *cost = COSTS_N_INSNS (4);
7820
7821 if (speed)
7822 *cost += 2 * extra_cost->alu.logical
7823 + 2 * extra_cost->alu.arith;
7824
7825 return true;
7826 }
7827
7828 /* Fall-through. */
7829 case UMOD:
7830 if (speed)
7831 {
7832 /* Slighly prefer UMOD over SMOD. */
7833 if (VECTOR_MODE_P (mode))
7834 *cost += extra_cost->vect.alu;
7835 else if (GET_MODE_CLASS (mode) == MODE_INT)
7836 *cost += (extra_cost->mult[mode == DImode].add
7837 + extra_cost->mult[mode == DImode].idiv
7838 + (code == MOD ? 1 : 0));
7839 }
7840 return false; /* All arguments need to be in registers. */
7841
7842 case DIV:
7843 case UDIV:
7844 case SQRT:
7845 if (speed)
7846 {
7847 if (VECTOR_MODE_P (mode))
7848 *cost += extra_cost->vect.alu;
7849 else if (GET_MODE_CLASS (mode) == MODE_INT)
7850 /* There is no integer SQRT, so only DIV and UDIV can get
7851 here. */
7852 *cost += (extra_cost->mult[mode == DImode].idiv
7853 /* Slighly prefer UDIV over SDIV. */
7854 + (code == DIV ? 1 : 0));
7855 else
7856 *cost += extra_cost->fp[mode == DFmode].div;
7857 }
7858 return false; /* All arguments need to be in registers. */
7859
7860 case IF_THEN_ELSE:
7861 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7862 XEXP (x, 2), cost, speed);
7863
7864 case EQ:
7865 case NE:
7866 case GT:
7867 case GTU:
7868 case LT:
7869 case LTU:
7870 case GE:
7871 case GEU:
7872 case LE:
7873 case LEU:
7874
7875 return false; /* All arguments must be in registers. */
7876
7877 case FMA:
7878 op0 = XEXP (x, 0);
7879 op1 = XEXP (x, 1);
7880 op2 = XEXP (x, 2);
7881
7882 if (speed)
7883 {
7884 if (VECTOR_MODE_P (mode))
7885 *cost += extra_cost->vect.alu;
7886 else
7887 *cost += extra_cost->fp[mode == DFmode].fma;
7888 }
7889
7890 /* FMSUB, FNMADD, and FNMSUB are free. */
7891 if (GET_CODE (op0) == NEG)
7892 op0 = XEXP (op0, 0);
7893
7894 if (GET_CODE (op2) == NEG)
7895 op2 = XEXP (op2, 0);
7896
7897 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7898 and the by-element operand as operand 0. */
7899 if (GET_CODE (op1) == NEG)
7900 op1 = XEXP (op1, 0);
7901
7902 /* Catch vector-by-element operations. The by-element operand can
7903 either be (vec_duplicate (vec_select (x))) or just
7904 (vec_select (x)), depending on whether we are multiplying by
7905 a vector or a scalar.
7906
7907 Canonicalization is not very good in these cases, FMA4 will put the
7908 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7909 if (GET_CODE (op0) == VEC_DUPLICATE)
7910 op0 = XEXP (op0, 0);
7911 else if (GET_CODE (op1) == VEC_DUPLICATE)
7912 op1 = XEXP (op1, 0);
7913
7914 if (GET_CODE (op0) == VEC_SELECT)
7915 op0 = XEXP (op0, 0);
7916 else if (GET_CODE (op1) == VEC_SELECT)
7917 op1 = XEXP (op1, 0);
7918
7919 /* If the remaining parameters are not registers,
7920 get the cost to put them into registers. */
7921 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7922 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7923 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7924 return true;
7925
7926 case FLOAT:
7927 case UNSIGNED_FLOAT:
7928 if (speed)
7929 *cost += extra_cost->fp[mode == DFmode].fromint;
7930 return false;
7931
7932 case FLOAT_EXTEND:
7933 if (speed)
7934 {
7935 if (VECTOR_MODE_P (mode))
7936 {
7937 /*Vector truncate. */
7938 *cost += extra_cost->vect.alu;
7939 }
7940 else
7941 *cost += extra_cost->fp[mode == DFmode].widen;
7942 }
7943 return false;
7944
7945 case FLOAT_TRUNCATE:
7946 if (speed)
7947 {
7948 if (VECTOR_MODE_P (mode))
7949 {
7950 /*Vector conversion. */
7951 *cost += extra_cost->vect.alu;
7952 }
7953 else
7954 *cost += extra_cost->fp[mode == DFmode].narrow;
7955 }
7956 return false;
7957
7958 case FIX:
7959 case UNSIGNED_FIX:
7960 x = XEXP (x, 0);
7961 /* Strip the rounding part. They will all be implemented
7962 by the fcvt* family of instructions anyway. */
7963 if (GET_CODE (x) == UNSPEC)
7964 {
7965 unsigned int uns_code = XINT (x, 1);
7966
7967 if (uns_code == UNSPEC_FRINTA
7968 || uns_code == UNSPEC_FRINTM
7969 || uns_code == UNSPEC_FRINTN
7970 || uns_code == UNSPEC_FRINTP
7971 || uns_code == UNSPEC_FRINTZ)
7972 x = XVECEXP (x, 0, 0);
7973 }
7974
7975 if (speed)
7976 {
7977 if (VECTOR_MODE_P (mode))
7978 *cost += extra_cost->vect.alu;
7979 else
7980 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7981 }
7982
7983 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7984 fixed-point fcvt. */
7985 if (GET_CODE (x) == MULT
7986 && ((VECTOR_MODE_P (mode)
7987 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7988 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7989 {
7990 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7991 0, speed);
7992 return true;
7993 }
7994
7995 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7996 return true;
7997
7998 case ABS:
7999 if (VECTOR_MODE_P (mode))
8000 {
8001 /* ABS (vector). */
8002 if (speed)
8003 *cost += extra_cost->vect.alu;
8004 }
8005 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8006 {
8007 op0 = XEXP (x, 0);
8008
8009 /* FABD, which is analogous to FADD. */
8010 if (GET_CODE (op0) == MINUS)
8011 {
8012 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8013 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8014 if (speed)
8015 *cost += extra_cost->fp[mode == DFmode].addsub;
8016
8017 return true;
8018 }
8019 /* Simple FABS is analogous to FNEG. */
8020 if (speed)
8021 *cost += extra_cost->fp[mode == DFmode].neg;
8022 }
8023 else
8024 {
8025 /* Integer ABS will either be split to
8026 two arithmetic instructions, or will be an ABS
8027 (scalar), which we don't model. */
8028 *cost = COSTS_N_INSNS (2);
8029 if (speed)
8030 *cost += 2 * extra_cost->alu.arith;
8031 }
8032 return false;
8033
8034 case SMAX:
8035 case SMIN:
8036 if (speed)
8037 {
8038 if (VECTOR_MODE_P (mode))
8039 *cost += extra_cost->vect.alu;
8040 else
8041 {
8042 /* FMAXNM/FMINNM/FMAX/FMIN.
8043 TODO: This may not be accurate for all implementations, but
8044 we do not model this in the cost tables. */
8045 *cost += extra_cost->fp[mode == DFmode].addsub;
8046 }
8047 }
8048 return false;
8049
8050 case UNSPEC:
8051 /* The floating point round to integer frint* instructions. */
8052 if (aarch64_frint_unspec_p (XINT (x, 1)))
8053 {
8054 if (speed)
8055 *cost += extra_cost->fp[mode == DFmode].roundint;
8056
8057 return false;
8058 }
8059
8060 if (XINT (x, 1) == UNSPEC_RBIT)
8061 {
8062 if (speed)
8063 *cost += extra_cost->alu.rev;
8064
8065 return false;
8066 }
8067 break;
8068
8069 case TRUNCATE:
8070
8071 /* Decompose <su>muldi3_highpart. */
8072 if (/* (truncate:DI */
8073 mode == DImode
8074 /* (lshiftrt:TI */
8075 && GET_MODE (XEXP (x, 0)) == TImode
8076 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8077 /* (mult:TI */
8078 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8079 /* (ANY_EXTEND:TI (reg:DI))
8080 (ANY_EXTEND:TI (reg:DI))) */
8081 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8082 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8083 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8084 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8085 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8086 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8087 /* (const_int 64) */
8088 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8089 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8090 {
8091 /* UMULH/SMULH. */
8092 if (speed)
8093 *cost += extra_cost->mult[mode == DImode].extend;
8094 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8095 mode, MULT, 0, speed);
8096 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8097 mode, MULT, 1, speed);
8098 return true;
8099 }
8100
8101 /* Fall through. */
8102 default:
8103 break;
8104 }
8105
8106 if (dump_file
8107 && flag_aarch64_verbose_cost)
8108 fprintf (dump_file,
8109 "\nFailed to cost RTX. Assuming default cost.\n");
8110
8111 return true;
8112 }
8113
8114 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8115 calculated for X. This cost is stored in *COST. Returns true
8116 if the total cost of X was calculated. */
8117 static bool
8118 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8119 int param, int *cost, bool speed)
8120 {
8121 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8122
8123 if (dump_file
8124 && flag_aarch64_verbose_cost)
8125 {
8126 print_rtl_single (dump_file, x);
8127 fprintf (dump_file, "\n%s cost: %d (%s)\n",
8128 speed ? "Hot" : "Cold",
8129 *cost, result ? "final" : "partial");
8130 }
8131
8132 return result;
8133 }
8134
8135 static int
8136 aarch64_register_move_cost (machine_mode mode,
8137 reg_class_t from_i, reg_class_t to_i)
8138 {
8139 enum reg_class from = (enum reg_class) from_i;
8140 enum reg_class to = (enum reg_class) to_i;
8141 const struct cpu_regmove_cost *regmove_cost
8142 = aarch64_tune_params.regmove_cost;
8143
8144 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8145 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8146 to = GENERAL_REGS;
8147
8148 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8149 from = GENERAL_REGS;
8150
8151 /* Moving between GPR and stack cost is the same as GP2GP. */
8152 if ((from == GENERAL_REGS && to == STACK_REG)
8153 || (to == GENERAL_REGS && from == STACK_REG))
8154 return regmove_cost->GP2GP;
8155
8156 /* To/From the stack register, we move via the gprs. */
8157 if (to == STACK_REG || from == STACK_REG)
8158 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8159 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8160
8161 if (GET_MODE_SIZE (mode) == 16)
8162 {
8163 /* 128-bit operations on general registers require 2 instructions. */
8164 if (from == GENERAL_REGS && to == GENERAL_REGS)
8165 return regmove_cost->GP2GP * 2;
8166 else if (from == GENERAL_REGS)
8167 return regmove_cost->GP2FP * 2;
8168 else if (to == GENERAL_REGS)
8169 return regmove_cost->FP2GP * 2;
8170
8171 /* When AdvSIMD instructions are disabled it is not possible to move
8172 a 128-bit value directly between Q registers. This is handled in
8173 secondary reload. A general register is used as a scratch to move
8174 the upper DI value and the lower DI value is moved directly,
8175 hence the cost is the sum of three moves. */
8176 if (! TARGET_SIMD)
8177 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8178
8179 return regmove_cost->FP2FP;
8180 }
8181
8182 if (from == GENERAL_REGS && to == GENERAL_REGS)
8183 return regmove_cost->GP2GP;
8184 else if (from == GENERAL_REGS)
8185 return regmove_cost->GP2FP;
8186 else if (to == GENERAL_REGS)
8187 return regmove_cost->FP2GP;
8188
8189 return regmove_cost->FP2FP;
8190 }
8191
8192 static int
8193 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8194 reg_class_t rclass ATTRIBUTE_UNUSED,
8195 bool in ATTRIBUTE_UNUSED)
8196 {
8197 return aarch64_tune_params.memmov_cost;
8198 }
8199
8200 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8201 to optimize 1.0/sqrt. */
8202
8203 static bool
8204 use_rsqrt_p (machine_mode mode)
8205 {
8206 return (!flag_trapping_math
8207 && flag_unsafe_math_optimizations
8208 && ((aarch64_tune_params.approx_modes->recip_sqrt
8209 & AARCH64_APPROX_MODE (mode))
8210 || flag_mrecip_low_precision_sqrt));
8211 }
8212
8213 /* Function to decide when to use the approximate reciprocal square root
8214 builtin. */
8215
8216 static tree
8217 aarch64_builtin_reciprocal (tree fndecl)
8218 {
8219 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8220
8221 if (!use_rsqrt_p (mode))
8222 return NULL_TREE;
8223 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8224 }
8225
8226 typedef rtx (*rsqrte_type) (rtx, rtx);
8227
8228 /* Select reciprocal square root initial estimate insn depending on machine
8229 mode. */
8230
8231 static rsqrte_type
8232 get_rsqrte_type (machine_mode mode)
8233 {
8234 switch (mode)
8235 {
8236 case E_DFmode: return gen_aarch64_rsqrtedf;
8237 case E_SFmode: return gen_aarch64_rsqrtesf;
8238 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8239 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8240 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8241 default: gcc_unreachable ();
8242 }
8243 }
8244
8245 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8246
8247 /* Select reciprocal square root series step insn depending on machine mode. */
8248
8249 static rsqrts_type
8250 get_rsqrts_type (machine_mode mode)
8251 {
8252 switch (mode)
8253 {
8254 case E_DFmode: return gen_aarch64_rsqrtsdf;
8255 case E_SFmode: return gen_aarch64_rsqrtssf;
8256 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8257 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8258 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8259 default: gcc_unreachable ();
8260 }
8261 }
8262
8263 /* Emit instruction sequence to compute either the approximate square root
8264 or its approximate reciprocal, depending on the flag RECP, and return
8265 whether the sequence was emitted or not. */
8266
8267 bool
8268 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8269 {
8270 machine_mode mode = GET_MODE (dst);
8271
8272 if (GET_MODE_INNER (mode) == HFmode)
8273 {
8274 gcc_assert (!recp);
8275 return false;
8276 }
8277
8278 machine_mode mmsk
8279 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)).require (),
8280 GET_MODE_NUNITS (mode));
8281 if (!recp)
8282 {
8283 if (!(flag_mlow_precision_sqrt
8284 || (aarch64_tune_params.approx_modes->sqrt
8285 & AARCH64_APPROX_MODE (mode))))
8286 return false;
8287
8288 if (flag_finite_math_only
8289 || flag_trapping_math
8290 || !flag_unsafe_math_optimizations
8291 || optimize_function_for_size_p (cfun))
8292 return false;
8293 }
8294 else
8295 /* Caller assumes we cannot fail. */
8296 gcc_assert (use_rsqrt_p (mode));
8297
8298
8299 rtx xmsk = gen_reg_rtx (mmsk);
8300 if (!recp)
8301 /* When calculating the approximate square root, compare the
8302 argument with 0.0 and create a mask. */
8303 emit_insn (gen_rtx_SET (xmsk,
8304 gen_rtx_NEG (mmsk,
8305 gen_rtx_EQ (mmsk, src,
8306 CONST0_RTX (mode)))));
8307
8308 /* Estimate the approximate reciprocal square root. */
8309 rtx xdst = gen_reg_rtx (mode);
8310 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8311
8312 /* Iterate over the series twice for SF and thrice for DF. */
8313 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8314
8315 /* Optionally iterate over the series once less for faster performance
8316 while sacrificing the accuracy. */
8317 if ((recp && flag_mrecip_low_precision_sqrt)
8318 || (!recp && flag_mlow_precision_sqrt))
8319 iterations--;
8320
8321 /* Iterate over the series to calculate the approximate reciprocal square
8322 root. */
8323 rtx x1 = gen_reg_rtx (mode);
8324 while (iterations--)
8325 {
8326 rtx x2 = gen_reg_rtx (mode);
8327 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8328
8329 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8330
8331 if (iterations > 0)
8332 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8333 }
8334
8335 if (!recp)
8336 {
8337 /* Qualify the approximate reciprocal square root when the argument is
8338 0.0 by squashing the intermediary result to 0.0. */
8339 rtx xtmp = gen_reg_rtx (mmsk);
8340 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8341 gen_rtx_SUBREG (mmsk, xdst, 0)));
8342 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8343
8344 /* Calculate the approximate square root. */
8345 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8346 }
8347
8348 /* Finalize the approximation. */
8349 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8350
8351 return true;
8352 }
8353
8354 typedef rtx (*recpe_type) (rtx, rtx);
8355
8356 /* Select reciprocal initial estimate insn depending on machine mode. */
8357
8358 static recpe_type
8359 get_recpe_type (machine_mode mode)
8360 {
8361 switch (mode)
8362 {
8363 case E_SFmode: return (gen_aarch64_frecpesf);
8364 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8365 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8366 case E_DFmode: return (gen_aarch64_frecpedf);
8367 case E_V2DFmode: return (gen_aarch64_frecpev2df);
8368 default: gcc_unreachable ();
8369 }
8370 }
8371
8372 typedef rtx (*recps_type) (rtx, rtx, rtx);
8373
8374 /* Select reciprocal series step insn depending on machine mode. */
8375
8376 static recps_type
8377 get_recps_type (machine_mode mode)
8378 {
8379 switch (mode)
8380 {
8381 case E_SFmode: return (gen_aarch64_frecpssf);
8382 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8383 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8384 case E_DFmode: return (gen_aarch64_frecpsdf);
8385 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8386 default: gcc_unreachable ();
8387 }
8388 }
8389
8390 /* Emit the instruction sequence to compute the approximation for the division
8391 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8392
8393 bool
8394 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8395 {
8396 machine_mode mode = GET_MODE (quo);
8397
8398 if (GET_MODE_INNER (mode) == HFmode)
8399 return false;
8400
8401 bool use_approx_division_p = (flag_mlow_precision_div
8402 || (aarch64_tune_params.approx_modes->division
8403 & AARCH64_APPROX_MODE (mode)));
8404
8405 if (!flag_finite_math_only
8406 || flag_trapping_math
8407 || !flag_unsafe_math_optimizations
8408 || optimize_function_for_size_p (cfun)
8409 || !use_approx_division_p)
8410 return false;
8411
8412 /* Estimate the approximate reciprocal. */
8413 rtx xrcp = gen_reg_rtx (mode);
8414 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8415
8416 /* Iterate over the series twice for SF and thrice for DF. */
8417 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8418
8419 /* Optionally iterate over the series once less for faster performance,
8420 while sacrificing the accuracy. */
8421 if (flag_mlow_precision_div)
8422 iterations--;
8423
8424 /* Iterate over the series to calculate the approximate reciprocal. */
8425 rtx xtmp = gen_reg_rtx (mode);
8426 while (iterations--)
8427 {
8428 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8429
8430 if (iterations > 0)
8431 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8432 }
8433
8434 if (num != CONST1_RTX (mode))
8435 {
8436 /* As the approximate reciprocal of DEN is already calculated, only
8437 calculate the approximate division when NUM is not 1.0. */
8438 rtx xnum = force_reg (mode, num);
8439 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8440 }
8441
8442 /* Finalize the approximation. */
8443 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8444 return true;
8445 }
8446
8447 /* Return the number of instructions that can be issued per cycle. */
8448 static int
8449 aarch64_sched_issue_rate (void)
8450 {
8451 return aarch64_tune_params.issue_rate;
8452 }
8453
8454 static int
8455 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8456 {
8457 int issue_rate = aarch64_sched_issue_rate ();
8458
8459 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8460 }
8461
8462
8463 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8464 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8465 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8466
8467 static int
8468 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8469 int ready_index)
8470 {
8471 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8472 }
8473
8474
8475 /* Vectorizer cost model target hooks. */
8476
8477 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8478 static int
8479 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8480 tree vectype,
8481 int misalign ATTRIBUTE_UNUSED)
8482 {
8483 unsigned elements;
8484 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8485 bool fp = false;
8486
8487 if (vectype != NULL)
8488 fp = FLOAT_TYPE_P (vectype);
8489
8490 switch (type_of_cost)
8491 {
8492 case scalar_stmt:
8493 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8494
8495 case scalar_load:
8496 return costs->scalar_load_cost;
8497
8498 case scalar_store:
8499 return costs->scalar_store_cost;
8500
8501 case vector_stmt:
8502 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8503
8504 case vector_load:
8505 return costs->vec_align_load_cost;
8506
8507 case vector_store:
8508 return costs->vec_store_cost;
8509
8510 case vec_to_scalar:
8511 return costs->vec_to_scalar_cost;
8512
8513 case scalar_to_vec:
8514 return costs->scalar_to_vec_cost;
8515
8516 case unaligned_load:
8517 return costs->vec_unalign_load_cost;
8518
8519 case unaligned_store:
8520 return costs->vec_unalign_store_cost;
8521
8522 case cond_branch_taken:
8523 return costs->cond_taken_branch_cost;
8524
8525 case cond_branch_not_taken:
8526 return costs->cond_not_taken_branch_cost;
8527
8528 case vec_perm:
8529 return costs->vec_permute_cost;
8530
8531 case vec_promote_demote:
8532 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8533
8534 case vec_construct:
8535 elements = TYPE_VECTOR_SUBPARTS (vectype);
8536 return elements / 2 + 1;
8537
8538 default:
8539 gcc_unreachable ();
8540 }
8541 }
8542
8543 /* Implement targetm.vectorize.add_stmt_cost. */
8544 static unsigned
8545 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8546 struct _stmt_vec_info *stmt_info, int misalign,
8547 enum vect_cost_model_location where)
8548 {
8549 unsigned *cost = (unsigned *) data;
8550 unsigned retval = 0;
8551
8552 if (flag_vect_cost_model)
8553 {
8554 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8555 int stmt_cost =
8556 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8557
8558 /* Statements in an inner loop relative to the loop being
8559 vectorized are weighted more heavily. The value here is
8560 arbitrary and could potentially be improved with analysis. */
8561 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8562 count *= 50; /* FIXME */
8563
8564 retval = (unsigned) (count * stmt_cost);
8565 cost[where] += retval;
8566 }
8567
8568 return retval;
8569 }
8570
8571 static void initialize_aarch64_code_model (struct gcc_options *);
8572
8573 /* Parse the TO_PARSE string and put the architecture struct that it
8574 selects into RES and the architectural features into ISA_FLAGS.
8575 Return an aarch64_parse_opt_result describing the parse result.
8576 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8577
8578 static enum aarch64_parse_opt_result
8579 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8580 unsigned long *isa_flags)
8581 {
8582 char *ext;
8583 const struct processor *arch;
8584 char *str = (char *) alloca (strlen (to_parse) + 1);
8585 size_t len;
8586
8587 strcpy (str, to_parse);
8588
8589 ext = strchr (str, '+');
8590
8591 if (ext != NULL)
8592 len = ext - str;
8593 else
8594 len = strlen (str);
8595
8596 if (len == 0)
8597 return AARCH64_PARSE_MISSING_ARG;
8598
8599
8600 /* Loop through the list of supported ARCHes to find a match. */
8601 for (arch = all_architectures; arch->name != NULL; arch++)
8602 {
8603 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8604 {
8605 unsigned long isa_temp = arch->flags;
8606
8607 if (ext != NULL)
8608 {
8609 /* TO_PARSE string contains at least one extension. */
8610 enum aarch64_parse_opt_result ext_res
8611 = aarch64_parse_extension (ext, &isa_temp);
8612
8613 if (ext_res != AARCH64_PARSE_OK)
8614 return ext_res;
8615 }
8616 /* Extension parsing was successful. Confirm the result
8617 arch and ISA flags. */
8618 *res = arch;
8619 *isa_flags = isa_temp;
8620 return AARCH64_PARSE_OK;
8621 }
8622 }
8623
8624 /* ARCH name not found in list. */
8625 return AARCH64_PARSE_INVALID_ARG;
8626 }
8627
8628 /* Parse the TO_PARSE string and put the result tuning in RES and the
8629 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8630 describing the parse result. If there is an error parsing, RES and
8631 ISA_FLAGS are left unchanged. */
8632
8633 static enum aarch64_parse_opt_result
8634 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8635 unsigned long *isa_flags)
8636 {
8637 char *ext;
8638 const struct processor *cpu;
8639 char *str = (char *) alloca (strlen (to_parse) + 1);
8640 size_t len;
8641
8642 strcpy (str, to_parse);
8643
8644 ext = strchr (str, '+');
8645
8646 if (ext != NULL)
8647 len = ext - str;
8648 else
8649 len = strlen (str);
8650
8651 if (len == 0)
8652 return AARCH64_PARSE_MISSING_ARG;
8653
8654
8655 /* Loop through the list of supported CPUs to find a match. */
8656 for (cpu = all_cores; cpu->name != NULL; cpu++)
8657 {
8658 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8659 {
8660 unsigned long isa_temp = cpu->flags;
8661
8662
8663 if (ext != NULL)
8664 {
8665 /* TO_PARSE string contains at least one extension. */
8666 enum aarch64_parse_opt_result ext_res
8667 = aarch64_parse_extension (ext, &isa_temp);
8668
8669 if (ext_res != AARCH64_PARSE_OK)
8670 return ext_res;
8671 }
8672 /* Extension parsing was successfull. Confirm the result
8673 cpu and ISA flags. */
8674 *res = cpu;
8675 *isa_flags = isa_temp;
8676 return AARCH64_PARSE_OK;
8677 }
8678 }
8679
8680 /* CPU name not found in list. */
8681 return AARCH64_PARSE_INVALID_ARG;
8682 }
8683
8684 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8685 Return an aarch64_parse_opt_result describing the parse result.
8686 If the parsing fails the RES does not change. */
8687
8688 static enum aarch64_parse_opt_result
8689 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8690 {
8691 const struct processor *cpu;
8692 char *str = (char *) alloca (strlen (to_parse) + 1);
8693
8694 strcpy (str, to_parse);
8695
8696 /* Loop through the list of supported CPUs to find a match. */
8697 for (cpu = all_cores; cpu->name != NULL; cpu++)
8698 {
8699 if (strcmp (cpu->name, str) == 0)
8700 {
8701 *res = cpu;
8702 return AARCH64_PARSE_OK;
8703 }
8704 }
8705
8706 /* CPU name not found in list. */
8707 return AARCH64_PARSE_INVALID_ARG;
8708 }
8709
8710 /* Parse TOKEN, which has length LENGTH to see if it is an option
8711 described in FLAG. If it is, return the index bit for that fusion type.
8712 If not, error (printing OPTION_NAME) and return zero. */
8713
8714 static unsigned int
8715 aarch64_parse_one_option_token (const char *token,
8716 size_t length,
8717 const struct aarch64_flag_desc *flag,
8718 const char *option_name)
8719 {
8720 for (; flag->name != NULL; flag++)
8721 {
8722 if (length == strlen (flag->name)
8723 && !strncmp (flag->name, token, length))
8724 return flag->flag;
8725 }
8726
8727 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8728 return 0;
8729 }
8730
8731 /* Parse OPTION which is a comma-separated list of flags to enable.
8732 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8733 default state we inherit from the CPU tuning structures. OPTION_NAME
8734 gives the top-level option we are parsing in the -moverride string,
8735 for use in error messages. */
8736
8737 static unsigned int
8738 aarch64_parse_boolean_options (const char *option,
8739 const struct aarch64_flag_desc *flags,
8740 unsigned int initial_state,
8741 const char *option_name)
8742 {
8743 const char separator = '.';
8744 const char* specs = option;
8745 const char* ntoken = option;
8746 unsigned int found_flags = initial_state;
8747
8748 while ((ntoken = strchr (specs, separator)))
8749 {
8750 size_t token_length = ntoken - specs;
8751 unsigned token_ops = aarch64_parse_one_option_token (specs,
8752 token_length,
8753 flags,
8754 option_name);
8755 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8756 in the token stream, reset the supported operations. So:
8757
8758 adrp+add.cmp+branch.none.adrp+add
8759
8760 would have the result of turning on only adrp+add fusion. */
8761 if (!token_ops)
8762 found_flags = 0;
8763
8764 found_flags |= token_ops;
8765 specs = ++ntoken;
8766 }
8767
8768 /* We ended with a comma, print something. */
8769 if (!(*specs))
8770 {
8771 error ("%s string ill-formed\n", option_name);
8772 return 0;
8773 }
8774
8775 /* We still have one more token to parse. */
8776 size_t token_length = strlen (specs);
8777 unsigned token_ops = aarch64_parse_one_option_token (specs,
8778 token_length,
8779 flags,
8780 option_name);
8781 if (!token_ops)
8782 found_flags = 0;
8783
8784 found_flags |= token_ops;
8785 return found_flags;
8786 }
8787
8788 /* Support for overriding instruction fusion. */
8789
8790 static void
8791 aarch64_parse_fuse_string (const char *fuse_string,
8792 struct tune_params *tune)
8793 {
8794 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8795 aarch64_fusible_pairs,
8796 tune->fusible_ops,
8797 "fuse=");
8798 }
8799
8800 /* Support for overriding other tuning flags. */
8801
8802 static void
8803 aarch64_parse_tune_string (const char *tune_string,
8804 struct tune_params *tune)
8805 {
8806 tune->extra_tuning_flags
8807 = aarch64_parse_boolean_options (tune_string,
8808 aarch64_tuning_flags,
8809 tune->extra_tuning_flags,
8810 "tune=");
8811 }
8812
8813 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8814 we understand. If it is, extract the option string and handoff to
8815 the appropriate function. */
8816
8817 void
8818 aarch64_parse_one_override_token (const char* token,
8819 size_t length,
8820 struct tune_params *tune)
8821 {
8822 const struct aarch64_tuning_override_function *fn
8823 = aarch64_tuning_override_functions;
8824
8825 const char *option_part = strchr (token, '=');
8826 if (!option_part)
8827 {
8828 error ("tuning string missing in option (%s)", token);
8829 return;
8830 }
8831
8832 /* Get the length of the option name. */
8833 length = option_part - token;
8834 /* Skip the '=' to get to the option string. */
8835 option_part++;
8836
8837 for (; fn->name != NULL; fn++)
8838 {
8839 if (!strncmp (fn->name, token, length))
8840 {
8841 fn->parse_override (option_part, tune);
8842 return;
8843 }
8844 }
8845
8846 error ("unknown tuning option (%s)",token);
8847 return;
8848 }
8849
8850 /* A checking mechanism for the implementation of the tls size. */
8851
8852 static void
8853 initialize_aarch64_tls_size (struct gcc_options *opts)
8854 {
8855 if (aarch64_tls_size == 0)
8856 aarch64_tls_size = 24;
8857
8858 switch (opts->x_aarch64_cmodel_var)
8859 {
8860 case AARCH64_CMODEL_TINY:
8861 /* Both the default and maximum TLS size allowed under tiny is 1M which
8862 needs two instructions to address, so we clamp the size to 24. */
8863 if (aarch64_tls_size > 24)
8864 aarch64_tls_size = 24;
8865 break;
8866 case AARCH64_CMODEL_SMALL:
8867 /* The maximum TLS size allowed under small is 4G. */
8868 if (aarch64_tls_size > 32)
8869 aarch64_tls_size = 32;
8870 break;
8871 case AARCH64_CMODEL_LARGE:
8872 /* The maximum TLS size allowed under large is 16E.
8873 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8874 if (aarch64_tls_size > 48)
8875 aarch64_tls_size = 48;
8876 break;
8877 default:
8878 gcc_unreachable ();
8879 }
8880
8881 return;
8882 }
8883
8884 /* Parse STRING looking for options in the format:
8885 string :: option:string
8886 option :: name=substring
8887 name :: {a-z}
8888 substring :: defined by option. */
8889
8890 static void
8891 aarch64_parse_override_string (const char* input_string,
8892 struct tune_params* tune)
8893 {
8894 const char separator = ':';
8895 size_t string_length = strlen (input_string) + 1;
8896 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8897 char *string = string_root;
8898 strncpy (string, input_string, string_length);
8899 string[string_length - 1] = '\0';
8900
8901 char* ntoken = string;
8902
8903 while ((ntoken = strchr (string, separator)))
8904 {
8905 size_t token_length = ntoken - string;
8906 /* Make this substring look like a string. */
8907 *ntoken = '\0';
8908 aarch64_parse_one_override_token (string, token_length, tune);
8909 string = ++ntoken;
8910 }
8911
8912 /* One last option to parse. */
8913 aarch64_parse_one_override_token (string, strlen (string), tune);
8914 free (string_root);
8915 }
8916
8917
8918 static void
8919 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8920 {
8921 /* The logic here is that if we are disabling all frame pointer generation
8922 then we do not need to disable leaf frame pointer generation as a
8923 separate operation. But if we are *only* disabling leaf frame pointer
8924 generation then we set flag_omit_frame_pointer to true, but in
8925 aarch64_frame_pointer_required we return false only for leaf functions.
8926
8927 PR 70044: We have to be careful about being called multiple times for the
8928 same function. Once we have decided to set flag_omit_frame_pointer just
8929 so that we can omit leaf frame pointers, we must then not interpret a
8930 second call as meaning that all frame pointer generation should be
8931 omitted. We do this by setting flag_omit_frame_pointer to a special,
8932 non-zero value. */
8933 if (opts->x_flag_omit_frame_pointer == 2)
8934 opts->x_flag_omit_frame_pointer = 0;
8935
8936 if (opts->x_flag_omit_frame_pointer)
8937 opts->x_flag_omit_leaf_frame_pointer = false;
8938 else if (opts->x_flag_omit_leaf_frame_pointer)
8939 opts->x_flag_omit_frame_pointer = 2;
8940
8941 /* If not optimizing for size, set the default
8942 alignment to what the target wants. */
8943 if (!opts->x_optimize_size)
8944 {
8945 if (opts->x_align_loops <= 0)
8946 opts->x_align_loops = aarch64_tune_params.loop_align;
8947 if (opts->x_align_jumps <= 0)
8948 opts->x_align_jumps = aarch64_tune_params.jump_align;
8949 if (opts->x_align_functions <= 0)
8950 opts->x_align_functions = aarch64_tune_params.function_align;
8951 }
8952
8953 /* We default to no pc-relative literal loads. */
8954
8955 aarch64_pcrelative_literal_loads = false;
8956
8957 /* If -mpc-relative-literal-loads is set on the command line, this
8958 implies that the user asked for PC relative literal loads. */
8959 if (opts->x_pcrelative_literal_loads == 1)
8960 aarch64_pcrelative_literal_loads = true;
8961
8962 /* This is PR70113. When building the Linux kernel with
8963 CONFIG_ARM64_ERRATUM_843419, support for relocations
8964 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8965 removed from the kernel to avoid loading objects with possibly
8966 offending sequences. Without -mpc-relative-literal-loads we would
8967 generate such relocations, preventing the kernel build from
8968 succeeding. */
8969 if (opts->x_pcrelative_literal_loads == 2
8970 && TARGET_FIX_ERR_A53_843419)
8971 aarch64_pcrelative_literal_loads = true;
8972
8973 /* In the tiny memory model it makes no sense to disallow PC relative
8974 literal pool loads. */
8975 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8976 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8977 aarch64_pcrelative_literal_loads = true;
8978
8979 /* When enabling the lower precision Newton series for the square root, also
8980 enable it for the reciprocal square root, since the latter is an
8981 intermediary step for the former. */
8982 if (flag_mlow_precision_sqrt)
8983 flag_mrecip_low_precision_sqrt = true;
8984 }
8985
8986 /* 'Unpack' up the internal tuning structs and update the options
8987 in OPTS. The caller must have set up selected_tune and selected_arch
8988 as all the other target-specific codegen decisions are
8989 derived from them. */
8990
8991 void
8992 aarch64_override_options_internal (struct gcc_options *opts)
8993 {
8994 aarch64_tune_flags = selected_tune->flags;
8995 aarch64_tune = selected_tune->sched_core;
8996 /* Make a copy of the tuning parameters attached to the core, which
8997 we may later overwrite. */
8998 aarch64_tune_params = *(selected_tune->tune);
8999 aarch64_architecture_version = selected_arch->architecture_version;
9000
9001 if (opts->x_aarch64_override_tune_string)
9002 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9003 &aarch64_tune_params);
9004
9005 /* This target defaults to strict volatile bitfields. */
9006 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9007 opts->x_flag_strict_volatile_bitfields = 1;
9008
9009 initialize_aarch64_code_model (opts);
9010 initialize_aarch64_tls_size (opts);
9011
9012 int queue_depth = 0;
9013 switch (aarch64_tune_params.autoprefetcher_model)
9014 {
9015 case tune_params::AUTOPREFETCHER_OFF:
9016 queue_depth = -1;
9017 break;
9018 case tune_params::AUTOPREFETCHER_WEAK:
9019 queue_depth = 0;
9020 break;
9021 case tune_params::AUTOPREFETCHER_STRONG:
9022 queue_depth = max_insn_queue_index + 1;
9023 break;
9024 default:
9025 gcc_unreachable ();
9026 }
9027
9028 /* We don't mind passing in global_options_set here as we don't use
9029 the *options_set structs anyway. */
9030 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9031 queue_depth,
9032 opts->x_param_values,
9033 global_options_set.x_param_values);
9034
9035 /* Set up parameters to be used in prefetching algorithm. Do not
9036 override the defaults unless we are tuning for a core we have
9037 researched values for. */
9038 if (aarch64_tune_params.prefetch->num_slots > 0)
9039 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9040 aarch64_tune_params.prefetch->num_slots,
9041 opts->x_param_values,
9042 global_options_set.x_param_values);
9043 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9044 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9045 aarch64_tune_params.prefetch->l1_cache_size,
9046 opts->x_param_values,
9047 global_options_set.x_param_values);
9048 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9049 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9050 aarch64_tune_params.prefetch->l1_cache_line_size,
9051 opts->x_param_values,
9052 global_options_set.x_param_values);
9053 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9054 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9055 aarch64_tune_params.prefetch->l2_cache_size,
9056 opts->x_param_values,
9057 global_options_set.x_param_values);
9058
9059 /* Enable sw prefetching at specified optimization level for
9060 CPUS that have prefetch. Lower optimization level threshold by 1
9061 when profiling is enabled. */
9062 if (opts->x_flag_prefetch_loop_arrays < 0
9063 && !opts->x_optimize_size
9064 && aarch64_tune_params.prefetch->default_opt_level >= 0
9065 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9066 opts->x_flag_prefetch_loop_arrays = 1;
9067
9068 aarch64_override_options_after_change_1 (opts);
9069 }
9070
9071 /* Print a hint with a suggestion for a core or architecture name that
9072 most closely resembles what the user passed in STR. ARCH is true if
9073 the user is asking for an architecture name. ARCH is false if the user
9074 is asking for a core name. */
9075
9076 static void
9077 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9078 {
9079 auto_vec<const char *> candidates;
9080 const struct processor *entry = arch ? all_architectures : all_cores;
9081 for (; entry->name != NULL; entry++)
9082 candidates.safe_push (entry->name);
9083 char *s;
9084 const char *hint = candidates_list_and_hint (str, s, candidates);
9085 if (hint)
9086 inform (input_location, "valid arguments are: %s;"
9087 " did you mean %qs?", s, hint);
9088 XDELETEVEC (s);
9089 }
9090
9091 /* Print a hint with a suggestion for a core name that most closely resembles
9092 what the user passed in STR. */
9093
9094 inline static void
9095 aarch64_print_hint_for_core (const char *str)
9096 {
9097 aarch64_print_hint_for_core_or_arch (str, false);
9098 }
9099
9100 /* Print a hint with a suggestion for an architecture name that most closely
9101 resembles what the user passed in STR. */
9102
9103 inline static void
9104 aarch64_print_hint_for_arch (const char *str)
9105 {
9106 aarch64_print_hint_for_core_or_arch (str, true);
9107 }
9108
9109 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9110 specified in STR and throw errors if appropriate. Put the results if
9111 they are valid in RES and ISA_FLAGS. Return whether the option is
9112 valid. */
9113
9114 static bool
9115 aarch64_validate_mcpu (const char *str, const struct processor **res,
9116 unsigned long *isa_flags)
9117 {
9118 enum aarch64_parse_opt_result parse_res
9119 = aarch64_parse_cpu (str, res, isa_flags);
9120
9121 if (parse_res == AARCH64_PARSE_OK)
9122 return true;
9123
9124 switch (parse_res)
9125 {
9126 case AARCH64_PARSE_MISSING_ARG:
9127 error ("missing cpu name in %<-mcpu=%s%>", str);
9128 break;
9129 case AARCH64_PARSE_INVALID_ARG:
9130 error ("unknown value %qs for -mcpu", str);
9131 aarch64_print_hint_for_core (str);
9132 break;
9133 case AARCH64_PARSE_INVALID_FEATURE:
9134 error ("invalid feature modifier in %<-mcpu=%s%>", str);
9135 break;
9136 default:
9137 gcc_unreachable ();
9138 }
9139
9140 return false;
9141 }
9142
9143 /* Validate a command-line -march option. Parse the arch and extensions
9144 (if any) specified in STR and throw errors if appropriate. Put the
9145 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9146 option is valid. */
9147
9148 static bool
9149 aarch64_validate_march (const char *str, const struct processor **res,
9150 unsigned long *isa_flags)
9151 {
9152 enum aarch64_parse_opt_result parse_res
9153 = aarch64_parse_arch (str, res, isa_flags);
9154
9155 if (parse_res == AARCH64_PARSE_OK)
9156 return true;
9157
9158 switch (parse_res)
9159 {
9160 case AARCH64_PARSE_MISSING_ARG:
9161 error ("missing arch name in %<-march=%s%>", str);
9162 break;
9163 case AARCH64_PARSE_INVALID_ARG:
9164 error ("unknown value %qs for -march", str);
9165 aarch64_print_hint_for_arch (str);
9166 break;
9167 case AARCH64_PARSE_INVALID_FEATURE:
9168 error ("invalid feature modifier in %<-march=%s%>", str);
9169 break;
9170 default:
9171 gcc_unreachable ();
9172 }
9173
9174 return false;
9175 }
9176
9177 /* Validate a command-line -mtune option. Parse the cpu
9178 specified in STR and throw errors if appropriate. Put the
9179 result, if it is valid, in RES. Return whether the option is
9180 valid. */
9181
9182 static bool
9183 aarch64_validate_mtune (const char *str, const struct processor **res)
9184 {
9185 enum aarch64_parse_opt_result parse_res
9186 = aarch64_parse_tune (str, res);
9187
9188 if (parse_res == AARCH64_PARSE_OK)
9189 return true;
9190
9191 switch (parse_res)
9192 {
9193 case AARCH64_PARSE_MISSING_ARG:
9194 error ("missing cpu name in %<-mtune=%s%>", str);
9195 break;
9196 case AARCH64_PARSE_INVALID_ARG:
9197 error ("unknown value %qs for -mtune", str);
9198 aarch64_print_hint_for_core (str);
9199 break;
9200 default:
9201 gcc_unreachable ();
9202 }
9203 return false;
9204 }
9205
9206 /* Return the CPU corresponding to the enum CPU.
9207 If it doesn't specify a cpu, return the default. */
9208
9209 static const struct processor *
9210 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9211 {
9212 if (cpu != aarch64_none)
9213 return &all_cores[cpu];
9214
9215 /* The & 0x3f is to extract the bottom 6 bits that encode the
9216 default cpu as selected by the --with-cpu GCC configure option
9217 in config.gcc.
9218 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9219 flags mechanism should be reworked to make it more sane. */
9220 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9221 }
9222
9223 /* Return the architecture corresponding to the enum ARCH.
9224 If it doesn't specify a valid architecture, return the default. */
9225
9226 static const struct processor *
9227 aarch64_get_arch (enum aarch64_arch arch)
9228 {
9229 if (arch != aarch64_no_arch)
9230 return &all_architectures[arch];
9231
9232 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9233
9234 return &all_architectures[cpu->arch];
9235 }
9236
9237 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9238 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9239 tuning structs. In particular it must set selected_tune and
9240 aarch64_isa_flags that define the available ISA features and tuning
9241 decisions. It must also set selected_arch as this will be used to
9242 output the .arch asm tags for each function. */
9243
9244 static void
9245 aarch64_override_options (void)
9246 {
9247 unsigned long cpu_isa = 0;
9248 unsigned long arch_isa = 0;
9249 aarch64_isa_flags = 0;
9250
9251 bool valid_cpu = true;
9252 bool valid_tune = true;
9253 bool valid_arch = true;
9254
9255 selected_cpu = NULL;
9256 selected_arch = NULL;
9257 selected_tune = NULL;
9258
9259 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9260 If either of -march or -mtune is given, they override their
9261 respective component of -mcpu. */
9262 if (aarch64_cpu_string)
9263 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9264 &cpu_isa);
9265
9266 if (aarch64_arch_string)
9267 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9268 &arch_isa);
9269
9270 if (aarch64_tune_string)
9271 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9272
9273 /* If the user did not specify a processor, choose the default
9274 one for them. This will be the CPU set during configuration using
9275 --with-cpu, otherwise it is "generic". */
9276 if (!selected_cpu)
9277 {
9278 if (selected_arch)
9279 {
9280 selected_cpu = &all_cores[selected_arch->ident];
9281 aarch64_isa_flags = arch_isa;
9282 explicit_arch = selected_arch->arch;
9283 }
9284 else
9285 {
9286 /* Get default configure-time CPU. */
9287 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9288 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9289 }
9290
9291 if (selected_tune)
9292 explicit_tune_core = selected_tune->ident;
9293 }
9294 /* If both -mcpu and -march are specified check that they are architecturally
9295 compatible, warn if they're not and prefer the -march ISA flags. */
9296 else if (selected_arch)
9297 {
9298 if (selected_arch->arch != selected_cpu->arch)
9299 {
9300 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9301 all_architectures[selected_cpu->arch].name,
9302 selected_arch->name);
9303 }
9304 aarch64_isa_flags = arch_isa;
9305 explicit_arch = selected_arch->arch;
9306 explicit_tune_core = selected_tune ? selected_tune->ident
9307 : selected_cpu->ident;
9308 }
9309 else
9310 {
9311 /* -mcpu but no -march. */
9312 aarch64_isa_flags = cpu_isa;
9313 explicit_tune_core = selected_tune ? selected_tune->ident
9314 : selected_cpu->ident;
9315 gcc_assert (selected_cpu);
9316 selected_arch = &all_architectures[selected_cpu->arch];
9317 explicit_arch = selected_arch->arch;
9318 }
9319
9320 /* Set the arch as well as we will need it when outputing
9321 the .arch directive in assembly. */
9322 if (!selected_arch)
9323 {
9324 gcc_assert (selected_cpu);
9325 selected_arch = &all_architectures[selected_cpu->arch];
9326 }
9327
9328 if (!selected_tune)
9329 selected_tune = selected_cpu;
9330
9331 #ifndef HAVE_AS_MABI_OPTION
9332 /* The compiler may have been configured with 2.23.* binutils, which does
9333 not have support for ILP32. */
9334 if (TARGET_ILP32)
9335 error ("Assembler does not support -mabi=ilp32");
9336 #endif
9337
9338 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9339 sorry ("Return address signing is only supported for -mabi=lp64");
9340
9341 /* Make sure we properly set up the explicit options. */
9342 if ((aarch64_cpu_string && valid_cpu)
9343 || (aarch64_tune_string && valid_tune))
9344 gcc_assert (explicit_tune_core != aarch64_none);
9345
9346 if ((aarch64_cpu_string && valid_cpu)
9347 || (aarch64_arch_string && valid_arch))
9348 gcc_assert (explicit_arch != aarch64_no_arch);
9349
9350 aarch64_override_options_internal (&global_options);
9351
9352 /* Save these options as the default ones in case we push and pop them later
9353 while processing functions with potential target attributes. */
9354 target_option_default_node = target_option_current_node
9355 = build_target_option_node (&global_options);
9356 }
9357
9358 /* Implement targetm.override_options_after_change. */
9359
9360 static void
9361 aarch64_override_options_after_change (void)
9362 {
9363 aarch64_override_options_after_change_1 (&global_options);
9364 }
9365
9366 static struct machine_function *
9367 aarch64_init_machine_status (void)
9368 {
9369 struct machine_function *machine;
9370 machine = ggc_cleared_alloc<machine_function> ();
9371 return machine;
9372 }
9373
9374 void
9375 aarch64_init_expanders (void)
9376 {
9377 init_machine_status = aarch64_init_machine_status;
9378 }
9379
9380 /* A checking mechanism for the implementation of the various code models. */
9381 static void
9382 initialize_aarch64_code_model (struct gcc_options *opts)
9383 {
9384 if (opts->x_flag_pic)
9385 {
9386 switch (opts->x_aarch64_cmodel_var)
9387 {
9388 case AARCH64_CMODEL_TINY:
9389 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9390 break;
9391 case AARCH64_CMODEL_SMALL:
9392 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9393 aarch64_cmodel = (flag_pic == 2
9394 ? AARCH64_CMODEL_SMALL_PIC
9395 : AARCH64_CMODEL_SMALL_SPIC);
9396 #else
9397 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9398 #endif
9399 break;
9400 case AARCH64_CMODEL_LARGE:
9401 sorry ("code model %qs with -f%s", "large",
9402 opts->x_flag_pic > 1 ? "PIC" : "pic");
9403 break;
9404 default:
9405 gcc_unreachable ();
9406 }
9407 }
9408 else
9409 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9410 }
9411
9412 /* Implement TARGET_OPTION_SAVE. */
9413
9414 static void
9415 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9416 {
9417 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9418 }
9419
9420 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9421 using the information saved in PTR. */
9422
9423 static void
9424 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9425 {
9426 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9427 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9428 opts->x_explicit_arch = ptr->x_explicit_arch;
9429 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9430 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9431
9432 aarch64_override_options_internal (opts);
9433 }
9434
9435 /* Implement TARGET_OPTION_PRINT. */
9436
9437 static void
9438 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9439 {
9440 const struct processor *cpu
9441 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9442 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9443 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9444 std::string extension
9445 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9446
9447 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9448 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9449 arch->name, extension.c_str ());
9450 }
9451
9452 static GTY(()) tree aarch64_previous_fndecl;
9453
9454 void
9455 aarch64_reset_previous_fndecl (void)
9456 {
9457 aarch64_previous_fndecl = NULL;
9458 }
9459
9460 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9461 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9462 make sure optab availability predicates are recomputed when necessary. */
9463
9464 void
9465 aarch64_save_restore_target_globals (tree new_tree)
9466 {
9467 if (TREE_TARGET_GLOBALS (new_tree))
9468 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9469 else if (new_tree == target_option_default_node)
9470 restore_target_globals (&default_target_globals);
9471 else
9472 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9473 }
9474
9475 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9476 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9477 of the function, if such exists. This function may be called multiple
9478 times on a single function so use aarch64_previous_fndecl to avoid
9479 setting up identical state. */
9480
9481 static void
9482 aarch64_set_current_function (tree fndecl)
9483 {
9484 if (!fndecl || fndecl == aarch64_previous_fndecl)
9485 return;
9486
9487 tree old_tree = (aarch64_previous_fndecl
9488 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9489 : NULL_TREE);
9490
9491 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9492
9493 /* If current function has no attributes but the previous one did,
9494 use the default node. */
9495 if (!new_tree && old_tree)
9496 new_tree = target_option_default_node;
9497
9498 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9499 the default have been handled by aarch64_save_restore_target_globals from
9500 aarch64_pragma_target_parse. */
9501 if (old_tree == new_tree)
9502 return;
9503
9504 aarch64_previous_fndecl = fndecl;
9505
9506 /* First set the target options. */
9507 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9508
9509 aarch64_save_restore_target_globals (new_tree);
9510 }
9511
9512 /* Enum describing the various ways we can handle attributes.
9513 In many cases we can reuse the generic option handling machinery. */
9514
9515 enum aarch64_attr_opt_type
9516 {
9517 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9518 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9519 aarch64_attr_enum, /* Attribute sets an enum variable. */
9520 aarch64_attr_custom /* Attribute requires a custom handling function. */
9521 };
9522
9523 /* All the information needed to handle a target attribute.
9524 NAME is the name of the attribute.
9525 ATTR_TYPE specifies the type of behavior of the attribute as described
9526 in the definition of enum aarch64_attr_opt_type.
9527 ALLOW_NEG is true if the attribute supports a "no-" form.
9528 HANDLER is the function that takes the attribute string and whether
9529 it is a pragma or attribute and handles the option. It is needed only
9530 when the ATTR_TYPE is aarch64_attr_custom.
9531 OPT_NUM is the enum specifying the option that the attribute modifies.
9532 This is needed for attributes that mirror the behavior of a command-line
9533 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9534 aarch64_attr_enum. */
9535
9536 struct aarch64_attribute_info
9537 {
9538 const char *name;
9539 enum aarch64_attr_opt_type attr_type;
9540 bool allow_neg;
9541 bool (*handler) (const char *, const char *);
9542 enum opt_code opt_num;
9543 };
9544
9545 /* Handle the ARCH_STR argument to the arch= target attribute.
9546 PRAGMA_OR_ATTR is used in potential error messages. */
9547
9548 static bool
9549 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9550 {
9551 const struct processor *tmp_arch = NULL;
9552 enum aarch64_parse_opt_result parse_res
9553 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9554
9555 if (parse_res == AARCH64_PARSE_OK)
9556 {
9557 gcc_assert (tmp_arch);
9558 selected_arch = tmp_arch;
9559 explicit_arch = selected_arch->arch;
9560 return true;
9561 }
9562
9563 switch (parse_res)
9564 {
9565 case AARCH64_PARSE_MISSING_ARG:
9566 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9567 break;
9568 case AARCH64_PARSE_INVALID_ARG:
9569 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9570 aarch64_print_hint_for_arch (str);
9571 break;
9572 case AARCH64_PARSE_INVALID_FEATURE:
9573 error ("invalid feature modifier %qs for 'arch' target %s",
9574 str, pragma_or_attr);
9575 break;
9576 default:
9577 gcc_unreachable ();
9578 }
9579
9580 return false;
9581 }
9582
9583 /* Handle the argument CPU_STR to the cpu= target attribute.
9584 PRAGMA_OR_ATTR is used in potential error messages. */
9585
9586 static bool
9587 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9588 {
9589 const struct processor *tmp_cpu = NULL;
9590 enum aarch64_parse_opt_result parse_res
9591 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9592
9593 if (parse_res == AARCH64_PARSE_OK)
9594 {
9595 gcc_assert (tmp_cpu);
9596 selected_tune = tmp_cpu;
9597 explicit_tune_core = selected_tune->ident;
9598
9599 selected_arch = &all_architectures[tmp_cpu->arch];
9600 explicit_arch = selected_arch->arch;
9601 return true;
9602 }
9603
9604 switch (parse_res)
9605 {
9606 case AARCH64_PARSE_MISSING_ARG:
9607 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9608 break;
9609 case AARCH64_PARSE_INVALID_ARG:
9610 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9611 aarch64_print_hint_for_core (str);
9612 break;
9613 case AARCH64_PARSE_INVALID_FEATURE:
9614 error ("invalid feature modifier %qs for 'cpu' target %s",
9615 str, pragma_or_attr);
9616 break;
9617 default:
9618 gcc_unreachable ();
9619 }
9620
9621 return false;
9622 }
9623
9624 /* Handle the argument STR to the tune= target attribute.
9625 PRAGMA_OR_ATTR is used in potential error messages. */
9626
9627 static bool
9628 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9629 {
9630 const struct processor *tmp_tune = NULL;
9631 enum aarch64_parse_opt_result parse_res
9632 = aarch64_parse_tune (str, &tmp_tune);
9633
9634 if (parse_res == AARCH64_PARSE_OK)
9635 {
9636 gcc_assert (tmp_tune);
9637 selected_tune = tmp_tune;
9638 explicit_tune_core = selected_tune->ident;
9639 return true;
9640 }
9641
9642 switch (parse_res)
9643 {
9644 case AARCH64_PARSE_INVALID_ARG:
9645 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9646 aarch64_print_hint_for_core (str);
9647 break;
9648 default:
9649 gcc_unreachable ();
9650 }
9651
9652 return false;
9653 }
9654
9655 /* Parse an architecture extensions target attribute string specified in STR.
9656 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9657 if successful. Update aarch64_isa_flags to reflect the ISA features
9658 modified.
9659 PRAGMA_OR_ATTR is used in potential error messages. */
9660
9661 static bool
9662 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9663 {
9664 enum aarch64_parse_opt_result parse_res;
9665 unsigned long isa_flags = aarch64_isa_flags;
9666
9667 /* We allow "+nothing" in the beginning to clear out all architectural
9668 features if the user wants to handpick specific features. */
9669 if (strncmp ("+nothing", str, 8) == 0)
9670 {
9671 isa_flags = 0;
9672 str += 8;
9673 }
9674
9675 parse_res = aarch64_parse_extension (str, &isa_flags);
9676
9677 if (parse_res == AARCH64_PARSE_OK)
9678 {
9679 aarch64_isa_flags = isa_flags;
9680 return true;
9681 }
9682
9683 switch (parse_res)
9684 {
9685 case AARCH64_PARSE_MISSING_ARG:
9686 error ("missing feature modifier in target %s %qs",
9687 pragma_or_attr, str);
9688 break;
9689
9690 case AARCH64_PARSE_INVALID_FEATURE:
9691 error ("invalid feature modifier in target %s %qs",
9692 pragma_or_attr, str);
9693 break;
9694
9695 default:
9696 gcc_unreachable ();
9697 }
9698
9699 return false;
9700 }
9701
9702 /* The target attributes that we support. On top of these we also support just
9703 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9704 handled explicitly in aarch64_process_one_target_attr. */
9705
9706 static const struct aarch64_attribute_info aarch64_attributes[] =
9707 {
9708 { "general-regs-only", aarch64_attr_mask, false, NULL,
9709 OPT_mgeneral_regs_only },
9710 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9711 OPT_mfix_cortex_a53_835769 },
9712 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9713 OPT_mfix_cortex_a53_843419 },
9714 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9715 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9716 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9717 OPT_momit_leaf_frame_pointer },
9718 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9719 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9720 OPT_march_ },
9721 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9722 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9723 OPT_mtune_ },
9724 { "sign-return-address", aarch64_attr_enum, false, NULL,
9725 OPT_msign_return_address_ },
9726 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9727 };
9728
9729 /* Parse ARG_STR which contains the definition of one target attribute.
9730 Show appropriate errors if any or return true if the attribute is valid.
9731 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9732 we're processing a target attribute or pragma. */
9733
9734 static bool
9735 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9736 {
9737 bool invert = false;
9738
9739 size_t len = strlen (arg_str);
9740
9741 if (len == 0)
9742 {
9743 error ("malformed target %s", pragma_or_attr);
9744 return false;
9745 }
9746
9747 char *str_to_check = (char *) alloca (len + 1);
9748 strcpy (str_to_check, arg_str);
9749
9750 /* Skip leading whitespace. */
9751 while (*str_to_check == ' ' || *str_to_check == '\t')
9752 str_to_check++;
9753
9754 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9755 It is easier to detect and handle it explicitly here rather than going
9756 through the machinery for the rest of the target attributes in this
9757 function. */
9758 if (*str_to_check == '+')
9759 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9760
9761 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9762 {
9763 invert = true;
9764 str_to_check += 3;
9765 }
9766 char *arg = strchr (str_to_check, '=');
9767
9768 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9769 and point ARG to "foo". */
9770 if (arg)
9771 {
9772 *arg = '\0';
9773 arg++;
9774 }
9775 const struct aarch64_attribute_info *p_attr;
9776 bool found = false;
9777 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9778 {
9779 /* If the names don't match up, or the user has given an argument
9780 to an attribute that doesn't accept one, or didn't give an argument
9781 to an attribute that expects one, fail to match. */
9782 if (strcmp (str_to_check, p_attr->name) != 0)
9783 continue;
9784
9785 found = true;
9786 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9787 || p_attr->attr_type == aarch64_attr_enum;
9788
9789 if (attr_need_arg_p ^ (arg != NULL))
9790 {
9791 error ("target %s %qs does not accept an argument",
9792 pragma_or_attr, str_to_check);
9793 return false;
9794 }
9795
9796 /* If the name matches but the attribute does not allow "no-" versions
9797 then we can't match. */
9798 if (invert && !p_attr->allow_neg)
9799 {
9800 error ("target %s %qs does not allow a negated form",
9801 pragma_or_attr, str_to_check);
9802 return false;
9803 }
9804
9805 switch (p_attr->attr_type)
9806 {
9807 /* Has a custom handler registered.
9808 For example, cpu=, arch=, tune=. */
9809 case aarch64_attr_custom:
9810 gcc_assert (p_attr->handler);
9811 if (!p_attr->handler (arg, pragma_or_attr))
9812 return false;
9813 break;
9814
9815 /* Either set or unset a boolean option. */
9816 case aarch64_attr_bool:
9817 {
9818 struct cl_decoded_option decoded;
9819
9820 generate_option (p_attr->opt_num, NULL, !invert,
9821 CL_TARGET, &decoded);
9822 aarch64_handle_option (&global_options, &global_options_set,
9823 &decoded, input_location);
9824 break;
9825 }
9826 /* Set or unset a bit in the target_flags. aarch64_handle_option
9827 should know what mask to apply given the option number. */
9828 case aarch64_attr_mask:
9829 {
9830 struct cl_decoded_option decoded;
9831 /* We only need to specify the option number.
9832 aarch64_handle_option will know which mask to apply. */
9833 decoded.opt_index = p_attr->opt_num;
9834 decoded.value = !invert;
9835 aarch64_handle_option (&global_options, &global_options_set,
9836 &decoded, input_location);
9837 break;
9838 }
9839 /* Use the option setting machinery to set an option to an enum. */
9840 case aarch64_attr_enum:
9841 {
9842 gcc_assert (arg);
9843 bool valid;
9844 int value;
9845 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9846 &value, CL_TARGET);
9847 if (valid)
9848 {
9849 set_option (&global_options, NULL, p_attr->opt_num, value,
9850 NULL, DK_UNSPECIFIED, input_location,
9851 global_dc);
9852 }
9853 else
9854 {
9855 error ("target %s %s=%s is not valid",
9856 pragma_or_attr, str_to_check, arg);
9857 }
9858 break;
9859 }
9860 default:
9861 gcc_unreachable ();
9862 }
9863 }
9864
9865 /* If we reached here we either have found an attribute and validated
9866 it or didn't match any. If we matched an attribute but its arguments
9867 were malformed we will have returned false already. */
9868 return found;
9869 }
9870
9871 /* Count how many times the character C appears in
9872 NULL-terminated string STR. */
9873
9874 static unsigned int
9875 num_occurences_in_str (char c, char *str)
9876 {
9877 unsigned int res = 0;
9878 while (*str != '\0')
9879 {
9880 if (*str == c)
9881 res++;
9882
9883 str++;
9884 }
9885
9886 return res;
9887 }
9888
9889 /* Parse the tree in ARGS that contains the target attribute information
9890 and update the global target options space. PRAGMA_OR_ATTR is a string
9891 to be used in error messages, specifying whether this is processing
9892 a target attribute or a target pragma. */
9893
9894 bool
9895 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9896 {
9897 if (TREE_CODE (args) == TREE_LIST)
9898 {
9899 do
9900 {
9901 tree head = TREE_VALUE (args);
9902 if (head)
9903 {
9904 if (!aarch64_process_target_attr (head, pragma_or_attr))
9905 return false;
9906 }
9907 args = TREE_CHAIN (args);
9908 } while (args);
9909
9910 return true;
9911 }
9912
9913 if (TREE_CODE (args) != STRING_CST)
9914 {
9915 error ("attribute %<target%> argument not a string");
9916 return false;
9917 }
9918
9919 size_t len = strlen (TREE_STRING_POINTER (args));
9920 char *str_to_check = (char *) alloca (len + 1);
9921 strcpy (str_to_check, TREE_STRING_POINTER (args));
9922
9923 if (len == 0)
9924 {
9925 error ("malformed target %s value", pragma_or_attr);
9926 return false;
9927 }
9928
9929 /* Used to catch empty spaces between commas i.e.
9930 attribute ((target ("attr1,,attr2"))). */
9931 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9932
9933 /* Handle multiple target attributes separated by ','. */
9934 char *token = strtok (str_to_check, ",");
9935
9936 unsigned int num_attrs = 0;
9937 while (token)
9938 {
9939 num_attrs++;
9940 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9941 {
9942 error ("target %s %qs is invalid", pragma_or_attr, token);
9943 return false;
9944 }
9945
9946 token = strtok (NULL, ",");
9947 }
9948
9949 if (num_attrs != num_commas + 1)
9950 {
9951 error ("malformed target %s list %qs",
9952 pragma_or_attr, TREE_STRING_POINTER (args));
9953 return false;
9954 }
9955
9956 return true;
9957 }
9958
9959 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9960 process attribute ((target ("..."))). */
9961
9962 static bool
9963 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9964 {
9965 struct cl_target_option cur_target;
9966 bool ret;
9967 tree old_optimize;
9968 tree new_target, new_optimize;
9969 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9970
9971 /* If what we're processing is the current pragma string then the
9972 target option node is already stored in target_option_current_node
9973 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9974 having to re-parse the string. This is especially useful to keep
9975 arm_neon.h compile times down since that header contains a lot
9976 of intrinsics enclosed in pragmas. */
9977 if (!existing_target && args == current_target_pragma)
9978 {
9979 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9980 return true;
9981 }
9982 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9983
9984 old_optimize = build_optimization_node (&global_options);
9985 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9986
9987 /* If the function changed the optimization levels as well as setting
9988 target options, start with the optimizations specified. */
9989 if (func_optimize && func_optimize != old_optimize)
9990 cl_optimization_restore (&global_options,
9991 TREE_OPTIMIZATION (func_optimize));
9992
9993 /* Save the current target options to restore at the end. */
9994 cl_target_option_save (&cur_target, &global_options);
9995
9996 /* If fndecl already has some target attributes applied to it, unpack
9997 them so that we add this attribute on top of them, rather than
9998 overwriting them. */
9999 if (existing_target)
10000 {
10001 struct cl_target_option *existing_options
10002 = TREE_TARGET_OPTION (existing_target);
10003
10004 if (existing_options)
10005 cl_target_option_restore (&global_options, existing_options);
10006 }
10007 else
10008 cl_target_option_restore (&global_options,
10009 TREE_TARGET_OPTION (target_option_current_node));
10010
10011
10012 ret = aarch64_process_target_attr (args, "attribute");
10013
10014 /* Set up any additional state. */
10015 if (ret)
10016 {
10017 aarch64_override_options_internal (&global_options);
10018 /* Initialize SIMD builtins if we haven't already.
10019 Set current_target_pragma to NULL for the duration so that
10020 the builtin initialization code doesn't try to tag the functions
10021 being built with the attributes specified by any current pragma, thus
10022 going into an infinite recursion. */
10023 if (TARGET_SIMD)
10024 {
10025 tree saved_current_target_pragma = current_target_pragma;
10026 current_target_pragma = NULL;
10027 aarch64_init_simd_builtins ();
10028 current_target_pragma = saved_current_target_pragma;
10029 }
10030 new_target = build_target_option_node (&global_options);
10031 }
10032 else
10033 new_target = NULL;
10034
10035 new_optimize = build_optimization_node (&global_options);
10036
10037 if (fndecl && ret)
10038 {
10039 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10040
10041 if (old_optimize != new_optimize)
10042 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10043 }
10044
10045 cl_target_option_restore (&global_options, &cur_target);
10046
10047 if (old_optimize != new_optimize)
10048 cl_optimization_restore (&global_options,
10049 TREE_OPTIMIZATION (old_optimize));
10050 return ret;
10051 }
10052
10053 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10054 tri-bool options (yes, no, don't care) and the default value is
10055 DEF, determine whether to reject inlining. */
10056
10057 static bool
10058 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10059 int dont_care, int def)
10060 {
10061 /* If the callee doesn't care, always allow inlining. */
10062 if (callee == dont_care)
10063 return true;
10064
10065 /* If the caller doesn't care, always allow inlining. */
10066 if (caller == dont_care)
10067 return true;
10068
10069 /* Otherwise, allow inlining if either the callee and caller values
10070 agree, or if the callee is using the default value. */
10071 return (callee == caller || callee == def);
10072 }
10073
10074 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10075 to inline CALLEE into CALLER based on target-specific info.
10076 Make sure that the caller and callee have compatible architectural
10077 features. Then go through the other possible target attributes
10078 and see if they can block inlining. Try not to reject always_inline
10079 callees unless they are incompatible architecturally. */
10080
10081 static bool
10082 aarch64_can_inline_p (tree caller, tree callee)
10083 {
10084 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10085 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10086
10087 /* If callee has no option attributes, then it is ok to inline. */
10088 if (!callee_tree)
10089 return true;
10090
10091 struct cl_target_option *caller_opts
10092 = TREE_TARGET_OPTION (caller_tree ? caller_tree
10093 : target_option_default_node);
10094
10095 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10096
10097
10098 /* Callee's ISA flags should be a subset of the caller's. */
10099 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10100 != callee_opts->x_aarch64_isa_flags)
10101 return false;
10102
10103 /* Allow non-strict aligned functions inlining into strict
10104 aligned ones. */
10105 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10106 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10107 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10108 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10109 return false;
10110
10111 bool always_inline = lookup_attribute ("always_inline",
10112 DECL_ATTRIBUTES (callee));
10113
10114 /* If the architectural features match up and the callee is always_inline
10115 then the other attributes don't matter. */
10116 if (always_inline)
10117 return true;
10118
10119 if (caller_opts->x_aarch64_cmodel_var
10120 != callee_opts->x_aarch64_cmodel_var)
10121 return false;
10122
10123 if (caller_opts->x_aarch64_tls_dialect
10124 != callee_opts->x_aarch64_tls_dialect)
10125 return false;
10126
10127 /* Honour explicit requests to workaround errata. */
10128 if (!aarch64_tribools_ok_for_inlining_p (
10129 caller_opts->x_aarch64_fix_a53_err835769,
10130 callee_opts->x_aarch64_fix_a53_err835769,
10131 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10132 return false;
10133
10134 if (!aarch64_tribools_ok_for_inlining_p (
10135 caller_opts->x_aarch64_fix_a53_err843419,
10136 callee_opts->x_aarch64_fix_a53_err843419,
10137 2, TARGET_FIX_ERR_A53_843419))
10138 return false;
10139
10140 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10141 caller and calle and they don't match up, reject inlining. */
10142 if (!aarch64_tribools_ok_for_inlining_p (
10143 caller_opts->x_flag_omit_leaf_frame_pointer,
10144 callee_opts->x_flag_omit_leaf_frame_pointer,
10145 2, 1))
10146 return false;
10147
10148 /* If the callee has specific tuning overrides, respect them. */
10149 if (callee_opts->x_aarch64_override_tune_string != NULL
10150 && caller_opts->x_aarch64_override_tune_string == NULL)
10151 return false;
10152
10153 /* If the user specified tuning override strings for the
10154 caller and callee and they don't match up, reject inlining.
10155 We just do a string compare here, we don't analyze the meaning
10156 of the string, as it would be too costly for little gain. */
10157 if (callee_opts->x_aarch64_override_tune_string
10158 && caller_opts->x_aarch64_override_tune_string
10159 && (strcmp (callee_opts->x_aarch64_override_tune_string,
10160 caller_opts->x_aarch64_override_tune_string) != 0))
10161 return false;
10162
10163 return true;
10164 }
10165
10166 /* Return true if SYMBOL_REF X binds locally. */
10167
10168 static bool
10169 aarch64_symbol_binds_local_p (const_rtx x)
10170 {
10171 return (SYMBOL_REF_DECL (x)
10172 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10173 : SYMBOL_REF_LOCAL_P (x));
10174 }
10175
10176 /* Return true if SYMBOL_REF X is thread local */
10177 static bool
10178 aarch64_tls_symbol_p (rtx x)
10179 {
10180 if (! TARGET_HAVE_TLS)
10181 return false;
10182
10183 if (GET_CODE (x) != SYMBOL_REF)
10184 return false;
10185
10186 return SYMBOL_REF_TLS_MODEL (x) != 0;
10187 }
10188
10189 /* Classify a TLS symbol into one of the TLS kinds. */
10190 enum aarch64_symbol_type
10191 aarch64_classify_tls_symbol (rtx x)
10192 {
10193 enum tls_model tls_kind = tls_symbolic_operand_type (x);
10194
10195 switch (tls_kind)
10196 {
10197 case TLS_MODEL_GLOBAL_DYNAMIC:
10198 case TLS_MODEL_LOCAL_DYNAMIC:
10199 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10200
10201 case TLS_MODEL_INITIAL_EXEC:
10202 switch (aarch64_cmodel)
10203 {
10204 case AARCH64_CMODEL_TINY:
10205 case AARCH64_CMODEL_TINY_PIC:
10206 return SYMBOL_TINY_TLSIE;
10207 default:
10208 return SYMBOL_SMALL_TLSIE;
10209 }
10210
10211 case TLS_MODEL_LOCAL_EXEC:
10212 if (aarch64_tls_size == 12)
10213 return SYMBOL_TLSLE12;
10214 else if (aarch64_tls_size == 24)
10215 return SYMBOL_TLSLE24;
10216 else if (aarch64_tls_size == 32)
10217 return SYMBOL_TLSLE32;
10218 else if (aarch64_tls_size == 48)
10219 return SYMBOL_TLSLE48;
10220 else
10221 gcc_unreachable ();
10222
10223 case TLS_MODEL_EMULATED:
10224 case TLS_MODEL_NONE:
10225 return SYMBOL_FORCE_TO_MEM;
10226
10227 default:
10228 gcc_unreachable ();
10229 }
10230 }
10231
10232 /* Return the method that should be used to access SYMBOL_REF or
10233 LABEL_REF X. */
10234
10235 enum aarch64_symbol_type
10236 aarch64_classify_symbol (rtx x, rtx offset)
10237 {
10238 if (GET_CODE (x) == LABEL_REF)
10239 {
10240 switch (aarch64_cmodel)
10241 {
10242 case AARCH64_CMODEL_LARGE:
10243 return SYMBOL_FORCE_TO_MEM;
10244
10245 case AARCH64_CMODEL_TINY_PIC:
10246 case AARCH64_CMODEL_TINY:
10247 return SYMBOL_TINY_ABSOLUTE;
10248
10249 case AARCH64_CMODEL_SMALL_SPIC:
10250 case AARCH64_CMODEL_SMALL_PIC:
10251 case AARCH64_CMODEL_SMALL:
10252 return SYMBOL_SMALL_ABSOLUTE;
10253
10254 default:
10255 gcc_unreachable ();
10256 }
10257 }
10258
10259 if (GET_CODE (x) == SYMBOL_REF)
10260 {
10261 if (aarch64_tls_symbol_p (x))
10262 return aarch64_classify_tls_symbol (x);
10263
10264 switch (aarch64_cmodel)
10265 {
10266 case AARCH64_CMODEL_TINY:
10267 /* When we retrieve symbol + offset address, we have to make sure
10268 the offset does not cause overflow of the final address. But
10269 we have no way of knowing the address of symbol at compile time
10270 so we can't accurately say if the distance between the PC and
10271 symbol + offset is outside the addressible range of +/-1M in the
10272 TINY code model. So we rely on images not being greater than
10273 1M and cap the offset at 1M and anything beyond 1M will have to
10274 be loaded using an alternative mechanism. Furthermore if the
10275 symbol is a weak reference to something that isn't known to
10276 resolve to a symbol in this module, then force to memory. */
10277 if ((SYMBOL_REF_WEAK (x)
10278 && !aarch64_symbol_binds_local_p (x))
10279 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10280 return SYMBOL_FORCE_TO_MEM;
10281 return SYMBOL_TINY_ABSOLUTE;
10282
10283 case AARCH64_CMODEL_SMALL:
10284 /* Same reasoning as the tiny code model, but the offset cap here is
10285 4G. */
10286 if ((SYMBOL_REF_WEAK (x)
10287 && !aarch64_symbol_binds_local_p (x))
10288 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10289 HOST_WIDE_INT_C (4294967264)))
10290 return SYMBOL_FORCE_TO_MEM;
10291 return SYMBOL_SMALL_ABSOLUTE;
10292
10293 case AARCH64_CMODEL_TINY_PIC:
10294 if (!aarch64_symbol_binds_local_p (x))
10295 return SYMBOL_TINY_GOT;
10296 return SYMBOL_TINY_ABSOLUTE;
10297
10298 case AARCH64_CMODEL_SMALL_SPIC:
10299 case AARCH64_CMODEL_SMALL_PIC:
10300 if (!aarch64_symbol_binds_local_p (x))
10301 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10302 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10303 return SYMBOL_SMALL_ABSOLUTE;
10304
10305 case AARCH64_CMODEL_LARGE:
10306 /* This is alright even in PIC code as the constant
10307 pool reference is always PC relative and within
10308 the same translation unit. */
10309 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10310 return SYMBOL_SMALL_ABSOLUTE;
10311 else
10312 return SYMBOL_FORCE_TO_MEM;
10313
10314 default:
10315 gcc_unreachable ();
10316 }
10317 }
10318
10319 /* By default push everything into the constant pool. */
10320 return SYMBOL_FORCE_TO_MEM;
10321 }
10322
10323 bool
10324 aarch64_constant_address_p (rtx x)
10325 {
10326 return (CONSTANT_P (x) && memory_address_p (DImode, x));
10327 }
10328
10329 bool
10330 aarch64_legitimate_pic_operand_p (rtx x)
10331 {
10332 if (GET_CODE (x) == SYMBOL_REF
10333 || (GET_CODE (x) == CONST
10334 && GET_CODE (XEXP (x, 0)) == PLUS
10335 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10336 return false;
10337
10338 return true;
10339 }
10340
10341 /* Return true if X holds either a quarter-precision or
10342 floating-point +0.0 constant. */
10343 static bool
10344 aarch64_valid_floating_const (rtx x)
10345 {
10346 if (!CONST_DOUBLE_P (x))
10347 return false;
10348
10349 /* This call determines which constants can be used in mov<mode>
10350 as integer moves instead of constant loads. */
10351 if (aarch64_float_const_rtx_p (x))
10352 return true;
10353
10354 return aarch64_float_const_representable_p (x);
10355 }
10356
10357 static bool
10358 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10359 {
10360 /* Do not allow vector struct mode constants. We could support
10361 0 and -1 easily, but they need support in aarch64-simd.md. */
10362 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10363 return false;
10364
10365 /* For these cases we never want to use a literal load.
10366 As such we have to prevent the compiler from forcing these
10367 to memory. */
10368 if ((GET_CODE (x) == CONST_VECTOR
10369 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10370 || CONST_INT_P (x)
10371 || aarch64_valid_floating_const (x)
10372 || aarch64_can_const_movi_rtx_p (x, mode)
10373 || aarch64_float_const_rtx_p (x))
10374 return !targetm.cannot_force_const_mem (mode, x);
10375
10376 if (GET_CODE (x) == HIGH
10377 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10378 return true;
10379
10380 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10381 so spilling them is better than rematerialization. */
10382 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10383 return true;
10384
10385 return aarch64_constant_address_p (x);
10386 }
10387
10388 rtx
10389 aarch64_load_tp (rtx target)
10390 {
10391 if (!target
10392 || GET_MODE (target) != Pmode
10393 || !register_operand (target, Pmode))
10394 target = gen_reg_rtx (Pmode);
10395
10396 /* Can return in any reg. */
10397 emit_insn (gen_aarch64_load_tp_hard (target));
10398 return target;
10399 }
10400
10401 /* On AAPCS systems, this is the "struct __va_list". */
10402 static GTY(()) tree va_list_type;
10403
10404 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10405 Return the type to use as __builtin_va_list.
10406
10407 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10408
10409 struct __va_list
10410 {
10411 void *__stack;
10412 void *__gr_top;
10413 void *__vr_top;
10414 int __gr_offs;
10415 int __vr_offs;
10416 }; */
10417
10418 static tree
10419 aarch64_build_builtin_va_list (void)
10420 {
10421 tree va_list_name;
10422 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10423
10424 /* Create the type. */
10425 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10426 /* Give it the required name. */
10427 va_list_name = build_decl (BUILTINS_LOCATION,
10428 TYPE_DECL,
10429 get_identifier ("__va_list"),
10430 va_list_type);
10431 DECL_ARTIFICIAL (va_list_name) = 1;
10432 TYPE_NAME (va_list_type) = va_list_name;
10433 TYPE_STUB_DECL (va_list_type) = va_list_name;
10434
10435 /* Create the fields. */
10436 f_stack = build_decl (BUILTINS_LOCATION,
10437 FIELD_DECL, get_identifier ("__stack"),
10438 ptr_type_node);
10439 f_grtop = build_decl (BUILTINS_LOCATION,
10440 FIELD_DECL, get_identifier ("__gr_top"),
10441 ptr_type_node);
10442 f_vrtop = build_decl (BUILTINS_LOCATION,
10443 FIELD_DECL, get_identifier ("__vr_top"),
10444 ptr_type_node);
10445 f_groff = build_decl (BUILTINS_LOCATION,
10446 FIELD_DECL, get_identifier ("__gr_offs"),
10447 integer_type_node);
10448 f_vroff = build_decl (BUILTINS_LOCATION,
10449 FIELD_DECL, get_identifier ("__vr_offs"),
10450 integer_type_node);
10451
10452 /* Tell tree-stdarg pass about our internal offset fields.
10453 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10454 purpose to identify whether the code is updating va_list internal
10455 offset fields through irregular way. */
10456 va_list_gpr_counter_field = f_groff;
10457 va_list_fpr_counter_field = f_vroff;
10458
10459 DECL_ARTIFICIAL (f_stack) = 1;
10460 DECL_ARTIFICIAL (f_grtop) = 1;
10461 DECL_ARTIFICIAL (f_vrtop) = 1;
10462 DECL_ARTIFICIAL (f_groff) = 1;
10463 DECL_ARTIFICIAL (f_vroff) = 1;
10464
10465 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10466 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10467 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10468 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10469 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10470
10471 TYPE_FIELDS (va_list_type) = f_stack;
10472 DECL_CHAIN (f_stack) = f_grtop;
10473 DECL_CHAIN (f_grtop) = f_vrtop;
10474 DECL_CHAIN (f_vrtop) = f_groff;
10475 DECL_CHAIN (f_groff) = f_vroff;
10476
10477 /* Compute its layout. */
10478 layout_type (va_list_type);
10479
10480 return va_list_type;
10481 }
10482
10483 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10484 static void
10485 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10486 {
10487 const CUMULATIVE_ARGS *cum;
10488 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10489 tree stack, grtop, vrtop, groff, vroff;
10490 tree t;
10491 int gr_save_area_size = cfun->va_list_gpr_size;
10492 int vr_save_area_size = cfun->va_list_fpr_size;
10493 int vr_offset;
10494
10495 cum = &crtl->args.info;
10496 if (cfun->va_list_gpr_size)
10497 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10498 cfun->va_list_gpr_size);
10499 if (cfun->va_list_fpr_size)
10500 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10501 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10502
10503 if (!TARGET_FLOAT)
10504 {
10505 gcc_assert (cum->aapcs_nvrn == 0);
10506 vr_save_area_size = 0;
10507 }
10508
10509 f_stack = TYPE_FIELDS (va_list_type_node);
10510 f_grtop = DECL_CHAIN (f_stack);
10511 f_vrtop = DECL_CHAIN (f_grtop);
10512 f_groff = DECL_CHAIN (f_vrtop);
10513 f_vroff = DECL_CHAIN (f_groff);
10514
10515 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10516 NULL_TREE);
10517 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10518 NULL_TREE);
10519 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10520 NULL_TREE);
10521 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10522 NULL_TREE);
10523 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10524 NULL_TREE);
10525
10526 /* Emit code to initialize STACK, which points to the next varargs stack
10527 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10528 by named arguments. STACK is 8-byte aligned. */
10529 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10530 if (cum->aapcs_stack_size > 0)
10531 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10532 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10533 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10534
10535 /* Emit code to initialize GRTOP, the top of the GR save area.
10536 virtual_incoming_args_rtx should have been 16 byte aligned. */
10537 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10538 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10539 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10540
10541 /* Emit code to initialize VRTOP, the top of the VR save area.
10542 This address is gr_save_area_bytes below GRTOP, rounded
10543 down to the next 16-byte boundary. */
10544 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10545 vr_offset = ROUND_UP (gr_save_area_size,
10546 STACK_BOUNDARY / BITS_PER_UNIT);
10547
10548 if (vr_offset)
10549 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10550 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10551 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10552
10553 /* Emit code to initialize GROFF, the offset from GRTOP of the
10554 next GPR argument. */
10555 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10556 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10557 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10558
10559 /* Likewise emit code to initialize VROFF, the offset from FTOP
10560 of the next VR argument. */
10561 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10562 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10563 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10564 }
10565
10566 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10567
10568 static tree
10569 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10570 gimple_seq *post_p ATTRIBUTE_UNUSED)
10571 {
10572 tree addr;
10573 bool indirect_p;
10574 bool is_ha; /* is HFA or HVA. */
10575 bool dw_align; /* double-word align. */
10576 machine_mode ag_mode = VOIDmode;
10577 int nregs;
10578 machine_mode mode;
10579
10580 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10581 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10582 HOST_WIDE_INT size, rsize, adjust, align;
10583 tree t, u, cond1, cond2;
10584
10585 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10586 if (indirect_p)
10587 type = build_pointer_type (type);
10588
10589 mode = TYPE_MODE (type);
10590
10591 f_stack = TYPE_FIELDS (va_list_type_node);
10592 f_grtop = DECL_CHAIN (f_stack);
10593 f_vrtop = DECL_CHAIN (f_grtop);
10594 f_groff = DECL_CHAIN (f_vrtop);
10595 f_vroff = DECL_CHAIN (f_groff);
10596
10597 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10598 f_stack, NULL_TREE);
10599 size = int_size_in_bytes (type);
10600 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10601
10602 dw_align = false;
10603 adjust = 0;
10604 if (aarch64_vfp_is_call_or_return_candidate (mode,
10605 type,
10606 &ag_mode,
10607 &nregs,
10608 &is_ha))
10609 {
10610 /* TYPE passed in fp/simd registers. */
10611 if (!TARGET_FLOAT)
10612 aarch64_err_no_fpadvsimd (mode, "varargs");
10613
10614 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10615 unshare_expr (valist), f_vrtop, NULL_TREE);
10616 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10617 unshare_expr (valist), f_vroff, NULL_TREE);
10618
10619 rsize = nregs * UNITS_PER_VREG;
10620
10621 if (is_ha)
10622 {
10623 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10624 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10625 }
10626 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10627 && size < UNITS_PER_VREG)
10628 {
10629 adjust = UNITS_PER_VREG - size;
10630 }
10631 }
10632 else
10633 {
10634 /* TYPE passed in general registers. */
10635 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10636 unshare_expr (valist), f_grtop, NULL_TREE);
10637 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10638 unshare_expr (valist), f_groff, NULL_TREE);
10639 rsize = ROUND_UP (size, UNITS_PER_WORD);
10640 nregs = rsize / UNITS_PER_WORD;
10641
10642 if (align > 8)
10643 dw_align = true;
10644
10645 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10646 && size < UNITS_PER_WORD)
10647 {
10648 adjust = UNITS_PER_WORD - size;
10649 }
10650 }
10651
10652 /* Get a local temporary for the field value. */
10653 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10654
10655 /* Emit code to branch if off >= 0. */
10656 t = build2 (GE_EXPR, boolean_type_node, off,
10657 build_int_cst (TREE_TYPE (off), 0));
10658 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10659
10660 if (dw_align)
10661 {
10662 /* Emit: offs = (offs + 15) & -16. */
10663 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10664 build_int_cst (TREE_TYPE (off), 15));
10665 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10666 build_int_cst (TREE_TYPE (off), -16));
10667 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10668 }
10669 else
10670 roundup = NULL;
10671
10672 /* Update ap.__[g|v]r_offs */
10673 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10674 build_int_cst (TREE_TYPE (off), rsize));
10675 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10676
10677 /* String up. */
10678 if (roundup)
10679 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10680
10681 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10682 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10683 build_int_cst (TREE_TYPE (f_off), 0));
10684 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10685
10686 /* String up: make sure the assignment happens before the use. */
10687 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10688 COND_EXPR_ELSE (cond1) = t;
10689
10690 /* Prepare the trees handling the argument that is passed on the stack;
10691 the top level node will store in ON_STACK. */
10692 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10693 if (align > 8)
10694 {
10695 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10696 t = fold_convert (intDI_type_node, arg);
10697 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10698 build_int_cst (TREE_TYPE (t), 15));
10699 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10700 build_int_cst (TREE_TYPE (t), -16));
10701 t = fold_convert (TREE_TYPE (arg), t);
10702 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10703 }
10704 else
10705 roundup = NULL;
10706 /* Advance ap.__stack */
10707 t = fold_convert (intDI_type_node, arg);
10708 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10709 build_int_cst (TREE_TYPE (t), size + 7));
10710 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10711 build_int_cst (TREE_TYPE (t), -8));
10712 t = fold_convert (TREE_TYPE (arg), t);
10713 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10714 /* String up roundup and advance. */
10715 if (roundup)
10716 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10717 /* String up with arg */
10718 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10719 /* Big-endianness related address adjustment. */
10720 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10721 && size < UNITS_PER_WORD)
10722 {
10723 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10724 size_int (UNITS_PER_WORD - size));
10725 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10726 }
10727
10728 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10729 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10730
10731 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10732 t = off;
10733 if (adjust)
10734 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10735 build_int_cst (TREE_TYPE (off), adjust));
10736
10737 t = fold_convert (sizetype, t);
10738 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10739
10740 if (is_ha)
10741 {
10742 /* type ha; // treat as "struct {ftype field[n];}"
10743 ... [computing offs]
10744 for (i = 0; i <nregs; ++i, offs += 16)
10745 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10746 return ha; */
10747 int i;
10748 tree tmp_ha, field_t, field_ptr_t;
10749
10750 /* Declare a local variable. */
10751 tmp_ha = create_tmp_var_raw (type, "ha");
10752 gimple_add_tmp_var (tmp_ha);
10753
10754 /* Establish the base type. */
10755 switch (ag_mode)
10756 {
10757 case E_SFmode:
10758 field_t = float_type_node;
10759 field_ptr_t = float_ptr_type_node;
10760 break;
10761 case E_DFmode:
10762 field_t = double_type_node;
10763 field_ptr_t = double_ptr_type_node;
10764 break;
10765 case E_TFmode:
10766 field_t = long_double_type_node;
10767 field_ptr_t = long_double_ptr_type_node;
10768 break;
10769 case E_HFmode:
10770 field_t = aarch64_fp16_type_node;
10771 field_ptr_t = aarch64_fp16_ptr_type_node;
10772 break;
10773 case E_V2SImode:
10774 case E_V4SImode:
10775 {
10776 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10777 field_t = build_vector_type_for_mode (innertype, ag_mode);
10778 field_ptr_t = build_pointer_type (field_t);
10779 }
10780 break;
10781 default:
10782 gcc_assert (0);
10783 }
10784
10785 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10786 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10787 addr = t;
10788 t = fold_convert (field_ptr_t, addr);
10789 t = build2 (MODIFY_EXPR, field_t,
10790 build1 (INDIRECT_REF, field_t, tmp_ha),
10791 build1 (INDIRECT_REF, field_t, t));
10792
10793 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10794 for (i = 1; i < nregs; ++i)
10795 {
10796 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10797 u = fold_convert (field_ptr_t, addr);
10798 u = build2 (MODIFY_EXPR, field_t,
10799 build2 (MEM_REF, field_t, tmp_ha,
10800 build_int_cst (field_ptr_t,
10801 (i *
10802 int_size_in_bytes (field_t)))),
10803 build1 (INDIRECT_REF, field_t, u));
10804 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10805 }
10806
10807 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10808 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10809 }
10810
10811 COND_EXPR_ELSE (cond2) = t;
10812 addr = fold_convert (build_pointer_type (type), cond1);
10813 addr = build_va_arg_indirect_ref (addr);
10814
10815 if (indirect_p)
10816 addr = build_va_arg_indirect_ref (addr);
10817
10818 return addr;
10819 }
10820
10821 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10822
10823 static void
10824 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10825 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10826 int no_rtl)
10827 {
10828 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10829 CUMULATIVE_ARGS local_cum;
10830 int gr_saved = cfun->va_list_gpr_size;
10831 int vr_saved = cfun->va_list_fpr_size;
10832
10833 /* The caller has advanced CUM up to, but not beyond, the last named
10834 argument. Advance a local copy of CUM past the last "real" named
10835 argument, to find out how many registers are left over. */
10836 local_cum = *cum;
10837 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10838
10839 /* Found out how many registers we need to save.
10840 Honor tree-stdvar analysis results. */
10841 if (cfun->va_list_gpr_size)
10842 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10843 cfun->va_list_gpr_size / UNITS_PER_WORD);
10844 if (cfun->va_list_fpr_size)
10845 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10846 cfun->va_list_fpr_size / UNITS_PER_VREG);
10847
10848 if (!TARGET_FLOAT)
10849 {
10850 gcc_assert (local_cum.aapcs_nvrn == 0);
10851 vr_saved = 0;
10852 }
10853
10854 if (!no_rtl)
10855 {
10856 if (gr_saved > 0)
10857 {
10858 rtx ptr, mem;
10859
10860 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10861 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10862 - gr_saved * UNITS_PER_WORD);
10863 mem = gen_frame_mem (BLKmode, ptr);
10864 set_mem_alias_set (mem, get_varargs_alias_set ());
10865
10866 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10867 mem, gr_saved);
10868 }
10869 if (vr_saved > 0)
10870 {
10871 /* We can't use move_block_from_reg, because it will use
10872 the wrong mode, storing D regs only. */
10873 machine_mode mode = TImode;
10874 int off, i, vr_start;
10875
10876 /* Set OFF to the offset from virtual_incoming_args_rtx of
10877 the first vector register. The VR save area lies below
10878 the GR one, and is aligned to 16 bytes. */
10879 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10880 STACK_BOUNDARY / BITS_PER_UNIT);
10881 off -= vr_saved * UNITS_PER_VREG;
10882
10883 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10884 for (i = 0; i < vr_saved; ++i)
10885 {
10886 rtx ptr, mem;
10887
10888 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10889 mem = gen_frame_mem (mode, ptr);
10890 set_mem_alias_set (mem, get_varargs_alias_set ());
10891 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10892 off += UNITS_PER_VREG;
10893 }
10894 }
10895 }
10896
10897 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10898 any complication of having crtl->args.pretend_args_size changed. */
10899 cfun->machine->frame.saved_varargs_size
10900 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10901 STACK_BOUNDARY / BITS_PER_UNIT)
10902 + vr_saved * UNITS_PER_VREG);
10903 }
10904
10905 static void
10906 aarch64_conditional_register_usage (void)
10907 {
10908 int i;
10909 if (!TARGET_FLOAT)
10910 {
10911 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10912 {
10913 fixed_regs[i] = 1;
10914 call_used_regs[i] = 1;
10915 }
10916 }
10917 }
10918
10919 /* Walk down the type tree of TYPE counting consecutive base elements.
10920 If *MODEP is VOIDmode, then set it to the first valid floating point
10921 type. If a non-floating point type is found, or if a floating point
10922 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10923 otherwise return the count in the sub-tree. */
10924 static int
10925 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10926 {
10927 machine_mode mode;
10928 HOST_WIDE_INT size;
10929
10930 switch (TREE_CODE (type))
10931 {
10932 case REAL_TYPE:
10933 mode = TYPE_MODE (type);
10934 if (mode != DFmode && mode != SFmode
10935 && mode != TFmode && mode != HFmode)
10936 return -1;
10937
10938 if (*modep == VOIDmode)
10939 *modep = mode;
10940
10941 if (*modep == mode)
10942 return 1;
10943
10944 break;
10945
10946 case COMPLEX_TYPE:
10947 mode = TYPE_MODE (TREE_TYPE (type));
10948 if (mode != DFmode && mode != SFmode
10949 && mode != TFmode && mode != HFmode)
10950 return -1;
10951
10952 if (*modep == VOIDmode)
10953 *modep = mode;
10954
10955 if (*modep == mode)
10956 return 2;
10957
10958 break;
10959
10960 case VECTOR_TYPE:
10961 /* Use V2SImode and V4SImode as representatives of all 64-bit
10962 and 128-bit vector types. */
10963 size = int_size_in_bytes (type);
10964 switch (size)
10965 {
10966 case 8:
10967 mode = V2SImode;
10968 break;
10969 case 16:
10970 mode = V4SImode;
10971 break;
10972 default:
10973 return -1;
10974 }
10975
10976 if (*modep == VOIDmode)
10977 *modep = mode;
10978
10979 /* Vector modes are considered to be opaque: two vectors are
10980 equivalent for the purposes of being homogeneous aggregates
10981 if they are the same size. */
10982 if (*modep == mode)
10983 return 1;
10984
10985 break;
10986
10987 case ARRAY_TYPE:
10988 {
10989 int count;
10990 tree index = TYPE_DOMAIN (type);
10991
10992 /* Can't handle incomplete types nor sizes that are not
10993 fixed. */
10994 if (!COMPLETE_TYPE_P (type)
10995 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10996 return -1;
10997
10998 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10999 if (count == -1
11000 || !index
11001 || !TYPE_MAX_VALUE (index)
11002 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11003 || !TYPE_MIN_VALUE (index)
11004 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11005 || count < 0)
11006 return -1;
11007
11008 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11009 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11010
11011 /* There must be no padding. */
11012 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11013 return -1;
11014
11015 return count;
11016 }
11017
11018 case RECORD_TYPE:
11019 {
11020 int count = 0;
11021 int sub_count;
11022 tree field;
11023
11024 /* Can't handle incomplete types nor sizes that are not
11025 fixed. */
11026 if (!COMPLETE_TYPE_P (type)
11027 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11028 return -1;
11029
11030 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11031 {
11032 if (TREE_CODE (field) != FIELD_DECL)
11033 continue;
11034
11035 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11036 if (sub_count < 0)
11037 return -1;
11038 count += sub_count;
11039 }
11040
11041 /* There must be no padding. */
11042 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11043 return -1;
11044
11045 return count;
11046 }
11047
11048 case UNION_TYPE:
11049 case QUAL_UNION_TYPE:
11050 {
11051 /* These aren't very interesting except in a degenerate case. */
11052 int count = 0;
11053 int sub_count;
11054 tree field;
11055
11056 /* Can't handle incomplete types nor sizes that are not
11057 fixed. */
11058 if (!COMPLETE_TYPE_P (type)
11059 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11060 return -1;
11061
11062 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11063 {
11064 if (TREE_CODE (field) != FIELD_DECL)
11065 continue;
11066
11067 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11068 if (sub_count < 0)
11069 return -1;
11070 count = count > sub_count ? count : sub_count;
11071 }
11072
11073 /* There must be no padding. */
11074 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11075 return -1;
11076
11077 return count;
11078 }
11079
11080 default:
11081 break;
11082 }
11083
11084 return -1;
11085 }
11086
11087 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11088 type as described in AAPCS64 \S 4.1.2.
11089
11090 See the comment above aarch64_composite_type_p for the notes on MODE. */
11091
11092 static bool
11093 aarch64_short_vector_p (const_tree type,
11094 machine_mode mode)
11095 {
11096 HOST_WIDE_INT size = -1;
11097
11098 if (type && TREE_CODE (type) == VECTOR_TYPE)
11099 size = int_size_in_bytes (type);
11100 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11101 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11102 size = GET_MODE_SIZE (mode);
11103
11104 return (size == 8 || size == 16);
11105 }
11106
11107 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11108 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11109 array types. The C99 floating-point complex types are also considered
11110 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11111 types, which are GCC extensions and out of the scope of AAPCS64, are
11112 treated as composite types here as well.
11113
11114 Note that MODE itself is not sufficient in determining whether a type
11115 is such a composite type or not. This is because
11116 stor-layout.c:compute_record_mode may have already changed the MODE
11117 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11118 structure with only one field may have its MODE set to the mode of the
11119 field. Also an integer mode whose size matches the size of the
11120 RECORD_TYPE type may be used to substitute the original mode
11121 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11122 solely relied on. */
11123
11124 static bool
11125 aarch64_composite_type_p (const_tree type,
11126 machine_mode mode)
11127 {
11128 if (aarch64_short_vector_p (type, mode))
11129 return false;
11130
11131 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11132 return true;
11133
11134 if (mode == BLKmode
11135 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11136 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11137 return true;
11138
11139 return false;
11140 }
11141
11142 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11143 shall be passed or returned in simd/fp register(s) (providing these
11144 parameter passing registers are available).
11145
11146 Upon successful return, *COUNT returns the number of needed registers,
11147 *BASE_MODE returns the mode of the individual register and when IS_HAF
11148 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11149 floating-point aggregate or a homogeneous short-vector aggregate. */
11150
11151 static bool
11152 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11153 const_tree type,
11154 machine_mode *base_mode,
11155 int *count,
11156 bool *is_ha)
11157 {
11158 machine_mode new_mode = VOIDmode;
11159 bool composite_p = aarch64_composite_type_p (type, mode);
11160
11161 if (is_ha != NULL) *is_ha = false;
11162
11163 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11164 || aarch64_short_vector_p (type, mode))
11165 {
11166 *count = 1;
11167 new_mode = mode;
11168 }
11169 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11170 {
11171 if (is_ha != NULL) *is_ha = true;
11172 *count = 2;
11173 new_mode = GET_MODE_INNER (mode);
11174 }
11175 else if (type && composite_p)
11176 {
11177 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11178
11179 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11180 {
11181 if (is_ha != NULL) *is_ha = true;
11182 *count = ag_count;
11183 }
11184 else
11185 return false;
11186 }
11187 else
11188 return false;
11189
11190 *base_mode = new_mode;
11191 return true;
11192 }
11193
11194 /* Implement TARGET_STRUCT_VALUE_RTX. */
11195
11196 static rtx
11197 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11198 int incoming ATTRIBUTE_UNUSED)
11199 {
11200 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11201 }
11202
11203 /* Implements target hook vector_mode_supported_p. */
11204 static bool
11205 aarch64_vector_mode_supported_p (machine_mode mode)
11206 {
11207 if (TARGET_SIMD
11208 && (mode == V4SImode || mode == V8HImode
11209 || mode == V16QImode || mode == V2DImode
11210 || mode == V2SImode || mode == V4HImode
11211 || mode == V8QImode || mode == V2SFmode
11212 || mode == V4SFmode || mode == V2DFmode
11213 || mode == V4HFmode || mode == V8HFmode
11214 || mode == V1DFmode))
11215 return true;
11216
11217 return false;
11218 }
11219
11220 /* Return appropriate SIMD container
11221 for MODE within a vector of WIDTH bits. */
11222 static machine_mode
11223 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11224 {
11225 gcc_assert (width == 64 || width == 128);
11226 if (TARGET_SIMD)
11227 {
11228 if (width == 128)
11229 switch (mode)
11230 {
11231 case E_DFmode:
11232 return V2DFmode;
11233 case E_SFmode:
11234 return V4SFmode;
11235 case E_HFmode:
11236 return V8HFmode;
11237 case E_SImode:
11238 return V4SImode;
11239 case E_HImode:
11240 return V8HImode;
11241 case E_QImode:
11242 return V16QImode;
11243 case E_DImode:
11244 return V2DImode;
11245 default:
11246 break;
11247 }
11248 else
11249 switch (mode)
11250 {
11251 case E_SFmode:
11252 return V2SFmode;
11253 case E_HFmode:
11254 return V4HFmode;
11255 case E_SImode:
11256 return V2SImode;
11257 case E_HImode:
11258 return V4HImode;
11259 case E_QImode:
11260 return V8QImode;
11261 default:
11262 break;
11263 }
11264 }
11265 return word_mode;
11266 }
11267
11268 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11269 static machine_mode
11270 aarch64_preferred_simd_mode (scalar_mode mode)
11271 {
11272 return aarch64_simd_container_mode (mode, 128);
11273 }
11274
11275 /* Return the bitmask of possible vector sizes for the vectorizer
11276 to iterate over. */
11277 static unsigned int
11278 aarch64_autovectorize_vector_sizes (void)
11279 {
11280 return (16 | 8);
11281 }
11282
11283 /* Implement TARGET_MANGLE_TYPE. */
11284
11285 static const char *
11286 aarch64_mangle_type (const_tree type)
11287 {
11288 /* The AArch64 ABI documents say that "__va_list" has to be
11289 managled as if it is in the "std" namespace. */
11290 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11291 return "St9__va_list";
11292
11293 /* Half-precision float. */
11294 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11295 return "Dh";
11296
11297 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11298 builtin types. */
11299 if (TYPE_NAME (type) != NULL)
11300 return aarch64_mangle_builtin_type (type);
11301
11302 /* Use the default mangling. */
11303 return NULL;
11304 }
11305
11306 /* Find the first rtx_insn before insn that will generate an assembly
11307 instruction. */
11308
11309 static rtx_insn *
11310 aarch64_prev_real_insn (rtx_insn *insn)
11311 {
11312 if (!insn)
11313 return NULL;
11314
11315 do
11316 {
11317 insn = prev_real_insn (insn);
11318 }
11319 while (insn && recog_memoized (insn) < 0);
11320
11321 return insn;
11322 }
11323
11324 static bool
11325 is_madd_op (enum attr_type t1)
11326 {
11327 unsigned int i;
11328 /* A number of these may be AArch32 only. */
11329 enum attr_type mlatypes[] = {
11330 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11331 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11332 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11333 };
11334
11335 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11336 {
11337 if (t1 == mlatypes[i])
11338 return true;
11339 }
11340
11341 return false;
11342 }
11343
11344 /* Check if there is a register dependency between a load and the insn
11345 for which we hold recog_data. */
11346
11347 static bool
11348 dep_between_memop_and_curr (rtx memop)
11349 {
11350 rtx load_reg;
11351 int opno;
11352
11353 gcc_assert (GET_CODE (memop) == SET);
11354
11355 if (!REG_P (SET_DEST (memop)))
11356 return false;
11357
11358 load_reg = SET_DEST (memop);
11359 for (opno = 1; opno < recog_data.n_operands; opno++)
11360 {
11361 rtx operand = recog_data.operand[opno];
11362 if (REG_P (operand)
11363 && reg_overlap_mentioned_p (load_reg, operand))
11364 return true;
11365
11366 }
11367 return false;
11368 }
11369
11370
11371 /* When working around the Cortex-A53 erratum 835769,
11372 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11373 instruction and has a preceding memory instruction such that a NOP
11374 should be inserted between them. */
11375
11376 bool
11377 aarch64_madd_needs_nop (rtx_insn* insn)
11378 {
11379 enum attr_type attr_type;
11380 rtx_insn *prev;
11381 rtx body;
11382
11383 if (!TARGET_FIX_ERR_A53_835769)
11384 return false;
11385
11386 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11387 return false;
11388
11389 attr_type = get_attr_type (insn);
11390 if (!is_madd_op (attr_type))
11391 return false;
11392
11393 prev = aarch64_prev_real_insn (insn);
11394 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11395 Restore recog state to INSN to avoid state corruption. */
11396 extract_constrain_insn_cached (insn);
11397
11398 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11399 return false;
11400
11401 body = single_set (prev);
11402
11403 /* If the previous insn is a memory op and there is no dependency between
11404 it and the DImode madd, emit a NOP between them. If body is NULL then we
11405 have a complex memory operation, probably a load/store pair.
11406 Be conservative for now and emit a NOP. */
11407 if (GET_MODE (recog_data.operand[0]) == DImode
11408 && (!body || !dep_between_memop_and_curr (body)))
11409 return true;
11410
11411 return false;
11412
11413 }
11414
11415
11416 /* Implement FINAL_PRESCAN_INSN. */
11417
11418 void
11419 aarch64_final_prescan_insn (rtx_insn *insn)
11420 {
11421 if (aarch64_madd_needs_nop (insn))
11422 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11423 }
11424
11425
11426 /* Return the equivalent letter for size. */
11427 static char
11428 sizetochar (int size)
11429 {
11430 switch (size)
11431 {
11432 case 64: return 'd';
11433 case 32: return 's';
11434 case 16: return 'h';
11435 case 8 : return 'b';
11436 default: gcc_unreachable ();
11437 }
11438 }
11439
11440 /* Return true iff x is a uniform vector of floating-point
11441 constants, and the constant can be represented in
11442 quarter-precision form. Note, as aarch64_float_const_representable
11443 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11444 static bool
11445 aarch64_vect_float_const_representable_p (rtx x)
11446 {
11447 rtx elt;
11448 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11449 && const_vec_duplicate_p (x, &elt)
11450 && aarch64_float_const_representable_p (elt));
11451 }
11452
11453 /* Return true for valid and false for invalid. */
11454 bool
11455 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11456 struct simd_immediate_info *info)
11457 {
11458 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11459 matches = 1; \
11460 for (i = 0; i < idx; i += (STRIDE)) \
11461 if (!(TEST)) \
11462 matches = 0; \
11463 if (matches) \
11464 { \
11465 immtype = (CLASS); \
11466 elsize = (ELSIZE); \
11467 eshift = (SHIFT); \
11468 emvn = (NEG); \
11469 break; \
11470 }
11471
11472 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11473 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11474 unsigned char bytes[16];
11475 int immtype = -1, matches;
11476 unsigned int invmask = inverse ? 0xff : 0;
11477 int eshift, emvn;
11478
11479 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11480 {
11481 if (! (aarch64_simd_imm_zero_p (op, mode)
11482 || aarch64_vect_float_const_representable_p (op)))
11483 return false;
11484
11485 if (info)
11486 {
11487 rtx elt = CONST_VECTOR_ELT (op, 0);
11488 scalar_float_mode elt_mode
11489 = as_a <scalar_float_mode> (GET_MODE (elt));
11490
11491 info->value = elt;
11492 info->element_width = GET_MODE_BITSIZE (elt_mode);
11493 info->mvn = false;
11494 info->shift = 0;
11495 }
11496
11497 return true;
11498 }
11499
11500 /* Splat vector constant out into a byte vector. */
11501 for (i = 0; i < n_elts; i++)
11502 {
11503 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11504 it must be laid out in the vector register in reverse order. */
11505 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11506 unsigned HOST_WIDE_INT elpart;
11507
11508 gcc_assert (CONST_INT_P (el));
11509 elpart = INTVAL (el);
11510
11511 for (unsigned int byte = 0; byte < innersize; byte++)
11512 {
11513 bytes[idx++] = (elpart & 0xff) ^ invmask;
11514 elpart >>= BITS_PER_UNIT;
11515 }
11516
11517 }
11518
11519 /* Sanity check. */
11520 gcc_assert (idx == GET_MODE_SIZE (mode));
11521
11522 do
11523 {
11524 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11525 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11526
11527 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11528 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11529
11530 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11531 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11532
11533 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11534 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11535
11536 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11537
11538 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11539
11540 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11541 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11542
11543 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11544 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11545
11546 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11547 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11548
11549 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11550 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11551
11552 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11553
11554 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11555
11556 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11557 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11558
11559 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11560 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11561
11562 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11563 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11564
11565 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11566 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11567
11568 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11569
11570 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11571 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11572 }
11573 while (0);
11574
11575 if (immtype == -1)
11576 return false;
11577
11578 if (info)
11579 {
11580 info->element_width = elsize;
11581 info->mvn = emvn != 0;
11582 info->shift = eshift;
11583
11584 unsigned HOST_WIDE_INT imm = 0;
11585
11586 if (immtype >= 12 && immtype <= 15)
11587 info->msl = true;
11588
11589 /* Un-invert bytes of recognized vector, if necessary. */
11590 if (invmask != 0)
11591 for (i = 0; i < idx; i++)
11592 bytes[i] ^= invmask;
11593
11594 if (immtype == 17)
11595 {
11596 /* FIXME: Broken on 32-bit H_W_I hosts. */
11597 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11598
11599 for (i = 0; i < 8; i++)
11600 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11601 << (i * BITS_PER_UNIT);
11602
11603
11604 info->value = GEN_INT (imm);
11605 }
11606 else
11607 {
11608 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11609 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11610
11611 /* Construct 'abcdefgh' because the assembler cannot handle
11612 generic constants. */
11613 if (info->mvn)
11614 imm = ~imm;
11615 imm = (imm >> info->shift) & 0xff;
11616 info->value = GEN_INT (imm);
11617 }
11618 }
11619
11620 return true;
11621 #undef CHECK
11622 }
11623
11624 /* Check of immediate shift constants are within range. */
11625 bool
11626 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11627 {
11628 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11629 if (left)
11630 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11631 else
11632 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11633 }
11634
11635 /* Return true if X is a uniform vector where all elements
11636 are either the floating-point constant 0.0 or the
11637 integer constant 0. */
11638 bool
11639 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11640 {
11641 return x == CONST0_RTX (mode);
11642 }
11643
11644
11645 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11646 operation of width WIDTH at bit position POS. */
11647
11648 rtx
11649 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11650 {
11651 gcc_assert (CONST_INT_P (width));
11652 gcc_assert (CONST_INT_P (pos));
11653
11654 unsigned HOST_WIDE_INT mask
11655 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11656 return GEN_INT (mask << UINTVAL (pos));
11657 }
11658
11659 bool
11660 aarch64_mov_operand_p (rtx x, machine_mode mode)
11661 {
11662 if (GET_CODE (x) == HIGH
11663 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11664 return true;
11665
11666 if (CONST_INT_P (x))
11667 return true;
11668
11669 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11670 return true;
11671
11672 return aarch64_classify_symbolic_expression (x)
11673 == SYMBOL_TINY_ABSOLUTE;
11674 }
11675
11676 /* Return a const_int vector of VAL. */
11677 rtx
11678 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11679 {
11680 int nunits = GET_MODE_NUNITS (mode);
11681 rtvec v = rtvec_alloc (nunits);
11682 int i;
11683
11684 rtx cache = GEN_INT (val);
11685
11686 for (i=0; i < nunits; i++)
11687 RTVEC_ELT (v, i) = cache;
11688
11689 return gen_rtx_CONST_VECTOR (mode, v);
11690 }
11691
11692 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11693
11694 bool
11695 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11696 {
11697 machine_mode vmode;
11698
11699 gcc_assert (!VECTOR_MODE_P (mode));
11700 vmode = aarch64_preferred_simd_mode (as_a <scalar_mode> (mode));
11701 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11702 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11703 }
11704
11705 /* Construct and return a PARALLEL RTX vector with elements numbering the
11706 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11707 the vector - from the perspective of the architecture. This does not
11708 line up with GCC's perspective on lane numbers, so we end up with
11709 different masks depending on our target endian-ness. The diagram
11710 below may help. We must draw the distinction when building masks
11711 which select one half of the vector. An instruction selecting
11712 architectural low-lanes for a big-endian target, must be described using
11713 a mask selecting GCC high-lanes.
11714
11715 Big-Endian Little-Endian
11716
11717 GCC 0 1 2 3 3 2 1 0
11718 | x | x | x | x | | x | x | x | x |
11719 Architecture 3 2 1 0 3 2 1 0
11720
11721 Low Mask: { 2, 3 } { 0, 1 }
11722 High Mask: { 0, 1 } { 2, 3 }
11723 */
11724
11725 rtx
11726 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11727 {
11728 int nunits = GET_MODE_NUNITS (mode);
11729 rtvec v = rtvec_alloc (nunits / 2);
11730 int high_base = nunits / 2;
11731 int low_base = 0;
11732 int base;
11733 rtx t1;
11734 int i;
11735
11736 if (BYTES_BIG_ENDIAN)
11737 base = high ? low_base : high_base;
11738 else
11739 base = high ? high_base : low_base;
11740
11741 for (i = 0; i < nunits / 2; i++)
11742 RTVEC_ELT (v, i) = GEN_INT (base + i);
11743
11744 t1 = gen_rtx_PARALLEL (mode, v);
11745 return t1;
11746 }
11747
11748 /* Check OP for validity as a PARALLEL RTX vector with elements
11749 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11750 from the perspective of the architecture. See the diagram above
11751 aarch64_simd_vect_par_cnst_half for more details. */
11752
11753 bool
11754 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11755 bool high)
11756 {
11757 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11758 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11759 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11760 int i = 0;
11761
11762 if (!VECTOR_MODE_P (mode))
11763 return false;
11764
11765 if (count_op != count_ideal)
11766 return false;
11767
11768 for (i = 0; i < count_ideal; i++)
11769 {
11770 rtx elt_op = XVECEXP (op, 0, i);
11771 rtx elt_ideal = XVECEXP (ideal, 0, i);
11772
11773 if (!CONST_INT_P (elt_op)
11774 || INTVAL (elt_ideal) != INTVAL (elt_op))
11775 return false;
11776 }
11777 return true;
11778 }
11779
11780 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11781 HIGH (exclusive). */
11782 void
11783 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11784 const_tree exp)
11785 {
11786 HOST_WIDE_INT lane;
11787 gcc_assert (CONST_INT_P (operand));
11788 lane = INTVAL (operand);
11789
11790 if (lane < low || lane >= high)
11791 {
11792 if (exp)
11793 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11794 else
11795 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11796 }
11797 }
11798
11799 /* Return TRUE if OP is a valid vector addressing mode. */
11800 bool
11801 aarch64_simd_mem_operand_p (rtx op)
11802 {
11803 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11804 || REG_P (XEXP (op, 0)));
11805 }
11806
11807 /* Emit a register copy from operand to operand, taking care not to
11808 early-clobber source registers in the process.
11809
11810 COUNT is the number of components into which the copy needs to be
11811 decomposed. */
11812 void
11813 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11814 unsigned int count)
11815 {
11816 unsigned int i;
11817 int rdest = REGNO (operands[0]);
11818 int rsrc = REGNO (operands[1]);
11819
11820 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11821 || rdest < rsrc)
11822 for (i = 0; i < count; i++)
11823 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11824 gen_rtx_REG (mode, rsrc + i));
11825 else
11826 for (i = 0; i < count; i++)
11827 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11828 gen_rtx_REG (mode, rsrc + count - i - 1));
11829 }
11830
11831 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11832 one of VSTRUCT modes: OI, CI, or XI. */
11833 int
11834 aarch64_simd_attr_length_rglist (machine_mode mode)
11835 {
11836 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11837 }
11838
11839 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11840 alignment of a vector to 128 bits. */
11841 static HOST_WIDE_INT
11842 aarch64_simd_vector_alignment (const_tree type)
11843 {
11844 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11845 return MIN (align, 128);
11846 }
11847
11848 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11849 static bool
11850 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11851 {
11852 if (is_packed)
11853 return false;
11854
11855 /* We guarantee alignment for vectors up to 128-bits. */
11856 if (tree_int_cst_compare (TYPE_SIZE (type),
11857 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11858 return false;
11859
11860 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11861 return true;
11862 }
11863
11864 /* Return true if the vector misalignment factor is supported by the
11865 target. */
11866 static bool
11867 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11868 const_tree type, int misalignment,
11869 bool is_packed)
11870 {
11871 if (TARGET_SIMD && STRICT_ALIGNMENT)
11872 {
11873 /* Return if movmisalign pattern is not supported for this mode. */
11874 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11875 return false;
11876
11877 if (misalignment == -1)
11878 {
11879 /* Misalignment factor is unknown at compile time but we know
11880 it's word aligned. */
11881 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11882 {
11883 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11884
11885 if (element_size != 64)
11886 return true;
11887 }
11888 return false;
11889 }
11890 }
11891 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11892 is_packed);
11893 }
11894
11895 /* If VALS is a vector constant that can be loaded into a register
11896 using DUP, generate instructions to do so and return an RTX to
11897 assign to the register. Otherwise return NULL_RTX. */
11898 static rtx
11899 aarch64_simd_dup_constant (rtx vals)
11900 {
11901 machine_mode mode = GET_MODE (vals);
11902 machine_mode inner_mode = GET_MODE_INNER (mode);
11903 rtx x;
11904
11905 if (!const_vec_duplicate_p (vals, &x))
11906 return NULL_RTX;
11907
11908 /* We can load this constant by using DUP and a constant in a
11909 single ARM register. This will be cheaper than a vector
11910 load. */
11911 x = copy_to_mode_reg (inner_mode, x);
11912 return gen_rtx_VEC_DUPLICATE (mode, x);
11913 }
11914
11915
11916 /* Generate code to load VALS, which is a PARALLEL containing only
11917 constants (for vec_init) or CONST_VECTOR, efficiently into a
11918 register. Returns an RTX to copy into the register, or NULL_RTX
11919 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11920 static rtx
11921 aarch64_simd_make_constant (rtx vals)
11922 {
11923 machine_mode mode = GET_MODE (vals);
11924 rtx const_dup;
11925 rtx const_vec = NULL_RTX;
11926 int n_elts = GET_MODE_NUNITS (mode);
11927 int n_const = 0;
11928 int i;
11929
11930 if (GET_CODE (vals) == CONST_VECTOR)
11931 const_vec = vals;
11932 else if (GET_CODE (vals) == PARALLEL)
11933 {
11934 /* A CONST_VECTOR must contain only CONST_INTs and
11935 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11936 Only store valid constants in a CONST_VECTOR. */
11937 for (i = 0; i < n_elts; ++i)
11938 {
11939 rtx x = XVECEXP (vals, 0, i);
11940 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11941 n_const++;
11942 }
11943 if (n_const == n_elts)
11944 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11945 }
11946 else
11947 gcc_unreachable ();
11948
11949 if (const_vec != NULL_RTX
11950 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11951 /* Load using MOVI/MVNI. */
11952 return const_vec;
11953 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11954 /* Loaded using DUP. */
11955 return const_dup;
11956 else if (const_vec != NULL_RTX)
11957 /* Load from constant pool. We can not take advantage of single-cycle
11958 LD1 because we need a PC-relative addressing mode. */
11959 return const_vec;
11960 else
11961 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11962 We can not construct an initializer. */
11963 return NULL_RTX;
11964 }
11965
11966 /* Expand a vector initialisation sequence, such that TARGET is
11967 initialised to contain VALS. */
11968
11969 void
11970 aarch64_expand_vector_init (rtx target, rtx vals)
11971 {
11972 machine_mode mode = GET_MODE (target);
11973 machine_mode inner_mode = GET_MODE_INNER (mode);
11974 /* The number of vector elements. */
11975 int n_elts = GET_MODE_NUNITS (mode);
11976 /* The number of vector elements which are not constant. */
11977 int n_var = 0;
11978 rtx any_const = NULL_RTX;
11979 /* The first element of vals. */
11980 rtx v0 = XVECEXP (vals, 0, 0);
11981 bool all_same = true;
11982
11983 /* Count the number of variable elements to initialise. */
11984 for (int i = 0; i < n_elts; ++i)
11985 {
11986 rtx x = XVECEXP (vals, 0, i);
11987 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11988 ++n_var;
11989 else
11990 any_const = x;
11991
11992 all_same &= rtx_equal_p (x, v0);
11993 }
11994
11995 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11996 how best to handle this. */
11997 if (n_var == 0)
11998 {
11999 rtx constant = aarch64_simd_make_constant (vals);
12000 if (constant != NULL_RTX)
12001 {
12002 emit_move_insn (target, constant);
12003 return;
12004 }
12005 }
12006
12007 /* Splat a single non-constant element if we can. */
12008 if (all_same)
12009 {
12010 rtx x = copy_to_mode_reg (inner_mode, v0);
12011 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12012 return;
12013 }
12014
12015 enum insn_code icode = optab_handler (vec_set_optab, mode);
12016 gcc_assert (icode != CODE_FOR_nothing);
12017
12018 /* If there are only variable elements, try to optimize
12019 the insertion using dup for the most common element
12020 followed by insertions. */
12021
12022 /* The algorithm will fill matches[*][0] with the earliest matching element,
12023 and matches[X][1] with the count of duplicate elements (if X is the
12024 earliest element which has duplicates). */
12025
12026 if (n_var == n_elts && n_elts <= 16)
12027 {
12028 int matches[16][2] = {0};
12029 for (int i = 0; i < n_elts; i++)
12030 {
12031 for (int j = 0; j <= i; j++)
12032 {
12033 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12034 {
12035 matches[i][0] = j;
12036 matches[j][1]++;
12037 break;
12038 }
12039 }
12040 }
12041 int maxelement = 0;
12042 int maxv = 0;
12043 for (int i = 0; i < n_elts; i++)
12044 if (matches[i][1] > maxv)
12045 {
12046 maxelement = i;
12047 maxv = matches[i][1];
12048 }
12049
12050 /* Create a duplicate of the most common element. */
12051 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12052 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12053
12054 /* Insert the rest. */
12055 for (int i = 0; i < n_elts; i++)
12056 {
12057 rtx x = XVECEXP (vals, 0, i);
12058 if (matches[i][0] == maxelement)
12059 continue;
12060 x = copy_to_mode_reg (inner_mode, x);
12061 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12062 }
12063 return;
12064 }
12065
12066 /* Initialise a vector which is part-variable. We want to first try
12067 to build those lanes which are constant in the most efficient way we
12068 can. */
12069 if (n_var != n_elts)
12070 {
12071 rtx copy = copy_rtx (vals);
12072
12073 /* Load constant part of vector. We really don't care what goes into the
12074 parts we will overwrite, but we're more likely to be able to load the
12075 constant efficiently if it has fewer, larger, repeating parts
12076 (see aarch64_simd_valid_immediate). */
12077 for (int i = 0; i < n_elts; i++)
12078 {
12079 rtx x = XVECEXP (vals, 0, i);
12080 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12081 continue;
12082 rtx subst = any_const;
12083 for (int bit = n_elts / 2; bit > 0; bit /= 2)
12084 {
12085 /* Look in the copied vector, as more elements are const. */
12086 rtx test = XVECEXP (copy, 0, i ^ bit);
12087 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12088 {
12089 subst = test;
12090 break;
12091 }
12092 }
12093 XVECEXP (copy, 0, i) = subst;
12094 }
12095 aarch64_expand_vector_init (target, copy);
12096 }
12097
12098 /* Insert the variable lanes directly. */
12099 for (int i = 0; i < n_elts; i++)
12100 {
12101 rtx x = XVECEXP (vals, 0, i);
12102 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12103 continue;
12104 x = copy_to_mode_reg (inner_mode, x);
12105 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12106 }
12107 }
12108
12109 static unsigned HOST_WIDE_INT
12110 aarch64_shift_truncation_mask (machine_mode mode)
12111 {
12112 return
12113 (!SHIFT_COUNT_TRUNCATED
12114 || aarch64_vector_mode_supported_p (mode)
12115 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12116 }
12117
12118 /* Select a format to encode pointers in exception handling data. */
12119 int
12120 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12121 {
12122 int type;
12123 switch (aarch64_cmodel)
12124 {
12125 case AARCH64_CMODEL_TINY:
12126 case AARCH64_CMODEL_TINY_PIC:
12127 case AARCH64_CMODEL_SMALL:
12128 case AARCH64_CMODEL_SMALL_PIC:
12129 case AARCH64_CMODEL_SMALL_SPIC:
12130 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12131 for everything. */
12132 type = DW_EH_PE_sdata4;
12133 break;
12134 default:
12135 /* No assumptions here. 8-byte relocs required. */
12136 type = DW_EH_PE_sdata8;
12137 break;
12138 }
12139 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12140 }
12141
12142 /* The last .arch and .tune assembly strings that we printed. */
12143 static std::string aarch64_last_printed_arch_string;
12144 static std::string aarch64_last_printed_tune_string;
12145
12146 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12147 by the function fndecl. */
12148
12149 void
12150 aarch64_declare_function_name (FILE *stream, const char* name,
12151 tree fndecl)
12152 {
12153 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12154
12155 struct cl_target_option *targ_options;
12156 if (target_parts)
12157 targ_options = TREE_TARGET_OPTION (target_parts);
12158 else
12159 targ_options = TREE_TARGET_OPTION (target_option_current_node);
12160 gcc_assert (targ_options);
12161
12162 const struct processor *this_arch
12163 = aarch64_get_arch (targ_options->x_explicit_arch);
12164
12165 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12166 std::string extension
12167 = aarch64_get_extension_string_for_isa_flags (isa_flags,
12168 this_arch->flags);
12169 /* Only update the assembler .arch string if it is distinct from the last
12170 such string we printed. */
12171 std::string to_print = this_arch->name + extension;
12172 if (to_print != aarch64_last_printed_arch_string)
12173 {
12174 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12175 aarch64_last_printed_arch_string = to_print;
12176 }
12177
12178 /* Print the cpu name we're tuning for in the comments, might be
12179 useful to readers of the generated asm. Do it only when it changes
12180 from function to function and verbose assembly is requested. */
12181 const struct processor *this_tune
12182 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12183
12184 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12185 {
12186 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12187 this_tune->name);
12188 aarch64_last_printed_tune_string = this_tune->name;
12189 }
12190
12191 /* Don't forget the type directive for ELF. */
12192 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12193 ASM_OUTPUT_LABEL (stream, name);
12194 }
12195
12196 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12197
12198 static void
12199 aarch64_start_file (void)
12200 {
12201 struct cl_target_option *default_options
12202 = TREE_TARGET_OPTION (target_option_default_node);
12203
12204 const struct processor *default_arch
12205 = aarch64_get_arch (default_options->x_explicit_arch);
12206 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12207 std::string extension
12208 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12209 default_arch->flags);
12210
12211 aarch64_last_printed_arch_string = default_arch->name + extension;
12212 aarch64_last_printed_tune_string = "";
12213 asm_fprintf (asm_out_file, "\t.arch %s\n",
12214 aarch64_last_printed_arch_string.c_str ());
12215
12216 default_file_start ();
12217 }
12218
12219 /* Emit load exclusive. */
12220
12221 static void
12222 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12223 rtx mem, rtx model_rtx)
12224 {
12225 rtx (*gen) (rtx, rtx, rtx);
12226
12227 switch (mode)
12228 {
12229 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12230 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12231 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12232 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12233 default:
12234 gcc_unreachable ();
12235 }
12236
12237 emit_insn (gen (rval, mem, model_rtx));
12238 }
12239
12240 /* Emit store exclusive. */
12241
12242 static void
12243 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12244 rtx rval, rtx mem, rtx model_rtx)
12245 {
12246 rtx (*gen) (rtx, rtx, rtx, rtx);
12247
12248 switch (mode)
12249 {
12250 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12251 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12252 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12253 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12254 default:
12255 gcc_unreachable ();
12256 }
12257
12258 emit_insn (gen (bval, rval, mem, model_rtx));
12259 }
12260
12261 /* Mark the previous jump instruction as unlikely. */
12262
12263 static void
12264 aarch64_emit_unlikely_jump (rtx insn)
12265 {
12266 rtx_insn *jump = emit_jump_insn (insn);
12267 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12268 }
12269
12270 /* Expand a compare and swap pattern. */
12271
12272 void
12273 aarch64_expand_compare_and_swap (rtx operands[])
12274 {
12275 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12276 machine_mode mode, cmp_mode;
12277 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12278 int idx;
12279 gen_cas_fn gen;
12280 const gen_cas_fn split_cas[] =
12281 {
12282 gen_aarch64_compare_and_swapqi,
12283 gen_aarch64_compare_and_swaphi,
12284 gen_aarch64_compare_and_swapsi,
12285 gen_aarch64_compare_and_swapdi
12286 };
12287 const gen_cas_fn atomic_cas[] =
12288 {
12289 gen_aarch64_compare_and_swapqi_lse,
12290 gen_aarch64_compare_and_swaphi_lse,
12291 gen_aarch64_compare_and_swapsi_lse,
12292 gen_aarch64_compare_and_swapdi_lse
12293 };
12294
12295 bval = operands[0];
12296 rval = operands[1];
12297 mem = operands[2];
12298 oldval = operands[3];
12299 newval = operands[4];
12300 is_weak = operands[5];
12301 mod_s = operands[6];
12302 mod_f = operands[7];
12303 mode = GET_MODE (mem);
12304 cmp_mode = mode;
12305
12306 /* Normally the succ memory model must be stronger than fail, but in the
12307 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12308 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12309
12310 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12311 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12312 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12313
12314 switch (mode)
12315 {
12316 case E_QImode:
12317 case E_HImode:
12318 /* For short modes, we're going to perform the comparison in SImode,
12319 so do the zero-extension now. */
12320 cmp_mode = SImode;
12321 rval = gen_reg_rtx (SImode);
12322 oldval = convert_modes (SImode, mode, oldval, true);
12323 /* Fall through. */
12324
12325 case E_SImode:
12326 case E_DImode:
12327 /* Force the value into a register if needed. */
12328 if (!aarch64_plus_operand (oldval, mode))
12329 oldval = force_reg (cmp_mode, oldval);
12330 break;
12331
12332 default:
12333 gcc_unreachable ();
12334 }
12335
12336 switch (mode)
12337 {
12338 case E_QImode: idx = 0; break;
12339 case E_HImode: idx = 1; break;
12340 case E_SImode: idx = 2; break;
12341 case E_DImode: idx = 3; break;
12342 default:
12343 gcc_unreachable ();
12344 }
12345 if (TARGET_LSE)
12346 gen = atomic_cas[idx];
12347 else
12348 gen = split_cas[idx];
12349
12350 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12351
12352 if (mode == QImode || mode == HImode)
12353 emit_move_insn (operands[1], gen_lowpart (mode, rval));
12354
12355 x = gen_rtx_REG (CCmode, CC_REGNUM);
12356 x = gen_rtx_EQ (SImode, x, const0_rtx);
12357 emit_insn (gen_rtx_SET (bval, x));
12358 }
12359
12360 /* Test whether the target supports using a atomic load-operate instruction.
12361 CODE is the operation and AFTER is TRUE if the data in memory after the
12362 operation should be returned and FALSE if the data before the operation
12363 should be returned. Returns FALSE if the operation isn't supported by the
12364 architecture. */
12365
12366 bool
12367 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12368 {
12369 if (!TARGET_LSE)
12370 return false;
12371
12372 switch (code)
12373 {
12374 case SET:
12375 case AND:
12376 case IOR:
12377 case XOR:
12378 case MINUS:
12379 case PLUS:
12380 return true;
12381 default:
12382 return false;
12383 }
12384 }
12385
12386 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12387 sequence implementing an atomic operation. */
12388
12389 static void
12390 aarch64_emit_post_barrier (enum memmodel model)
12391 {
12392 const enum memmodel base_model = memmodel_base (model);
12393
12394 if (is_mm_sync (model)
12395 && (base_model == MEMMODEL_ACQUIRE
12396 || base_model == MEMMODEL_ACQ_REL
12397 || base_model == MEMMODEL_SEQ_CST))
12398 {
12399 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12400 }
12401 }
12402
12403 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12404 for the data in memory. EXPECTED is the value expected to be in memory.
12405 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12406 is the memory ordering to use. */
12407
12408 void
12409 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12410 rtx expected, rtx desired,
12411 rtx model)
12412 {
12413 rtx (*gen) (rtx, rtx, rtx, rtx);
12414 machine_mode mode;
12415
12416 mode = GET_MODE (mem);
12417
12418 switch (mode)
12419 {
12420 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12421 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12422 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12423 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12424 default:
12425 gcc_unreachable ();
12426 }
12427
12428 /* Move the expected value into the CAS destination register. */
12429 emit_insn (gen_rtx_SET (rval, expected));
12430
12431 /* Emit the CAS. */
12432 emit_insn (gen (rval, mem, desired, model));
12433
12434 /* Compare the expected value with the value loaded by the CAS, to establish
12435 whether the swap was made. */
12436 aarch64_gen_compare_reg (EQ, rval, expected);
12437 }
12438
12439 /* Split a compare and swap pattern. */
12440
12441 void
12442 aarch64_split_compare_and_swap (rtx operands[])
12443 {
12444 rtx rval, mem, oldval, newval, scratch;
12445 machine_mode mode;
12446 bool is_weak;
12447 rtx_code_label *label1, *label2;
12448 rtx x, cond;
12449 enum memmodel model;
12450 rtx model_rtx;
12451
12452 rval = operands[0];
12453 mem = operands[1];
12454 oldval = operands[2];
12455 newval = operands[3];
12456 is_weak = (operands[4] != const0_rtx);
12457 model_rtx = operands[5];
12458 scratch = operands[7];
12459 mode = GET_MODE (mem);
12460 model = memmodel_from_int (INTVAL (model_rtx));
12461
12462 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12463 loop:
12464 .label1:
12465 LD[A]XR rval, [mem]
12466 CBNZ rval, .label2
12467 ST[L]XR scratch, newval, [mem]
12468 CBNZ scratch, .label1
12469 .label2:
12470 CMP rval, 0. */
12471 bool strong_zero_p = !is_weak && oldval == const0_rtx;
12472
12473 label1 = NULL;
12474 if (!is_weak)
12475 {
12476 label1 = gen_label_rtx ();
12477 emit_label (label1);
12478 }
12479 label2 = gen_label_rtx ();
12480
12481 /* The initial load can be relaxed for a __sync operation since a final
12482 barrier will be emitted to stop code hoisting. */
12483 if (is_mm_sync (model))
12484 aarch64_emit_load_exclusive (mode, rval, mem,
12485 GEN_INT (MEMMODEL_RELAXED));
12486 else
12487 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12488
12489 if (strong_zero_p)
12490 {
12491 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12492 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12493 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12494 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12495 }
12496 else
12497 {
12498 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12499 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12500 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12501 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12502 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12503 }
12504
12505 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12506
12507 if (!is_weak)
12508 {
12509 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12510 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12511 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12512 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12513 }
12514 else
12515 {
12516 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12517 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12518 emit_insn (gen_rtx_SET (cond, x));
12519 }
12520
12521 emit_label (label2);
12522 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12523 to set the condition flags. If this is not used it will be removed by
12524 later passes. */
12525 if (strong_zero_p)
12526 {
12527 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12528 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12529 emit_insn (gen_rtx_SET (cond, x));
12530 }
12531 /* Emit any final barrier needed for a __sync operation. */
12532 if (is_mm_sync (model))
12533 aarch64_emit_post_barrier (model);
12534 }
12535
12536 /* Emit a BIC instruction. */
12537
12538 static void
12539 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12540 {
12541 rtx shift_rtx = GEN_INT (shift);
12542 rtx (*gen) (rtx, rtx, rtx, rtx);
12543
12544 switch (mode)
12545 {
12546 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12547 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12548 default:
12549 gcc_unreachable ();
12550 }
12551
12552 emit_insn (gen (dst, s2, shift_rtx, s1));
12553 }
12554
12555 /* Emit an atomic swap. */
12556
12557 static void
12558 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12559 rtx mem, rtx model)
12560 {
12561 rtx (*gen) (rtx, rtx, rtx, rtx);
12562
12563 switch (mode)
12564 {
12565 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12566 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12567 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12568 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12569 default:
12570 gcc_unreachable ();
12571 }
12572
12573 emit_insn (gen (dst, mem, value, model));
12574 }
12575
12576 /* Operations supported by aarch64_emit_atomic_load_op. */
12577
12578 enum aarch64_atomic_load_op_code
12579 {
12580 AARCH64_LDOP_PLUS, /* A + B */
12581 AARCH64_LDOP_XOR, /* A ^ B */
12582 AARCH64_LDOP_OR, /* A | B */
12583 AARCH64_LDOP_BIC /* A & ~B */
12584 };
12585
12586 /* Emit an atomic load-operate. */
12587
12588 static void
12589 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12590 machine_mode mode, rtx dst, rtx src,
12591 rtx mem, rtx model)
12592 {
12593 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12594 const aarch64_atomic_load_op_fn plus[] =
12595 {
12596 gen_aarch64_atomic_loadaddqi,
12597 gen_aarch64_atomic_loadaddhi,
12598 gen_aarch64_atomic_loadaddsi,
12599 gen_aarch64_atomic_loadadddi
12600 };
12601 const aarch64_atomic_load_op_fn eor[] =
12602 {
12603 gen_aarch64_atomic_loadeorqi,
12604 gen_aarch64_atomic_loadeorhi,
12605 gen_aarch64_atomic_loadeorsi,
12606 gen_aarch64_atomic_loadeordi
12607 };
12608 const aarch64_atomic_load_op_fn ior[] =
12609 {
12610 gen_aarch64_atomic_loadsetqi,
12611 gen_aarch64_atomic_loadsethi,
12612 gen_aarch64_atomic_loadsetsi,
12613 gen_aarch64_atomic_loadsetdi
12614 };
12615 const aarch64_atomic_load_op_fn bic[] =
12616 {
12617 gen_aarch64_atomic_loadclrqi,
12618 gen_aarch64_atomic_loadclrhi,
12619 gen_aarch64_atomic_loadclrsi,
12620 gen_aarch64_atomic_loadclrdi
12621 };
12622 aarch64_atomic_load_op_fn gen;
12623 int idx = 0;
12624
12625 switch (mode)
12626 {
12627 case E_QImode: idx = 0; break;
12628 case E_HImode: idx = 1; break;
12629 case E_SImode: idx = 2; break;
12630 case E_DImode: idx = 3; break;
12631 default:
12632 gcc_unreachable ();
12633 }
12634
12635 switch (code)
12636 {
12637 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12638 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12639 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12640 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12641 default:
12642 gcc_unreachable ();
12643 }
12644
12645 emit_insn (gen (dst, mem, src, model));
12646 }
12647
12648 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12649 location to store the data read from memory. OUT_RESULT is the location to
12650 store the result of the operation. MEM is the memory location to read and
12651 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12652 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12653 be NULL. */
12654
12655 void
12656 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12657 rtx mem, rtx value, rtx model_rtx)
12658 {
12659 machine_mode mode = GET_MODE (mem);
12660 machine_mode wmode = (mode == DImode ? DImode : SImode);
12661 const bool short_mode = (mode < SImode);
12662 aarch64_atomic_load_op_code ldop_code;
12663 rtx src;
12664 rtx x;
12665
12666 if (out_data)
12667 out_data = gen_lowpart (mode, out_data);
12668
12669 if (out_result)
12670 out_result = gen_lowpart (mode, out_result);
12671
12672 /* Make sure the value is in a register, putting it into a destination
12673 register if it needs to be manipulated. */
12674 if (!register_operand (value, mode)
12675 || code == AND || code == MINUS)
12676 {
12677 src = out_result ? out_result : out_data;
12678 emit_move_insn (src, gen_lowpart (mode, value));
12679 }
12680 else
12681 src = value;
12682 gcc_assert (register_operand (src, mode));
12683
12684 /* Preprocess the data for the operation as necessary. If the operation is
12685 a SET then emit a swap instruction and finish. */
12686 switch (code)
12687 {
12688 case SET:
12689 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12690 return;
12691
12692 case MINUS:
12693 /* Negate the value and treat it as a PLUS. */
12694 {
12695 rtx neg_src;
12696
12697 /* Resize the value if necessary. */
12698 if (short_mode)
12699 src = gen_lowpart (wmode, src);
12700
12701 neg_src = gen_rtx_NEG (wmode, src);
12702 emit_insn (gen_rtx_SET (src, neg_src));
12703
12704 if (short_mode)
12705 src = gen_lowpart (mode, src);
12706 }
12707 /* Fall-through. */
12708 case PLUS:
12709 ldop_code = AARCH64_LDOP_PLUS;
12710 break;
12711
12712 case IOR:
12713 ldop_code = AARCH64_LDOP_OR;
12714 break;
12715
12716 case XOR:
12717 ldop_code = AARCH64_LDOP_XOR;
12718 break;
12719
12720 case AND:
12721 {
12722 rtx not_src;
12723
12724 /* Resize the value if necessary. */
12725 if (short_mode)
12726 src = gen_lowpart (wmode, src);
12727
12728 not_src = gen_rtx_NOT (wmode, src);
12729 emit_insn (gen_rtx_SET (src, not_src));
12730
12731 if (short_mode)
12732 src = gen_lowpart (mode, src);
12733 }
12734 ldop_code = AARCH64_LDOP_BIC;
12735 break;
12736
12737 default:
12738 /* The operation can't be done with atomic instructions. */
12739 gcc_unreachable ();
12740 }
12741
12742 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12743
12744 /* If necessary, calculate the data in memory after the update by redoing the
12745 operation from values in registers. */
12746 if (!out_result)
12747 return;
12748
12749 if (short_mode)
12750 {
12751 src = gen_lowpart (wmode, src);
12752 out_data = gen_lowpart (wmode, out_data);
12753 out_result = gen_lowpart (wmode, out_result);
12754 }
12755
12756 x = NULL_RTX;
12757
12758 switch (code)
12759 {
12760 case MINUS:
12761 case PLUS:
12762 x = gen_rtx_PLUS (wmode, out_data, src);
12763 break;
12764 case IOR:
12765 x = gen_rtx_IOR (wmode, out_data, src);
12766 break;
12767 case XOR:
12768 x = gen_rtx_XOR (wmode, out_data, src);
12769 break;
12770 case AND:
12771 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12772 return;
12773 default:
12774 gcc_unreachable ();
12775 }
12776
12777 emit_set_insn (out_result, x);
12778
12779 return;
12780 }
12781
12782 /* Split an atomic operation. */
12783
12784 void
12785 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12786 rtx value, rtx model_rtx, rtx cond)
12787 {
12788 machine_mode mode = GET_MODE (mem);
12789 machine_mode wmode = (mode == DImode ? DImode : SImode);
12790 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12791 const bool is_sync = is_mm_sync (model);
12792 rtx_code_label *label;
12793 rtx x;
12794
12795 /* Split the atomic operation into a sequence. */
12796 label = gen_label_rtx ();
12797 emit_label (label);
12798
12799 if (new_out)
12800 new_out = gen_lowpart (wmode, new_out);
12801 if (old_out)
12802 old_out = gen_lowpart (wmode, old_out);
12803 else
12804 old_out = new_out;
12805 value = simplify_gen_subreg (wmode, value, mode, 0);
12806
12807 /* The initial load can be relaxed for a __sync operation since a final
12808 barrier will be emitted to stop code hoisting. */
12809 if (is_sync)
12810 aarch64_emit_load_exclusive (mode, old_out, mem,
12811 GEN_INT (MEMMODEL_RELAXED));
12812 else
12813 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12814
12815 switch (code)
12816 {
12817 case SET:
12818 new_out = value;
12819 break;
12820
12821 case NOT:
12822 x = gen_rtx_AND (wmode, old_out, value);
12823 emit_insn (gen_rtx_SET (new_out, x));
12824 x = gen_rtx_NOT (wmode, new_out);
12825 emit_insn (gen_rtx_SET (new_out, x));
12826 break;
12827
12828 case MINUS:
12829 if (CONST_INT_P (value))
12830 {
12831 value = GEN_INT (-INTVAL (value));
12832 code = PLUS;
12833 }
12834 /* Fall through. */
12835
12836 default:
12837 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12838 emit_insn (gen_rtx_SET (new_out, x));
12839 break;
12840 }
12841
12842 aarch64_emit_store_exclusive (mode, cond, mem,
12843 gen_lowpart (mode, new_out), model_rtx);
12844
12845 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12846 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12847 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12848 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12849
12850 /* Emit any final barrier needed for a __sync operation. */
12851 if (is_sync)
12852 aarch64_emit_post_barrier (model);
12853 }
12854
12855 static void
12856 aarch64_init_libfuncs (void)
12857 {
12858 /* Half-precision float operations. The compiler handles all operations
12859 with NULL libfuncs by converting to SFmode. */
12860
12861 /* Conversions. */
12862 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12863 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12864
12865 /* Arithmetic. */
12866 set_optab_libfunc (add_optab, HFmode, NULL);
12867 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12868 set_optab_libfunc (smul_optab, HFmode, NULL);
12869 set_optab_libfunc (neg_optab, HFmode, NULL);
12870 set_optab_libfunc (sub_optab, HFmode, NULL);
12871
12872 /* Comparisons. */
12873 set_optab_libfunc (eq_optab, HFmode, NULL);
12874 set_optab_libfunc (ne_optab, HFmode, NULL);
12875 set_optab_libfunc (lt_optab, HFmode, NULL);
12876 set_optab_libfunc (le_optab, HFmode, NULL);
12877 set_optab_libfunc (ge_optab, HFmode, NULL);
12878 set_optab_libfunc (gt_optab, HFmode, NULL);
12879 set_optab_libfunc (unord_optab, HFmode, NULL);
12880 }
12881
12882 /* Target hook for c_mode_for_suffix. */
12883 static machine_mode
12884 aarch64_c_mode_for_suffix (char suffix)
12885 {
12886 if (suffix == 'q')
12887 return TFmode;
12888
12889 return VOIDmode;
12890 }
12891
12892 /* We can only represent floating point constants which will fit in
12893 "quarter-precision" values. These values are characterised by
12894 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12895 by:
12896
12897 (-1)^s * (n/16) * 2^r
12898
12899 Where:
12900 's' is the sign bit.
12901 'n' is an integer in the range 16 <= n <= 31.
12902 'r' is an integer in the range -3 <= r <= 4. */
12903
12904 /* Return true iff X can be represented by a quarter-precision
12905 floating point immediate operand X. Note, we cannot represent 0.0. */
12906 bool
12907 aarch64_float_const_representable_p (rtx x)
12908 {
12909 /* This represents our current view of how many bits
12910 make up the mantissa. */
12911 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12912 int exponent;
12913 unsigned HOST_WIDE_INT mantissa, mask;
12914 REAL_VALUE_TYPE r, m;
12915 bool fail;
12916
12917 if (!CONST_DOUBLE_P (x))
12918 return false;
12919
12920 /* We don't support HFmode constants yet. */
12921 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12922 return false;
12923
12924 r = *CONST_DOUBLE_REAL_VALUE (x);
12925
12926 /* We cannot represent infinities, NaNs or +/-zero. We won't
12927 know if we have +zero until we analyse the mantissa, but we
12928 can reject the other invalid values. */
12929 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12930 || REAL_VALUE_MINUS_ZERO (r))
12931 return false;
12932
12933 /* Extract exponent. */
12934 r = real_value_abs (&r);
12935 exponent = REAL_EXP (&r);
12936
12937 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12938 highest (sign) bit, with a fixed binary point at bit point_pos.
12939 m1 holds the low part of the mantissa, m2 the high part.
12940 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12941 bits for the mantissa, this can fail (low bits will be lost). */
12942 real_ldexp (&m, &r, point_pos - exponent);
12943 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12944
12945 /* If the low part of the mantissa has bits set we cannot represent
12946 the value. */
12947 if (w.ulow () != 0)
12948 return false;
12949 /* We have rejected the lower HOST_WIDE_INT, so update our
12950 understanding of how many bits lie in the mantissa and
12951 look only at the high HOST_WIDE_INT. */
12952 mantissa = w.elt (1);
12953 point_pos -= HOST_BITS_PER_WIDE_INT;
12954
12955 /* We can only represent values with a mantissa of the form 1.xxxx. */
12956 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12957 if ((mantissa & mask) != 0)
12958 return false;
12959
12960 /* Having filtered unrepresentable values, we may now remove all
12961 but the highest 5 bits. */
12962 mantissa >>= point_pos - 5;
12963
12964 /* We cannot represent the value 0.0, so reject it. This is handled
12965 elsewhere. */
12966 if (mantissa == 0)
12967 return false;
12968
12969 /* Then, as bit 4 is always set, we can mask it off, leaving
12970 the mantissa in the range [0, 15]. */
12971 mantissa &= ~(1 << 4);
12972 gcc_assert (mantissa <= 15);
12973
12974 /* GCC internally does not use IEEE754-like encoding (where normalized
12975 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12976 Our mantissa values are shifted 4 places to the left relative to
12977 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12978 by 5 places to correct for GCC's representation. */
12979 exponent = 5 - exponent;
12980
12981 return (exponent >= 0 && exponent <= 7);
12982 }
12983
12984 char*
12985 aarch64_output_simd_mov_immediate (rtx const_vector,
12986 machine_mode mode,
12987 unsigned width)
12988 {
12989 bool is_valid;
12990 static char templ[40];
12991 const char *mnemonic;
12992 const char *shift_op;
12993 unsigned int lane_count = 0;
12994 char element_char;
12995
12996 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12997
12998 /* This will return true to show const_vector is legal for use as either
12999 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
13000 also update INFO to show how the immediate should be generated. */
13001 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13002 gcc_assert (is_valid);
13003
13004 element_char = sizetochar (info.element_width);
13005 lane_count = width / info.element_width;
13006
13007 mode = GET_MODE_INNER (mode);
13008 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13009 {
13010 gcc_assert (info.shift == 0 && ! info.mvn);
13011 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13012 move immediate path. */
13013 if (aarch64_float_const_zero_rtx_p (info.value))
13014 info.value = GEN_INT (0);
13015 else
13016 {
13017 const unsigned int buf_size = 20;
13018 char float_buf[buf_size] = {'\0'};
13019 real_to_decimal_for_mode (float_buf,
13020 CONST_DOUBLE_REAL_VALUE (info.value),
13021 buf_size, buf_size, 1, mode);
13022
13023 if (lane_count == 1)
13024 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13025 else
13026 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13027 lane_count, element_char, float_buf);
13028 return templ;
13029 }
13030 }
13031
13032 mnemonic = info.mvn ? "mvni" : "movi";
13033 shift_op = info.msl ? "msl" : "lsl";
13034
13035 gcc_assert (CONST_INT_P (info.value));
13036 if (lane_count == 1)
13037 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13038 mnemonic, UINTVAL (info.value));
13039 else if (info.shift)
13040 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13041 ", %s %d", mnemonic, lane_count, element_char,
13042 UINTVAL (info.value), shift_op, info.shift);
13043 else
13044 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13045 mnemonic, lane_count, element_char, UINTVAL (info.value));
13046 return templ;
13047 }
13048
13049 char*
13050 aarch64_output_scalar_simd_mov_immediate (rtx immediate, machine_mode mode)
13051 {
13052
13053 /* If a floating point number was passed and we desire to use it in an
13054 integer mode do the conversion to integer. */
13055 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13056 {
13057 unsigned HOST_WIDE_INT ival;
13058 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13059 gcc_unreachable ();
13060 immediate = gen_int_mode (ival, mode);
13061 }
13062
13063 machine_mode vmode;
13064 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13065 a 128 bit vector mode. */
13066 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13067
13068 gcc_assert (!VECTOR_MODE_P (mode));
13069 vmode = aarch64_simd_container_mode (mode, width);
13070 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13071 return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13072 }
13073
13074 /* Split operands into moves from op[1] + op[2] into op[0]. */
13075
13076 void
13077 aarch64_split_combinev16qi (rtx operands[3])
13078 {
13079 unsigned int dest = REGNO (operands[0]);
13080 unsigned int src1 = REGNO (operands[1]);
13081 unsigned int src2 = REGNO (operands[2]);
13082 machine_mode halfmode = GET_MODE (operands[1]);
13083 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13084 rtx destlo, desthi;
13085
13086 gcc_assert (halfmode == V16QImode);
13087
13088 if (src1 == dest && src2 == dest + halfregs)
13089 {
13090 /* No-op move. Can't split to nothing; emit something. */
13091 emit_note (NOTE_INSN_DELETED);
13092 return;
13093 }
13094
13095 /* Preserve register attributes for variable tracking. */
13096 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13097 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13098 GET_MODE_SIZE (halfmode));
13099
13100 /* Special case of reversed high/low parts. */
13101 if (reg_overlap_mentioned_p (operands[2], destlo)
13102 && reg_overlap_mentioned_p (operands[1], desthi))
13103 {
13104 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13105 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13106 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13107 }
13108 else if (!reg_overlap_mentioned_p (operands[2], destlo))
13109 {
13110 /* Try to avoid unnecessary moves if part of the result
13111 is in the right place already. */
13112 if (src1 != dest)
13113 emit_move_insn (destlo, operands[1]);
13114 if (src2 != dest + halfregs)
13115 emit_move_insn (desthi, operands[2]);
13116 }
13117 else
13118 {
13119 if (src2 != dest + halfregs)
13120 emit_move_insn (desthi, operands[2]);
13121 if (src1 != dest)
13122 emit_move_insn (destlo, operands[1]);
13123 }
13124 }
13125
13126 /* vec_perm support. */
13127
13128 #define MAX_VECT_LEN 16
13129
13130 struct expand_vec_perm_d
13131 {
13132 rtx target, op0, op1;
13133 unsigned char perm[MAX_VECT_LEN];
13134 machine_mode vmode;
13135 unsigned char nelt;
13136 bool one_vector_p;
13137 bool testing_p;
13138 };
13139
13140 /* Generate a variable permutation. */
13141
13142 static void
13143 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13144 {
13145 machine_mode vmode = GET_MODE (target);
13146 bool one_vector_p = rtx_equal_p (op0, op1);
13147
13148 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13149 gcc_checking_assert (GET_MODE (op0) == vmode);
13150 gcc_checking_assert (GET_MODE (op1) == vmode);
13151 gcc_checking_assert (GET_MODE (sel) == vmode);
13152 gcc_checking_assert (TARGET_SIMD);
13153
13154 if (one_vector_p)
13155 {
13156 if (vmode == V8QImode)
13157 {
13158 /* Expand the argument to a V16QI mode by duplicating it. */
13159 rtx pair = gen_reg_rtx (V16QImode);
13160 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13161 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13162 }
13163 else
13164 {
13165 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13166 }
13167 }
13168 else
13169 {
13170 rtx pair;
13171
13172 if (vmode == V8QImode)
13173 {
13174 pair = gen_reg_rtx (V16QImode);
13175 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13176 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13177 }
13178 else
13179 {
13180 pair = gen_reg_rtx (OImode);
13181 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13182 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13183 }
13184 }
13185 }
13186
13187 void
13188 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13189 {
13190 machine_mode vmode = GET_MODE (target);
13191 unsigned int nelt = GET_MODE_NUNITS (vmode);
13192 bool one_vector_p = rtx_equal_p (op0, op1);
13193 rtx mask;
13194
13195 /* The TBL instruction does not use a modulo index, so we must take care
13196 of that ourselves. */
13197 mask = aarch64_simd_gen_const_vector_dup (vmode,
13198 one_vector_p ? nelt - 1 : 2 * nelt - 1);
13199 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13200
13201 /* For big-endian, we also need to reverse the index within the vector
13202 (but not which vector). */
13203 if (BYTES_BIG_ENDIAN)
13204 {
13205 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13206 if (!one_vector_p)
13207 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13208 sel = expand_simple_binop (vmode, XOR, sel, mask,
13209 NULL, 0, OPTAB_LIB_WIDEN);
13210 }
13211 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13212 }
13213
13214 /* Recognize patterns suitable for the TRN instructions. */
13215 static bool
13216 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13217 {
13218 unsigned int i, odd, mask, nelt = d->nelt;
13219 rtx out, in0, in1, x;
13220 rtx (*gen) (rtx, rtx, rtx);
13221 machine_mode vmode = d->vmode;
13222
13223 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13224 return false;
13225
13226 /* Note that these are little-endian tests.
13227 We correct for big-endian later. */
13228 if (d->perm[0] == 0)
13229 odd = 0;
13230 else if (d->perm[0] == 1)
13231 odd = 1;
13232 else
13233 return false;
13234 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13235
13236 for (i = 0; i < nelt; i += 2)
13237 {
13238 if (d->perm[i] != i + odd)
13239 return false;
13240 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13241 return false;
13242 }
13243
13244 /* Success! */
13245 if (d->testing_p)
13246 return true;
13247
13248 in0 = d->op0;
13249 in1 = d->op1;
13250 if (BYTES_BIG_ENDIAN)
13251 {
13252 x = in0, in0 = in1, in1 = x;
13253 odd = !odd;
13254 }
13255 out = d->target;
13256
13257 if (odd)
13258 {
13259 switch (vmode)
13260 {
13261 case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13262 case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13263 case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13264 case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13265 case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13266 case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13267 case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13268 case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13269 case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13270 case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13271 case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13272 case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13273 default:
13274 return false;
13275 }
13276 }
13277 else
13278 {
13279 switch (vmode)
13280 {
13281 case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13282 case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13283 case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13284 case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13285 case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13286 case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13287 case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13288 case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13289 case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13290 case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13291 case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13292 case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13293 default:
13294 return false;
13295 }
13296 }
13297
13298 emit_insn (gen (out, in0, in1));
13299 return true;
13300 }
13301
13302 /* Recognize patterns suitable for the UZP instructions. */
13303 static bool
13304 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13305 {
13306 unsigned int i, odd, mask, nelt = d->nelt;
13307 rtx out, in0, in1, x;
13308 rtx (*gen) (rtx, rtx, rtx);
13309 machine_mode vmode = d->vmode;
13310
13311 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13312 return false;
13313
13314 /* Note that these are little-endian tests.
13315 We correct for big-endian later. */
13316 if (d->perm[0] == 0)
13317 odd = 0;
13318 else if (d->perm[0] == 1)
13319 odd = 1;
13320 else
13321 return false;
13322 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13323
13324 for (i = 0; i < nelt; i++)
13325 {
13326 unsigned elt = (i * 2 + odd) & mask;
13327 if (d->perm[i] != elt)
13328 return false;
13329 }
13330
13331 /* Success! */
13332 if (d->testing_p)
13333 return true;
13334
13335 in0 = d->op0;
13336 in1 = d->op1;
13337 if (BYTES_BIG_ENDIAN)
13338 {
13339 x = in0, in0 = in1, in1 = x;
13340 odd = !odd;
13341 }
13342 out = d->target;
13343
13344 if (odd)
13345 {
13346 switch (vmode)
13347 {
13348 case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13349 case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13350 case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13351 case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13352 case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13353 case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13354 case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13355 case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13356 case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13357 case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13358 case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13359 case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13360 default:
13361 return false;
13362 }
13363 }
13364 else
13365 {
13366 switch (vmode)
13367 {
13368 case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13369 case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13370 case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13371 case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13372 case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13373 case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13374 case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13375 case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13376 case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13377 case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13378 case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13379 case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13380 default:
13381 return false;
13382 }
13383 }
13384
13385 emit_insn (gen (out, in0, in1));
13386 return true;
13387 }
13388
13389 /* Recognize patterns suitable for the ZIP instructions. */
13390 static bool
13391 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13392 {
13393 unsigned int i, high, mask, nelt = d->nelt;
13394 rtx out, in0, in1, x;
13395 rtx (*gen) (rtx, rtx, rtx);
13396 machine_mode vmode = d->vmode;
13397
13398 if (GET_MODE_UNIT_SIZE (vmode) > 8)
13399 return false;
13400
13401 /* Note that these are little-endian tests.
13402 We correct for big-endian later. */
13403 high = nelt / 2;
13404 if (d->perm[0] == high)
13405 /* Do Nothing. */
13406 ;
13407 else if (d->perm[0] == 0)
13408 high = 0;
13409 else
13410 return false;
13411 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13412
13413 for (i = 0; i < nelt / 2; i++)
13414 {
13415 unsigned elt = (i + high) & mask;
13416 if (d->perm[i * 2] != elt)
13417 return false;
13418 elt = (elt + nelt) & mask;
13419 if (d->perm[i * 2 + 1] != elt)
13420 return false;
13421 }
13422
13423 /* Success! */
13424 if (d->testing_p)
13425 return true;
13426
13427 in0 = d->op0;
13428 in1 = d->op1;
13429 if (BYTES_BIG_ENDIAN)
13430 {
13431 x = in0, in0 = in1, in1 = x;
13432 high = !high;
13433 }
13434 out = d->target;
13435
13436 if (high)
13437 {
13438 switch (vmode)
13439 {
13440 case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13441 case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13442 case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13443 case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13444 case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13445 case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13446 case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13447 case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13448 case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13449 case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13450 case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13451 case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13452 default:
13453 return false;
13454 }
13455 }
13456 else
13457 {
13458 switch (vmode)
13459 {
13460 case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13461 case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13462 case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13463 case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13464 case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13465 case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13466 case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13467 case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13468 case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13469 case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13470 case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13471 case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13472 default:
13473 return false;
13474 }
13475 }
13476
13477 emit_insn (gen (out, in0, in1));
13478 return true;
13479 }
13480
13481 /* Recognize patterns for the EXT insn. */
13482
13483 static bool
13484 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13485 {
13486 unsigned int i, nelt = d->nelt;
13487 rtx (*gen) (rtx, rtx, rtx, rtx);
13488 rtx offset;
13489
13490 unsigned int location = d->perm[0]; /* Always < nelt. */
13491
13492 /* Check if the extracted indices are increasing by one. */
13493 for (i = 1; i < nelt; i++)
13494 {
13495 unsigned int required = location + i;
13496 if (d->one_vector_p)
13497 {
13498 /* We'll pass the same vector in twice, so allow indices to wrap. */
13499 required &= (nelt - 1);
13500 }
13501 if (d->perm[i] != required)
13502 return false;
13503 }
13504
13505 switch (d->vmode)
13506 {
13507 case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13508 case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13509 case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13510 case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13511 case E_V2SImode: gen = gen_aarch64_extv2si; break;
13512 case E_V4SImode: gen = gen_aarch64_extv4si; break;
13513 case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13514 case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13515 case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13516 case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13517 case E_V2DImode: gen = gen_aarch64_extv2di; break;
13518 case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13519 default:
13520 return false;
13521 }
13522
13523 /* Success! */
13524 if (d->testing_p)
13525 return true;
13526
13527 /* The case where (location == 0) is a no-op for both big- and little-endian,
13528 and is removed by the mid-end at optimization levels -O1 and higher. */
13529
13530 if (BYTES_BIG_ENDIAN && (location != 0))
13531 {
13532 /* After setup, we want the high elements of the first vector (stored
13533 at the LSB end of the register), and the low elements of the second
13534 vector (stored at the MSB end of the register). So swap. */
13535 std::swap (d->op0, d->op1);
13536 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13537 location = nelt - location;
13538 }
13539
13540 offset = GEN_INT (location);
13541 emit_insn (gen (d->target, d->op0, d->op1, offset));
13542 return true;
13543 }
13544
13545 /* Recognize patterns for the REV insns. */
13546
13547 static bool
13548 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13549 {
13550 unsigned int i, j, diff, nelt = d->nelt;
13551 rtx (*gen) (rtx, rtx);
13552
13553 if (!d->one_vector_p)
13554 return false;
13555
13556 diff = d->perm[0];
13557 switch (diff)
13558 {
13559 case 7:
13560 switch (d->vmode)
13561 {
13562 case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13563 case E_V8QImode: gen = gen_aarch64_rev64v8qi; break;
13564 default:
13565 return false;
13566 }
13567 break;
13568 case 3:
13569 switch (d->vmode)
13570 {
13571 case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13572 case E_V8QImode: gen = gen_aarch64_rev32v8qi; break;
13573 case E_V8HImode: gen = gen_aarch64_rev64v8hi; break;
13574 case E_V4HImode: gen = gen_aarch64_rev64v4hi; break;
13575 default:
13576 return false;
13577 }
13578 break;
13579 case 1:
13580 switch (d->vmode)
13581 {
13582 case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13583 case E_V8QImode: gen = gen_aarch64_rev16v8qi; break;
13584 case E_V8HImode: gen = gen_aarch64_rev32v8hi; break;
13585 case E_V4HImode: gen = gen_aarch64_rev32v4hi; break;
13586 case E_V4SImode: gen = gen_aarch64_rev64v4si; break;
13587 case E_V2SImode: gen = gen_aarch64_rev64v2si; break;
13588 case E_V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13589 case E_V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13590 case E_V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13591 case E_V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13592 default:
13593 return false;
13594 }
13595 break;
13596 default:
13597 return false;
13598 }
13599
13600 for (i = 0; i < nelt ; i += diff + 1)
13601 for (j = 0; j <= diff; j += 1)
13602 {
13603 /* This is guaranteed to be true as the value of diff
13604 is 7, 3, 1 and we should have enough elements in the
13605 queue to generate this. Getting a vector mask with a
13606 value of diff other than these values implies that
13607 something is wrong by the time we get here. */
13608 gcc_assert (i + j < nelt);
13609 if (d->perm[i + j] != i + diff - j)
13610 return false;
13611 }
13612
13613 /* Success! */
13614 if (d->testing_p)
13615 return true;
13616
13617 emit_insn (gen (d->target, d->op0));
13618 return true;
13619 }
13620
13621 static bool
13622 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13623 {
13624 rtx (*gen) (rtx, rtx, rtx);
13625 rtx out = d->target;
13626 rtx in0;
13627 machine_mode vmode = d->vmode;
13628 unsigned int i, elt, nelt = d->nelt;
13629 rtx lane;
13630
13631 elt = d->perm[0];
13632 for (i = 1; i < nelt; i++)
13633 {
13634 if (elt != d->perm[i])
13635 return false;
13636 }
13637
13638 /* The generic preparation in aarch64_expand_vec_perm_const_1
13639 swaps the operand order and the permute indices if it finds
13640 d->perm[0] to be in the second operand. Thus, we can always
13641 use d->op0 and need not do any extra arithmetic to get the
13642 correct lane number. */
13643 in0 = d->op0;
13644 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13645
13646 switch (vmode)
13647 {
13648 case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13649 case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13650 case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13651 case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13652 case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13653 case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13654 case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13655 case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13656 case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13657 case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13658 case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13659 case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13660 default:
13661 return false;
13662 }
13663
13664 emit_insn (gen (out, in0, lane));
13665 return true;
13666 }
13667
13668 static bool
13669 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13670 {
13671 rtx rperm[MAX_VECT_LEN], sel;
13672 machine_mode vmode = d->vmode;
13673 unsigned int i, nelt = d->nelt;
13674
13675 if (d->testing_p)
13676 return true;
13677
13678 /* Generic code will try constant permutation twice. Once with the
13679 original mode and again with the elements lowered to QImode.
13680 So wait and don't do the selector expansion ourselves. */
13681 if (vmode != V8QImode && vmode != V16QImode)
13682 return false;
13683
13684 for (i = 0; i < nelt; ++i)
13685 {
13686 int nunits = GET_MODE_NUNITS (vmode);
13687
13688 /* If big-endian and two vectors we end up with a weird mixed-endian
13689 mode on NEON. Reverse the index within each word but not the word
13690 itself. */
13691 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13692 : d->perm[i]);
13693 }
13694 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13695 sel = force_reg (vmode, sel);
13696
13697 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13698 return true;
13699 }
13700
13701 static bool
13702 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13703 {
13704 /* The pattern matching functions above are written to look for a small
13705 number to begin the sequence (0, 1, N/2). If we begin with an index
13706 from the second operand, we can swap the operands. */
13707 if (d->perm[0] >= d->nelt)
13708 {
13709 unsigned i, nelt = d->nelt;
13710
13711 gcc_assert (nelt == (nelt & -nelt));
13712 for (i = 0; i < nelt; ++i)
13713 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13714
13715 std::swap (d->op0, d->op1);
13716 }
13717
13718 if (TARGET_SIMD)
13719 {
13720 if (aarch64_evpc_rev (d))
13721 return true;
13722 else if (aarch64_evpc_ext (d))
13723 return true;
13724 else if (aarch64_evpc_dup (d))
13725 return true;
13726 else if (aarch64_evpc_zip (d))
13727 return true;
13728 else if (aarch64_evpc_uzp (d))
13729 return true;
13730 else if (aarch64_evpc_trn (d))
13731 return true;
13732 return aarch64_evpc_tbl (d);
13733 }
13734 return false;
13735 }
13736
13737 /* Expand a vec_perm_const pattern. */
13738
13739 bool
13740 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13741 {
13742 struct expand_vec_perm_d d;
13743 int i, nelt, which;
13744
13745 d.target = target;
13746 d.op0 = op0;
13747 d.op1 = op1;
13748
13749 d.vmode = GET_MODE (target);
13750 gcc_assert (VECTOR_MODE_P (d.vmode));
13751 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13752 d.testing_p = false;
13753
13754 for (i = which = 0; i < nelt; ++i)
13755 {
13756 rtx e = XVECEXP (sel, 0, i);
13757 int ei = INTVAL (e) & (2 * nelt - 1);
13758 which |= (ei < nelt ? 1 : 2);
13759 d.perm[i] = ei;
13760 }
13761
13762 switch (which)
13763 {
13764 default:
13765 gcc_unreachable ();
13766
13767 case 3:
13768 d.one_vector_p = false;
13769 if (!rtx_equal_p (op0, op1))
13770 break;
13771
13772 /* The elements of PERM do not suggest that only the first operand
13773 is used, but both operands are identical. Allow easier matching
13774 of the permutation by folding the permutation into the single
13775 input vector. */
13776 /* Fall Through. */
13777 case 2:
13778 for (i = 0; i < nelt; ++i)
13779 d.perm[i] &= nelt - 1;
13780 d.op0 = op1;
13781 d.one_vector_p = true;
13782 break;
13783
13784 case 1:
13785 d.op1 = op0;
13786 d.one_vector_p = true;
13787 break;
13788 }
13789
13790 return aarch64_expand_vec_perm_const_1 (&d);
13791 }
13792
13793 static bool
13794 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13795 const unsigned char *sel)
13796 {
13797 struct expand_vec_perm_d d;
13798 unsigned int i, nelt, which;
13799 bool ret;
13800
13801 d.vmode = vmode;
13802 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13803 d.testing_p = true;
13804 memcpy (d.perm, sel, nelt);
13805
13806 /* Calculate whether all elements are in one vector. */
13807 for (i = which = 0; i < nelt; ++i)
13808 {
13809 unsigned char e = d.perm[i];
13810 gcc_assert (e < 2 * nelt);
13811 which |= (e < nelt ? 1 : 2);
13812 }
13813
13814 /* If all elements are from the second vector, reindex as if from the
13815 first vector. */
13816 if (which == 2)
13817 for (i = 0; i < nelt; ++i)
13818 d.perm[i] -= nelt;
13819
13820 /* Check whether the mask can be applied to a single vector. */
13821 d.one_vector_p = (which != 3);
13822
13823 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13824 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13825 if (!d.one_vector_p)
13826 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13827
13828 start_sequence ();
13829 ret = aarch64_expand_vec_perm_const_1 (&d);
13830 end_sequence ();
13831
13832 return ret;
13833 }
13834
13835 rtx
13836 aarch64_reverse_mask (machine_mode mode)
13837 {
13838 /* We have to reverse each vector because we dont have
13839 a permuted load that can reverse-load according to ABI rules. */
13840 rtx mask;
13841 rtvec v = rtvec_alloc (16);
13842 int i, j;
13843 int nunits = GET_MODE_NUNITS (mode);
13844 int usize = GET_MODE_UNIT_SIZE (mode);
13845
13846 gcc_assert (BYTES_BIG_ENDIAN);
13847 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13848
13849 for (i = 0; i < nunits; i++)
13850 for (j = 0; j < usize; j++)
13851 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13852 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13853 return force_reg (V16QImode, mask);
13854 }
13855
13856 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13857 However due to issues with register allocation it is preferable to avoid
13858 tieing integer scalar and FP scalar modes. Executing integer operations
13859 in general registers is better than treating them as scalar vector
13860 operations. This reduces latency and avoids redundant int<->FP moves.
13861 So tie modes if they are either the same class, or vector modes with
13862 other vector modes, vector structs or any scalar mode.
13863 */
13864
13865 bool
13866 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13867 {
13868 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13869 return true;
13870
13871 /* We specifically want to allow elements of "structure" modes to
13872 be tieable to the structure. This more general condition allows
13873 other rarer situations too. */
13874 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13875 return true;
13876
13877 /* Also allow any scalar modes with vectors. */
13878 if (aarch64_vector_mode_supported_p (mode1)
13879 || aarch64_vector_mode_supported_p (mode2))
13880 return true;
13881
13882 return false;
13883 }
13884
13885 /* Return a new RTX holding the result of moving POINTER forward by
13886 AMOUNT bytes. */
13887
13888 static rtx
13889 aarch64_move_pointer (rtx pointer, int amount)
13890 {
13891 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13892
13893 return adjust_automodify_address (pointer, GET_MODE (pointer),
13894 next, amount);
13895 }
13896
13897 /* Return a new RTX holding the result of moving POINTER forward by the
13898 size of the mode it points to. */
13899
13900 static rtx
13901 aarch64_progress_pointer (rtx pointer)
13902 {
13903 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13904
13905 return aarch64_move_pointer (pointer, amount);
13906 }
13907
13908 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13909 MODE bytes. */
13910
13911 static void
13912 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13913 machine_mode mode)
13914 {
13915 rtx reg = gen_reg_rtx (mode);
13916
13917 /* "Cast" the pointers to the correct mode. */
13918 *src = adjust_address (*src, mode, 0);
13919 *dst = adjust_address (*dst, mode, 0);
13920 /* Emit the memcpy. */
13921 emit_move_insn (reg, *src);
13922 emit_move_insn (*dst, reg);
13923 /* Move the pointers forward. */
13924 *src = aarch64_progress_pointer (*src);
13925 *dst = aarch64_progress_pointer (*dst);
13926 }
13927
13928 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13929 we succeed, otherwise return false. */
13930
13931 bool
13932 aarch64_expand_movmem (rtx *operands)
13933 {
13934 unsigned int n;
13935 rtx dst = operands[0];
13936 rtx src = operands[1];
13937 rtx base;
13938 bool speed_p = !optimize_function_for_size_p (cfun);
13939
13940 /* When optimizing for size, give a better estimate of the length of a
13941 memcpy call, but use the default otherwise. */
13942 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13943
13944 /* We can't do anything smart if the amount to copy is not constant. */
13945 if (!CONST_INT_P (operands[2]))
13946 return false;
13947
13948 n = UINTVAL (operands[2]);
13949
13950 /* Try to keep the number of instructions low. For cases below 16 bytes we
13951 need to make at most two moves. For cases above 16 bytes it will be one
13952 move for each 16 byte chunk, then at most two additional moves. */
13953 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13954 return false;
13955
13956 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13957 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13958
13959 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13960 src = adjust_automodify_address (src, VOIDmode, base, 0);
13961
13962 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13963 1-byte chunk. */
13964 if (n < 4)
13965 {
13966 if (n >= 2)
13967 {
13968 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13969 n -= 2;
13970 }
13971
13972 if (n == 1)
13973 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13974
13975 return true;
13976 }
13977
13978 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13979 4-byte chunk, partially overlapping with the previously copied chunk. */
13980 if (n < 8)
13981 {
13982 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13983 n -= 4;
13984 if (n > 0)
13985 {
13986 int move = n - 4;
13987
13988 src = aarch64_move_pointer (src, move);
13989 dst = aarch64_move_pointer (dst, move);
13990 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13991 }
13992 return true;
13993 }
13994
13995 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13996 them, then (if applicable) an 8-byte chunk. */
13997 while (n >= 8)
13998 {
13999 if (n / 16)
14000 {
14001 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14002 n -= 16;
14003 }
14004 else
14005 {
14006 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14007 n -= 8;
14008 }
14009 }
14010
14011 /* Finish the final bytes of the copy. We can always do this in one
14012 instruction. We either copy the exact amount we need, or partially
14013 overlap with the previous chunk we copied and copy 8-bytes. */
14014 if (n == 0)
14015 return true;
14016 else if (n == 1)
14017 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14018 else if (n == 2)
14019 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14020 else if (n == 4)
14021 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14022 else
14023 {
14024 if (n == 3)
14025 {
14026 src = aarch64_move_pointer (src, -1);
14027 dst = aarch64_move_pointer (dst, -1);
14028 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14029 }
14030 else
14031 {
14032 int move = n - 8;
14033
14034 src = aarch64_move_pointer (src, move);
14035 dst = aarch64_move_pointer (dst, move);
14036 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14037 }
14038 }
14039
14040 return true;
14041 }
14042
14043 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14044 SImode stores. Handle the case when the constant has identical
14045 bottom and top halves. This is beneficial when the two stores can be
14046 merged into an STP and we avoid synthesising potentially expensive
14047 immediates twice. Return true if such a split is possible. */
14048
14049 bool
14050 aarch64_split_dimode_const_store (rtx dst, rtx src)
14051 {
14052 rtx lo = gen_lowpart (SImode, src);
14053 rtx hi = gen_highpart_mode (SImode, DImode, src);
14054
14055 bool size_p = optimize_function_for_size_p (cfun);
14056
14057 if (!rtx_equal_p (lo, hi))
14058 return false;
14059
14060 unsigned int orig_cost
14061 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14062 unsigned int lo_cost
14063 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14064
14065 /* We want to transform:
14066 MOV x1, 49370
14067 MOVK x1, 0x140, lsl 16
14068 MOVK x1, 0xc0da, lsl 32
14069 MOVK x1, 0x140, lsl 48
14070 STR x1, [x0]
14071 into:
14072 MOV w1, 49370
14073 MOVK w1, 0x140, lsl 16
14074 STP w1, w1, [x0]
14075 So we want to perform this only when we save two instructions
14076 or more. When optimizing for size, however, accept any code size
14077 savings we can. */
14078 if (size_p && orig_cost <= lo_cost)
14079 return false;
14080
14081 if (!size_p
14082 && (orig_cost <= lo_cost + 1))
14083 return false;
14084
14085 rtx mem_lo = adjust_address (dst, SImode, 0);
14086 if (!aarch64_mem_pair_operand (mem_lo, SImode))
14087 return false;
14088
14089 rtx tmp_reg = gen_reg_rtx (SImode);
14090 aarch64_expand_mov_immediate (tmp_reg, lo);
14091 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14092 /* Don't emit an explicit store pair as this may not be always profitable.
14093 Let the sched-fusion logic decide whether to merge them. */
14094 emit_move_insn (mem_lo, tmp_reg);
14095 emit_move_insn (mem_hi, tmp_reg);
14096
14097 return true;
14098 }
14099
14100 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14101
14102 static unsigned HOST_WIDE_INT
14103 aarch64_asan_shadow_offset (void)
14104 {
14105 return (HOST_WIDE_INT_1 << 36);
14106 }
14107
14108 static bool
14109 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14110 unsigned int align,
14111 enum by_pieces_operation op,
14112 bool speed_p)
14113 {
14114 /* STORE_BY_PIECES can be used when copying a constant string, but
14115 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14116 For now we always fail this and let the move_by_pieces code copy
14117 the string from read-only memory. */
14118 if (op == STORE_BY_PIECES)
14119 return false;
14120
14121 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14122 }
14123
14124 static rtx
14125 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14126 int code, tree treeop0, tree treeop1)
14127 {
14128 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14129 rtx op0, op1;
14130 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14131 insn_code icode;
14132 struct expand_operand ops[4];
14133
14134 start_sequence ();
14135 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14136
14137 op_mode = GET_MODE (op0);
14138 if (op_mode == VOIDmode)
14139 op_mode = GET_MODE (op1);
14140
14141 switch (op_mode)
14142 {
14143 case E_QImode:
14144 case E_HImode:
14145 case E_SImode:
14146 cmp_mode = SImode;
14147 icode = CODE_FOR_cmpsi;
14148 break;
14149
14150 case E_DImode:
14151 cmp_mode = DImode;
14152 icode = CODE_FOR_cmpdi;
14153 break;
14154
14155 case E_SFmode:
14156 cmp_mode = SFmode;
14157 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14158 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14159 break;
14160
14161 case E_DFmode:
14162 cmp_mode = DFmode;
14163 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14164 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14165 break;
14166
14167 default:
14168 end_sequence ();
14169 return NULL_RTX;
14170 }
14171
14172 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14173 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14174 if (!op0 || !op1)
14175 {
14176 end_sequence ();
14177 return NULL_RTX;
14178 }
14179 *prep_seq = get_insns ();
14180 end_sequence ();
14181
14182 create_fixed_operand (&ops[0], op0);
14183 create_fixed_operand (&ops[1], op1);
14184
14185 start_sequence ();
14186 if (!maybe_expand_insn (icode, 2, ops))
14187 {
14188 end_sequence ();
14189 return NULL_RTX;
14190 }
14191 *gen_seq = get_insns ();
14192 end_sequence ();
14193
14194 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14195 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14196 }
14197
14198 static rtx
14199 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14200 int cmp_code, tree treeop0, tree treeop1, int bit_code)
14201 {
14202 rtx op0, op1, target;
14203 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14204 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14205 insn_code icode;
14206 struct expand_operand ops[6];
14207 int aarch64_cond;
14208
14209 push_to_sequence (*prep_seq);
14210 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14211
14212 op_mode = GET_MODE (op0);
14213 if (op_mode == VOIDmode)
14214 op_mode = GET_MODE (op1);
14215
14216 switch (op_mode)
14217 {
14218 case E_QImode:
14219 case E_HImode:
14220 case E_SImode:
14221 cmp_mode = SImode;
14222 icode = CODE_FOR_ccmpsi;
14223 break;
14224
14225 case E_DImode:
14226 cmp_mode = DImode;
14227 icode = CODE_FOR_ccmpdi;
14228 break;
14229
14230 case E_SFmode:
14231 cmp_mode = SFmode;
14232 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14233 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14234 break;
14235
14236 case E_DFmode:
14237 cmp_mode = DFmode;
14238 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14239 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14240 break;
14241
14242 default:
14243 end_sequence ();
14244 return NULL_RTX;
14245 }
14246
14247 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14248 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14249 if (!op0 || !op1)
14250 {
14251 end_sequence ();
14252 return NULL_RTX;
14253 }
14254 *prep_seq = get_insns ();
14255 end_sequence ();
14256
14257 target = gen_rtx_REG (cc_mode, CC_REGNUM);
14258 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14259
14260 if (bit_code != AND)
14261 {
14262 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14263 GET_MODE (XEXP (prev, 0))),
14264 VOIDmode, XEXP (prev, 0), const0_rtx);
14265 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14266 }
14267
14268 create_fixed_operand (&ops[0], XEXP (prev, 0));
14269 create_fixed_operand (&ops[1], target);
14270 create_fixed_operand (&ops[2], op0);
14271 create_fixed_operand (&ops[3], op1);
14272 create_fixed_operand (&ops[4], prev);
14273 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14274
14275 push_to_sequence (*gen_seq);
14276 if (!maybe_expand_insn (icode, 6, ops))
14277 {
14278 end_sequence ();
14279 return NULL_RTX;
14280 }
14281
14282 *gen_seq = get_insns ();
14283 end_sequence ();
14284
14285 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14286 }
14287
14288 #undef TARGET_GEN_CCMP_FIRST
14289 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14290
14291 #undef TARGET_GEN_CCMP_NEXT
14292 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14293
14294 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14295 instruction fusion of some sort. */
14296
14297 static bool
14298 aarch64_macro_fusion_p (void)
14299 {
14300 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14301 }
14302
14303
14304 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14305 should be kept together during scheduling. */
14306
14307 static bool
14308 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14309 {
14310 rtx set_dest;
14311 rtx prev_set = single_set (prev);
14312 rtx curr_set = single_set (curr);
14313 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14314 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14315
14316 if (!aarch64_macro_fusion_p ())
14317 return false;
14318
14319 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14320 {
14321 /* We are trying to match:
14322 prev (mov) == (set (reg r0) (const_int imm16))
14323 curr (movk) == (set (zero_extract (reg r0)
14324 (const_int 16)
14325 (const_int 16))
14326 (const_int imm16_1)) */
14327
14328 set_dest = SET_DEST (curr_set);
14329
14330 if (GET_CODE (set_dest) == ZERO_EXTRACT
14331 && CONST_INT_P (SET_SRC (curr_set))
14332 && CONST_INT_P (SET_SRC (prev_set))
14333 && CONST_INT_P (XEXP (set_dest, 2))
14334 && INTVAL (XEXP (set_dest, 2)) == 16
14335 && REG_P (XEXP (set_dest, 0))
14336 && REG_P (SET_DEST (prev_set))
14337 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14338 {
14339 return true;
14340 }
14341 }
14342
14343 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14344 {
14345
14346 /* We're trying to match:
14347 prev (adrp) == (set (reg r1)
14348 (high (symbol_ref ("SYM"))))
14349 curr (add) == (set (reg r0)
14350 (lo_sum (reg r1)
14351 (symbol_ref ("SYM"))))
14352 Note that r0 need not necessarily be the same as r1, especially
14353 during pre-regalloc scheduling. */
14354
14355 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14356 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14357 {
14358 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14359 && REG_P (XEXP (SET_SRC (curr_set), 0))
14360 && REGNO (XEXP (SET_SRC (curr_set), 0))
14361 == REGNO (SET_DEST (prev_set))
14362 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14363 XEXP (SET_SRC (curr_set), 1)))
14364 return true;
14365 }
14366 }
14367
14368 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14369 {
14370
14371 /* We're trying to match:
14372 prev (movk) == (set (zero_extract (reg r0)
14373 (const_int 16)
14374 (const_int 32))
14375 (const_int imm16_1))
14376 curr (movk) == (set (zero_extract (reg r0)
14377 (const_int 16)
14378 (const_int 48))
14379 (const_int imm16_2)) */
14380
14381 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14382 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14383 && REG_P (XEXP (SET_DEST (prev_set), 0))
14384 && REG_P (XEXP (SET_DEST (curr_set), 0))
14385 && REGNO (XEXP (SET_DEST (prev_set), 0))
14386 == REGNO (XEXP (SET_DEST (curr_set), 0))
14387 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14388 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14389 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14390 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14391 && CONST_INT_P (SET_SRC (prev_set))
14392 && CONST_INT_P (SET_SRC (curr_set)))
14393 return true;
14394
14395 }
14396 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14397 {
14398 /* We're trying to match:
14399 prev (adrp) == (set (reg r0)
14400 (high (symbol_ref ("SYM"))))
14401 curr (ldr) == (set (reg r1)
14402 (mem (lo_sum (reg r0)
14403 (symbol_ref ("SYM")))))
14404 or
14405 curr (ldr) == (set (reg r1)
14406 (zero_extend (mem
14407 (lo_sum (reg r0)
14408 (symbol_ref ("SYM")))))) */
14409 if (satisfies_constraint_Ush (SET_SRC (prev_set))
14410 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14411 {
14412 rtx curr_src = SET_SRC (curr_set);
14413
14414 if (GET_CODE (curr_src) == ZERO_EXTEND)
14415 curr_src = XEXP (curr_src, 0);
14416
14417 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14418 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14419 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14420 == REGNO (SET_DEST (prev_set))
14421 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14422 XEXP (SET_SRC (prev_set), 0)))
14423 return true;
14424 }
14425 }
14426
14427 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14428 && aarch_crypto_can_dual_issue (prev, curr))
14429 return true;
14430
14431 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14432 && any_condjump_p (curr))
14433 {
14434 enum attr_type prev_type = get_attr_type (prev);
14435
14436 unsigned int condreg1, condreg2;
14437 rtx cc_reg_1;
14438 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14439 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14440
14441 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14442 && prev
14443 && modified_in_p (cc_reg_1, prev))
14444 {
14445 /* FIXME: this misses some which is considered simple arthematic
14446 instructions for ThunderX. Simple shifts are missed here. */
14447 if (prev_type == TYPE_ALUS_SREG
14448 || prev_type == TYPE_ALUS_IMM
14449 || prev_type == TYPE_LOGICS_REG
14450 || prev_type == TYPE_LOGICS_IMM)
14451 return true;
14452 }
14453 }
14454
14455 if (prev_set
14456 && curr_set
14457 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14458 && any_condjump_p (curr))
14459 {
14460 /* We're trying to match:
14461 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14462 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14463 (const_int 0))
14464 (label_ref ("SYM"))
14465 (pc)) */
14466 if (SET_DEST (curr_set) == (pc_rtx)
14467 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14468 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14469 && REG_P (SET_DEST (prev_set))
14470 && REGNO (SET_DEST (prev_set))
14471 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14472 {
14473 /* Fuse ALU operations followed by conditional branch instruction. */
14474 switch (get_attr_type (prev))
14475 {
14476 case TYPE_ALU_IMM:
14477 case TYPE_ALU_SREG:
14478 case TYPE_ADC_REG:
14479 case TYPE_ADC_IMM:
14480 case TYPE_ADCS_REG:
14481 case TYPE_ADCS_IMM:
14482 case TYPE_LOGIC_REG:
14483 case TYPE_LOGIC_IMM:
14484 case TYPE_CSEL:
14485 case TYPE_ADR:
14486 case TYPE_MOV_IMM:
14487 case TYPE_SHIFT_REG:
14488 case TYPE_SHIFT_IMM:
14489 case TYPE_BFM:
14490 case TYPE_RBIT:
14491 case TYPE_REV:
14492 case TYPE_EXTEND:
14493 return true;
14494
14495 default:;
14496 }
14497 }
14498 }
14499
14500 return false;
14501 }
14502
14503 /* Return true iff the instruction fusion described by OP is enabled. */
14504
14505 bool
14506 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14507 {
14508 return (aarch64_tune_params.fusible_ops & op) != 0;
14509 }
14510
14511 /* If MEM is in the form of [base+offset], extract the two parts
14512 of address and set to BASE and OFFSET, otherwise return false
14513 after clearing BASE and OFFSET. */
14514
14515 bool
14516 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14517 {
14518 rtx addr;
14519
14520 gcc_assert (MEM_P (mem));
14521
14522 addr = XEXP (mem, 0);
14523
14524 if (REG_P (addr))
14525 {
14526 *base = addr;
14527 *offset = const0_rtx;
14528 return true;
14529 }
14530
14531 if (GET_CODE (addr) == PLUS
14532 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14533 {
14534 *base = XEXP (addr, 0);
14535 *offset = XEXP (addr, 1);
14536 return true;
14537 }
14538
14539 *base = NULL_RTX;
14540 *offset = NULL_RTX;
14541
14542 return false;
14543 }
14544
14545 /* Types for scheduling fusion. */
14546 enum sched_fusion_type
14547 {
14548 SCHED_FUSION_NONE = 0,
14549 SCHED_FUSION_LD_SIGN_EXTEND,
14550 SCHED_FUSION_LD_ZERO_EXTEND,
14551 SCHED_FUSION_LD,
14552 SCHED_FUSION_ST,
14553 SCHED_FUSION_NUM
14554 };
14555
14556 /* If INSN is a load or store of address in the form of [base+offset],
14557 extract the two parts and set to BASE and OFFSET. Return scheduling
14558 fusion type this INSN is. */
14559
14560 static enum sched_fusion_type
14561 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14562 {
14563 rtx x, dest, src;
14564 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14565
14566 gcc_assert (INSN_P (insn));
14567 x = PATTERN (insn);
14568 if (GET_CODE (x) != SET)
14569 return SCHED_FUSION_NONE;
14570
14571 src = SET_SRC (x);
14572 dest = SET_DEST (x);
14573
14574 machine_mode dest_mode = GET_MODE (dest);
14575
14576 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14577 return SCHED_FUSION_NONE;
14578
14579 if (GET_CODE (src) == SIGN_EXTEND)
14580 {
14581 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14582 src = XEXP (src, 0);
14583 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14584 return SCHED_FUSION_NONE;
14585 }
14586 else if (GET_CODE (src) == ZERO_EXTEND)
14587 {
14588 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14589 src = XEXP (src, 0);
14590 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14591 return SCHED_FUSION_NONE;
14592 }
14593
14594 if (GET_CODE (src) == MEM && REG_P (dest))
14595 extract_base_offset_in_addr (src, base, offset);
14596 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14597 {
14598 fusion = SCHED_FUSION_ST;
14599 extract_base_offset_in_addr (dest, base, offset);
14600 }
14601 else
14602 return SCHED_FUSION_NONE;
14603
14604 if (*base == NULL_RTX || *offset == NULL_RTX)
14605 fusion = SCHED_FUSION_NONE;
14606
14607 return fusion;
14608 }
14609
14610 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14611
14612 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14613 and PRI are only calculated for these instructions. For other instruction,
14614 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14615 type instruction fusion can be added by returning different priorities.
14616
14617 It's important that irrelevant instructions get the largest FUSION_PRI. */
14618
14619 static void
14620 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14621 int *fusion_pri, int *pri)
14622 {
14623 int tmp, off_val;
14624 rtx base, offset;
14625 enum sched_fusion_type fusion;
14626
14627 gcc_assert (INSN_P (insn));
14628
14629 tmp = max_pri - 1;
14630 fusion = fusion_load_store (insn, &base, &offset);
14631 if (fusion == SCHED_FUSION_NONE)
14632 {
14633 *pri = tmp;
14634 *fusion_pri = tmp;
14635 return;
14636 }
14637
14638 /* Set FUSION_PRI according to fusion type and base register. */
14639 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14640
14641 /* Calculate PRI. */
14642 tmp /= 2;
14643
14644 /* INSN with smaller offset goes first. */
14645 off_val = (int)(INTVAL (offset));
14646 if (off_val >= 0)
14647 tmp -= (off_val & 0xfffff);
14648 else
14649 tmp += ((- off_val) & 0xfffff);
14650
14651 *pri = tmp;
14652 return;
14653 }
14654
14655 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14656 Adjust priority of sha1h instructions so they are scheduled before
14657 other SHA1 instructions. */
14658
14659 static int
14660 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14661 {
14662 rtx x = PATTERN (insn);
14663
14664 if (GET_CODE (x) == SET)
14665 {
14666 x = SET_SRC (x);
14667
14668 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14669 return priority + 10;
14670 }
14671
14672 return priority;
14673 }
14674
14675 /* Given OPERANDS of consecutive load/store, check if we can merge
14676 them into ldp/stp. LOAD is true if they are load instructions.
14677 MODE is the mode of memory operands. */
14678
14679 bool
14680 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14681 machine_mode mode)
14682 {
14683 HOST_WIDE_INT offval_1, offval_2, msize;
14684 enum reg_class rclass_1, rclass_2;
14685 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14686
14687 if (load)
14688 {
14689 mem_1 = operands[1];
14690 mem_2 = operands[3];
14691 reg_1 = operands[0];
14692 reg_2 = operands[2];
14693 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14694 if (REGNO (reg_1) == REGNO (reg_2))
14695 return false;
14696 }
14697 else
14698 {
14699 mem_1 = operands[0];
14700 mem_2 = operands[2];
14701 reg_1 = operands[1];
14702 reg_2 = operands[3];
14703 }
14704
14705 /* The mems cannot be volatile. */
14706 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14707 return false;
14708
14709 /* If we have SImode and slow unaligned ldp,
14710 check the alignment to be at least 8 byte. */
14711 if (mode == SImode
14712 && (aarch64_tune_params.extra_tuning_flags
14713 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14714 && !optimize_size
14715 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14716 return false;
14717
14718 /* Check if the addresses are in the form of [base+offset]. */
14719 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14720 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14721 return false;
14722 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14723 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14724 return false;
14725
14726 /* Check if the bases are same. */
14727 if (!rtx_equal_p (base_1, base_2))
14728 return false;
14729
14730 offval_1 = INTVAL (offset_1);
14731 offval_2 = INTVAL (offset_2);
14732 msize = GET_MODE_SIZE (mode);
14733 /* Check if the offsets are consecutive. */
14734 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14735 return false;
14736
14737 /* Check if the addresses are clobbered by load. */
14738 if (load)
14739 {
14740 if (reg_mentioned_p (reg_1, mem_1))
14741 return false;
14742
14743 /* In increasing order, the last load can clobber the address. */
14744 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14745 return false;
14746 }
14747
14748 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14749 rclass_1 = FP_REGS;
14750 else
14751 rclass_1 = GENERAL_REGS;
14752
14753 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14754 rclass_2 = FP_REGS;
14755 else
14756 rclass_2 = GENERAL_REGS;
14757
14758 /* Check if the registers are of same class. */
14759 if (rclass_1 != rclass_2)
14760 return false;
14761
14762 return true;
14763 }
14764
14765 /* Given OPERANDS of consecutive load/store, check if we can merge
14766 them into ldp/stp by adjusting the offset. LOAD is true if they
14767 are load instructions. MODE is the mode of memory operands.
14768
14769 Given below consecutive stores:
14770
14771 str w1, [xb, 0x100]
14772 str w1, [xb, 0x104]
14773 str w1, [xb, 0x108]
14774 str w1, [xb, 0x10c]
14775
14776 Though the offsets are out of the range supported by stp, we can
14777 still pair them after adjusting the offset, like:
14778
14779 add scratch, xb, 0x100
14780 stp w1, w1, [scratch]
14781 stp w1, w1, [scratch, 0x8]
14782
14783 The peephole patterns detecting this opportunity should guarantee
14784 the scratch register is avaliable. */
14785
14786 bool
14787 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14788 machine_mode mode)
14789 {
14790 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14791 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14792 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14793 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14794
14795 if (load)
14796 {
14797 reg_1 = operands[0];
14798 mem_1 = operands[1];
14799 reg_2 = operands[2];
14800 mem_2 = operands[3];
14801 reg_3 = operands[4];
14802 mem_3 = operands[5];
14803 reg_4 = operands[6];
14804 mem_4 = operands[7];
14805 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14806 && REG_P (reg_3) && REG_P (reg_4));
14807 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14808 return false;
14809 }
14810 else
14811 {
14812 mem_1 = operands[0];
14813 reg_1 = operands[1];
14814 mem_2 = operands[2];
14815 reg_2 = operands[3];
14816 mem_3 = operands[4];
14817 reg_3 = operands[5];
14818 mem_4 = operands[6];
14819 reg_4 = operands[7];
14820 }
14821 /* Skip if memory operand is by itslef valid for ldp/stp. */
14822 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14823 return false;
14824
14825 /* The mems cannot be volatile. */
14826 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14827 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14828 return false;
14829
14830 /* Check if the addresses are in the form of [base+offset]. */
14831 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14832 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14833 return false;
14834 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14835 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14836 return false;
14837 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14838 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14839 return false;
14840 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14841 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14842 return false;
14843
14844 /* Check if the bases are same. */
14845 if (!rtx_equal_p (base_1, base_2)
14846 || !rtx_equal_p (base_2, base_3)
14847 || !rtx_equal_p (base_3, base_4))
14848 return false;
14849
14850 offval_1 = INTVAL (offset_1);
14851 offval_2 = INTVAL (offset_2);
14852 offval_3 = INTVAL (offset_3);
14853 offval_4 = INTVAL (offset_4);
14854 msize = GET_MODE_SIZE (mode);
14855 /* Check if the offsets are consecutive. */
14856 if ((offval_1 != (offval_2 + msize)
14857 || offval_1 != (offval_3 + msize * 2)
14858 || offval_1 != (offval_4 + msize * 3))
14859 && (offval_4 != (offval_3 + msize)
14860 || offval_4 != (offval_2 + msize * 2)
14861 || offval_4 != (offval_1 + msize * 3)))
14862 return false;
14863
14864 /* Check if the addresses are clobbered by load. */
14865 if (load)
14866 {
14867 if (reg_mentioned_p (reg_1, mem_1)
14868 || reg_mentioned_p (reg_2, mem_2)
14869 || reg_mentioned_p (reg_3, mem_3))
14870 return false;
14871
14872 /* In increasing order, the last load can clobber the address. */
14873 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14874 return false;
14875 }
14876
14877 /* If we have SImode and slow unaligned ldp,
14878 check the alignment to be at least 8 byte. */
14879 if (mode == SImode
14880 && (aarch64_tune_params.extra_tuning_flags
14881 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14882 && !optimize_size
14883 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14884 return false;
14885
14886 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14887 rclass_1 = FP_REGS;
14888 else
14889 rclass_1 = GENERAL_REGS;
14890
14891 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14892 rclass_2 = FP_REGS;
14893 else
14894 rclass_2 = GENERAL_REGS;
14895
14896 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14897 rclass_3 = FP_REGS;
14898 else
14899 rclass_3 = GENERAL_REGS;
14900
14901 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14902 rclass_4 = FP_REGS;
14903 else
14904 rclass_4 = GENERAL_REGS;
14905
14906 /* Check if the registers are of same class. */
14907 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14908 return false;
14909
14910 return true;
14911 }
14912
14913 /* Given OPERANDS of consecutive load/store, this function pairs them
14914 into ldp/stp after adjusting the offset. It depends on the fact
14915 that addresses of load/store instructions are in increasing order.
14916 MODE is the mode of memory operands. CODE is the rtl operator
14917 which should be applied to all memory operands, it's SIGN_EXTEND,
14918 ZERO_EXTEND or UNKNOWN. */
14919
14920 bool
14921 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14922 machine_mode mode, RTX_CODE code)
14923 {
14924 rtx base, offset, t1, t2;
14925 rtx mem_1, mem_2, mem_3, mem_4;
14926 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14927
14928 if (load)
14929 {
14930 mem_1 = operands[1];
14931 mem_2 = operands[3];
14932 mem_3 = operands[5];
14933 mem_4 = operands[7];
14934 }
14935 else
14936 {
14937 mem_1 = operands[0];
14938 mem_2 = operands[2];
14939 mem_3 = operands[4];
14940 mem_4 = operands[6];
14941 gcc_assert (code == UNKNOWN);
14942 }
14943
14944 extract_base_offset_in_addr (mem_1, &base, &offset);
14945 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14946
14947 /* Adjust offset thus it can fit in ldp/stp instruction. */
14948 msize = GET_MODE_SIZE (mode);
14949 stp_off_limit = msize * 0x40;
14950 off_val = INTVAL (offset);
14951 abs_off = (off_val < 0) ? -off_val : off_val;
14952 new_off = abs_off % stp_off_limit;
14953 adj_off = abs_off - new_off;
14954
14955 /* Further adjust to make sure all offsets are OK. */
14956 if ((new_off + msize * 2) >= stp_off_limit)
14957 {
14958 adj_off += stp_off_limit;
14959 new_off -= stp_off_limit;
14960 }
14961
14962 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14963 if (adj_off >= 0x1000)
14964 return false;
14965
14966 if (off_val < 0)
14967 {
14968 adj_off = -adj_off;
14969 new_off = -new_off;
14970 }
14971
14972 /* Create new memory references. */
14973 mem_1 = change_address (mem_1, VOIDmode,
14974 plus_constant (DImode, operands[8], new_off));
14975
14976 /* Check if the adjusted address is OK for ldp/stp. */
14977 if (!aarch64_mem_pair_operand (mem_1, mode))
14978 return false;
14979
14980 msize = GET_MODE_SIZE (mode);
14981 mem_2 = change_address (mem_2, VOIDmode,
14982 plus_constant (DImode,
14983 operands[8],
14984 new_off + msize));
14985 mem_3 = change_address (mem_3, VOIDmode,
14986 plus_constant (DImode,
14987 operands[8],
14988 new_off + msize * 2));
14989 mem_4 = change_address (mem_4, VOIDmode,
14990 plus_constant (DImode,
14991 operands[8],
14992 new_off + msize * 3));
14993
14994 if (code == ZERO_EXTEND)
14995 {
14996 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14997 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14998 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14999 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15000 }
15001 else if (code == SIGN_EXTEND)
15002 {
15003 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15004 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15005 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15006 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15007 }
15008
15009 if (load)
15010 {
15011 operands[1] = mem_1;
15012 operands[3] = mem_2;
15013 operands[5] = mem_3;
15014 operands[7] = mem_4;
15015 }
15016 else
15017 {
15018 operands[0] = mem_1;
15019 operands[2] = mem_2;
15020 operands[4] = mem_3;
15021 operands[6] = mem_4;
15022 }
15023
15024 /* Emit adjusting instruction. */
15025 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15026 /* Emit ldp/stp instructions. */
15027 t1 = gen_rtx_SET (operands[0], operands[1]);
15028 t2 = gen_rtx_SET (operands[2], operands[3]);
15029 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15030 t1 = gen_rtx_SET (operands[4], operands[5]);
15031 t2 = gen_rtx_SET (operands[6], operands[7]);
15032 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15033 return true;
15034 }
15035
15036 /* Return 1 if pseudo register should be created and used to hold
15037 GOT address for PIC code. */
15038
15039 bool
15040 aarch64_use_pseudo_pic_reg (void)
15041 {
15042 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15043 }
15044
15045 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15046
15047 static int
15048 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15049 {
15050 switch (XINT (x, 1))
15051 {
15052 case UNSPEC_GOTSMALLPIC:
15053 case UNSPEC_GOTSMALLPIC28K:
15054 case UNSPEC_GOTTINYPIC:
15055 return 0;
15056 default:
15057 break;
15058 }
15059
15060 return default_unspec_may_trap_p (x, flags);
15061 }
15062
15063
15064 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15065 return the log2 of that value. Otherwise return -1. */
15066
15067 int
15068 aarch64_fpconst_pow_of_2 (rtx x)
15069 {
15070 const REAL_VALUE_TYPE *r;
15071
15072 if (!CONST_DOUBLE_P (x))
15073 return -1;
15074
15075 r = CONST_DOUBLE_REAL_VALUE (x);
15076
15077 if (REAL_VALUE_NEGATIVE (*r)
15078 || REAL_VALUE_ISNAN (*r)
15079 || REAL_VALUE_ISINF (*r)
15080 || !real_isinteger (r, DFmode))
15081 return -1;
15082
15083 return exact_log2 (real_to_integer (r));
15084 }
15085
15086 /* If X is a vector of equal CONST_DOUBLE values and that value is
15087 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15088
15089 int
15090 aarch64_vec_fpconst_pow_of_2 (rtx x)
15091 {
15092 if (GET_CODE (x) != CONST_VECTOR)
15093 return -1;
15094
15095 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15096 return -1;
15097
15098 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15099 if (firstval <= 0)
15100 return -1;
15101
15102 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15103 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15104 return -1;
15105
15106 return firstval;
15107 }
15108
15109 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15110 to float.
15111
15112 __fp16 always promotes through this hook.
15113 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15114 through the generic excess precision logic rather than here. */
15115
15116 static tree
15117 aarch64_promoted_type (const_tree t)
15118 {
15119 if (SCALAR_FLOAT_TYPE_P (t)
15120 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15121 return float_type_node;
15122
15123 return NULL_TREE;
15124 }
15125
15126 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15127
15128 static bool
15129 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15130 optimization_type opt_type)
15131 {
15132 switch (op)
15133 {
15134 case rsqrt_optab:
15135 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15136
15137 default:
15138 return true;
15139 }
15140 }
15141
15142 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15143 if MODE is HFmode, and punt to the generic implementation otherwise. */
15144
15145 static bool
15146 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15147 {
15148 return (mode == HFmode
15149 ? true
15150 : default_libgcc_floating_mode_supported_p (mode));
15151 }
15152
15153 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15154 if MODE is HFmode, and punt to the generic implementation otherwise. */
15155
15156 static bool
15157 aarch64_scalar_mode_supported_p (scalar_mode mode)
15158 {
15159 return (mode == HFmode
15160 ? true
15161 : default_scalar_mode_supported_p (mode));
15162 }
15163
15164 /* Set the value of FLT_EVAL_METHOD.
15165 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15166
15167 0: evaluate all operations and constants, whose semantic type has at
15168 most the range and precision of type float, to the range and
15169 precision of float; evaluate all other operations and constants to
15170 the range and precision of the semantic type;
15171
15172 N, where _FloatN is a supported interchange floating type
15173 evaluate all operations and constants, whose semantic type has at
15174 most the range and precision of _FloatN type, to the range and
15175 precision of the _FloatN type; evaluate all other operations and
15176 constants to the range and precision of the semantic type;
15177
15178 If we have the ARMv8.2-A extensions then we support _Float16 in native
15179 precision, so we should set this to 16. Otherwise, we support the type,
15180 but want to evaluate expressions in float precision, so set this to
15181 0. */
15182
15183 static enum flt_eval_method
15184 aarch64_excess_precision (enum excess_precision_type type)
15185 {
15186 switch (type)
15187 {
15188 case EXCESS_PRECISION_TYPE_FAST:
15189 case EXCESS_PRECISION_TYPE_STANDARD:
15190 /* We can calculate either in 16-bit range and precision or
15191 32-bit range and precision. Make that decision based on whether
15192 we have native support for the ARMv8.2-A 16-bit floating-point
15193 instructions or not. */
15194 return (TARGET_FP_F16INST
15195 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15196 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15197 case EXCESS_PRECISION_TYPE_IMPLICIT:
15198 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15199 default:
15200 gcc_unreachable ();
15201 }
15202 return FLT_EVAL_METHOD_UNPREDICTABLE;
15203 }
15204
15205 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15206 scheduled for speculative execution. Reject the long-running division
15207 and square-root instructions. */
15208
15209 static bool
15210 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15211 {
15212 switch (get_attr_type (insn))
15213 {
15214 case TYPE_SDIV:
15215 case TYPE_UDIV:
15216 case TYPE_FDIVS:
15217 case TYPE_FDIVD:
15218 case TYPE_FSQRTS:
15219 case TYPE_FSQRTD:
15220 case TYPE_NEON_FP_SQRT_S:
15221 case TYPE_NEON_FP_SQRT_D:
15222 case TYPE_NEON_FP_SQRT_S_Q:
15223 case TYPE_NEON_FP_SQRT_D_Q:
15224 case TYPE_NEON_FP_DIV_S:
15225 case TYPE_NEON_FP_DIV_D:
15226 case TYPE_NEON_FP_DIV_S_Q:
15227 case TYPE_NEON_FP_DIV_D_Q:
15228 return false;
15229 default:
15230 return true;
15231 }
15232 }
15233
15234 /* Target-specific selftests. */
15235
15236 #if CHECKING_P
15237
15238 namespace selftest {
15239
15240 /* Selftest for the RTL loader.
15241 Verify that the RTL loader copes with a dump from
15242 print_rtx_function. This is essentially just a test that class
15243 function_reader can handle a real dump, but it also verifies
15244 that lookup_reg_by_dump_name correctly handles hard regs.
15245 The presence of hard reg names in the dump means that the test is
15246 target-specific, hence it is in this file. */
15247
15248 static void
15249 aarch64_test_loading_full_dump ()
15250 {
15251 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15252
15253 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15254
15255 rtx_insn *insn_1 = get_insn_by_uid (1);
15256 ASSERT_EQ (NOTE, GET_CODE (insn_1));
15257
15258 rtx_insn *insn_15 = get_insn_by_uid (15);
15259 ASSERT_EQ (INSN, GET_CODE (insn_15));
15260 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15261
15262 /* Verify crtl->return_rtx. */
15263 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15264 ASSERT_EQ (0, REGNO (crtl->return_rtx));
15265 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15266 }
15267
15268 /* Run all target-specific selftests. */
15269
15270 static void
15271 aarch64_run_selftests (void)
15272 {
15273 aarch64_test_loading_full_dump ();
15274 }
15275
15276 } // namespace selftest
15277
15278 #endif /* #if CHECKING_P */
15279
15280 #undef TARGET_ADDRESS_COST
15281 #define TARGET_ADDRESS_COST aarch64_address_cost
15282
15283 /* This hook will determines whether unnamed bitfields affect the alignment
15284 of the containing structure. The hook returns true if the structure
15285 should inherit the alignment requirements of an unnamed bitfield's
15286 type. */
15287 #undef TARGET_ALIGN_ANON_BITFIELD
15288 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15289
15290 #undef TARGET_ASM_ALIGNED_DI_OP
15291 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15292
15293 #undef TARGET_ASM_ALIGNED_HI_OP
15294 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15295
15296 #undef TARGET_ASM_ALIGNED_SI_OP
15297 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15298
15299 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15300 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15301 hook_bool_const_tree_hwi_hwi_const_tree_true
15302
15303 #undef TARGET_ASM_FILE_START
15304 #define TARGET_ASM_FILE_START aarch64_start_file
15305
15306 #undef TARGET_ASM_OUTPUT_MI_THUNK
15307 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15308
15309 #undef TARGET_ASM_SELECT_RTX_SECTION
15310 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15311
15312 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15313 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15314
15315 #undef TARGET_BUILD_BUILTIN_VA_LIST
15316 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15317
15318 #undef TARGET_CALLEE_COPIES
15319 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15320
15321 #undef TARGET_CAN_ELIMINATE
15322 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15323
15324 #undef TARGET_CAN_INLINE_P
15325 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15326
15327 #undef TARGET_CANNOT_FORCE_CONST_MEM
15328 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15329
15330 #undef TARGET_CASE_VALUES_THRESHOLD
15331 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15332
15333 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15334 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15335
15336 /* Only the least significant bit is used for initialization guard
15337 variables. */
15338 #undef TARGET_CXX_GUARD_MASK_BIT
15339 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15340
15341 #undef TARGET_C_MODE_FOR_SUFFIX
15342 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15343
15344 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15345 #undef TARGET_DEFAULT_TARGET_FLAGS
15346 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15347 #endif
15348
15349 #undef TARGET_CLASS_MAX_NREGS
15350 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15351
15352 #undef TARGET_BUILTIN_DECL
15353 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15354
15355 #undef TARGET_BUILTIN_RECIPROCAL
15356 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15357
15358 #undef TARGET_C_EXCESS_PRECISION
15359 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15360
15361 #undef TARGET_EXPAND_BUILTIN
15362 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15363
15364 #undef TARGET_EXPAND_BUILTIN_VA_START
15365 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15366
15367 #undef TARGET_FOLD_BUILTIN
15368 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15369
15370 #undef TARGET_FUNCTION_ARG
15371 #define TARGET_FUNCTION_ARG aarch64_function_arg
15372
15373 #undef TARGET_FUNCTION_ARG_ADVANCE
15374 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15375
15376 #undef TARGET_FUNCTION_ARG_BOUNDARY
15377 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15378
15379 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15380 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15381
15382 #undef TARGET_FUNCTION_VALUE
15383 #define TARGET_FUNCTION_VALUE aarch64_function_value
15384
15385 #undef TARGET_FUNCTION_VALUE_REGNO_P
15386 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15387
15388 #undef TARGET_FRAME_POINTER_REQUIRED
15389 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15390
15391 #undef TARGET_GIMPLE_FOLD_BUILTIN
15392 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15393
15394 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15395 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15396
15397 #undef TARGET_INIT_BUILTINS
15398 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15399
15400 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15401 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15402 aarch64_ira_change_pseudo_allocno_class
15403
15404 #undef TARGET_LEGITIMATE_ADDRESS_P
15405 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15406
15407 #undef TARGET_LEGITIMATE_CONSTANT_P
15408 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15409
15410 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15411 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15412 aarch64_legitimize_address_displacement
15413
15414 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15415 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15416
15417 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15418 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15419 aarch64_libgcc_floating_mode_supported_p
15420
15421 #undef TARGET_MANGLE_TYPE
15422 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15423
15424 #undef TARGET_MEMORY_MOVE_COST
15425 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15426
15427 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15428 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15429
15430 #undef TARGET_MUST_PASS_IN_STACK
15431 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15432
15433 /* This target hook should return true if accesses to volatile bitfields
15434 should use the narrowest mode possible. It should return false if these
15435 accesses should use the bitfield container type. */
15436 #undef TARGET_NARROW_VOLATILE_BITFIELD
15437 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15438
15439 #undef TARGET_OPTION_OVERRIDE
15440 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15441
15442 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15443 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15444 aarch64_override_options_after_change
15445
15446 #undef TARGET_OPTION_SAVE
15447 #define TARGET_OPTION_SAVE aarch64_option_save
15448
15449 #undef TARGET_OPTION_RESTORE
15450 #define TARGET_OPTION_RESTORE aarch64_option_restore
15451
15452 #undef TARGET_OPTION_PRINT
15453 #define TARGET_OPTION_PRINT aarch64_option_print
15454
15455 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15456 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15457
15458 #undef TARGET_SET_CURRENT_FUNCTION
15459 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15460
15461 #undef TARGET_PASS_BY_REFERENCE
15462 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15463
15464 #undef TARGET_PREFERRED_RELOAD_CLASS
15465 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15466
15467 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15468 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15469
15470 #undef TARGET_PROMOTED_TYPE
15471 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15472
15473 #undef TARGET_SECONDARY_RELOAD
15474 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15475
15476 #undef TARGET_SHIFT_TRUNCATION_MASK
15477 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15478
15479 #undef TARGET_SETUP_INCOMING_VARARGS
15480 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15481
15482 #undef TARGET_STRUCT_VALUE_RTX
15483 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15484
15485 #undef TARGET_REGISTER_MOVE_COST
15486 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15487
15488 #undef TARGET_RETURN_IN_MEMORY
15489 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15490
15491 #undef TARGET_RETURN_IN_MSB
15492 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15493
15494 #undef TARGET_RTX_COSTS
15495 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15496
15497 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15498 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15499
15500 #undef TARGET_SCHED_ISSUE_RATE
15501 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15502
15503 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15504 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15505 aarch64_sched_first_cycle_multipass_dfa_lookahead
15506
15507 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15508 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15509 aarch64_first_cycle_multipass_dfa_lookahead_guard
15510
15511 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15512 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15513 aarch64_get_separate_components
15514
15515 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15516 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15517 aarch64_components_for_bb
15518
15519 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15520 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15521 aarch64_disqualify_components
15522
15523 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15524 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15525 aarch64_emit_prologue_components
15526
15527 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15528 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15529 aarch64_emit_epilogue_components
15530
15531 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15532 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15533 aarch64_set_handled_components
15534
15535 #undef TARGET_TRAMPOLINE_INIT
15536 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15537
15538 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15539 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15540
15541 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15542 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15543
15544 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15545 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15546 aarch64_builtin_support_vector_misalignment
15547
15548 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15549 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15550
15551 #undef TARGET_VECTORIZE_ADD_STMT_COST
15552 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15553
15554 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15555 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15556 aarch64_builtin_vectorization_cost
15557
15558 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15559 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15560
15561 #undef TARGET_VECTORIZE_BUILTINS
15562 #define TARGET_VECTORIZE_BUILTINS
15563
15564 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15565 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15566 aarch64_builtin_vectorized_function
15567
15568 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15569 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15570 aarch64_autovectorize_vector_sizes
15571
15572 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15573 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15574 aarch64_atomic_assign_expand_fenv
15575
15576 /* Section anchor support. */
15577
15578 #undef TARGET_MIN_ANCHOR_OFFSET
15579 #define TARGET_MIN_ANCHOR_OFFSET -256
15580
15581 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15582 byte offset; we can do much more for larger data types, but have no way
15583 to determine the size of the access. We assume accesses are aligned. */
15584 #undef TARGET_MAX_ANCHOR_OFFSET
15585 #define TARGET_MAX_ANCHOR_OFFSET 4095
15586
15587 #undef TARGET_VECTOR_ALIGNMENT
15588 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15589
15590 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15591 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15592 aarch64_simd_vector_alignment_reachable
15593
15594 /* vec_perm support. */
15595
15596 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15597 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15598 aarch64_vectorize_vec_perm_const_ok
15599
15600 #undef TARGET_INIT_LIBFUNCS
15601 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15602
15603 #undef TARGET_FIXED_CONDITION_CODE_REGS
15604 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15605
15606 #undef TARGET_FLAGS_REGNUM
15607 #define TARGET_FLAGS_REGNUM CC_REGNUM
15608
15609 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15610 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15611
15612 #undef TARGET_ASAN_SHADOW_OFFSET
15613 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15614
15615 #undef TARGET_LEGITIMIZE_ADDRESS
15616 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15617
15618 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15619 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15620 aarch64_use_by_pieces_infrastructure_p
15621
15622 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15623 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15624
15625 #undef TARGET_CAN_USE_DOLOOP_P
15626 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15627
15628 #undef TARGET_SCHED_ADJUST_PRIORITY
15629 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15630
15631 #undef TARGET_SCHED_MACRO_FUSION_P
15632 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15633
15634 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15635 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15636
15637 #undef TARGET_SCHED_FUSION_PRIORITY
15638 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15639
15640 #undef TARGET_UNSPEC_MAY_TRAP_P
15641 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15642
15643 #undef TARGET_USE_PSEUDO_PIC_REG
15644 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15645
15646 #undef TARGET_PRINT_OPERAND
15647 #define TARGET_PRINT_OPERAND aarch64_print_operand
15648
15649 #undef TARGET_PRINT_OPERAND_ADDRESS
15650 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15651
15652 #undef TARGET_OPTAB_SUPPORTED_P
15653 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15654
15655 #undef TARGET_OMIT_STRUCT_RETURN_REG
15656 #define TARGET_OMIT_STRUCT_RETURN_REG true
15657
15658 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15659 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15660 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15661
15662 #if CHECKING_P
15663 #undef TARGET_RUN_TARGET_SELFTESTS
15664 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15665 #endif /* #if CHECKING_P */
15666
15667 struct gcc_target targetm = TARGET_INITIALIZER;
15668
15669 #include "gt-aarch64.h"