]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
f8285ac5fac61a4d94008847c1b3e0d2ac79c33b
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95
96 /* The mode of the elements. */
97 scalar_mode elt_mode;
98
99 /* The value of each element if all elements are the same, or the
100 first value if the constant is a series. */
101 rtx value;
102
103 /* The value of the step if the constant is a series, null otherwise. */
104 rtx step;
105
106 /* The instruction to use to move the immediate into a vector. */
107 insn_type insn;
108
109 /* The kind of shift modifier to use, and the number of bits to shift.
110 This is (LSL, 0) if no shift is needed. */
111 modifier_type modifier;
112 unsigned int shift;
113 };
114
115 /* Construct a floating-point immediate in which each element has mode
116 ELT_MODE_IN and value VALUE_IN. */
117 inline simd_immediate_info
118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
119 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
120 modifier (LSL), shift (0)
121 {}
122
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124 and value VALUE_IN. The other parameters are as for the structure
125 fields. */
126 inline simd_immediate_info
127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
128 unsigned HOST_WIDE_INT value_in,
129 insn_type insn_in, modifier_type modifier_in,
130 unsigned int shift_in)
131 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
132 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 {}
134
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136 and where element I is equal to VALUE_IN + I * STEP_IN. */
137 inline simd_immediate_info
138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
139 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
140 modifier (LSL), shift (0)
141 {}
142
143 /* The current code model. */
144 enum aarch64_code_model aarch64_cmodel;
145
146 /* The number of 64-bit elements in an SVE vector. */
147 poly_uint16 aarch64_sve_vg;
148
149 #ifdef HAVE_AS_TLS
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
152 #endif
153
154 static bool aarch64_composite_type_p (const_tree, machine_mode);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
156 const_tree,
157 machine_mode *, int *,
158 bool *);
159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode);
163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
165 const_tree type,
166 int misalignment,
167 bool is_packed);
168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
170 aarch64_addr_query_type);
171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
172
173 /* Major revision number of the ARM Architecture implemented by the target. */
174 unsigned aarch64_architecture_version;
175
176 /* The processor for which instructions should be scheduled. */
177 enum aarch64_processor aarch64_tune = cortexa53;
178
179 /* Mask to specify which instruction scheduling options should be used. */
180 uint64_t aarch64_tune_flags = 0;
181
182 /* Global flag for PC relative loads. */
183 bool aarch64_pcrelative_literal_loads;
184
185 /* Global flag for whether frame pointer is enabled. */
186 bool aarch64_use_frame_pointer;
187
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string = NULL;
190
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
193
194 /* Support for command line parsing of boolean flags in the tuning
195 structures. */
196 struct aarch64_flag_desc
197 {
198 const char* name;
199 unsigned int flag;
200 };
201
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203 { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
205 {
206 { "none", AARCH64_FUSE_NOTHING },
207 #include "aarch64-fusion-pairs.def"
208 { "all", AARCH64_FUSE_ALL },
209 { NULL, AARCH64_FUSE_NOTHING }
210 };
211
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213 { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
215 {
216 { "none", AARCH64_EXTRA_TUNE_NONE },
217 #include "aarch64-tuning-flags.def"
218 { "all", AARCH64_EXTRA_TUNE_ALL },
219 { NULL, AARCH64_EXTRA_TUNE_NONE }
220 };
221
222 /* Tuning parameters. */
223
224 static const struct cpu_addrcost_table generic_addrcost_table =
225 {
226 {
227 1, /* hi */
228 0, /* si */
229 0, /* di */
230 1, /* ti */
231 },
232 0, /* pre_modify */
233 0, /* post_modify */
234 0, /* register_offset */
235 0, /* register_sextend */
236 0, /* register_zextend */
237 0 /* imm_offset */
238 };
239
240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
241 {
242 {
243 0, /* hi */
244 0, /* si */
245 0, /* di */
246 2, /* ti */
247 },
248 0, /* pre_modify */
249 0, /* post_modify */
250 1, /* register_offset */
251 1, /* register_sextend */
252 2, /* register_zextend */
253 0, /* imm_offset */
254 };
255
256 static const struct cpu_addrcost_table xgene1_addrcost_table =
257 {
258 {
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
263 },
264 1, /* pre_modify */
265 1, /* post_modify */
266 0, /* register_offset */
267 1, /* register_sextend */
268 1, /* register_zextend */
269 0, /* imm_offset */
270 };
271
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
273 {
274 {
275 1, /* hi */
276 1, /* si */
277 1, /* di */
278 2, /* ti */
279 },
280 0, /* pre_modify */
281 0, /* post_modify */
282 2, /* register_offset */
283 3, /* register_sextend */
284 3, /* register_zextend */
285 0, /* imm_offset */
286 };
287
288 static const struct cpu_addrcost_table tsv110_addrcost_table =
289 {
290 {
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
295 },
296 0, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
302 };
303
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
305 {
306 {
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
311 },
312 1, /* pre_modify */
313 1, /* post_modify */
314 3, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 2, /* imm_offset */
318 };
319
320 static const struct cpu_regmove_cost generic_regmove_cost =
321 {
322 1, /* GP2GP */
323 /* Avoid the use of slow int<->fp moves for spilling by setting
324 their cost higher than memmov_cost. */
325 5, /* GP2FP */
326 5, /* FP2GP */
327 2 /* FP2FP */
328 };
329
330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
331 {
332 1, /* GP2GP */
333 /* Avoid the use of slow int<->fp moves for spilling by setting
334 their cost higher than memmov_cost. */
335 5, /* GP2FP */
336 5, /* FP2GP */
337 2 /* FP2FP */
338 };
339
340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
341 {
342 1, /* GP2GP */
343 /* Avoid the use of slow int<->fp moves for spilling by setting
344 their cost higher than memmov_cost. */
345 5, /* GP2FP */
346 5, /* FP2GP */
347 2 /* FP2FP */
348 };
349
350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
351 {
352 1, /* GP2GP */
353 /* Avoid the use of slow int<->fp moves for spilling by setting
354 their cost higher than memmov_cost (actual, 4 and 9). */
355 9, /* GP2FP */
356 9, /* FP2GP */
357 1 /* FP2FP */
358 };
359
360 static const struct cpu_regmove_cost thunderx_regmove_cost =
361 {
362 2, /* GP2GP */
363 2, /* GP2FP */
364 6, /* FP2GP */
365 4 /* FP2FP */
366 };
367
368 static const struct cpu_regmove_cost xgene1_regmove_cost =
369 {
370 1, /* GP2GP */
371 /* Avoid the use of slow int<->fp moves for spilling by setting
372 their cost higher than memmov_cost. */
373 8, /* GP2FP */
374 8, /* FP2GP */
375 2 /* FP2FP */
376 };
377
378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
379 {
380 2, /* GP2GP */
381 /* Avoid the use of int<->fp moves for spilling. */
382 6, /* GP2FP */
383 6, /* FP2GP */
384 4 /* FP2FP */
385 };
386
387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
388 {
389 1, /* GP2GP */
390 /* Avoid the use of int<->fp moves for spilling. */
391 8, /* GP2FP */
392 8, /* FP2GP */
393 4 /* FP2FP */
394 };
395
396 static const struct cpu_regmove_cost tsv110_regmove_cost =
397 {
398 1, /* GP2GP */
399 /* Avoid the use of slow int<->fp moves for spilling by setting
400 their cost higher than memmov_cost. */
401 2, /* GP2FP */
402 3, /* FP2GP */
403 2 /* FP2FP */
404 };
405
406 /* Generic costs for vector insn classes. */
407 static const struct cpu_vector_cost generic_vector_cost =
408 {
409 1, /* scalar_int_stmt_cost */
410 1, /* scalar_fp_stmt_cost */
411 1, /* scalar_load_cost */
412 1, /* scalar_store_cost */
413 1, /* vec_int_stmt_cost */
414 1, /* vec_fp_stmt_cost */
415 2, /* vec_permute_cost */
416 1, /* vec_to_scalar_cost */
417 1, /* scalar_to_vec_cost */
418 1, /* vec_align_load_cost */
419 1, /* vec_unalign_load_cost */
420 1, /* vec_unalign_store_cost */
421 1, /* vec_store_cost */
422 3, /* cond_taken_branch_cost */
423 1 /* cond_not_taken_branch_cost */
424 };
425
426 /* QDF24XX costs for vector insn classes. */
427 static const struct cpu_vector_cost qdf24xx_vector_cost =
428 {
429 1, /* scalar_int_stmt_cost */
430 1, /* scalar_fp_stmt_cost */
431 1, /* scalar_load_cost */
432 1, /* scalar_store_cost */
433 1, /* vec_int_stmt_cost */
434 3, /* vec_fp_stmt_cost */
435 2, /* vec_permute_cost */
436 1, /* vec_to_scalar_cost */
437 1, /* scalar_to_vec_cost */
438 1, /* vec_align_load_cost */
439 1, /* vec_unalign_load_cost */
440 1, /* vec_unalign_store_cost */
441 1, /* vec_store_cost */
442 3, /* cond_taken_branch_cost */
443 1 /* cond_not_taken_branch_cost */
444 };
445
446 /* ThunderX costs for vector insn classes. */
447 static const struct cpu_vector_cost thunderx_vector_cost =
448 {
449 1, /* scalar_int_stmt_cost */
450 1, /* scalar_fp_stmt_cost */
451 3, /* scalar_load_cost */
452 1, /* scalar_store_cost */
453 4, /* vec_int_stmt_cost */
454 1, /* vec_fp_stmt_cost */
455 4, /* vec_permute_cost */
456 2, /* vec_to_scalar_cost */
457 2, /* scalar_to_vec_cost */
458 3, /* vec_align_load_cost */
459 5, /* vec_unalign_load_cost */
460 5, /* vec_unalign_store_cost */
461 1, /* vec_store_cost */
462 3, /* cond_taken_branch_cost */
463 3 /* cond_not_taken_branch_cost */
464 };
465
466 static const struct cpu_vector_cost tsv110_vector_cost =
467 {
468 1, /* scalar_int_stmt_cost */
469 1, /* scalar_fp_stmt_cost */
470 5, /* scalar_load_cost */
471 1, /* scalar_store_cost */
472 2, /* vec_int_stmt_cost */
473 2, /* vec_fp_stmt_cost */
474 2, /* vec_permute_cost */
475 3, /* vec_to_scalar_cost */
476 2, /* scalar_to_vec_cost */
477 5, /* vec_align_load_cost */
478 5, /* vec_unalign_load_cost */
479 1, /* vec_unalign_store_cost */
480 1, /* vec_store_cost */
481 1, /* cond_taken_branch_cost */
482 1 /* cond_not_taken_branch_cost */
483 };
484
485 /* Generic costs for vector insn classes. */
486 static const struct cpu_vector_cost cortexa57_vector_cost =
487 {
488 1, /* scalar_int_stmt_cost */
489 1, /* scalar_fp_stmt_cost */
490 4, /* scalar_load_cost */
491 1, /* scalar_store_cost */
492 2, /* vec_int_stmt_cost */
493 2, /* vec_fp_stmt_cost */
494 3, /* vec_permute_cost */
495 8, /* vec_to_scalar_cost */
496 8, /* scalar_to_vec_cost */
497 4, /* vec_align_load_cost */
498 4, /* vec_unalign_load_cost */
499 1, /* vec_unalign_store_cost */
500 1, /* vec_store_cost */
501 1, /* cond_taken_branch_cost */
502 1 /* cond_not_taken_branch_cost */
503 };
504
505 static const struct cpu_vector_cost exynosm1_vector_cost =
506 {
507 1, /* scalar_int_stmt_cost */
508 1, /* scalar_fp_stmt_cost */
509 5, /* scalar_load_cost */
510 1, /* scalar_store_cost */
511 3, /* vec_int_stmt_cost */
512 3, /* vec_fp_stmt_cost */
513 3, /* vec_permute_cost */
514 3, /* vec_to_scalar_cost */
515 3, /* scalar_to_vec_cost */
516 5, /* vec_align_load_cost */
517 5, /* vec_unalign_load_cost */
518 1, /* vec_unalign_store_cost */
519 1, /* vec_store_cost */
520 1, /* cond_taken_branch_cost */
521 1 /* cond_not_taken_branch_cost */
522 };
523
524 /* Generic costs for vector insn classes. */
525 static const struct cpu_vector_cost xgene1_vector_cost =
526 {
527 1, /* scalar_int_stmt_cost */
528 1, /* scalar_fp_stmt_cost */
529 5, /* scalar_load_cost */
530 1, /* scalar_store_cost */
531 2, /* vec_int_stmt_cost */
532 2, /* vec_fp_stmt_cost */
533 2, /* vec_permute_cost */
534 4, /* vec_to_scalar_cost */
535 4, /* scalar_to_vec_cost */
536 10, /* vec_align_load_cost */
537 10, /* vec_unalign_load_cost */
538 2, /* vec_unalign_store_cost */
539 2, /* vec_store_cost */
540 2, /* cond_taken_branch_cost */
541 1 /* cond_not_taken_branch_cost */
542 };
543
544 /* Costs for vector insn classes for Vulcan. */
545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
546 {
547 1, /* scalar_int_stmt_cost */
548 6, /* scalar_fp_stmt_cost */
549 4, /* scalar_load_cost */
550 1, /* scalar_store_cost */
551 5, /* vec_int_stmt_cost */
552 6, /* vec_fp_stmt_cost */
553 3, /* vec_permute_cost */
554 6, /* vec_to_scalar_cost */
555 5, /* scalar_to_vec_cost */
556 8, /* vec_align_load_cost */
557 8, /* vec_unalign_load_cost */
558 4, /* vec_unalign_store_cost */
559 4, /* vec_store_cost */
560 2, /* cond_taken_branch_cost */
561 1 /* cond_not_taken_branch_cost */
562 };
563
564 /* Generic costs for branch instructions. */
565 static const struct cpu_branch_cost generic_branch_cost =
566 {
567 1, /* Predictable. */
568 3 /* Unpredictable. */
569 };
570
571 /* Generic approximation modes. */
572 static const cpu_approx_modes generic_approx_modes =
573 {
574 AARCH64_APPROX_NONE, /* division */
575 AARCH64_APPROX_NONE, /* sqrt */
576 AARCH64_APPROX_NONE /* recip_sqrt */
577 };
578
579 /* Approximation modes for Exynos M1. */
580 static const cpu_approx_modes exynosm1_approx_modes =
581 {
582 AARCH64_APPROX_NONE, /* division */
583 AARCH64_APPROX_ALL, /* sqrt */
584 AARCH64_APPROX_ALL /* recip_sqrt */
585 };
586
587 /* Approximation modes for X-Gene 1. */
588 static const cpu_approx_modes xgene1_approx_modes =
589 {
590 AARCH64_APPROX_NONE, /* division */
591 AARCH64_APPROX_NONE, /* sqrt */
592 AARCH64_APPROX_ALL /* recip_sqrt */
593 };
594
595 /* Generic prefetch settings (which disable prefetch). */
596 static const cpu_prefetch_tune generic_prefetch_tune =
597 {
598 0, /* num_slots */
599 -1, /* l1_cache_size */
600 -1, /* l1_cache_line_size */
601 -1, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 -1 /* default_opt_level */
605 };
606
607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
608 {
609 0, /* num_slots */
610 -1, /* l1_cache_size */
611 64, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
616 };
617
618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
619 {
620 4, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 512, /* l2_cache_size */
624 false, /* prefetch_dynamic_strides */
625 2048, /* minimum_stride */
626 3 /* default_opt_level */
627 };
628
629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
630 {
631 8, /* num_slots */
632 32, /* l1_cache_size */
633 128, /* l1_cache_line_size */
634 16*1024, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 3 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune thunderx_prefetch_tune =
641 {
642 8, /* num_slots */
643 32, /* l1_cache_size */
644 128, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
652 {
653 8, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 256, /* l2_cache_size */
657 true, /* prefetch_dynamic_strides */
658 -1, /* minimum_stride */
659 -1 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune tsv110_prefetch_tune =
663 {
664 0, /* num_slots */
665 64, /* l1_cache_size */
666 64, /* l1_cache_line_size */
667 512, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 -1 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune xgene1_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 64, /* l1_cache_line_size */
678 256, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const struct tune_params generic_tunings =
685 {
686 &cortexa57_extra_costs,
687 &generic_addrcost_table,
688 &generic_regmove_cost,
689 &generic_vector_cost,
690 &generic_branch_cost,
691 &generic_approx_modes,
692 SVE_NOT_IMPLEMENTED, /* sve_width */
693 4, /* memmov_cost */
694 2, /* issue_rate */
695 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
696 "8", /* function_align. */
697 "4", /* jump_align. */
698 "8", /* loop_align. */
699 2, /* int_reassoc_width. */
700 4, /* fp_reassoc_width. */
701 1, /* vec_reassoc_width. */
702 2, /* min_div_recip_mul_sf. */
703 2, /* min_div_recip_mul_df. */
704 0, /* max_case_values. */
705 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
706 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
707 &generic_prefetch_tune
708 };
709
710 static const struct tune_params cortexa35_tunings =
711 {
712 &cortexa53_extra_costs,
713 &generic_addrcost_table,
714 &cortexa53_regmove_cost,
715 &generic_vector_cost,
716 &generic_branch_cost,
717 &generic_approx_modes,
718 SVE_NOT_IMPLEMENTED, /* sve_width */
719 4, /* memmov_cost */
720 1, /* issue_rate */
721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
723 "16", /* function_align. */
724 "4", /* jump_align. */
725 "8", /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
734 &generic_prefetch_tune
735 };
736
737 static const struct tune_params cortexa53_tunings =
738 {
739 &cortexa53_extra_costs,
740 &generic_addrcost_table,
741 &cortexa53_regmove_cost,
742 &generic_vector_cost,
743 &generic_branch_cost,
744 &generic_approx_modes,
745 SVE_NOT_IMPLEMENTED, /* sve_width */
746 4, /* memmov_cost */
747 2, /* issue_rate */
748 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
749 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
750 "16", /* function_align. */
751 "4", /* jump_align. */
752 "8", /* loop_align. */
753 2, /* int_reassoc_width. */
754 4, /* fp_reassoc_width. */
755 1, /* vec_reassoc_width. */
756 2, /* min_div_recip_mul_sf. */
757 2, /* min_div_recip_mul_df. */
758 0, /* max_case_values. */
759 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
760 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
761 &generic_prefetch_tune
762 };
763
764 static const struct tune_params cortexa57_tunings =
765 {
766 &cortexa57_extra_costs,
767 &generic_addrcost_table,
768 &cortexa57_regmove_cost,
769 &cortexa57_vector_cost,
770 &generic_branch_cost,
771 &generic_approx_modes,
772 SVE_NOT_IMPLEMENTED, /* sve_width */
773 4, /* memmov_cost */
774 3, /* issue_rate */
775 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
776 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
777 "16", /* function_align. */
778 "4", /* jump_align. */
779 "8", /* loop_align. */
780 2, /* int_reassoc_width. */
781 4, /* fp_reassoc_width. */
782 1, /* vec_reassoc_width. */
783 2, /* min_div_recip_mul_sf. */
784 2, /* min_div_recip_mul_df. */
785 0, /* max_case_values. */
786 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
787 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
788 &generic_prefetch_tune
789 };
790
791 static const struct tune_params cortexa72_tunings =
792 {
793 &cortexa57_extra_costs,
794 &generic_addrcost_table,
795 &cortexa57_regmove_cost,
796 &cortexa57_vector_cost,
797 &generic_branch_cost,
798 &generic_approx_modes,
799 SVE_NOT_IMPLEMENTED, /* sve_width */
800 4, /* memmov_cost */
801 3, /* issue_rate */
802 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
803 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
804 "16", /* function_align. */
805 "4", /* jump_align. */
806 "8", /* loop_align. */
807 2, /* int_reassoc_width. */
808 4, /* fp_reassoc_width. */
809 1, /* vec_reassoc_width. */
810 2, /* min_div_recip_mul_sf. */
811 2, /* min_div_recip_mul_df. */
812 0, /* max_case_values. */
813 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
814 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
815 &generic_prefetch_tune
816 };
817
818 static const struct tune_params cortexa73_tunings =
819 {
820 &cortexa57_extra_costs,
821 &generic_addrcost_table,
822 &cortexa57_regmove_cost,
823 &cortexa57_vector_cost,
824 &generic_branch_cost,
825 &generic_approx_modes,
826 SVE_NOT_IMPLEMENTED, /* sve_width */
827 4, /* memmov_cost. */
828 2, /* issue_rate. */
829 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
830 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
831 "16", /* function_align. */
832 "4", /* jump_align. */
833 "8", /* loop_align. */
834 2, /* int_reassoc_width. */
835 4, /* fp_reassoc_width. */
836 1, /* vec_reassoc_width. */
837 2, /* min_div_recip_mul_sf. */
838 2, /* min_div_recip_mul_df. */
839 0, /* max_case_values. */
840 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
841 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
842 &generic_prefetch_tune
843 };
844
845
846
847 static const struct tune_params exynosm1_tunings =
848 {
849 &exynosm1_extra_costs,
850 &exynosm1_addrcost_table,
851 &exynosm1_regmove_cost,
852 &exynosm1_vector_cost,
853 &generic_branch_cost,
854 &exynosm1_approx_modes,
855 SVE_NOT_IMPLEMENTED, /* sve_width */
856 4, /* memmov_cost */
857 3, /* issue_rate */
858 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
859 "4", /* function_align. */
860 "4", /* jump_align. */
861 "4", /* loop_align. */
862 2, /* int_reassoc_width. */
863 4, /* fp_reassoc_width. */
864 1, /* vec_reassoc_width. */
865 2, /* min_div_recip_mul_sf. */
866 2, /* min_div_recip_mul_df. */
867 48, /* max_case_values. */
868 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
869 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
870 &exynosm1_prefetch_tune
871 };
872
873 static const struct tune_params thunderxt88_tunings =
874 {
875 &thunderx_extra_costs,
876 &generic_addrcost_table,
877 &thunderx_regmove_cost,
878 &thunderx_vector_cost,
879 &generic_branch_cost,
880 &generic_approx_modes,
881 SVE_NOT_IMPLEMENTED, /* sve_width */
882 6, /* memmov_cost */
883 2, /* issue_rate */
884 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
885 "8", /* function_align. */
886 "8", /* jump_align. */
887 "8", /* loop_align. */
888 2, /* int_reassoc_width. */
889 4, /* fp_reassoc_width. */
890 1, /* vec_reassoc_width. */
891 2, /* min_div_recip_mul_sf. */
892 2, /* min_div_recip_mul_df. */
893 0, /* max_case_values. */
894 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
895 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
896 &thunderxt88_prefetch_tune
897 };
898
899 static const struct tune_params thunderx_tunings =
900 {
901 &thunderx_extra_costs,
902 &generic_addrcost_table,
903 &thunderx_regmove_cost,
904 &thunderx_vector_cost,
905 &generic_branch_cost,
906 &generic_approx_modes,
907 SVE_NOT_IMPLEMENTED, /* sve_width */
908 6, /* memmov_cost */
909 2, /* issue_rate */
910 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
911 "8", /* function_align. */
912 "8", /* jump_align. */
913 "8", /* loop_align. */
914 2, /* int_reassoc_width. */
915 4, /* fp_reassoc_width. */
916 1, /* vec_reassoc_width. */
917 2, /* min_div_recip_mul_sf. */
918 2, /* min_div_recip_mul_df. */
919 0, /* max_case_values. */
920 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
921 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
922 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
923 &thunderx_prefetch_tune
924 };
925
926 static const struct tune_params tsv110_tunings =
927 {
928 &tsv110_extra_costs,
929 &tsv110_addrcost_table,
930 &tsv110_regmove_cost,
931 &tsv110_vector_cost,
932 &generic_branch_cost,
933 &generic_approx_modes,
934 SVE_NOT_IMPLEMENTED, /* sve_width */
935 4, /* memmov_cost */
936 4, /* issue_rate */
937 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
938 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
939 "16", /* function_align. */
940 "4", /* jump_align. */
941 "8", /* loop_align. */
942 2, /* int_reassoc_width. */
943 4, /* fp_reassoc_width. */
944 1, /* vec_reassoc_width. */
945 2, /* min_div_recip_mul_sf. */
946 2, /* min_div_recip_mul_df. */
947 0, /* max_case_values. */
948 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
949 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
950 &tsv110_prefetch_tune
951 };
952
953 static const struct tune_params xgene1_tunings =
954 {
955 &xgene1_extra_costs,
956 &xgene1_addrcost_table,
957 &xgene1_regmove_cost,
958 &xgene1_vector_cost,
959 &generic_branch_cost,
960 &xgene1_approx_modes,
961 SVE_NOT_IMPLEMENTED, /* sve_width */
962 6, /* memmov_cost */
963 4, /* issue_rate */
964 AARCH64_FUSE_NOTHING, /* fusible_ops */
965 "16", /* function_align. */
966 "16", /* jump_align. */
967 "16", /* loop_align. */
968 2, /* int_reassoc_width. */
969 4, /* fp_reassoc_width. */
970 1, /* vec_reassoc_width. */
971 2, /* min_div_recip_mul_sf. */
972 2, /* min_div_recip_mul_df. */
973 17, /* max_case_values. */
974 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
975 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
976 &xgene1_prefetch_tune
977 };
978
979 static const struct tune_params emag_tunings =
980 {
981 &xgene1_extra_costs,
982 &xgene1_addrcost_table,
983 &xgene1_regmove_cost,
984 &xgene1_vector_cost,
985 &generic_branch_cost,
986 &xgene1_approx_modes,
987 SVE_NOT_IMPLEMENTED,
988 6, /* memmov_cost */
989 4, /* issue_rate */
990 AARCH64_FUSE_NOTHING, /* fusible_ops */
991 "16", /* function_align. */
992 "16", /* jump_align. */
993 "16", /* loop_align. */
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 17, /* max_case_values. */
1000 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1001 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1002 &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007 &qdf24xx_extra_costs,
1008 &qdf24xx_addrcost_table,
1009 &qdf24xx_regmove_cost,
1010 &qdf24xx_vector_cost,
1011 &generic_branch_cost,
1012 &generic_approx_modes,
1013 SVE_NOT_IMPLEMENTED, /* sve_width */
1014 4, /* memmov_cost */
1015 4, /* issue_rate */
1016 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1018 "16", /* function_align. */
1019 "8", /* jump_align. */
1020 "16", /* loop_align. */
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1028 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1029 &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1033 for now. */
1034 static const struct tune_params saphira_tunings =
1035 {
1036 &generic_extra_costs,
1037 &generic_addrcost_table,
1038 &generic_regmove_cost,
1039 &generic_vector_cost,
1040 &generic_branch_cost,
1041 &generic_approx_modes,
1042 SVE_NOT_IMPLEMENTED, /* sve_width */
1043 4, /* memmov_cost */
1044 4, /* issue_rate */
1045 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1047 "16", /* function_align. */
1048 "8", /* jump_align. */
1049 "16", /* loop_align. */
1050 2, /* int_reassoc_width. */
1051 4, /* fp_reassoc_width. */
1052 1, /* vec_reassoc_width. */
1053 2, /* min_div_recip_mul_sf. */
1054 2, /* min_div_recip_mul_df. */
1055 0, /* max_case_values. */
1056 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1057 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1058 &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063 &thunderx2t99_extra_costs,
1064 &thunderx2t99_addrcost_table,
1065 &thunderx2t99_regmove_cost,
1066 &thunderx2t99_vector_cost,
1067 &generic_branch_cost,
1068 &generic_approx_modes,
1069 SVE_NOT_IMPLEMENTED, /* sve_width */
1070 4, /* memmov_cost. */
1071 4, /* issue_rate. */
1072 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1074 "16", /* function_align. */
1075 "8", /* jump_align. */
1076 "16", /* loop_align. */
1077 3, /* int_reassoc_width. */
1078 2, /* fp_reassoc_width. */
1079 2, /* vec_reassoc_width. */
1080 2, /* min_div_recip_mul_sf. */
1081 2, /* min_div_recip_mul_df. */
1082 0, /* max_case_values. */
1083 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1084 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1085 &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params neoversen1_tunings =
1089 {
1090 &cortexa57_extra_costs,
1091 &generic_addrcost_table,
1092 &generic_regmove_cost,
1093 &cortexa57_vector_cost,
1094 &generic_branch_cost,
1095 &generic_approx_modes,
1096 SVE_NOT_IMPLEMENTED, /* sve_width */
1097 4, /* memmov_cost */
1098 3, /* issue_rate */
1099 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1100 "32:16", /* function_align. */
1101 "32:16", /* jump_align. */
1102 "32:16", /* loop_align. */
1103 2, /* int_reassoc_width. */
1104 4, /* fp_reassoc_width. */
1105 2, /* vec_reassoc_width. */
1106 2, /* min_div_recip_mul_sf. */
1107 2, /* min_div_recip_mul_df. */
1108 0, /* max_case_values. */
1109 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1110 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1111 &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures. */
1115 struct aarch64_tuning_override_function
1116 {
1117 const char* name;
1118 void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128 { "fuse", aarch64_parse_fuse_string },
1129 { "tune", aarch64_parse_tune_string },
1130 { "sve_width", aarch64_parse_sve_width_string },
1131 { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64. */
1135 struct processor
1136 {
1137 const char *const name;
1138 enum aarch64_processor ident;
1139 enum aarch64_processor sched_core;
1140 enum aarch64_arch arch;
1141 unsigned architecture_version;
1142 const uint64_t flags;
1143 const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64. */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64. */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1160 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1161 FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification. These are populated by the -march, -mtune, -mcpu
1170 handling code or by target attributes. */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1176
1177 /* The current tuning set. */
1178 struct tune_params aarch64_tune_params = generic_tunings;
1179
1180 /* Table of machine attributes. */
1181 static const struct attribute_spec aarch64_attribute_table[] =
1182 {
1183 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1184 affects_type_identity, handler, exclude } */
1185 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1186 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1187 };
1188
1189 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1190
1191 /* An ISA extension in the co-processor and main instruction set space. */
1192 struct aarch64_option_extension
1193 {
1194 const char *const name;
1195 const unsigned long flags_on;
1196 const unsigned long flags_off;
1197 };
1198
1199 typedef enum aarch64_cond_code
1200 {
1201 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1202 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1203 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1204 }
1205 aarch64_cc;
1206
1207 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1208
1209 struct aarch64_branch_protect_type
1210 {
1211 /* The type's name that the user passes to the branch-protection option
1212 string. */
1213 const char* name;
1214 /* Function to handle the protection type and set global variables.
1215 First argument is the string token corresponding with this type and the
1216 second argument is the next token in the option string.
1217 Return values:
1218 * AARCH64_PARSE_OK: Handling was sucessful.
1219 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1220 should print an error.
1221 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1222 own error. */
1223 enum aarch64_parse_opt_result (*handler)(char*, char*);
1224 /* A list of types that can follow this type in the option string. */
1225 const aarch64_branch_protect_type* subtypes;
1226 unsigned int num_subtypes;
1227 };
1228
1229 static enum aarch64_parse_opt_result
1230 aarch64_handle_no_branch_protection (char* str, char* rest)
1231 {
1232 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1233 aarch64_enable_bti = 0;
1234 if (rest)
1235 {
1236 error ("unexpected %<%s%> after %<%s%>", rest, str);
1237 return AARCH64_PARSE_INVALID_FEATURE;
1238 }
1239 return AARCH64_PARSE_OK;
1240 }
1241
1242 static enum aarch64_parse_opt_result
1243 aarch64_handle_standard_branch_protection (char* str, char* rest)
1244 {
1245 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1246 aarch64_ra_sign_key = AARCH64_KEY_A;
1247 aarch64_enable_bti = 1;
1248 if (rest)
1249 {
1250 error ("unexpected %<%s%> after %<%s%>", rest, str);
1251 return AARCH64_PARSE_INVALID_FEATURE;
1252 }
1253 return AARCH64_PARSE_OK;
1254 }
1255
1256 static enum aarch64_parse_opt_result
1257 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1258 char* rest ATTRIBUTE_UNUSED)
1259 {
1260 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1261 aarch64_ra_sign_key = AARCH64_KEY_A;
1262 return AARCH64_PARSE_OK;
1263 }
1264
1265 static enum aarch64_parse_opt_result
1266 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1267 char* rest ATTRIBUTE_UNUSED)
1268 {
1269 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1270 return AARCH64_PARSE_OK;
1271 }
1272
1273 static enum aarch64_parse_opt_result
1274 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1275 char* rest ATTRIBUTE_UNUSED)
1276 {
1277 aarch64_ra_sign_key = AARCH64_KEY_B;
1278 return AARCH64_PARSE_OK;
1279 }
1280
1281 static enum aarch64_parse_opt_result
1282 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1283 char* rest ATTRIBUTE_UNUSED)
1284 {
1285 aarch64_enable_bti = 1;
1286 return AARCH64_PARSE_OK;
1287 }
1288
1289 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1290 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1291 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1292 { NULL, NULL, NULL, 0 }
1293 };
1294
1295 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1296 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1297 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1298 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1299 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1300 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1301 { NULL, NULL, NULL, 0 }
1302 };
1303
1304 /* The condition codes of the processor, and the inverse function. */
1305 static const char * const aarch64_condition_codes[] =
1306 {
1307 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1308 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1309 };
1310
1311 /* Generate code to enable conditional branches in functions over 1 MiB. */
1312 const char *
1313 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1314 const char * branch_format)
1315 {
1316 rtx_code_label * tmp_label = gen_label_rtx ();
1317 char label_buf[256];
1318 char buffer[128];
1319 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1320 CODE_LABEL_NUMBER (tmp_label));
1321 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1322 rtx dest_label = operands[pos_label];
1323 operands[pos_label] = tmp_label;
1324
1325 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1326 output_asm_insn (buffer, operands);
1327
1328 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1329 operands[pos_label] = dest_label;
1330 output_asm_insn (buffer, operands);
1331 return "";
1332 }
1333
1334 void
1335 aarch64_err_no_fpadvsimd (machine_mode mode)
1336 {
1337 if (TARGET_GENERAL_REGS_ONLY)
1338 if (FLOAT_MODE_P (mode))
1339 error ("%qs is incompatible with the use of floating-point types",
1340 "-mgeneral-regs-only");
1341 else
1342 error ("%qs is incompatible with the use of vector types",
1343 "-mgeneral-regs-only");
1344 else
1345 if (FLOAT_MODE_P (mode))
1346 error ("%qs feature modifier is incompatible with the use of"
1347 " floating-point types", "+nofp");
1348 else
1349 error ("%qs feature modifier is incompatible with the use of"
1350 " vector types", "+nofp");
1351 }
1352
1353 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1354 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1355 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1356 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1357 and GENERAL_REGS is lower than the memory cost (in this case the best class
1358 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1359 cost results in bad allocations with many redundant int<->FP moves which
1360 are expensive on various cores.
1361 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1362 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1363 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1364 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1365 The result of this is that it is no longer inefficient to have a higher
1366 memory move cost than the register move cost.
1367 */
1368
1369 static reg_class_t
1370 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1371 reg_class_t best_class)
1372 {
1373 machine_mode mode;
1374
1375 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1376 || !reg_class_subset_p (FP_REGS, allocno_class))
1377 return allocno_class;
1378
1379 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1380 || !reg_class_subset_p (FP_REGS, best_class))
1381 return best_class;
1382
1383 mode = PSEUDO_REGNO_MODE (regno);
1384 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1385 }
1386
1387 static unsigned int
1388 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1389 {
1390 if (GET_MODE_UNIT_SIZE (mode) == 4)
1391 return aarch64_tune_params.min_div_recip_mul_sf;
1392 return aarch64_tune_params.min_div_recip_mul_df;
1393 }
1394
1395 /* Return the reassociation width of treeop OPC with mode MODE. */
1396 static int
1397 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1398 {
1399 if (VECTOR_MODE_P (mode))
1400 return aarch64_tune_params.vec_reassoc_width;
1401 if (INTEGRAL_MODE_P (mode))
1402 return aarch64_tune_params.int_reassoc_width;
1403 /* Avoid reassociating floating point addition so we emit more FMAs. */
1404 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1405 return aarch64_tune_params.fp_reassoc_width;
1406 return 1;
1407 }
1408
1409 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1410 unsigned
1411 aarch64_dbx_register_number (unsigned regno)
1412 {
1413 if (GP_REGNUM_P (regno))
1414 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1415 else if (regno == SP_REGNUM)
1416 return AARCH64_DWARF_SP;
1417 else if (FP_REGNUM_P (regno))
1418 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1419 else if (PR_REGNUM_P (regno))
1420 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1421 else if (regno == VG_REGNUM)
1422 return AARCH64_DWARF_VG;
1423
1424 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1425 equivalent DWARF register. */
1426 return DWARF_FRAME_REGISTERS;
1427 }
1428
1429 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1430 static bool
1431 aarch64_advsimd_struct_mode_p (machine_mode mode)
1432 {
1433 return (TARGET_SIMD
1434 && (mode == OImode || mode == CImode || mode == XImode));
1435 }
1436
1437 /* Return true if MODE is an SVE predicate mode. */
1438 static bool
1439 aarch64_sve_pred_mode_p (machine_mode mode)
1440 {
1441 return (TARGET_SVE
1442 && (mode == VNx16BImode
1443 || mode == VNx8BImode
1444 || mode == VNx4BImode
1445 || mode == VNx2BImode));
1446 }
1447
1448 /* Three mutually-exclusive flags describing a vector or predicate type. */
1449 const unsigned int VEC_ADVSIMD = 1;
1450 const unsigned int VEC_SVE_DATA = 2;
1451 const unsigned int VEC_SVE_PRED = 4;
1452 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1453 a structure of 2, 3 or 4 vectors. */
1454 const unsigned int VEC_STRUCT = 8;
1455 /* Useful combinations of the above. */
1456 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1457 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1458
1459 /* Return a set of flags describing the vector properties of mode MODE.
1460 Ignore modes that are not supported by the current target. */
1461 static unsigned int
1462 aarch64_classify_vector_mode (machine_mode mode)
1463 {
1464 if (aarch64_advsimd_struct_mode_p (mode))
1465 return VEC_ADVSIMD | VEC_STRUCT;
1466
1467 if (aarch64_sve_pred_mode_p (mode))
1468 return VEC_SVE_PRED;
1469
1470 scalar_mode inner = GET_MODE_INNER (mode);
1471 if (VECTOR_MODE_P (mode)
1472 && (inner == QImode
1473 || inner == HImode
1474 || inner == HFmode
1475 || inner == SImode
1476 || inner == SFmode
1477 || inner == DImode
1478 || inner == DFmode))
1479 {
1480 if (TARGET_SVE)
1481 {
1482 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1483 return VEC_SVE_DATA;
1484 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1485 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1486 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1487 return VEC_SVE_DATA | VEC_STRUCT;
1488 }
1489
1490 /* This includes V1DF but not V1DI (which doesn't exist). */
1491 if (TARGET_SIMD
1492 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1493 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1494 return VEC_ADVSIMD;
1495 }
1496
1497 return 0;
1498 }
1499
1500 /* Return true if MODE is any of the data vector modes, including
1501 structure modes. */
1502 static bool
1503 aarch64_vector_data_mode_p (machine_mode mode)
1504 {
1505 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1506 }
1507
1508 /* Return true if MODE is an SVE data vector mode; either a single vector
1509 or a structure of vectors. */
1510 static bool
1511 aarch64_sve_data_mode_p (machine_mode mode)
1512 {
1513 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1514 }
1515
1516 /* Implement target hook TARGET_ARRAY_MODE. */
1517 static opt_machine_mode
1518 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1519 {
1520 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1521 && IN_RANGE (nelems, 2, 4))
1522 return mode_for_vector (GET_MODE_INNER (mode),
1523 GET_MODE_NUNITS (mode) * nelems);
1524
1525 return opt_machine_mode ();
1526 }
1527
1528 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1529 static bool
1530 aarch64_array_mode_supported_p (machine_mode mode,
1531 unsigned HOST_WIDE_INT nelems)
1532 {
1533 if (TARGET_SIMD
1534 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1535 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1536 && (nelems >= 2 && nelems <= 4))
1537 return true;
1538
1539 return false;
1540 }
1541
1542 /* Return the SVE predicate mode to use for elements that have
1543 ELEM_NBYTES bytes, if such a mode exists. */
1544
1545 opt_machine_mode
1546 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1547 {
1548 if (TARGET_SVE)
1549 {
1550 if (elem_nbytes == 1)
1551 return VNx16BImode;
1552 if (elem_nbytes == 2)
1553 return VNx8BImode;
1554 if (elem_nbytes == 4)
1555 return VNx4BImode;
1556 if (elem_nbytes == 8)
1557 return VNx2BImode;
1558 }
1559 return opt_machine_mode ();
1560 }
1561
1562 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1563
1564 static opt_machine_mode
1565 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1566 {
1567 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1568 {
1569 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1570 machine_mode pred_mode;
1571 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1572 return pred_mode;
1573 }
1574
1575 return default_get_mask_mode (nunits, nbytes);
1576 }
1577
1578 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1579 prefer to use the first arithmetic operand as the else value if
1580 the else value doesn't matter, since that exactly matches the SVE
1581 destructive merging form. For ternary operations we could either
1582 pick the first operand and use FMAD-like instructions or the last
1583 operand and use FMLA-like instructions; the latter seems more
1584 natural. */
1585
1586 static tree
1587 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1588 {
1589 return nops == 3 ? ops[2] : ops[0];
1590 }
1591
1592 /* Implement TARGET_HARD_REGNO_NREGS. */
1593
1594 static unsigned int
1595 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1596 {
1597 /* ??? Logically we should only need to provide a value when
1598 HARD_REGNO_MODE_OK says that the combination is valid,
1599 but at the moment we need to handle all modes. Just ignore
1600 any runtime parts for registers that can't store them. */
1601 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1602 switch (aarch64_regno_regclass (regno))
1603 {
1604 case FP_REGS:
1605 case FP_LO_REGS:
1606 if (aarch64_sve_data_mode_p (mode))
1607 return exact_div (GET_MODE_SIZE (mode),
1608 BYTES_PER_SVE_VECTOR).to_constant ();
1609 return CEIL (lowest_size, UNITS_PER_VREG);
1610 case PR_REGS:
1611 case PR_LO_REGS:
1612 case PR_HI_REGS:
1613 return 1;
1614 default:
1615 return CEIL (lowest_size, UNITS_PER_WORD);
1616 }
1617 gcc_unreachable ();
1618 }
1619
1620 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1621
1622 static bool
1623 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1624 {
1625 if (GET_MODE_CLASS (mode) == MODE_CC)
1626 return regno == CC_REGNUM;
1627
1628 if (regno == VG_REGNUM)
1629 /* This must have the same size as _Unwind_Word. */
1630 return mode == DImode;
1631
1632 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1633 if (vec_flags & VEC_SVE_PRED)
1634 return PR_REGNUM_P (regno);
1635
1636 if (PR_REGNUM_P (regno))
1637 return 0;
1638
1639 if (regno == SP_REGNUM)
1640 /* The purpose of comparing with ptr_mode is to support the
1641 global register variable associated with the stack pointer
1642 register via the syntax of asm ("wsp") in ILP32. */
1643 return mode == Pmode || mode == ptr_mode;
1644
1645 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1646 return mode == Pmode;
1647
1648 if (GP_REGNUM_P (regno))
1649 {
1650 if (known_le (GET_MODE_SIZE (mode), 8))
1651 return true;
1652 else if (known_le (GET_MODE_SIZE (mode), 16))
1653 return (regno & 1) == 0;
1654 }
1655 else if (FP_REGNUM_P (regno))
1656 {
1657 if (vec_flags & VEC_STRUCT)
1658 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1659 else
1660 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1661 }
1662
1663 return false;
1664 }
1665
1666 /* Return true if this is a definition of a vectorized simd function. */
1667
1668 static bool
1669 aarch64_simd_decl_p (tree fndecl)
1670 {
1671 tree fntype;
1672
1673 if (fndecl == NULL)
1674 return false;
1675 fntype = TREE_TYPE (fndecl);
1676 if (fntype == NULL)
1677 return false;
1678
1679 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1680 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1681 return true;
1682
1683 return false;
1684 }
1685
1686 /* Return the mode a register save/restore should use. DImode for integer
1687 registers, DFmode for FP registers in non-SIMD functions (they only save
1688 the bottom half of a 128 bit register), or TFmode for FP registers in
1689 SIMD functions. */
1690
1691 static machine_mode
1692 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1693 {
1694 return GP_REGNUM_P (regno)
1695 ? E_DImode
1696 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1697 }
1698
1699 /* Return true if the instruction is a call to a SIMD function, false
1700 if it is not a SIMD function or if we do not know anything about
1701 the function. */
1702
1703 static bool
1704 aarch64_simd_call_p (rtx_insn *insn)
1705 {
1706 rtx symbol;
1707 rtx call;
1708 tree fndecl;
1709
1710 gcc_assert (CALL_P (insn));
1711 call = get_call_rtx_from (insn);
1712 symbol = XEXP (XEXP (call, 0), 0);
1713 if (GET_CODE (symbol) != SYMBOL_REF)
1714 return false;
1715 fndecl = SYMBOL_REF_DECL (symbol);
1716 if (!fndecl)
1717 return false;
1718
1719 return aarch64_simd_decl_p (fndecl);
1720 }
1721
1722 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1723 a function that uses the SIMD ABI, take advantage of the extra
1724 call-preserved registers that the ABI provides. */
1725
1726 void
1727 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1728 HARD_REG_SET *return_set)
1729 {
1730 if (aarch64_simd_call_p (insn))
1731 {
1732 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1733 if (FP_SIMD_SAVED_REGNUM_P (regno))
1734 CLEAR_HARD_REG_BIT (*return_set, regno);
1735 }
1736 }
1737
1738 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1739 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1740 clobbers the top 64 bits when restoring the bottom 64 bits. */
1741
1742 static bool
1743 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1744 machine_mode mode)
1745 {
1746 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1747 return FP_REGNUM_P (regno)
1748 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1749 }
1750
1751 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1752
1753 rtx_insn *
1754 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1755 {
1756 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1757
1758 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1759 return call_1;
1760 else
1761 return call_2;
1762 }
1763
1764 /* Implement REGMODE_NATURAL_SIZE. */
1765 poly_uint64
1766 aarch64_regmode_natural_size (machine_mode mode)
1767 {
1768 /* The natural size for SVE data modes is one SVE data vector,
1769 and similarly for predicates. We can't independently modify
1770 anything smaller than that. */
1771 /* ??? For now, only do this for variable-width SVE registers.
1772 Doing it for constant-sized registers breaks lower-subreg.c. */
1773 /* ??? And once that's fixed, we should probably have similar
1774 code for Advanced SIMD. */
1775 if (!aarch64_sve_vg.is_constant ())
1776 {
1777 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1778 if (vec_flags & VEC_SVE_PRED)
1779 return BYTES_PER_SVE_PRED;
1780 if (vec_flags & VEC_SVE_DATA)
1781 return BYTES_PER_SVE_VECTOR;
1782 }
1783 return UNITS_PER_WORD;
1784 }
1785
1786 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1787 machine_mode
1788 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1789 machine_mode mode)
1790 {
1791 /* The predicate mode determines which bits are significant and
1792 which are "don't care". Decreasing the number of lanes would
1793 lose data while increasing the number of lanes would make bits
1794 unnecessarily significant. */
1795 if (PR_REGNUM_P (regno))
1796 return mode;
1797 if (known_ge (GET_MODE_SIZE (mode), 4))
1798 return mode;
1799 else
1800 return SImode;
1801 }
1802
1803 /* Return true if I's bits are consecutive ones from the MSB. */
1804 bool
1805 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1806 {
1807 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1808 }
1809
1810 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1811 that strcpy from constants will be faster. */
1812
1813 static HOST_WIDE_INT
1814 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1815 {
1816 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1817 return MAX (align, BITS_PER_WORD);
1818 return align;
1819 }
1820
1821 /* Return true if calls to DECL should be treated as
1822 long-calls (ie called via a register). */
1823 static bool
1824 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1825 {
1826 return false;
1827 }
1828
1829 /* Return true if calls to symbol-ref SYM should be treated as
1830 long-calls (ie called via a register). */
1831 bool
1832 aarch64_is_long_call_p (rtx sym)
1833 {
1834 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1835 }
1836
1837 /* Return true if calls to symbol-ref SYM should not go through
1838 plt stubs. */
1839
1840 bool
1841 aarch64_is_noplt_call_p (rtx sym)
1842 {
1843 const_tree decl = SYMBOL_REF_DECL (sym);
1844
1845 if (flag_pic
1846 && decl
1847 && (!flag_plt
1848 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1849 && !targetm.binds_local_p (decl))
1850 return true;
1851
1852 return false;
1853 }
1854
1855 /* Return true if the offsets to a zero/sign-extract operation
1856 represent an expression that matches an extend operation. The
1857 operands represent the paramters from
1858
1859 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1860 bool
1861 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1862 rtx extract_imm)
1863 {
1864 HOST_WIDE_INT mult_val, extract_val;
1865
1866 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1867 return false;
1868
1869 mult_val = INTVAL (mult_imm);
1870 extract_val = INTVAL (extract_imm);
1871
1872 if (extract_val > 8
1873 && extract_val < GET_MODE_BITSIZE (mode)
1874 && exact_log2 (extract_val & ~7) > 0
1875 && (extract_val & 7) <= 4
1876 && mult_val == (1 << (extract_val & 7)))
1877 return true;
1878
1879 return false;
1880 }
1881
1882 /* Emit an insn that's a simple single-set. Both the operands must be
1883 known to be valid. */
1884 inline static rtx_insn *
1885 emit_set_insn (rtx x, rtx y)
1886 {
1887 return emit_insn (gen_rtx_SET (x, y));
1888 }
1889
1890 /* X and Y are two things to compare using CODE. Emit the compare insn and
1891 return the rtx for register 0 in the proper mode. */
1892 rtx
1893 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1894 {
1895 machine_mode mode = SELECT_CC_MODE (code, x, y);
1896 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1897
1898 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1899 return cc_reg;
1900 }
1901
1902 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1903
1904 static rtx
1905 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1906 machine_mode y_mode)
1907 {
1908 if (y_mode == E_QImode || y_mode == E_HImode)
1909 {
1910 if (CONST_INT_P (y))
1911 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1912 else
1913 {
1914 rtx t, cc_reg;
1915 machine_mode cc_mode;
1916
1917 t = gen_rtx_ZERO_EXTEND (SImode, y);
1918 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1919 cc_mode = CC_SWPmode;
1920 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1921 emit_set_insn (cc_reg, t);
1922 return cc_reg;
1923 }
1924 }
1925
1926 return aarch64_gen_compare_reg (code, x, y);
1927 }
1928
1929 /* Build the SYMBOL_REF for __tls_get_addr. */
1930
1931 static GTY(()) rtx tls_get_addr_libfunc;
1932
1933 rtx
1934 aarch64_tls_get_addr (void)
1935 {
1936 if (!tls_get_addr_libfunc)
1937 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1938 return tls_get_addr_libfunc;
1939 }
1940
1941 /* Return the TLS model to use for ADDR. */
1942
1943 static enum tls_model
1944 tls_symbolic_operand_type (rtx addr)
1945 {
1946 enum tls_model tls_kind = TLS_MODEL_NONE;
1947 if (GET_CODE (addr) == CONST)
1948 {
1949 poly_int64 addend;
1950 rtx sym = strip_offset (addr, &addend);
1951 if (GET_CODE (sym) == SYMBOL_REF)
1952 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1953 }
1954 else if (GET_CODE (addr) == SYMBOL_REF)
1955 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1956
1957 return tls_kind;
1958 }
1959
1960 /* We'll allow lo_sum's in addresses in our legitimate addresses
1961 so that combine would take care of combining addresses where
1962 necessary, but for generation purposes, we'll generate the address
1963 as :
1964 RTL Absolute
1965 tmp = hi (symbol_ref); adrp x1, foo
1966 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1967 nop
1968
1969 PIC TLS
1970 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1971 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1972 bl __tls_get_addr
1973 nop
1974
1975 Load TLS symbol, depending on TLS mechanism and TLS access model.
1976
1977 Global Dynamic - Traditional TLS:
1978 adrp tmp, :tlsgd:imm
1979 add dest, tmp, #:tlsgd_lo12:imm
1980 bl __tls_get_addr
1981
1982 Global Dynamic - TLS Descriptors:
1983 adrp dest, :tlsdesc:imm
1984 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1985 add dest, dest, #:tlsdesc_lo12:imm
1986 blr tmp
1987 mrs tp, tpidr_el0
1988 add dest, dest, tp
1989
1990 Initial Exec:
1991 mrs tp, tpidr_el0
1992 adrp tmp, :gottprel:imm
1993 ldr dest, [tmp, #:gottprel_lo12:imm]
1994 add dest, dest, tp
1995
1996 Local Exec:
1997 mrs tp, tpidr_el0
1998 add t0, tp, #:tprel_hi12:imm, lsl #12
1999 add t0, t0, #:tprel_lo12_nc:imm
2000 */
2001
2002 static void
2003 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2004 enum aarch64_symbol_type type)
2005 {
2006 switch (type)
2007 {
2008 case SYMBOL_SMALL_ABSOLUTE:
2009 {
2010 /* In ILP32, the mode of dest can be either SImode or DImode. */
2011 rtx tmp_reg = dest;
2012 machine_mode mode = GET_MODE (dest);
2013
2014 gcc_assert (mode == Pmode || mode == ptr_mode);
2015
2016 if (can_create_pseudo_p ())
2017 tmp_reg = gen_reg_rtx (mode);
2018
2019 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2020 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2021 return;
2022 }
2023
2024 case SYMBOL_TINY_ABSOLUTE:
2025 emit_insn (gen_rtx_SET (dest, imm));
2026 return;
2027
2028 case SYMBOL_SMALL_GOT_28K:
2029 {
2030 machine_mode mode = GET_MODE (dest);
2031 rtx gp_rtx = pic_offset_table_rtx;
2032 rtx insn;
2033 rtx mem;
2034
2035 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2036 here before rtl expand. Tree IVOPT will generate rtl pattern to
2037 decide rtx costs, in which case pic_offset_table_rtx is not
2038 initialized. For that case no need to generate the first adrp
2039 instruction as the final cost for global variable access is
2040 one instruction. */
2041 if (gp_rtx != NULL)
2042 {
2043 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2044 using the page base as GOT base, the first page may be wasted,
2045 in the worst scenario, there is only 28K space for GOT).
2046
2047 The generate instruction sequence for accessing global variable
2048 is:
2049
2050 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2051
2052 Only one instruction needed. But we must initialize
2053 pic_offset_table_rtx properly. We generate initialize insn for
2054 every global access, and allow CSE to remove all redundant.
2055
2056 The final instruction sequences will look like the following
2057 for multiply global variables access.
2058
2059 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2060
2061 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2062 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2063 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2064 ... */
2065
2066 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2067 crtl->uses_pic_offset_table = 1;
2068 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2069
2070 if (mode != GET_MODE (gp_rtx))
2071 gp_rtx = gen_lowpart (mode, gp_rtx);
2072
2073 }
2074
2075 if (mode == ptr_mode)
2076 {
2077 if (mode == DImode)
2078 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2079 else
2080 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2081
2082 mem = XVECEXP (SET_SRC (insn), 0, 0);
2083 }
2084 else
2085 {
2086 gcc_assert (mode == Pmode);
2087
2088 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2089 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2090 }
2091
2092 /* The operand is expected to be MEM. Whenever the related insn
2093 pattern changed, above code which calculate mem should be
2094 updated. */
2095 gcc_assert (GET_CODE (mem) == MEM);
2096 MEM_READONLY_P (mem) = 1;
2097 MEM_NOTRAP_P (mem) = 1;
2098 emit_insn (insn);
2099 return;
2100 }
2101
2102 case SYMBOL_SMALL_GOT_4G:
2103 {
2104 /* In ILP32, the mode of dest can be either SImode or DImode,
2105 while the got entry is always of SImode size. The mode of
2106 dest depends on how dest is used: if dest is assigned to a
2107 pointer (e.g. in the memory), it has SImode; it may have
2108 DImode if dest is dereferenced to access the memeory.
2109 This is why we have to handle three different ldr_got_small
2110 patterns here (two patterns for ILP32). */
2111
2112 rtx insn;
2113 rtx mem;
2114 rtx tmp_reg = dest;
2115 machine_mode mode = GET_MODE (dest);
2116
2117 if (can_create_pseudo_p ())
2118 tmp_reg = gen_reg_rtx (mode);
2119
2120 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2121 if (mode == ptr_mode)
2122 {
2123 if (mode == DImode)
2124 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2125 else
2126 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2127
2128 mem = XVECEXP (SET_SRC (insn), 0, 0);
2129 }
2130 else
2131 {
2132 gcc_assert (mode == Pmode);
2133
2134 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2135 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2136 }
2137
2138 gcc_assert (GET_CODE (mem) == MEM);
2139 MEM_READONLY_P (mem) = 1;
2140 MEM_NOTRAP_P (mem) = 1;
2141 emit_insn (insn);
2142 return;
2143 }
2144
2145 case SYMBOL_SMALL_TLSGD:
2146 {
2147 rtx_insn *insns;
2148 machine_mode mode = GET_MODE (dest);
2149 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2150
2151 start_sequence ();
2152 if (TARGET_ILP32)
2153 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2154 else
2155 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2156 insns = get_insns ();
2157 end_sequence ();
2158
2159 RTL_CONST_CALL_P (insns) = 1;
2160 emit_libcall_block (insns, dest, result, imm);
2161 return;
2162 }
2163
2164 case SYMBOL_SMALL_TLSDESC:
2165 {
2166 machine_mode mode = GET_MODE (dest);
2167 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2168 rtx tp;
2169
2170 gcc_assert (mode == Pmode || mode == ptr_mode);
2171
2172 /* In ILP32, the got entry is always of SImode size. Unlike
2173 small GOT, the dest is fixed at reg 0. */
2174 if (TARGET_ILP32)
2175 emit_insn (gen_tlsdesc_small_si (imm));
2176 else
2177 emit_insn (gen_tlsdesc_small_di (imm));
2178 tp = aarch64_load_tp (NULL);
2179
2180 if (mode != Pmode)
2181 tp = gen_lowpart (mode, tp);
2182
2183 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2184 if (REG_P (dest))
2185 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2186 return;
2187 }
2188
2189 case SYMBOL_SMALL_TLSIE:
2190 {
2191 /* In ILP32, the mode of dest can be either SImode or DImode,
2192 while the got entry is always of SImode size. The mode of
2193 dest depends on how dest is used: if dest is assigned to a
2194 pointer (e.g. in the memory), it has SImode; it may have
2195 DImode if dest is dereferenced to access the memeory.
2196 This is why we have to handle three different tlsie_small
2197 patterns here (two patterns for ILP32). */
2198 machine_mode mode = GET_MODE (dest);
2199 rtx tmp_reg = gen_reg_rtx (mode);
2200 rtx tp = aarch64_load_tp (NULL);
2201
2202 if (mode == ptr_mode)
2203 {
2204 if (mode == DImode)
2205 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2206 else
2207 {
2208 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2209 tp = gen_lowpart (mode, tp);
2210 }
2211 }
2212 else
2213 {
2214 gcc_assert (mode == Pmode);
2215 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2216 }
2217
2218 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2219 if (REG_P (dest))
2220 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2221 return;
2222 }
2223
2224 case SYMBOL_TLSLE12:
2225 case SYMBOL_TLSLE24:
2226 case SYMBOL_TLSLE32:
2227 case SYMBOL_TLSLE48:
2228 {
2229 machine_mode mode = GET_MODE (dest);
2230 rtx tp = aarch64_load_tp (NULL);
2231
2232 if (mode != Pmode)
2233 tp = gen_lowpart (mode, tp);
2234
2235 switch (type)
2236 {
2237 case SYMBOL_TLSLE12:
2238 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2239 (dest, tp, imm));
2240 break;
2241 case SYMBOL_TLSLE24:
2242 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2243 (dest, tp, imm));
2244 break;
2245 case SYMBOL_TLSLE32:
2246 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2247 (dest, imm));
2248 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2249 (dest, dest, tp));
2250 break;
2251 case SYMBOL_TLSLE48:
2252 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2253 (dest, imm));
2254 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2255 (dest, dest, tp));
2256 break;
2257 default:
2258 gcc_unreachable ();
2259 }
2260
2261 if (REG_P (dest))
2262 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2263 return;
2264 }
2265
2266 case SYMBOL_TINY_GOT:
2267 emit_insn (gen_ldr_got_tiny (dest, imm));
2268 return;
2269
2270 case SYMBOL_TINY_TLSIE:
2271 {
2272 machine_mode mode = GET_MODE (dest);
2273 rtx tp = aarch64_load_tp (NULL);
2274
2275 if (mode == ptr_mode)
2276 {
2277 if (mode == DImode)
2278 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2279 else
2280 {
2281 tp = gen_lowpart (mode, tp);
2282 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2283 }
2284 }
2285 else
2286 {
2287 gcc_assert (mode == Pmode);
2288 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2289 }
2290
2291 if (REG_P (dest))
2292 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2293 return;
2294 }
2295
2296 default:
2297 gcc_unreachable ();
2298 }
2299 }
2300
2301 /* Emit a move from SRC to DEST. Assume that the move expanders can
2302 handle all moves if !can_create_pseudo_p (). The distinction is
2303 important because, unlike emit_move_insn, the move expanders know
2304 how to force Pmode objects into the constant pool even when the
2305 constant pool address is not itself legitimate. */
2306 static rtx
2307 aarch64_emit_move (rtx dest, rtx src)
2308 {
2309 return (can_create_pseudo_p ()
2310 ? emit_move_insn (dest, src)
2311 : emit_move_insn_1 (dest, src));
2312 }
2313
2314 /* Apply UNOPTAB to OP and store the result in DEST. */
2315
2316 static void
2317 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2318 {
2319 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2320 if (dest != tmp)
2321 emit_move_insn (dest, tmp);
2322 }
2323
2324 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2325
2326 static void
2327 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2328 {
2329 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2330 OPTAB_DIRECT);
2331 if (dest != tmp)
2332 emit_move_insn (dest, tmp);
2333 }
2334
2335 /* Split a 128-bit move operation into two 64-bit move operations,
2336 taking care to handle partial overlap of register to register
2337 copies. Special cases are needed when moving between GP regs and
2338 FP regs. SRC can be a register, constant or memory; DST a register
2339 or memory. If either operand is memory it must not have any side
2340 effects. */
2341 void
2342 aarch64_split_128bit_move (rtx dst, rtx src)
2343 {
2344 rtx dst_lo, dst_hi;
2345 rtx src_lo, src_hi;
2346
2347 machine_mode mode = GET_MODE (dst);
2348
2349 gcc_assert (mode == TImode || mode == TFmode);
2350 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2351 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2352
2353 if (REG_P (dst) && REG_P (src))
2354 {
2355 int src_regno = REGNO (src);
2356 int dst_regno = REGNO (dst);
2357
2358 /* Handle FP <-> GP regs. */
2359 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2360 {
2361 src_lo = gen_lowpart (word_mode, src);
2362 src_hi = gen_highpart (word_mode, src);
2363
2364 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2365 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2366 return;
2367 }
2368 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2369 {
2370 dst_lo = gen_lowpart (word_mode, dst);
2371 dst_hi = gen_highpart (word_mode, dst);
2372
2373 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2374 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2375 return;
2376 }
2377 }
2378
2379 dst_lo = gen_lowpart (word_mode, dst);
2380 dst_hi = gen_highpart (word_mode, dst);
2381 src_lo = gen_lowpart (word_mode, src);
2382 src_hi = gen_highpart_mode (word_mode, mode, src);
2383
2384 /* At most one pairing may overlap. */
2385 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2386 {
2387 aarch64_emit_move (dst_hi, src_hi);
2388 aarch64_emit_move (dst_lo, src_lo);
2389 }
2390 else
2391 {
2392 aarch64_emit_move (dst_lo, src_lo);
2393 aarch64_emit_move (dst_hi, src_hi);
2394 }
2395 }
2396
2397 bool
2398 aarch64_split_128bit_move_p (rtx dst, rtx src)
2399 {
2400 return (! REG_P (src)
2401 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2402 }
2403
2404 /* Split a complex SIMD combine. */
2405
2406 void
2407 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2408 {
2409 machine_mode src_mode = GET_MODE (src1);
2410 machine_mode dst_mode = GET_MODE (dst);
2411
2412 gcc_assert (VECTOR_MODE_P (dst_mode));
2413 gcc_assert (register_operand (dst, dst_mode)
2414 && register_operand (src1, src_mode)
2415 && register_operand (src2, src_mode));
2416
2417 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2418 return;
2419 }
2420
2421 /* Split a complex SIMD move. */
2422
2423 void
2424 aarch64_split_simd_move (rtx dst, rtx src)
2425 {
2426 machine_mode src_mode = GET_MODE (src);
2427 machine_mode dst_mode = GET_MODE (dst);
2428
2429 gcc_assert (VECTOR_MODE_P (dst_mode));
2430
2431 if (REG_P (dst) && REG_P (src))
2432 {
2433 gcc_assert (VECTOR_MODE_P (src_mode));
2434 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2435 }
2436 }
2437
2438 bool
2439 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2440 machine_mode ymode, rtx y)
2441 {
2442 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2443 gcc_assert (r != NULL);
2444 return rtx_equal_p (x, r);
2445 }
2446
2447
2448 static rtx
2449 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2450 {
2451 if (can_create_pseudo_p ())
2452 return force_reg (mode, value);
2453 else
2454 {
2455 gcc_assert (x);
2456 aarch64_emit_move (x, value);
2457 return x;
2458 }
2459 }
2460
2461 /* Return true if we can move VALUE into a register using a single
2462 CNT[BHWD] instruction. */
2463
2464 static bool
2465 aarch64_sve_cnt_immediate_p (poly_int64 value)
2466 {
2467 HOST_WIDE_INT factor = value.coeffs[0];
2468 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2469 return (value.coeffs[1] == factor
2470 && IN_RANGE (factor, 2, 16 * 16)
2471 && (factor & 1) == 0
2472 && factor <= 16 * (factor & -factor));
2473 }
2474
2475 /* Likewise for rtx X. */
2476
2477 bool
2478 aarch64_sve_cnt_immediate_p (rtx x)
2479 {
2480 poly_int64 value;
2481 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2482 }
2483
2484 /* Return the asm string for an instruction with a CNT-like vector size
2485 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2486 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2487 first part of the operands template (the part that comes before the
2488 vector size itself). FACTOR is the number of quadwords.
2489 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2490 If it is zero, we can use any element size. */
2491
2492 static char *
2493 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2494 unsigned int factor,
2495 unsigned int nelts_per_vq)
2496 {
2497 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2498
2499 if (nelts_per_vq == 0)
2500 /* There is some overlap in the ranges of the four CNT instructions.
2501 Here we always use the smallest possible element size, so that the
2502 multiplier is 1 whereever possible. */
2503 nelts_per_vq = factor & -factor;
2504 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2505 gcc_assert (IN_RANGE (shift, 1, 4));
2506 char suffix = "dwhb"[shift - 1];
2507
2508 factor >>= shift;
2509 unsigned int written;
2510 if (factor == 1)
2511 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2512 prefix, suffix, operands);
2513 else
2514 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2515 prefix, suffix, operands, factor);
2516 gcc_assert (written < sizeof (buffer));
2517 return buffer;
2518 }
2519
2520 /* Return the asm string for an instruction with a CNT-like vector size
2521 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2522 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2523 first part of the operands template (the part that comes before the
2524 vector size itself). X is the value of the vector size operand,
2525 as a polynomial integer rtx. */
2526
2527 char *
2528 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2529 rtx x)
2530 {
2531 poly_int64 value = rtx_to_poly_int64 (x);
2532 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2533 return aarch64_output_sve_cnt_immediate (prefix, operands,
2534 value.coeffs[1], 0);
2535 }
2536
2537 /* Return true if we can add VALUE to a register using a single ADDVL
2538 or ADDPL instruction. */
2539
2540 static bool
2541 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2542 {
2543 HOST_WIDE_INT factor = value.coeffs[0];
2544 if (factor == 0 || value.coeffs[1] != factor)
2545 return false;
2546 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2547 and a value of 16 is one vector width. */
2548 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2549 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2550 }
2551
2552 /* Likewise for rtx X. */
2553
2554 bool
2555 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2556 {
2557 poly_int64 value;
2558 return (poly_int_rtx_p (x, &value)
2559 && aarch64_sve_addvl_addpl_immediate_p (value));
2560 }
2561
2562 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2563 and storing the result in operand 0. */
2564
2565 char *
2566 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2567 {
2568 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2569 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2570 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2571
2572 /* Use INC or DEC if possible. */
2573 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2574 {
2575 if (aarch64_sve_cnt_immediate_p (offset_value))
2576 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2577 offset_value.coeffs[1], 0);
2578 if (aarch64_sve_cnt_immediate_p (-offset_value))
2579 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2580 -offset_value.coeffs[1], 0);
2581 }
2582
2583 int factor = offset_value.coeffs[1];
2584 if ((factor & 15) == 0)
2585 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2586 else
2587 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2588 return buffer;
2589 }
2590
2591 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2592 instruction. If it is, store the number of elements in each vector
2593 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2594 factor in *FACTOR_OUT (if nonnull). */
2595
2596 bool
2597 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2598 unsigned int *nelts_per_vq_out)
2599 {
2600 rtx elt;
2601 poly_int64 value;
2602
2603 if (!const_vec_duplicate_p (x, &elt)
2604 || !poly_int_rtx_p (elt, &value))
2605 return false;
2606
2607 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2608 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2609 /* There's no vector INCB. */
2610 return false;
2611
2612 HOST_WIDE_INT factor = value.coeffs[0];
2613 if (value.coeffs[1] != factor)
2614 return false;
2615
2616 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2617 if ((factor % nelts_per_vq) != 0
2618 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2619 return false;
2620
2621 if (factor_out)
2622 *factor_out = factor;
2623 if (nelts_per_vq_out)
2624 *nelts_per_vq_out = nelts_per_vq;
2625 return true;
2626 }
2627
2628 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2629 instruction. */
2630
2631 bool
2632 aarch64_sve_inc_dec_immediate_p (rtx x)
2633 {
2634 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2635 }
2636
2637 /* Return the asm template for an SVE vector INC or DEC instruction.
2638 OPERANDS gives the operands before the vector count and X is the
2639 value of the vector count operand itself. */
2640
2641 char *
2642 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2643 {
2644 int factor;
2645 unsigned int nelts_per_vq;
2646 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2647 gcc_unreachable ();
2648 if (factor < 0)
2649 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2650 nelts_per_vq);
2651 else
2652 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2653 nelts_per_vq);
2654 }
2655
2656 static int
2657 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2658 scalar_int_mode mode)
2659 {
2660 int i;
2661 unsigned HOST_WIDE_INT val, val2, mask;
2662 int one_match, zero_match;
2663 int num_insns;
2664
2665 val = INTVAL (imm);
2666
2667 if (aarch64_move_imm (val, mode))
2668 {
2669 if (generate)
2670 emit_insn (gen_rtx_SET (dest, imm));
2671 return 1;
2672 }
2673
2674 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2675 (with XXXX non-zero). In that case check to see if the move can be done in
2676 a smaller mode. */
2677 val2 = val & 0xffffffff;
2678 if (mode == DImode
2679 && aarch64_move_imm (val2, SImode)
2680 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2681 {
2682 if (generate)
2683 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2684
2685 /* Check if we have to emit a second instruction by checking to see
2686 if any of the upper 32 bits of the original DI mode value is set. */
2687 if (val == val2)
2688 return 1;
2689
2690 i = (val >> 48) ? 48 : 32;
2691
2692 if (generate)
2693 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2694 GEN_INT ((val >> i) & 0xffff)));
2695
2696 return 2;
2697 }
2698
2699 if ((val >> 32) == 0 || mode == SImode)
2700 {
2701 if (generate)
2702 {
2703 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2704 if (mode == SImode)
2705 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2706 GEN_INT ((val >> 16) & 0xffff)));
2707 else
2708 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2709 GEN_INT ((val >> 16) & 0xffff)));
2710 }
2711 return 2;
2712 }
2713
2714 /* Remaining cases are all for DImode. */
2715
2716 mask = 0xffff;
2717 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2718 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2719 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2720 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2721
2722 if (zero_match != 2 && one_match != 2)
2723 {
2724 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2725 For a 64-bit bitmask try whether changing 16 bits to all ones or
2726 zeroes creates a valid bitmask. To check any repeated bitmask,
2727 try using 16 bits from the other 32-bit half of val. */
2728
2729 for (i = 0; i < 64; i += 16, mask <<= 16)
2730 {
2731 val2 = val & ~mask;
2732 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2733 break;
2734 val2 = val | mask;
2735 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2736 break;
2737 val2 = val2 & ~mask;
2738 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2739 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2740 break;
2741 }
2742 if (i != 64)
2743 {
2744 if (generate)
2745 {
2746 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2747 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2748 GEN_INT ((val >> i) & 0xffff)));
2749 }
2750 return 2;
2751 }
2752 }
2753
2754 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2755 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2756 otherwise skip zero bits. */
2757
2758 num_insns = 1;
2759 mask = 0xffff;
2760 val2 = one_match > zero_match ? ~val : val;
2761 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2762
2763 if (generate)
2764 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2765 ? (val | ~(mask << i))
2766 : (val & (mask << i)))));
2767 for (i += 16; i < 64; i += 16)
2768 {
2769 if ((val2 & (mask << i)) == 0)
2770 continue;
2771 if (generate)
2772 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2773 GEN_INT ((val >> i) & 0xffff)));
2774 num_insns ++;
2775 }
2776
2777 return num_insns;
2778 }
2779
2780 /* Return whether imm is a 128-bit immediate which is simple enough to
2781 expand inline. */
2782 bool
2783 aarch64_mov128_immediate (rtx imm)
2784 {
2785 if (GET_CODE (imm) == CONST_INT)
2786 return true;
2787
2788 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2789
2790 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2791 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2792
2793 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2794 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2795 }
2796
2797
2798 /* Return the number of temporary registers that aarch64_add_offset_1
2799 would need to add OFFSET to a register. */
2800
2801 static unsigned int
2802 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2803 {
2804 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2805 }
2806
2807 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2808 a non-polynomial OFFSET. MODE is the mode of the addition.
2809 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2810 be set and CFA adjustments added to the generated instructions.
2811
2812 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2813 temporary if register allocation is already complete. This temporary
2814 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2815 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2816 the immediate again.
2817
2818 Since this function may be used to adjust the stack pointer, we must
2819 ensure that it cannot cause transient stack deallocation (for example
2820 by first incrementing SP and then decrementing when adjusting by a
2821 large immediate). */
2822
2823 static void
2824 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2825 rtx src, HOST_WIDE_INT offset, rtx temp1,
2826 bool frame_related_p, bool emit_move_imm)
2827 {
2828 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2829 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2830
2831 HOST_WIDE_INT moffset = abs_hwi (offset);
2832 rtx_insn *insn;
2833
2834 if (!moffset)
2835 {
2836 if (!rtx_equal_p (dest, src))
2837 {
2838 insn = emit_insn (gen_rtx_SET (dest, src));
2839 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2840 }
2841 return;
2842 }
2843
2844 /* Single instruction adjustment. */
2845 if (aarch64_uimm12_shift (moffset))
2846 {
2847 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2848 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2849 return;
2850 }
2851
2852 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2853 and either:
2854
2855 a) the offset cannot be loaded by a 16-bit move or
2856 b) there is no spare register into which we can move it. */
2857 if (moffset < 0x1000000
2858 && ((!temp1 && !can_create_pseudo_p ())
2859 || !aarch64_move_imm (moffset, mode)))
2860 {
2861 HOST_WIDE_INT low_off = moffset & 0xfff;
2862
2863 low_off = offset < 0 ? -low_off : low_off;
2864 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2865 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2866 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2867 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2868 return;
2869 }
2870
2871 /* Emit a move immediate if required and an addition/subtraction. */
2872 if (emit_move_imm)
2873 {
2874 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2875 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2876 }
2877 insn = emit_insn (offset < 0
2878 ? gen_sub3_insn (dest, src, temp1)
2879 : gen_add3_insn (dest, src, temp1));
2880 if (frame_related_p)
2881 {
2882 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2883 rtx adj = plus_constant (mode, src, offset);
2884 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2885 }
2886 }
2887
2888 /* Return the number of temporary registers that aarch64_add_offset
2889 would need to move OFFSET into a register or add OFFSET to a register;
2890 ADD_P is true if we want the latter rather than the former. */
2891
2892 static unsigned int
2893 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2894 {
2895 /* This follows the same structure as aarch64_add_offset. */
2896 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2897 return 0;
2898
2899 unsigned int count = 0;
2900 HOST_WIDE_INT factor = offset.coeffs[1];
2901 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2902 poly_int64 poly_offset (factor, factor);
2903 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2904 /* Need one register for the ADDVL/ADDPL result. */
2905 count += 1;
2906 else if (factor != 0)
2907 {
2908 factor = abs (factor);
2909 if (factor > 16 * (factor & -factor))
2910 /* Need one register for the CNT result and one for the multiplication
2911 factor. If necessary, the second temporary can be reused for the
2912 constant part of the offset. */
2913 return 2;
2914 /* Need one register for the CNT result (which might then
2915 be shifted). */
2916 count += 1;
2917 }
2918 return count + aarch64_add_offset_1_temporaries (constant);
2919 }
2920
2921 /* If X can be represented as a poly_int64, return the number
2922 of temporaries that are required to add it to a register.
2923 Return -1 otherwise. */
2924
2925 int
2926 aarch64_add_offset_temporaries (rtx x)
2927 {
2928 poly_int64 offset;
2929 if (!poly_int_rtx_p (x, &offset))
2930 return -1;
2931 return aarch64_offset_temporaries (true, offset);
2932 }
2933
2934 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2935 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2936 be set and CFA adjustments added to the generated instructions.
2937
2938 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2939 temporary if register allocation is already complete. This temporary
2940 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2941 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2942 false to avoid emitting the immediate again.
2943
2944 TEMP2, if nonnull, is a second temporary register that doesn't
2945 overlap either DEST or REG.
2946
2947 Since this function may be used to adjust the stack pointer, we must
2948 ensure that it cannot cause transient stack deallocation (for example
2949 by first incrementing SP and then decrementing when adjusting by a
2950 large immediate). */
2951
2952 static void
2953 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2954 poly_int64 offset, rtx temp1, rtx temp2,
2955 bool frame_related_p, bool emit_move_imm = true)
2956 {
2957 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2958 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2959 gcc_assert (temp1 == NULL_RTX
2960 || !frame_related_p
2961 || !reg_overlap_mentioned_p (temp1, dest));
2962 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2963
2964 /* Try using ADDVL or ADDPL to add the whole value. */
2965 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2966 {
2967 rtx offset_rtx = gen_int_mode (offset, mode);
2968 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2969 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2970 return;
2971 }
2972
2973 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2974 SVE vector register, over and above the minimum size of 128 bits.
2975 This is equivalent to half the value returned by CNTD with a
2976 vector shape of ALL. */
2977 HOST_WIDE_INT factor = offset.coeffs[1];
2978 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2979
2980 /* Try using ADDVL or ADDPL to add the VG-based part. */
2981 poly_int64 poly_offset (factor, factor);
2982 if (src != const0_rtx
2983 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2984 {
2985 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2986 if (frame_related_p)
2987 {
2988 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2989 RTX_FRAME_RELATED_P (insn) = true;
2990 src = dest;
2991 }
2992 else
2993 {
2994 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2995 src = aarch64_force_temporary (mode, temp1, addr);
2996 temp1 = temp2;
2997 temp2 = NULL_RTX;
2998 }
2999 }
3000 /* Otherwise use a CNT-based sequence. */
3001 else if (factor != 0)
3002 {
3003 /* Use a subtraction if we have a negative factor. */
3004 rtx_code code = PLUS;
3005 if (factor < 0)
3006 {
3007 factor = -factor;
3008 code = MINUS;
3009 }
3010
3011 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3012 into the multiplication. */
3013 rtx val;
3014 int shift = 0;
3015 if (factor & 1)
3016 /* Use a right shift by 1. */
3017 shift = -1;
3018 else
3019 factor /= 2;
3020 HOST_WIDE_INT low_bit = factor & -factor;
3021 if (factor <= 16 * low_bit)
3022 {
3023 if (factor > 16 * 8)
3024 {
3025 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3026 the value with the minimum multiplier and shift it into
3027 position. */
3028 int extra_shift = exact_log2 (low_bit);
3029 shift += extra_shift;
3030 factor >>= extra_shift;
3031 }
3032 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3033 }
3034 else
3035 {
3036 /* Use CNTD, then multiply it by FACTOR. */
3037 val = gen_int_mode (poly_int64 (2, 2), mode);
3038 val = aarch64_force_temporary (mode, temp1, val);
3039
3040 /* Go back to using a negative multiplication factor if we have
3041 no register from which to subtract. */
3042 if (code == MINUS && src == const0_rtx)
3043 {
3044 factor = -factor;
3045 code = PLUS;
3046 }
3047 rtx coeff1 = gen_int_mode (factor, mode);
3048 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3049 val = gen_rtx_MULT (mode, val, coeff1);
3050 }
3051
3052 if (shift > 0)
3053 {
3054 /* Multiply by 1 << SHIFT. */
3055 val = aarch64_force_temporary (mode, temp1, val);
3056 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3057 }
3058 else if (shift == -1)
3059 {
3060 /* Divide by 2. */
3061 val = aarch64_force_temporary (mode, temp1, val);
3062 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3063 }
3064
3065 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3066 if (src != const0_rtx)
3067 {
3068 val = aarch64_force_temporary (mode, temp1, val);
3069 val = gen_rtx_fmt_ee (code, mode, src, val);
3070 }
3071 else if (code == MINUS)
3072 {
3073 val = aarch64_force_temporary (mode, temp1, val);
3074 val = gen_rtx_NEG (mode, val);
3075 }
3076
3077 if (constant == 0 || frame_related_p)
3078 {
3079 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3080 if (frame_related_p)
3081 {
3082 RTX_FRAME_RELATED_P (insn) = true;
3083 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3084 gen_rtx_SET (dest, plus_constant (Pmode, src,
3085 poly_offset)));
3086 }
3087 src = dest;
3088 if (constant == 0)
3089 return;
3090 }
3091 else
3092 {
3093 src = aarch64_force_temporary (mode, temp1, val);
3094 temp1 = temp2;
3095 temp2 = NULL_RTX;
3096 }
3097
3098 emit_move_imm = true;
3099 }
3100
3101 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3102 frame_related_p, emit_move_imm);
3103 }
3104
3105 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3106 than a poly_int64. */
3107
3108 void
3109 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3110 rtx offset_rtx, rtx temp1, rtx temp2)
3111 {
3112 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3113 temp1, temp2, false);
3114 }
3115
3116 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3117 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3118 if TEMP1 already contains abs (DELTA). */
3119
3120 static inline void
3121 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3122 {
3123 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3124 temp1, temp2, true, emit_move_imm);
3125 }
3126
3127 /* Subtract DELTA from the stack pointer, marking the instructions
3128 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3129 if nonnull. */
3130
3131 static inline void
3132 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3133 bool emit_move_imm = true)
3134 {
3135 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3136 temp1, temp2, frame_related_p, emit_move_imm);
3137 }
3138
3139 /* Set DEST to (vec_series BASE STEP). */
3140
3141 static void
3142 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3143 {
3144 machine_mode mode = GET_MODE (dest);
3145 scalar_mode inner = GET_MODE_INNER (mode);
3146
3147 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3148 if (!aarch64_sve_index_immediate_p (base))
3149 base = force_reg (inner, base);
3150 if (!aarch64_sve_index_immediate_p (step))
3151 step = force_reg (inner, step);
3152
3153 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3154 }
3155
3156 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3157 integer of mode INT_MODE. Return true on success. */
3158
3159 static bool
3160 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3161 rtx src)
3162 {
3163 /* If the constant is smaller than 128 bits, we can do the move
3164 using a vector of SRC_MODEs. */
3165 if (src_mode != TImode)
3166 {
3167 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3168 GET_MODE_SIZE (src_mode));
3169 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3170 emit_move_insn (gen_lowpart (dup_mode, dest),
3171 gen_const_vec_duplicate (dup_mode, src));
3172 return true;
3173 }
3174
3175 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3176 src = force_const_mem (src_mode, src);
3177 if (!src)
3178 return false;
3179
3180 /* Make sure that the address is legitimate. */
3181 if (!aarch64_sve_ld1r_operand_p (src))
3182 {
3183 rtx addr = force_reg (Pmode, XEXP (src, 0));
3184 src = replace_equiv_address (src, addr);
3185 }
3186
3187 machine_mode mode = GET_MODE (dest);
3188 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3189 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3190 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3191 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3192 emit_insn (gen_rtx_SET (dest, src));
3193 return true;
3194 }
3195
3196 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3197 isn't a simple duplicate or series. */
3198
3199 static void
3200 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3201 {
3202 machine_mode mode = GET_MODE (src);
3203 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3204 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3205 gcc_assert (npatterns > 1);
3206
3207 if (nelts_per_pattern == 1)
3208 {
3209 /* The constant is a repeating seqeuence of at least two elements,
3210 where the repeating elements occupy no more than 128 bits.
3211 Get an integer representation of the replicated value. */
3212 scalar_int_mode int_mode;
3213 if (BYTES_BIG_ENDIAN)
3214 /* For now, always use LD1RQ to load the value on big-endian
3215 targets, since the handling of smaller integers includes a
3216 subreg that is semantically an element reverse. */
3217 int_mode = TImode;
3218 else
3219 {
3220 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3221 gcc_assert (int_bits <= 128);
3222 int_mode = int_mode_for_size (int_bits, 0).require ();
3223 }
3224 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3225 if (int_value
3226 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3227 return;
3228 }
3229
3230 /* Expand each pattern individually. */
3231 rtx_vector_builder builder;
3232 auto_vec<rtx, 16> vectors (npatterns);
3233 for (unsigned int i = 0; i < npatterns; ++i)
3234 {
3235 builder.new_vector (mode, 1, nelts_per_pattern);
3236 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3237 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3238 vectors.quick_push (force_reg (mode, builder.build ()));
3239 }
3240
3241 /* Use permutes to interleave the separate vectors. */
3242 while (npatterns > 1)
3243 {
3244 npatterns /= 2;
3245 for (unsigned int i = 0; i < npatterns; ++i)
3246 {
3247 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3248 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3249 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3250 vectors[i] = tmp;
3251 }
3252 }
3253 gcc_assert (vectors[0] == dest);
3254 }
3255
3256 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3257 is a pattern that can be used to set DEST to a replicated scalar
3258 element. */
3259
3260 void
3261 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3262 rtx (*gen_vec_duplicate) (rtx, rtx))
3263 {
3264 machine_mode mode = GET_MODE (dest);
3265
3266 /* Check on what type of symbol it is. */
3267 scalar_int_mode int_mode;
3268 if ((GET_CODE (imm) == SYMBOL_REF
3269 || GET_CODE (imm) == LABEL_REF
3270 || GET_CODE (imm) == CONST
3271 || GET_CODE (imm) == CONST_POLY_INT)
3272 && is_a <scalar_int_mode> (mode, &int_mode))
3273 {
3274 rtx mem;
3275 poly_int64 offset;
3276 HOST_WIDE_INT const_offset;
3277 enum aarch64_symbol_type sty;
3278
3279 /* If we have (const (plus symbol offset)), separate out the offset
3280 before we start classifying the symbol. */
3281 rtx base = strip_offset (imm, &offset);
3282
3283 /* We must always add an offset involving VL separately, rather than
3284 folding it into the relocation. */
3285 if (!offset.is_constant (&const_offset))
3286 {
3287 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3288 emit_insn (gen_rtx_SET (dest, imm));
3289 else
3290 {
3291 /* Do arithmetic on 32-bit values if the result is smaller
3292 than that. */
3293 if (partial_subreg_p (int_mode, SImode))
3294 {
3295 /* It is invalid to do symbol calculations in modes
3296 narrower than SImode. */
3297 gcc_assert (base == const0_rtx);
3298 dest = gen_lowpart (SImode, dest);
3299 int_mode = SImode;
3300 }
3301 if (base != const0_rtx)
3302 {
3303 base = aarch64_force_temporary (int_mode, dest, base);
3304 aarch64_add_offset (int_mode, dest, base, offset,
3305 NULL_RTX, NULL_RTX, false);
3306 }
3307 else
3308 aarch64_add_offset (int_mode, dest, base, offset,
3309 dest, NULL_RTX, false);
3310 }
3311 return;
3312 }
3313
3314 sty = aarch64_classify_symbol (base, const_offset);
3315 switch (sty)
3316 {
3317 case SYMBOL_FORCE_TO_MEM:
3318 if (const_offset != 0
3319 && targetm.cannot_force_const_mem (int_mode, imm))
3320 {
3321 gcc_assert (can_create_pseudo_p ());
3322 base = aarch64_force_temporary (int_mode, dest, base);
3323 aarch64_add_offset (int_mode, dest, base, const_offset,
3324 NULL_RTX, NULL_RTX, false);
3325 return;
3326 }
3327
3328 mem = force_const_mem (ptr_mode, imm);
3329 gcc_assert (mem);
3330
3331 /* If we aren't generating PC relative literals, then
3332 we need to expand the literal pool access carefully.
3333 This is something that needs to be done in a number
3334 of places, so could well live as a separate function. */
3335 if (!aarch64_pcrelative_literal_loads)
3336 {
3337 gcc_assert (can_create_pseudo_p ());
3338 base = gen_reg_rtx (ptr_mode);
3339 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3340 if (ptr_mode != Pmode)
3341 base = convert_memory_address (Pmode, base);
3342 mem = gen_rtx_MEM (ptr_mode, base);
3343 }
3344
3345 if (int_mode != ptr_mode)
3346 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3347
3348 emit_insn (gen_rtx_SET (dest, mem));
3349
3350 return;
3351
3352 case SYMBOL_SMALL_TLSGD:
3353 case SYMBOL_SMALL_TLSDESC:
3354 case SYMBOL_SMALL_TLSIE:
3355 case SYMBOL_SMALL_GOT_28K:
3356 case SYMBOL_SMALL_GOT_4G:
3357 case SYMBOL_TINY_GOT:
3358 case SYMBOL_TINY_TLSIE:
3359 if (const_offset != 0)
3360 {
3361 gcc_assert(can_create_pseudo_p ());
3362 base = aarch64_force_temporary (int_mode, dest, base);
3363 aarch64_add_offset (int_mode, dest, base, const_offset,
3364 NULL_RTX, NULL_RTX, false);
3365 return;
3366 }
3367 /* FALLTHRU */
3368
3369 case SYMBOL_SMALL_ABSOLUTE:
3370 case SYMBOL_TINY_ABSOLUTE:
3371 case SYMBOL_TLSLE12:
3372 case SYMBOL_TLSLE24:
3373 case SYMBOL_TLSLE32:
3374 case SYMBOL_TLSLE48:
3375 aarch64_load_symref_appropriately (dest, imm, sty);
3376 return;
3377
3378 default:
3379 gcc_unreachable ();
3380 }
3381 }
3382
3383 if (!CONST_INT_P (imm))
3384 {
3385 rtx base, step, value;
3386 if (GET_CODE (imm) == HIGH
3387 || aarch64_simd_valid_immediate (imm, NULL))
3388 emit_insn (gen_rtx_SET (dest, imm));
3389 else if (const_vec_series_p (imm, &base, &step))
3390 aarch64_expand_vec_series (dest, base, step);
3391 else if (const_vec_duplicate_p (imm, &value))
3392 {
3393 /* If the constant is out of range of an SVE vector move,
3394 load it from memory if we can, otherwise move it into
3395 a register and use a DUP. */
3396 scalar_mode inner_mode = GET_MODE_INNER (mode);
3397 rtx op = force_const_mem (inner_mode, value);
3398 if (!op)
3399 op = force_reg (inner_mode, value);
3400 else if (!aarch64_sve_ld1r_operand_p (op))
3401 {
3402 rtx addr = force_reg (Pmode, XEXP (op, 0));
3403 op = replace_equiv_address (op, addr);
3404 }
3405 emit_insn (gen_vec_duplicate (dest, op));
3406 }
3407 else if (GET_CODE (imm) == CONST_VECTOR
3408 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3409 aarch64_expand_sve_const_vector (dest, imm);
3410 else
3411 {
3412 rtx mem = force_const_mem (mode, imm);
3413 gcc_assert (mem);
3414 emit_move_insn (dest, mem);
3415 }
3416
3417 return;
3418 }
3419
3420 aarch64_internal_mov_immediate (dest, imm, true,
3421 as_a <scalar_int_mode> (mode));
3422 }
3423
3424 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3425 that is known to contain PTRUE. */
3426
3427 void
3428 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3429 {
3430 expand_operand ops[3];
3431 machine_mode mode = GET_MODE (dest);
3432 create_output_operand (&ops[0], dest, mode);
3433 create_input_operand (&ops[1], pred, GET_MODE(pred));
3434 create_input_operand (&ops[2], src, mode);
3435 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3436 }
3437
3438 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3439 operand is in memory. In this case we need to use the predicated LD1
3440 and ST1 instead of LDR and STR, both for correctness on big-endian
3441 targets and because LD1 and ST1 support a wider range of addressing modes.
3442 PRED_MODE is the mode of the predicate.
3443
3444 See the comment at the head of aarch64-sve.md for details about the
3445 big-endian handling. */
3446
3447 void
3448 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3449 {
3450 machine_mode mode = GET_MODE (dest);
3451 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3452 if (!register_operand (src, mode)
3453 && !register_operand (dest, mode))
3454 {
3455 rtx tmp = gen_reg_rtx (mode);
3456 if (MEM_P (src))
3457 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3458 else
3459 emit_move_insn (tmp, src);
3460 src = tmp;
3461 }
3462 aarch64_emit_sve_pred_move (dest, ptrue, src);
3463 }
3464
3465 /* Called only on big-endian targets. See whether an SVE vector move
3466 from SRC to DEST is effectively a REV[BHW] instruction, because at
3467 least one operand is a subreg of an SVE vector that has wider or
3468 narrower elements. Return true and emit the instruction if so.
3469
3470 For example:
3471
3472 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3473
3474 represents a VIEW_CONVERT between the following vectors, viewed
3475 in memory order:
3476
3477 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3478 R1: { [0], [1], [2], [3], ... }
3479
3480 The high part of lane X in R2 should therefore correspond to lane X*2
3481 of R1, but the register representations are:
3482
3483 msb lsb
3484 R2: ...... [1].high [1].low [0].high [0].low
3485 R1: ...... [3] [2] [1] [0]
3486
3487 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3488 We therefore need a reverse operation to swap the high and low values
3489 around.
3490
3491 This is purely an optimization. Without it we would spill the
3492 subreg operand to the stack in one mode and reload it in the
3493 other mode, which has the same effect as the REV. */
3494
3495 bool
3496 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3497 {
3498 gcc_assert (BYTES_BIG_ENDIAN);
3499 if (GET_CODE (dest) == SUBREG)
3500 dest = SUBREG_REG (dest);
3501 if (GET_CODE (src) == SUBREG)
3502 src = SUBREG_REG (src);
3503
3504 /* The optimization handles two single SVE REGs with different element
3505 sizes. */
3506 if (!REG_P (dest)
3507 || !REG_P (src)
3508 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3509 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3510 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3511 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3512 return false;
3513
3514 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3515 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3516 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3517 UNSPEC_REV_SUBREG);
3518 emit_insn (gen_rtx_SET (dest, unspec));
3519 return true;
3520 }
3521
3522 /* Return a copy of X with mode MODE, without changing its other
3523 attributes. Unlike gen_lowpart, this doesn't care whether the
3524 mode change is valid. */
3525
3526 static rtx
3527 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3528 {
3529 if (GET_MODE (x) == mode)
3530 return x;
3531
3532 x = shallow_copy_rtx (x);
3533 set_mode_and_regno (x, mode, REGNO (x));
3534 return x;
3535 }
3536
3537 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3538 operands. */
3539
3540 void
3541 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3542 {
3543 /* Decide which REV operation we need. The mode with narrower elements
3544 determines the mode of the operands and the mode with the wider
3545 elements determines the reverse width. */
3546 machine_mode mode_with_wider_elts = GET_MODE (dest);
3547 machine_mode mode_with_narrower_elts = GET_MODE (src);
3548 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3549 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3550 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3551
3552 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3553 unsigned int unspec;
3554 if (wider_bytes == 8)
3555 unspec = UNSPEC_REV64;
3556 else if (wider_bytes == 4)
3557 unspec = UNSPEC_REV32;
3558 else if (wider_bytes == 2)
3559 unspec = UNSPEC_REV16;
3560 else
3561 gcc_unreachable ();
3562 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3563
3564 /* Emit:
3565
3566 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3567 UNSPEC_MERGE_PTRUE))
3568
3569 with the appropriate modes. */
3570 ptrue = gen_lowpart (pred_mode, ptrue);
3571 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3572 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3573 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3574 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3575 UNSPEC_MERGE_PTRUE);
3576 emit_insn (gen_rtx_SET (dest, src));
3577 }
3578
3579 static bool
3580 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3581 tree exp ATTRIBUTE_UNUSED)
3582 {
3583 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3584 return false;
3585
3586 return true;
3587 }
3588
3589 /* Implement TARGET_PASS_BY_REFERENCE. */
3590
3591 static bool
3592 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3593 machine_mode mode,
3594 const_tree type,
3595 bool named ATTRIBUTE_UNUSED)
3596 {
3597 HOST_WIDE_INT size;
3598 machine_mode dummymode;
3599 int nregs;
3600
3601 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3602 if (mode == BLKmode && type)
3603 size = int_size_in_bytes (type);
3604 else
3605 /* No frontends can create types with variable-sized modes, so we
3606 shouldn't be asked to pass or return them. */
3607 size = GET_MODE_SIZE (mode).to_constant ();
3608
3609 /* Aggregates are passed by reference based on their size. */
3610 if (type && AGGREGATE_TYPE_P (type))
3611 {
3612 size = int_size_in_bytes (type);
3613 }
3614
3615 /* Variable sized arguments are always returned by reference. */
3616 if (size < 0)
3617 return true;
3618
3619 /* Can this be a candidate to be passed in fp/simd register(s)? */
3620 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3621 &dummymode, &nregs,
3622 NULL))
3623 return false;
3624
3625 /* Arguments which are variable sized or larger than 2 registers are
3626 passed by reference unless they are a homogenous floating point
3627 aggregate. */
3628 return size > 2 * UNITS_PER_WORD;
3629 }
3630
3631 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3632 static bool
3633 aarch64_return_in_msb (const_tree valtype)
3634 {
3635 machine_mode dummy_mode;
3636 int dummy_int;
3637
3638 /* Never happens in little-endian mode. */
3639 if (!BYTES_BIG_ENDIAN)
3640 return false;
3641
3642 /* Only composite types smaller than or equal to 16 bytes can
3643 be potentially returned in registers. */
3644 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3645 || int_size_in_bytes (valtype) <= 0
3646 || int_size_in_bytes (valtype) > 16)
3647 return false;
3648
3649 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3650 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3651 is always passed/returned in the least significant bits of fp/simd
3652 register(s). */
3653 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3654 &dummy_mode, &dummy_int, NULL))
3655 return false;
3656
3657 return true;
3658 }
3659
3660 /* Implement TARGET_FUNCTION_VALUE.
3661 Define how to find the value returned by a function. */
3662
3663 static rtx
3664 aarch64_function_value (const_tree type, const_tree func,
3665 bool outgoing ATTRIBUTE_UNUSED)
3666 {
3667 machine_mode mode;
3668 int unsignedp;
3669 int count;
3670 machine_mode ag_mode;
3671
3672 mode = TYPE_MODE (type);
3673 if (INTEGRAL_TYPE_P (type))
3674 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3675
3676 if (aarch64_return_in_msb (type))
3677 {
3678 HOST_WIDE_INT size = int_size_in_bytes (type);
3679
3680 if (size % UNITS_PER_WORD != 0)
3681 {
3682 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3683 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3684 }
3685 }
3686
3687 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3688 &ag_mode, &count, NULL))
3689 {
3690 if (!aarch64_composite_type_p (type, mode))
3691 {
3692 gcc_assert (count == 1 && mode == ag_mode);
3693 return gen_rtx_REG (mode, V0_REGNUM);
3694 }
3695 else
3696 {
3697 int i;
3698 rtx par;
3699
3700 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3701 for (i = 0; i < count; i++)
3702 {
3703 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3704 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3705 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3706 XVECEXP (par, 0, i) = tmp;
3707 }
3708 return par;
3709 }
3710 }
3711 else
3712 return gen_rtx_REG (mode, R0_REGNUM);
3713 }
3714
3715 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3716 Return true if REGNO is the number of a hard register in which the values
3717 of called function may come back. */
3718
3719 static bool
3720 aarch64_function_value_regno_p (const unsigned int regno)
3721 {
3722 /* Maximum of 16 bytes can be returned in the general registers. Examples
3723 of 16-byte return values are: 128-bit integers and 16-byte small
3724 structures (excluding homogeneous floating-point aggregates). */
3725 if (regno == R0_REGNUM || regno == R1_REGNUM)
3726 return true;
3727
3728 /* Up to four fp/simd registers can return a function value, e.g. a
3729 homogeneous floating-point aggregate having four members. */
3730 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3731 return TARGET_FLOAT;
3732
3733 return false;
3734 }
3735
3736 /* Implement TARGET_RETURN_IN_MEMORY.
3737
3738 If the type T of the result of a function is such that
3739 void func (T arg)
3740 would require that arg be passed as a value in a register (or set of
3741 registers) according to the parameter passing rules, then the result
3742 is returned in the same registers as would be used for such an
3743 argument. */
3744
3745 static bool
3746 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3747 {
3748 HOST_WIDE_INT size;
3749 machine_mode ag_mode;
3750 int count;
3751
3752 if (!AGGREGATE_TYPE_P (type)
3753 && TREE_CODE (type) != COMPLEX_TYPE
3754 && TREE_CODE (type) != VECTOR_TYPE)
3755 /* Simple scalar types always returned in registers. */
3756 return false;
3757
3758 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3759 type,
3760 &ag_mode,
3761 &count,
3762 NULL))
3763 return false;
3764
3765 /* Types larger than 2 registers returned in memory. */
3766 size = int_size_in_bytes (type);
3767 return (size < 0 || size > 2 * UNITS_PER_WORD);
3768 }
3769
3770 static bool
3771 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3772 const_tree type, int *nregs)
3773 {
3774 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3775 return aarch64_vfp_is_call_or_return_candidate (mode,
3776 type,
3777 &pcum->aapcs_vfp_rmode,
3778 nregs,
3779 NULL);
3780 }
3781
3782 /* Given MODE and TYPE of a function argument, return the alignment in
3783 bits. The idea is to suppress any stronger alignment requested by
3784 the user and opt for the natural alignment (specified in AAPCS64 \S
3785 4.1). ABI_BREAK is set to true if the alignment was incorrectly
3786 calculated in versions of GCC prior to GCC-9. This is a helper
3787 function for local use only. */
3788
3789 static unsigned int
3790 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3791 bool *abi_break)
3792 {
3793 *abi_break = false;
3794 if (!type)
3795 return GET_MODE_ALIGNMENT (mode);
3796
3797 if (integer_zerop (TYPE_SIZE (type)))
3798 return 0;
3799
3800 gcc_assert (TYPE_MODE (type) == mode);
3801
3802 if (!AGGREGATE_TYPE_P (type))
3803 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3804
3805 if (TREE_CODE (type) == ARRAY_TYPE)
3806 return TYPE_ALIGN (TREE_TYPE (type));
3807
3808 unsigned int alignment = 0;
3809 unsigned int bitfield_alignment = 0;
3810 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3811 if (TREE_CODE (field) == FIELD_DECL)
3812 {
3813 alignment = std::max (alignment, DECL_ALIGN (field));
3814 if (DECL_BIT_FIELD_TYPE (field))
3815 bitfield_alignment
3816 = std::max (bitfield_alignment,
3817 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3818 }
3819
3820 if (bitfield_alignment > alignment)
3821 {
3822 *abi_break = true;
3823 return bitfield_alignment;
3824 }
3825
3826 return alignment;
3827 }
3828
3829 /* Layout a function argument according to the AAPCS64 rules. The rule
3830 numbers refer to the rule numbers in the AAPCS64. */
3831
3832 static void
3833 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3834 const_tree type,
3835 bool named ATTRIBUTE_UNUSED)
3836 {
3837 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3838 int ncrn, nvrn, nregs;
3839 bool allocate_ncrn, allocate_nvrn;
3840 HOST_WIDE_INT size;
3841 bool abi_break;
3842
3843 /* We need to do this once per argument. */
3844 if (pcum->aapcs_arg_processed)
3845 return;
3846
3847 pcum->aapcs_arg_processed = true;
3848
3849 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3850 if (type)
3851 size = int_size_in_bytes (type);
3852 else
3853 /* No frontends can create types with variable-sized modes, so we
3854 shouldn't be asked to pass or return them. */
3855 size = GET_MODE_SIZE (mode).to_constant ();
3856 size = ROUND_UP (size, UNITS_PER_WORD);
3857
3858 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3859 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3860 mode,
3861 type,
3862 &nregs);
3863
3864 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3865 The following code thus handles passing by SIMD/FP registers first. */
3866
3867 nvrn = pcum->aapcs_nvrn;
3868
3869 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3870 and homogenous short-vector aggregates (HVA). */
3871 if (allocate_nvrn)
3872 {
3873 if (!TARGET_FLOAT)
3874 aarch64_err_no_fpadvsimd (mode);
3875
3876 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3877 {
3878 pcum->aapcs_nextnvrn = nvrn + nregs;
3879 if (!aarch64_composite_type_p (type, mode))
3880 {
3881 gcc_assert (nregs == 1);
3882 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3883 }
3884 else
3885 {
3886 rtx par;
3887 int i;
3888 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3889 for (i = 0; i < nregs; i++)
3890 {
3891 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3892 V0_REGNUM + nvrn + i);
3893 rtx offset = gen_int_mode
3894 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3895 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3896 XVECEXP (par, 0, i) = tmp;
3897 }
3898 pcum->aapcs_reg = par;
3899 }
3900 return;
3901 }
3902 else
3903 {
3904 /* C.3 NSRN is set to 8. */
3905 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3906 goto on_stack;
3907 }
3908 }
3909
3910 ncrn = pcum->aapcs_ncrn;
3911 nregs = size / UNITS_PER_WORD;
3912
3913 /* C6 - C9. though the sign and zero extension semantics are
3914 handled elsewhere. This is the case where the argument fits
3915 entirely general registers. */
3916 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3917 {
3918 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3919
3920 /* C.8 if the argument has an alignment of 16 then the NGRN is
3921 rounded up to the next even number. */
3922 if (nregs == 2
3923 && ncrn % 2
3924 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3925 comparison is there because for > 16 * BITS_PER_UNIT
3926 alignment nregs should be > 2 and therefore it should be
3927 passed by reference rather than value. */
3928 && (aarch64_function_arg_alignment (mode, type, &abi_break)
3929 == 16 * BITS_PER_UNIT))
3930 {
3931 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3932 inform (input_location, "parameter passing for argument of type "
3933 "%qT changed in GCC 9.1", type);
3934 ++ncrn;
3935 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3936 }
3937
3938 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3939 A reg is still generated for it, but the caller should be smart
3940 enough not to use it. */
3941 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3942 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3943 else
3944 {
3945 rtx par;
3946 int i;
3947
3948 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3949 for (i = 0; i < nregs; i++)
3950 {
3951 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3952 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3953 GEN_INT (i * UNITS_PER_WORD));
3954 XVECEXP (par, 0, i) = tmp;
3955 }
3956 pcum->aapcs_reg = par;
3957 }
3958
3959 pcum->aapcs_nextncrn = ncrn + nregs;
3960 return;
3961 }
3962
3963 /* C.11 */
3964 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3965
3966 /* The argument is passed on stack; record the needed number of words for
3967 this argument and align the total size if necessary. */
3968 on_stack:
3969 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3970
3971 if (aarch64_function_arg_alignment (mode, type, &abi_break)
3972 == 16 * BITS_PER_UNIT)
3973 {
3974 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
3975 if (pcum->aapcs_stack_size != new_size)
3976 {
3977 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3978 inform (input_location, "parameter passing for argument of type "
3979 "%qT changed in GCC 9.1", type);
3980 pcum->aapcs_stack_size = new_size;
3981 }
3982 }
3983 return;
3984 }
3985
3986 /* Implement TARGET_FUNCTION_ARG. */
3987
3988 static rtx
3989 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3990 const_tree type, bool named)
3991 {
3992 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3993 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3994
3995 if (mode == VOIDmode)
3996 return NULL_RTX;
3997
3998 aarch64_layout_arg (pcum_v, mode, type, named);
3999 return pcum->aapcs_reg;
4000 }
4001
4002 void
4003 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4004 const_tree fntype ATTRIBUTE_UNUSED,
4005 rtx libname ATTRIBUTE_UNUSED,
4006 const_tree fndecl ATTRIBUTE_UNUSED,
4007 unsigned n_named ATTRIBUTE_UNUSED)
4008 {
4009 pcum->aapcs_ncrn = 0;
4010 pcum->aapcs_nvrn = 0;
4011 pcum->aapcs_nextncrn = 0;
4012 pcum->aapcs_nextnvrn = 0;
4013 pcum->pcs_variant = ARM_PCS_AAPCS64;
4014 pcum->aapcs_reg = NULL_RTX;
4015 pcum->aapcs_arg_processed = false;
4016 pcum->aapcs_stack_words = 0;
4017 pcum->aapcs_stack_size = 0;
4018
4019 if (!TARGET_FLOAT
4020 && fndecl && TREE_PUBLIC (fndecl)
4021 && fntype && fntype != error_mark_node)
4022 {
4023 const_tree type = TREE_TYPE (fntype);
4024 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4025 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4026 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4027 &mode, &nregs, NULL))
4028 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4029 }
4030 return;
4031 }
4032
4033 static void
4034 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4035 machine_mode mode,
4036 const_tree type,
4037 bool named)
4038 {
4039 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4040 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4041 {
4042 aarch64_layout_arg (pcum_v, mode, type, named);
4043 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4044 != (pcum->aapcs_stack_words != 0));
4045 pcum->aapcs_arg_processed = false;
4046 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4047 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4048 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4049 pcum->aapcs_stack_words = 0;
4050 pcum->aapcs_reg = NULL_RTX;
4051 }
4052 }
4053
4054 bool
4055 aarch64_function_arg_regno_p (unsigned regno)
4056 {
4057 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4058 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4059 }
4060
4061 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4062 PARM_BOUNDARY bits of alignment, but will be given anything up
4063 to STACK_BOUNDARY bits if the type requires it. This makes sure
4064 that both before and after the layout of each argument, the Next
4065 Stacked Argument Address (NSAA) will have a minimum alignment of
4066 8 bytes. */
4067
4068 static unsigned int
4069 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4070 {
4071 bool abi_break;
4072 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4073 &abi_break);
4074 if (abi_break & warn_psabi)
4075 inform (input_location, "parameter passing for argument of type "
4076 "%qT changed in GCC 9.1", type);
4077
4078 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4079 }
4080
4081 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4082
4083 static fixed_size_mode
4084 aarch64_get_reg_raw_mode (int regno)
4085 {
4086 if (TARGET_SVE && FP_REGNUM_P (regno))
4087 /* Don't use the SVE part of the register for __builtin_apply and
4088 __builtin_return. The SVE registers aren't used by the normal PCS,
4089 so using them there would be a waste of time. The PCS extensions
4090 for SVE types are fundamentally incompatible with the
4091 __builtin_return/__builtin_apply interface. */
4092 return as_a <fixed_size_mode> (V16QImode);
4093 return default_get_reg_raw_mode (regno);
4094 }
4095
4096 /* Implement TARGET_FUNCTION_ARG_PADDING.
4097
4098 Small aggregate types are placed in the lowest memory address.
4099
4100 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4101
4102 static pad_direction
4103 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4104 {
4105 /* On little-endian targets, the least significant byte of every stack
4106 argument is passed at the lowest byte address of the stack slot. */
4107 if (!BYTES_BIG_ENDIAN)
4108 return PAD_UPWARD;
4109
4110 /* Otherwise, integral, floating-point and pointer types are padded downward:
4111 the least significant byte of a stack argument is passed at the highest
4112 byte address of the stack slot. */
4113 if (type
4114 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4115 || POINTER_TYPE_P (type))
4116 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4117 return PAD_DOWNWARD;
4118
4119 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4120 return PAD_UPWARD;
4121 }
4122
4123 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4124
4125 It specifies padding for the last (may also be the only)
4126 element of a block move between registers and memory. If
4127 assuming the block is in the memory, padding upward means that
4128 the last element is padded after its highest significant byte,
4129 while in downward padding, the last element is padded at the
4130 its least significant byte side.
4131
4132 Small aggregates and small complex types are always padded
4133 upwards.
4134
4135 We don't need to worry about homogeneous floating-point or
4136 short-vector aggregates; their move is not affected by the
4137 padding direction determined here. Regardless of endianness,
4138 each element of such an aggregate is put in the least
4139 significant bits of a fp/simd register.
4140
4141 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4142 register has useful data, and return the opposite if the most
4143 significant byte does. */
4144
4145 bool
4146 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4147 bool first ATTRIBUTE_UNUSED)
4148 {
4149
4150 /* Small composite types are always padded upward. */
4151 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4152 {
4153 HOST_WIDE_INT size;
4154 if (type)
4155 size = int_size_in_bytes (type);
4156 else
4157 /* No frontends can create types with variable-sized modes, so we
4158 shouldn't be asked to pass or return them. */
4159 size = GET_MODE_SIZE (mode).to_constant ();
4160 if (size < 2 * UNITS_PER_WORD)
4161 return true;
4162 }
4163
4164 /* Otherwise, use the default padding. */
4165 return !BYTES_BIG_ENDIAN;
4166 }
4167
4168 static scalar_int_mode
4169 aarch64_libgcc_cmp_return_mode (void)
4170 {
4171 return SImode;
4172 }
4173
4174 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4175
4176 /* We use the 12-bit shifted immediate arithmetic instructions so values
4177 must be multiple of (1 << 12), i.e. 4096. */
4178 #define ARITH_FACTOR 4096
4179
4180 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4181 #error Cannot use simple address calculation for stack probing
4182 #endif
4183
4184 /* The pair of scratch registers used for stack probing. */
4185 #define PROBE_STACK_FIRST_REG R9_REGNUM
4186 #define PROBE_STACK_SECOND_REG R10_REGNUM
4187
4188 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4189 inclusive. These are offsets from the current stack pointer. */
4190
4191 static void
4192 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4193 {
4194 HOST_WIDE_INT size;
4195 if (!poly_size.is_constant (&size))
4196 {
4197 sorry ("stack probes for SVE frames");
4198 return;
4199 }
4200
4201 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4202
4203 /* See the same assertion on PROBE_INTERVAL above. */
4204 gcc_assert ((first % ARITH_FACTOR) == 0);
4205
4206 /* See if we have a constant small number of probes to generate. If so,
4207 that's the easy case. */
4208 if (size <= PROBE_INTERVAL)
4209 {
4210 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4211
4212 emit_set_insn (reg1,
4213 plus_constant (Pmode,
4214 stack_pointer_rtx, -(first + base)));
4215 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4216 }
4217
4218 /* The run-time loop is made up of 8 insns in the generic case while the
4219 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4220 else if (size <= 4 * PROBE_INTERVAL)
4221 {
4222 HOST_WIDE_INT i, rem;
4223
4224 emit_set_insn (reg1,
4225 plus_constant (Pmode,
4226 stack_pointer_rtx,
4227 -(first + PROBE_INTERVAL)));
4228 emit_stack_probe (reg1);
4229
4230 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4231 it exceeds SIZE. If only two probes are needed, this will not
4232 generate any code. Then probe at FIRST + SIZE. */
4233 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4234 {
4235 emit_set_insn (reg1,
4236 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4237 emit_stack_probe (reg1);
4238 }
4239
4240 rem = size - (i - PROBE_INTERVAL);
4241 if (rem > 256)
4242 {
4243 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4244
4245 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4246 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4247 }
4248 else
4249 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4250 }
4251
4252 /* Otherwise, do the same as above, but in a loop. Note that we must be
4253 extra careful with variables wrapping around because we might be at
4254 the very top (or the very bottom) of the address space and we have
4255 to be able to handle this case properly; in particular, we use an
4256 equality test for the loop condition. */
4257 else
4258 {
4259 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4260
4261 /* Step 1: round SIZE to the previous multiple of the interval. */
4262
4263 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4264
4265
4266 /* Step 2: compute initial and final value of the loop counter. */
4267
4268 /* TEST_ADDR = SP + FIRST. */
4269 emit_set_insn (reg1,
4270 plus_constant (Pmode, stack_pointer_rtx, -first));
4271
4272 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4273 HOST_WIDE_INT adjustment = - (first + rounded_size);
4274 if (! aarch64_uimm12_shift (adjustment))
4275 {
4276 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4277 true, Pmode);
4278 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4279 }
4280 else
4281 emit_set_insn (reg2,
4282 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4283
4284 /* Step 3: the loop
4285
4286 do
4287 {
4288 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4289 probe at TEST_ADDR
4290 }
4291 while (TEST_ADDR != LAST_ADDR)
4292
4293 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4294 until it is equal to ROUNDED_SIZE. */
4295
4296 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4297
4298
4299 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4300 that SIZE is equal to ROUNDED_SIZE. */
4301
4302 if (size != rounded_size)
4303 {
4304 HOST_WIDE_INT rem = size - rounded_size;
4305
4306 if (rem > 256)
4307 {
4308 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4309
4310 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4311 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4312 }
4313 else
4314 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4315 }
4316 }
4317
4318 /* Make sure nothing is scheduled before we are done. */
4319 emit_insn (gen_blockage ());
4320 }
4321
4322 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4323 absolute addresses. */
4324
4325 const char *
4326 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4327 {
4328 static int labelno = 0;
4329 char loop_lab[32];
4330 rtx xops[2];
4331
4332 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4333
4334 /* Loop. */
4335 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4336
4337 HOST_WIDE_INT stack_clash_probe_interval
4338 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4339
4340 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4341 xops[0] = reg1;
4342 HOST_WIDE_INT interval;
4343 if (flag_stack_clash_protection)
4344 interval = stack_clash_probe_interval;
4345 else
4346 interval = PROBE_INTERVAL;
4347
4348 gcc_assert (aarch64_uimm12_shift (interval));
4349 xops[1] = GEN_INT (interval);
4350
4351 output_asm_insn ("sub\t%0, %0, %1", xops);
4352
4353 /* If doing stack clash protection then we probe up by the ABI specified
4354 amount. We do this because we're dropping full pages at a time in the
4355 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4356 if (flag_stack_clash_protection)
4357 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4358 else
4359 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4360
4361 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4362 by this amount for each iteration. */
4363 output_asm_insn ("str\txzr, [%0, %1]", xops);
4364
4365 /* Test if TEST_ADDR == LAST_ADDR. */
4366 xops[1] = reg2;
4367 output_asm_insn ("cmp\t%0, %1", xops);
4368
4369 /* Branch. */
4370 fputs ("\tb.ne\t", asm_out_file);
4371 assemble_name_raw (asm_out_file, loop_lab);
4372 fputc ('\n', asm_out_file);
4373
4374 return "";
4375 }
4376
4377 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4378 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4379 of GUARD_SIZE. When a probe is emitted it is done at most
4380 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4381 at most MIN_PROBE_THRESHOLD. By the end of this function
4382 BASE = BASE - ADJUSTMENT. */
4383
4384 const char *
4385 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4386 rtx min_probe_threshold, rtx guard_size)
4387 {
4388 /* This function is not allowed to use any instruction generation function
4389 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4390 so instead emit the code you want using output_asm_insn. */
4391 gcc_assert (flag_stack_clash_protection);
4392 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4393 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4394
4395 /* The minimum required allocation before the residual requires probing. */
4396 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4397
4398 /* Clamp the value down to the nearest value that can be used with a cmp. */
4399 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4400 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4401
4402 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4403 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4404
4405 static int labelno = 0;
4406 char loop_start_lab[32];
4407 char loop_end_lab[32];
4408 rtx xops[2];
4409
4410 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4411 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4412
4413 /* Emit loop start label. */
4414 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4415
4416 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4417 xops[0] = adjustment;
4418 xops[1] = probe_offset_value_rtx;
4419 output_asm_insn ("cmp\t%0, %1", xops);
4420
4421 /* Branch to end if not enough adjustment to probe. */
4422 fputs ("\tb.lt\t", asm_out_file);
4423 assemble_name_raw (asm_out_file, loop_end_lab);
4424 fputc ('\n', asm_out_file);
4425
4426 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4427 xops[0] = base;
4428 xops[1] = probe_offset_value_rtx;
4429 output_asm_insn ("sub\t%0, %0, %1", xops);
4430
4431 /* Probe at BASE. */
4432 xops[1] = const0_rtx;
4433 output_asm_insn ("str\txzr, [%0, %1]", xops);
4434
4435 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4436 xops[0] = adjustment;
4437 xops[1] = probe_offset_value_rtx;
4438 output_asm_insn ("sub\t%0, %0, %1", xops);
4439
4440 /* Branch to start if still more bytes to allocate. */
4441 fputs ("\tb\t", asm_out_file);
4442 assemble_name_raw (asm_out_file, loop_start_lab);
4443 fputc ('\n', asm_out_file);
4444
4445 /* No probe leave. */
4446 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4447
4448 /* BASE = BASE - ADJUSTMENT. */
4449 xops[0] = base;
4450 xops[1] = adjustment;
4451 output_asm_insn ("sub\t%0, %0, %1", xops);
4452 return "";
4453 }
4454
4455 /* Determine whether a frame chain needs to be generated. */
4456 static bool
4457 aarch64_needs_frame_chain (void)
4458 {
4459 /* Force a frame chain for EH returns so the return address is at FP+8. */
4460 if (frame_pointer_needed || crtl->calls_eh_return)
4461 return true;
4462
4463 /* A leaf function cannot have calls or write LR. */
4464 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4465
4466 /* Don't use a frame chain in leaf functions if leaf frame pointers
4467 are disabled. */
4468 if (flag_omit_leaf_frame_pointer && is_leaf)
4469 return false;
4470
4471 return aarch64_use_frame_pointer;
4472 }
4473
4474 /* Mark the registers that need to be saved by the callee and calculate
4475 the size of the callee-saved registers area and frame record (both FP
4476 and LR may be omitted). */
4477 static void
4478 aarch64_layout_frame (void)
4479 {
4480 HOST_WIDE_INT offset = 0;
4481 int regno, last_fp_reg = INVALID_REGNUM;
4482 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4483
4484 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4485
4486 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4487 the mid-end is doing. */
4488 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4489
4490 #define SLOT_NOT_REQUIRED (-2)
4491 #define SLOT_REQUIRED (-1)
4492
4493 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4494 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4495
4496 /* If this is a non-leaf simd function with calls we assume that
4497 at least one of those calls is to a non-simd function and thus
4498 we must save V8 to V23 in the prologue. */
4499
4500 if (simd_function && !crtl->is_leaf)
4501 {
4502 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4503 if (FP_SIMD_SAVED_REGNUM_P (regno))
4504 df_set_regs_ever_live (regno, true);
4505 }
4506
4507 /* First mark all the registers that really need to be saved... */
4508 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4509 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4510
4511 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4512 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4513
4514 /* ... that includes the eh data registers (if needed)... */
4515 if (crtl->calls_eh_return)
4516 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4517 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4518 = SLOT_REQUIRED;
4519
4520 /* ... and any callee saved register that dataflow says is live. */
4521 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4522 if (df_regs_ever_live_p (regno)
4523 && (regno == R30_REGNUM
4524 || !call_used_regs[regno]))
4525 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4526
4527 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4528 if (df_regs_ever_live_p (regno)
4529 && (!call_used_regs[regno]
4530 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4531 {
4532 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4533 last_fp_reg = regno;
4534 }
4535
4536 if (cfun->machine->frame.emit_frame_chain)
4537 {
4538 /* FP and LR are placed in the linkage record. */
4539 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4540 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4541 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4542 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4543 offset = 2 * UNITS_PER_WORD;
4544 }
4545
4546 /* With stack-clash, LR must be saved in non-leaf functions. */
4547 gcc_assert (crtl->is_leaf
4548 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4549 != SLOT_NOT_REQUIRED));
4550
4551 /* Now assign stack slots for them. */
4552 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4553 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4554 {
4555 cfun->machine->frame.reg_offset[regno] = offset;
4556 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4557 cfun->machine->frame.wb_candidate1 = regno;
4558 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4559 cfun->machine->frame.wb_candidate2 = regno;
4560 offset += UNITS_PER_WORD;
4561 }
4562
4563 HOST_WIDE_INT max_int_offset = offset;
4564 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4565 bool has_align_gap = offset != max_int_offset;
4566
4567 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4568 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4569 {
4570 /* If there is an alignment gap between integer and fp callee-saves,
4571 allocate the last fp register to it if possible. */
4572 if (regno == last_fp_reg
4573 && has_align_gap
4574 && !simd_function
4575 && (offset & 8) == 0)
4576 {
4577 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4578 break;
4579 }
4580
4581 cfun->machine->frame.reg_offset[regno] = offset;
4582 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4583 cfun->machine->frame.wb_candidate1 = regno;
4584 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4585 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4586 cfun->machine->frame.wb_candidate2 = regno;
4587 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4588 }
4589
4590 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4591
4592 cfun->machine->frame.saved_regs_size = offset;
4593
4594 HOST_WIDE_INT varargs_and_saved_regs_size
4595 = offset + cfun->machine->frame.saved_varargs_size;
4596
4597 cfun->machine->frame.hard_fp_offset
4598 = aligned_upper_bound (varargs_and_saved_regs_size
4599 + get_frame_size (),
4600 STACK_BOUNDARY / BITS_PER_UNIT);
4601
4602 /* Both these values are already aligned. */
4603 gcc_assert (multiple_p (crtl->outgoing_args_size,
4604 STACK_BOUNDARY / BITS_PER_UNIT));
4605 cfun->machine->frame.frame_size
4606 = (cfun->machine->frame.hard_fp_offset
4607 + crtl->outgoing_args_size);
4608
4609 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4610
4611 cfun->machine->frame.initial_adjust = 0;
4612 cfun->machine->frame.final_adjust = 0;
4613 cfun->machine->frame.callee_adjust = 0;
4614 cfun->machine->frame.callee_offset = 0;
4615
4616 HOST_WIDE_INT max_push_offset = 0;
4617 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4618 max_push_offset = 512;
4619 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4620 max_push_offset = 256;
4621
4622 HOST_WIDE_INT const_size, const_fp_offset;
4623 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4624 && const_size < max_push_offset
4625 && known_eq (crtl->outgoing_args_size, 0))
4626 {
4627 /* Simple, small frame with no outgoing arguments:
4628 stp reg1, reg2, [sp, -frame_size]!
4629 stp reg3, reg4, [sp, 16] */
4630 cfun->machine->frame.callee_adjust = const_size;
4631 }
4632 else if (known_lt (crtl->outgoing_args_size
4633 + cfun->machine->frame.saved_regs_size, 512)
4634 && !(cfun->calls_alloca
4635 && known_lt (cfun->machine->frame.hard_fp_offset,
4636 max_push_offset)))
4637 {
4638 /* Frame with small outgoing arguments:
4639 sub sp, sp, frame_size
4640 stp reg1, reg2, [sp, outgoing_args_size]
4641 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4642 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4643 cfun->machine->frame.callee_offset
4644 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4645 }
4646 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4647 && const_fp_offset < max_push_offset)
4648 {
4649 /* Frame with large outgoing arguments but a small local area:
4650 stp reg1, reg2, [sp, -hard_fp_offset]!
4651 stp reg3, reg4, [sp, 16]
4652 sub sp, sp, outgoing_args_size */
4653 cfun->machine->frame.callee_adjust = const_fp_offset;
4654 cfun->machine->frame.final_adjust
4655 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4656 }
4657 else
4658 {
4659 /* Frame with large local area and outgoing arguments using frame pointer:
4660 sub sp, sp, hard_fp_offset
4661 stp x29, x30, [sp, 0]
4662 add x29, sp, 0
4663 stp reg3, reg4, [sp, 16]
4664 sub sp, sp, outgoing_args_size */
4665 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4666 cfun->machine->frame.final_adjust
4667 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4668 }
4669
4670 cfun->machine->frame.laid_out = true;
4671 }
4672
4673 /* Return true if the register REGNO is saved on entry to
4674 the current function. */
4675
4676 static bool
4677 aarch64_register_saved_on_entry (int regno)
4678 {
4679 return cfun->machine->frame.reg_offset[regno] >= 0;
4680 }
4681
4682 /* Return the next register up from REGNO up to LIMIT for the callee
4683 to save. */
4684
4685 static unsigned
4686 aarch64_next_callee_save (unsigned regno, unsigned limit)
4687 {
4688 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4689 regno ++;
4690 return regno;
4691 }
4692
4693 /* Push the register number REGNO of mode MODE to the stack with write-back
4694 adjusting the stack by ADJUSTMENT. */
4695
4696 static void
4697 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4698 HOST_WIDE_INT adjustment)
4699 {
4700 rtx base_rtx = stack_pointer_rtx;
4701 rtx insn, reg, mem;
4702
4703 reg = gen_rtx_REG (mode, regno);
4704 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4705 plus_constant (Pmode, base_rtx, -adjustment));
4706 mem = gen_frame_mem (mode, mem);
4707
4708 insn = emit_move_insn (mem, reg);
4709 RTX_FRAME_RELATED_P (insn) = 1;
4710 }
4711
4712 /* Generate and return an instruction to store the pair of registers
4713 REG and REG2 of mode MODE to location BASE with write-back adjusting
4714 the stack location BASE by ADJUSTMENT. */
4715
4716 static rtx
4717 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4718 HOST_WIDE_INT adjustment)
4719 {
4720 switch (mode)
4721 {
4722 case E_DImode:
4723 return gen_storewb_pairdi_di (base, base, reg, reg2,
4724 GEN_INT (-adjustment),
4725 GEN_INT (UNITS_PER_WORD - adjustment));
4726 case E_DFmode:
4727 return gen_storewb_pairdf_di (base, base, reg, reg2,
4728 GEN_INT (-adjustment),
4729 GEN_INT (UNITS_PER_WORD - adjustment));
4730 case E_TFmode:
4731 return gen_storewb_pairtf_di (base, base, reg, reg2,
4732 GEN_INT (-adjustment),
4733 GEN_INT (UNITS_PER_VREG - adjustment));
4734 default:
4735 gcc_unreachable ();
4736 }
4737 }
4738
4739 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4740 stack pointer by ADJUSTMENT. */
4741
4742 static void
4743 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4744 {
4745 rtx_insn *insn;
4746 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4747
4748 if (regno2 == INVALID_REGNUM)
4749 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4750
4751 rtx reg1 = gen_rtx_REG (mode, regno1);
4752 rtx reg2 = gen_rtx_REG (mode, regno2);
4753
4754 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4755 reg2, adjustment));
4756 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4757 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4758 RTX_FRAME_RELATED_P (insn) = 1;
4759 }
4760
4761 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4762 adjusting it by ADJUSTMENT afterwards. */
4763
4764 static rtx
4765 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4766 HOST_WIDE_INT adjustment)
4767 {
4768 switch (mode)
4769 {
4770 case E_DImode:
4771 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4772 GEN_INT (UNITS_PER_WORD));
4773 case E_DFmode:
4774 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4775 GEN_INT (UNITS_PER_WORD));
4776 case E_TFmode:
4777 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4778 GEN_INT (UNITS_PER_VREG));
4779 default:
4780 gcc_unreachable ();
4781 }
4782 }
4783
4784 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4785 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4786 into CFI_OPS. */
4787
4788 static void
4789 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4790 rtx *cfi_ops)
4791 {
4792 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4793 rtx reg1 = gen_rtx_REG (mode, regno1);
4794
4795 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4796
4797 if (regno2 == INVALID_REGNUM)
4798 {
4799 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4800 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4801 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4802 }
4803 else
4804 {
4805 rtx reg2 = gen_rtx_REG (mode, regno2);
4806 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4807 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4808 reg2, adjustment));
4809 }
4810 }
4811
4812 /* Generate and return a store pair instruction of mode MODE to store
4813 register REG1 to MEM1 and register REG2 to MEM2. */
4814
4815 static rtx
4816 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4817 rtx reg2)
4818 {
4819 switch (mode)
4820 {
4821 case E_DImode:
4822 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4823
4824 case E_DFmode:
4825 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4826
4827 case E_TFmode:
4828 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4829
4830 default:
4831 gcc_unreachable ();
4832 }
4833 }
4834
4835 /* Generate and regurn a load pair isntruction of mode MODE to load register
4836 REG1 from MEM1 and register REG2 from MEM2. */
4837
4838 static rtx
4839 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4840 rtx mem2)
4841 {
4842 switch (mode)
4843 {
4844 case E_DImode:
4845 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4846
4847 case E_DFmode:
4848 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4849
4850 case E_TFmode:
4851 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4852
4853 default:
4854 gcc_unreachable ();
4855 }
4856 }
4857
4858 /* Return TRUE if return address signing should be enabled for the current
4859 function, otherwise return FALSE. */
4860
4861 bool
4862 aarch64_return_address_signing_enabled (void)
4863 {
4864 /* This function should only be called after frame laid out. */
4865 gcc_assert (cfun->machine->frame.laid_out);
4866
4867 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4868 if its LR is pushed onto stack. */
4869 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4870 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4871 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4872 }
4873
4874 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4875 bool
4876 aarch64_bti_enabled (void)
4877 {
4878 return (aarch64_enable_bti == 1);
4879 }
4880
4881 /* Emit code to save the callee-saved registers from register number START
4882 to LIMIT to the stack at the location starting at offset START_OFFSET,
4883 skipping any write-back candidates if SKIP_WB is true. */
4884
4885 static void
4886 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4887 unsigned start, unsigned limit, bool skip_wb)
4888 {
4889 rtx_insn *insn;
4890 unsigned regno;
4891 unsigned regno2;
4892
4893 for (regno = aarch64_next_callee_save (start, limit);
4894 regno <= limit;
4895 regno = aarch64_next_callee_save (regno + 1, limit))
4896 {
4897 rtx reg, mem;
4898 poly_int64 offset;
4899 int offset_diff;
4900
4901 if (skip_wb
4902 && (regno == cfun->machine->frame.wb_candidate1
4903 || regno == cfun->machine->frame.wb_candidate2))
4904 continue;
4905
4906 if (cfun->machine->reg_is_wrapped_separately[regno])
4907 continue;
4908
4909 reg = gen_rtx_REG (mode, regno);
4910 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4911 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4912 offset));
4913
4914 regno2 = aarch64_next_callee_save (regno + 1, limit);
4915 offset_diff = cfun->machine->frame.reg_offset[regno2]
4916 - cfun->machine->frame.reg_offset[regno];
4917
4918 if (regno2 <= limit
4919 && !cfun->machine->reg_is_wrapped_separately[regno2]
4920 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4921 {
4922 rtx reg2 = gen_rtx_REG (mode, regno2);
4923 rtx mem2;
4924
4925 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4926 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4927 offset));
4928 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4929 reg2));
4930
4931 /* The first part of a frame-related parallel insn is
4932 always assumed to be relevant to the frame
4933 calculations; subsequent parts, are only
4934 frame-related if explicitly marked. */
4935 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4936 regno = regno2;
4937 }
4938 else
4939 insn = emit_move_insn (mem, reg);
4940
4941 RTX_FRAME_RELATED_P (insn) = 1;
4942 }
4943 }
4944
4945 /* Emit code to restore the callee registers of mode MODE from register
4946 number START up to and including LIMIT. Restore from the stack offset
4947 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4948 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4949
4950 static void
4951 aarch64_restore_callee_saves (machine_mode mode,
4952 poly_int64 start_offset, unsigned start,
4953 unsigned limit, bool skip_wb, rtx *cfi_ops)
4954 {
4955 rtx base_rtx = stack_pointer_rtx;
4956 unsigned regno;
4957 unsigned regno2;
4958 poly_int64 offset;
4959
4960 for (regno = aarch64_next_callee_save (start, limit);
4961 regno <= limit;
4962 regno = aarch64_next_callee_save (regno + 1, limit))
4963 {
4964 if (cfun->machine->reg_is_wrapped_separately[regno])
4965 continue;
4966
4967 rtx reg, mem;
4968 int offset_diff;
4969
4970 if (skip_wb
4971 && (regno == cfun->machine->frame.wb_candidate1
4972 || regno == cfun->machine->frame.wb_candidate2))
4973 continue;
4974
4975 reg = gen_rtx_REG (mode, regno);
4976 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4977 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4978
4979 regno2 = aarch64_next_callee_save (regno + 1, limit);
4980 offset_diff = cfun->machine->frame.reg_offset[regno2]
4981 - cfun->machine->frame.reg_offset[regno];
4982
4983 if (regno2 <= limit
4984 && !cfun->machine->reg_is_wrapped_separately[regno2]
4985 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4986 {
4987 rtx reg2 = gen_rtx_REG (mode, regno2);
4988 rtx mem2;
4989
4990 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4991 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4992 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4993
4994 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4995 regno = regno2;
4996 }
4997 else
4998 emit_move_insn (reg, mem);
4999 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5000 }
5001 }
5002
5003 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5004 of MODE. */
5005
5006 static inline bool
5007 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5008 {
5009 HOST_WIDE_INT multiple;
5010 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5011 && IN_RANGE (multiple, -8, 7));
5012 }
5013
5014 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5015 of MODE. */
5016
5017 static inline bool
5018 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5019 {
5020 HOST_WIDE_INT multiple;
5021 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5022 && IN_RANGE (multiple, 0, 63));
5023 }
5024
5025 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5026 of MODE. */
5027
5028 bool
5029 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5030 {
5031 HOST_WIDE_INT multiple;
5032 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5033 && IN_RANGE (multiple, -64, 63));
5034 }
5035
5036 /* Return true if OFFSET is a signed 9-bit value. */
5037
5038 bool
5039 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5040 poly_int64 offset)
5041 {
5042 HOST_WIDE_INT const_offset;
5043 return (offset.is_constant (&const_offset)
5044 && IN_RANGE (const_offset, -256, 255));
5045 }
5046
5047 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5048 of MODE. */
5049
5050 static inline bool
5051 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5052 {
5053 HOST_WIDE_INT multiple;
5054 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5055 && IN_RANGE (multiple, -256, 255));
5056 }
5057
5058 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5059 of MODE. */
5060
5061 static inline bool
5062 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5063 {
5064 HOST_WIDE_INT multiple;
5065 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5066 && IN_RANGE (multiple, 0, 4095));
5067 }
5068
5069 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5070
5071 static sbitmap
5072 aarch64_get_separate_components (void)
5073 {
5074 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5075 bitmap_clear (components);
5076
5077 /* The registers we need saved to the frame. */
5078 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5079 if (aarch64_register_saved_on_entry (regno))
5080 {
5081 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5082 if (!frame_pointer_needed)
5083 offset += cfun->machine->frame.frame_size
5084 - cfun->machine->frame.hard_fp_offset;
5085 /* Check that we can access the stack slot of the register with one
5086 direct load with no adjustments needed. */
5087 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5088 bitmap_set_bit (components, regno);
5089 }
5090
5091 /* Don't mess with the hard frame pointer. */
5092 if (frame_pointer_needed)
5093 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5094
5095 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5096 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5097 /* If registers have been chosen to be stored/restored with
5098 writeback don't interfere with them to avoid having to output explicit
5099 stack adjustment instructions. */
5100 if (reg2 != INVALID_REGNUM)
5101 bitmap_clear_bit (components, reg2);
5102 if (reg1 != INVALID_REGNUM)
5103 bitmap_clear_bit (components, reg1);
5104
5105 bitmap_clear_bit (components, LR_REGNUM);
5106 bitmap_clear_bit (components, SP_REGNUM);
5107
5108 return components;
5109 }
5110
5111 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5112
5113 static sbitmap
5114 aarch64_components_for_bb (basic_block bb)
5115 {
5116 bitmap in = DF_LIVE_IN (bb);
5117 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5118 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5119 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5120
5121 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5122 bitmap_clear (components);
5123
5124 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5125 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5126 if ((!call_used_regs[regno]
5127 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5128 && (bitmap_bit_p (in, regno)
5129 || bitmap_bit_p (gen, regno)
5130 || bitmap_bit_p (kill, regno)))
5131 {
5132 unsigned regno2, offset, offset2;
5133 bitmap_set_bit (components, regno);
5134
5135 /* If there is a callee-save at an adjacent offset, add it too
5136 to increase the use of LDP/STP. */
5137 offset = cfun->machine->frame.reg_offset[regno];
5138 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5139
5140 if (regno2 <= LAST_SAVED_REGNUM)
5141 {
5142 offset2 = cfun->machine->frame.reg_offset[regno2];
5143 if ((offset & ~8) == (offset2 & ~8))
5144 bitmap_set_bit (components, regno2);
5145 }
5146 }
5147
5148 return components;
5149 }
5150
5151 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5152 Nothing to do for aarch64. */
5153
5154 static void
5155 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5156 {
5157 }
5158
5159 /* Return the next set bit in BMP from START onwards. Return the total number
5160 of bits in BMP if no set bit is found at or after START. */
5161
5162 static unsigned int
5163 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5164 {
5165 unsigned int nbits = SBITMAP_SIZE (bmp);
5166 if (start == nbits)
5167 return start;
5168
5169 gcc_assert (start < nbits);
5170 for (unsigned int i = start; i < nbits; i++)
5171 if (bitmap_bit_p (bmp, i))
5172 return i;
5173
5174 return nbits;
5175 }
5176
5177 /* Do the work for aarch64_emit_prologue_components and
5178 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5179 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5180 for these components or the epilogue sequence. That is, it determines
5181 whether we should emit stores or loads and what kind of CFA notes to attach
5182 to the insns. Otherwise the logic for the two sequences is very
5183 similar. */
5184
5185 static void
5186 aarch64_process_components (sbitmap components, bool prologue_p)
5187 {
5188 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5189 ? HARD_FRAME_POINTER_REGNUM
5190 : STACK_POINTER_REGNUM);
5191
5192 unsigned last_regno = SBITMAP_SIZE (components);
5193 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5194 rtx_insn *insn = NULL;
5195
5196 while (regno != last_regno)
5197 {
5198 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5199 so DFmode for the vector registers is enough. For simd functions
5200 we want to save the low 128 bits. */
5201 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5202
5203 rtx reg = gen_rtx_REG (mode, regno);
5204 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5205 if (!frame_pointer_needed)
5206 offset += cfun->machine->frame.frame_size
5207 - cfun->machine->frame.hard_fp_offset;
5208 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5209 rtx mem = gen_frame_mem (mode, addr);
5210
5211 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5212 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5213 /* No more registers to handle after REGNO.
5214 Emit a single save/restore and exit. */
5215 if (regno2 == last_regno)
5216 {
5217 insn = emit_insn (set);
5218 RTX_FRAME_RELATED_P (insn) = 1;
5219 if (prologue_p)
5220 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5221 else
5222 add_reg_note (insn, REG_CFA_RESTORE, reg);
5223 break;
5224 }
5225
5226 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5227 /* The next register is not of the same class or its offset is not
5228 mergeable with the current one into a pair. */
5229 if (!satisfies_constraint_Ump (mem)
5230 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5231 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5232 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5233 GET_MODE_SIZE (mode)))
5234 {
5235 insn = emit_insn (set);
5236 RTX_FRAME_RELATED_P (insn) = 1;
5237 if (prologue_p)
5238 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5239 else
5240 add_reg_note (insn, REG_CFA_RESTORE, reg);
5241
5242 regno = regno2;
5243 continue;
5244 }
5245
5246 /* REGNO2 can be saved/restored in a pair with REGNO. */
5247 rtx reg2 = gen_rtx_REG (mode, regno2);
5248 if (!frame_pointer_needed)
5249 offset2 += cfun->machine->frame.frame_size
5250 - cfun->machine->frame.hard_fp_offset;
5251 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5252 rtx mem2 = gen_frame_mem (mode, addr2);
5253 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5254 : gen_rtx_SET (reg2, mem2);
5255
5256 if (prologue_p)
5257 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5258 else
5259 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5260
5261 RTX_FRAME_RELATED_P (insn) = 1;
5262 if (prologue_p)
5263 {
5264 add_reg_note (insn, REG_CFA_OFFSET, set);
5265 add_reg_note (insn, REG_CFA_OFFSET, set2);
5266 }
5267 else
5268 {
5269 add_reg_note (insn, REG_CFA_RESTORE, reg);
5270 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5271 }
5272
5273 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5274 }
5275 }
5276
5277 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5278
5279 static void
5280 aarch64_emit_prologue_components (sbitmap components)
5281 {
5282 aarch64_process_components (components, true);
5283 }
5284
5285 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5286
5287 static void
5288 aarch64_emit_epilogue_components (sbitmap components)
5289 {
5290 aarch64_process_components (components, false);
5291 }
5292
5293 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5294
5295 static void
5296 aarch64_set_handled_components (sbitmap components)
5297 {
5298 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5299 if (bitmap_bit_p (components, regno))
5300 cfun->machine->reg_is_wrapped_separately[regno] = true;
5301 }
5302
5303 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5304 determining the probe offset for alloca. */
5305
5306 static HOST_WIDE_INT
5307 aarch64_stack_clash_protection_alloca_probe_range (void)
5308 {
5309 return STACK_CLASH_CALLER_GUARD;
5310 }
5311
5312
5313 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5314 registers. If POLY_SIZE is not large enough to require a probe this function
5315 will only adjust the stack. When allocating the stack space
5316 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5317 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5318 arguments. If we are then we ensure that any allocation larger than the ABI
5319 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5320 maintained.
5321
5322 We emit barriers after each stack adjustment to prevent optimizations from
5323 breaking the invariant that we never drop the stack more than a page. This
5324 invariant is needed to make it easier to correctly handle asynchronous
5325 events, e.g. if we were to allow the stack to be dropped by more than a page
5326 and then have multiple probes up and we take a signal somewhere in between
5327 then the signal handler doesn't know the state of the stack and can make no
5328 assumptions about which pages have been probed. */
5329
5330 static void
5331 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5332 poly_int64 poly_size,
5333 bool frame_related_p,
5334 bool final_adjustment_p)
5335 {
5336 HOST_WIDE_INT guard_size
5337 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5338 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5339 /* When doing the final adjustment for the outgoing argument size we can't
5340 assume that LR was saved at position 0. So subtract it's offset from the
5341 ABI safe buffer so that we don't accidentally allow an adjustment that
5342 would result in an allocation larger than the ABI buffer without
5343 probing. */
5344 HOST_WIDE_INT min_probe_threshold
5345 = final_adjustment_p
5346 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5347 : guard_size - guard_used_by_caller;
5348
5349 poly_int64 frame_size = cfun->machine->frame.frame_size;
5350
5351 /* We should always have a positive probe threshold. */
5352 gcc_assert (min_probe_threshold > 0);
5353
5354 if (flag_stack_clash_protection && !final_adjustment_p)
5355 {
5356 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5357 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5358
5359 if (known_eq (frame_size, 0))
5360 {
5361 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5362 }
5363 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5364 && known_lt (final_adjust, guard_used_by_caller))
5365 {
5366 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5367 }
5368 }
5369
5370 /* If SIZE is not large enough to require probing, just adjust the stack and
5371 exit. */
5372 if (known_lt (poly_size, min_probe_threshold)
5373 || !flag_stack_clash_protection)
5374 {
5375 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5376 return;
5377 }
5378
5379 HOST_WIDE_INT size;
5380 /* Handle the SVE non-constant case first. */
5381 if (!poly_size.is_constant (&size))
5382 {
5383 if (dump_file)
5384 {
5385 fprintf (dump_file, "Stack clash SVE prologue: ");
5386 print_dec (poly_size, dump_file);
5387 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5388 }
5389
5390 /* First calculate the amount of bytes we're actually spilling. */
5391 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5392 poly_size, temp1, temp2, false, true);
5393
5394 rtx_insn *insn = get_last_insn ();
5395
5396 if (frame_related_p)
5397 {
5398 /* This is done to provide unwinding information for the stack
5399 adjustments we're about to do, however to prevent the optimizers
5400 from removing the R11 move and leaving the CFA note (which would be
5401 very wrong) we tie the old and new stack pointer together.
5402 The tie will expand to nothing but the optimizers will not touch
5403 the instruction. */
5404 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5405 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5406 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5407
5408 /* We want the CFA independent of the stack pointer for the
5409 duration of the loop. */
5410 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5411 RTX_FRAME_RELATED_P (insn) = 1;
5412 }
5413
5414 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5415 rtx guard_const = gen_int_mode (guard_size, Pmode);
5416
5417 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5418 stack_pointer_rtx, temp1,
5419 probe_const, guard_const));
5420
5421 /* Now reset the CFA register if needed. */
5422 if (frame_related_p)
5423 {
5424 add_reg_note (insn, REG_CFA_DEF_CFA,
5425 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5426 gen_int_mode (poly_size, Pmode)));
5427 RTX_FRAME_RELATED_P (insn) = 1;
5428 }
5429
5430 return;
5431 }
5432
5433 if (dump_file)
5434 fprintf (dump_file,
5435 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5436 " bytes, probing will be required.\n", size);
5437
5438 /* Round size to the nearest multiple of guard_size, and calculate the
5439 residual as the difference between the original size and the rounded
5440 size. */
5441 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5442 HOST_WIDE_INT residual = size - rounded_size;
5443
5444 /* We can handle a small number of allocations/probes inline. Otherwise
5445 punt to a loop. */
5446 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5447 {
5448 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5449 {
5450 aarch64_sub_sp (NULL, temp2, guard_size, true);
5451 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5452 guard_used_by_caller));
5453 emit_insn (gen_blockage ());
5454 }
5455 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5456 }
5457 else
5458 {
5459 /* Compute the ending address. */
5460 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5461 temp1, NULL, false, true);
5462 rtx_insn *insn = get_last_insn ();
5463
5464 /* For the initial allocation, we don't have a frame pointer
5465 set up, so we always need CFI notes. If we're doing the
5466 final allocation, then we may have a frame pointer, in which
5467 case it is the CFA, otherwise we need CFI notes.
5468
5469 We can determine which allocation we are doing by looking at
5470 the value of FRAME_RELATED_P since the final allocations are not
5471 frame related. */
5472 if (frame_related_p)
5473 {
5474 /* We want the CFA independent of the stack pointer for the
5475 duration of the loop. */
5476 add_reg_note (insn, REG_CFA_DEF_CFA,
5477 plus_constant (Pmode, temp1, rounded_size));
5478 RTX_FRAME_RELATED_P (insn) = 1;
5479 }
5480
5481 /* This allocates and probes the stack. Note that this re-uses some of
5482 the existing Ada stack protection code. However we are guaranteed not
5483 to enter the non loop or residual branches of that code.
5484
5485 The non-loop part won't be entered because if our allocation amount
5486 doesn't require a loop, the case above would handle it.
5487
5488 The residual amount won't be entered because TEMP1 is a mutliple of
5489 the allocation size. The residual will always be 0. As such, the only
5490 part we are actually using from that code is the loop setup. The
5491 actual probing is done in aarch64_output_probe_stack_range. */
5492 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5493 stack_pointer_rtx, temp1));
5494
5495 /* Now reset the CFA register if needed. */
5496 if (frame_related_p)
5497 {
5498 add_reg_note (insn, REG_CFA_DEF_CFA,
5499 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5500 RTX_FRAME_RELATED_P (insn) = 1;
5501 }
5502
5503 emit_insn (gen_blockage ());
5504 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5505 }
5506
5507 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5508 be probed. This maintains the requirement that each page is probed at
5509 least once. For initial probing we probe only if the allocation is
5510 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5511 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5512 GUARD_SIZE. This works that for any allocation that is large enough to
5513 trigger a probe here, we'll have at least one, and if they're not large
5514 enough for this code to emit anything for them, The page would have been
5515 probed by the saving of FP/LR either by this function or any callees. If
5516 we don't have any callees then we won't have more stack adjustments and so
5517 are still safe. */
5518 if (residual)
5519 {
5520 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5521 /* If we're doing final adjustments, and we've done any full page
5522 allocations then any residual needs to be probed. */
5523 if (final_adjustment_p && rounded_size != 0)
5524 min_probe_threshold = 0;
5525 /* If doing a small final adjustment, we always probe at offset 0.
5526 This is done to avoid issues when LR is not at position 0 or when
5527 the final adjustment is smaller than the probing offset. */
5528 else if (final_adjustment_p && rounded_size == 0)
5529 residual_probe_offset = 0;
5530
5531 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5532 if (residual >= min_probe_threshold)
5533 {
5534 if (dump_file)
5535 fprintf (dump_file,
5536 "Stack clash AArch64 prologue residuals: "
5537 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5538 "\n", residual);
5539
5540 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5541 residual_probe_offset));
5542 emit_insn (gen_blockage ());
5543 }
5544 }
5545 }
5546
5547 /* Return 1 if the register is used by the epilogue. We need to say the
5548 return register is used, but only after epilogue generation is complete.
5549 Note that in the case of sibcalls, the values "used by the epilogue" are
5550 considered live at the start of the called function.
5551
5552 For SIMD functions we need to return 1 for FP registers that are saved and
5553 restored by a function but are not zero in call_used_regs. If we do not do
5554 this optimizations may remove the restore of the register. */
5555
5556 int
5557 aarch64_epilogue_uses (int regno)
5558 {
5559 if (epilogue_completed)
5560 {
5561 if (regno == LR_REGNUM)
5562 return 1;
5563 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5564 return 1;
5565 }
5566 return 0;
5567 }
5568
5569 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5570 is saved at BASE + OFFSET. */
5571
5572 static void
5573 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5574 rtx base, poly_int64 offset)
5575 {
5576 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5577 add_reg_note (insn, REG_CFA_EXPRESSION,
5578 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5579 }
5580
5581 /* AArch64 stack frames generated by this compiler look like:
5582
5583 +-------------------------------+
5584 | |
5585 | incoming stack arguments |
5586 | |
5587 +-------------------------------+
5588 | | <-- incoming stack pointer (aligned)
5589 | callee-allocated save area |
5590 | for register varargs |
5591 | |
5592 +-------------------------------+
5593 | local variables | <-- frame_pointer_rtx
5594 | |
5595 +-------------------------------+
5596 | padding | \
5597 +-------------------------------+ |
5598 | callee-saved registers | | frame.saved_regs_size
5599 +-------------------------------+ |
5600 | LR' | |
5601 +-------------------------------+ |
5602 | FP' | / <- hard_frame_pointer_rtx (aligned)
5603 +-------------------------------+
5604 | dynamic allocation |
5605 +-------------------------------+
5606 | padding |
5607 +-------------------------------+
5608 | outgoing stack arguments | <-- arg_pointer
5609 | |
5610 +-------------------------------+
5611 | | <-- stack_pointer_rtx (aligned)
5612
5613 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5614 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5615 unchanged.
5616
5617 By default for stack-clash we assume the guard is at least 64KB, but this
5618 value is configurable to either 4KB or 64KB. We also force the guard size to
5619 be the same as the probing interval and both values are kept in sync.
5620
5621 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5622 on the guard size) of stack space without probing.
5623
5624 When probing is needed, we emit a probe at the start of the prologue
5625 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5626
5627 We have to track how much space has been allocated and the only stores
5628 to the stack we track as implicit probes are the FP/LR stores.
5629
5630 For outgoing arguments we probe if the size is larger than 1KB, such that
5631 the ABI specified buffer is maintained for the next callee.
5632
5633 The following registers are reserved during frame layout and should not be
5634 used for any other purpose:
5635
5636 - r11: Used by stack clash protection when SVE is enabled.
5637 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5638 - r14 and r15: Used for speculation tracking.
5639 - r16(IP0), r17(IP1): Used by indirect tailcalls.
5640 - r30(LR), r29(FP): Used by standard frame layout.
5641
5642 These registers must be avoided in frame layout related code unless the
5643 explicit intention is to interact with one of the features listed above. */
5644
5645 /* Generate the prologue instructions for entry into a function.
5646 Establish the stack frame by decreasing the stack pointer with a
5647 properly calculated size and, if necessary, create a frame record
5648 filled with the values of LR and previous frame pointer. The
5649 current FP is also set up if it is in use. */
5650
5651 void
5652 aarch64_expand_prologue (void)
5653 {
5654 poly_int64 frame_size = cfun->machine->frame.frame_size;
5655 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5656 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5657 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5658 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5659 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5660 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5661 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5662 rtx_insn *insn;
5663
5664 /* Sign return address for functions. */
5665 if (aarch64_return_address_signing_enabled ())
5666 {
5667 switch (aarch64_ra_sign_key)
5668 {
5669 case AARCH64_KEY_A:
5670 insn = emit_insn (gen_paciasp ());
5671 break;
5672 case AARCH64_KEY_B:
5673 insn = emit_insn (gen_pacibsp ());
5674 break;
5675 default:
5676 gcc_unreachable ();
5677 }
5678 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5679 RTX_FRAME_RELATED_P (insn) = 1;
5680 }
5681
5682 if (flag_stack_usage_info)
5683 current_function_static_stack_size = constant_lower_bound (frame_size);
5684
5685 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5686 {
5687 if (crtl->is_leaf && !cfun->calls_alloca)
5688 {
5689 if (maybe_gt (frame_size, PROBE_INTERVAL)
5690 && maybe_gt (frame_size, get_stack_check_protect ()))
5691 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5692 (frame_size
5693 - get_stack_check_protect ()));
5694 }
5695 else if (maybe_gt (frame_size, 0))
5696 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5697 }
5698
5699 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5700 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5701
5702 /* In theory we should never have both an initial adjustment
5703 and a callee save adjustment. Verify that is the case since the
5704 code below does not handle it for -fstack-clash-protection. */
5705 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5706
5707 /* Will only probe if the initial adjustment is larger than the guard
5708 less the amount of the guard reserved for use by the caller's
5709 outgoing args. */
5710 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5711 true, false);
5712
5713 if (callee_adjust != 0)
5714 aarch64_push_regs (reg1, reg2, callee_adjust);
5715
5716 if (emit_frame_chain)
5717 {
5718 poly_int64 reg_offset = callee_adjust;
5719 if (callee_adjust == 0)
5720 {
5721 reg1 = R29_REGNUM;
5722 reg2 = R30_REGNUM;
5723 reg_offset = callee_offset;
5724 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5725 }
5726 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5727 stack_pointer_rtx, callee_offset,
5728 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5729 if (frame_pointer_needed && !frame_size.is_constant ())
5730 {
5731 /* Variable-sized frames need to describe the save slot
5732 address using DW_CFA_expression rather than DW_CFA_offset.
5733 This means that, without taking further action, the
5734 locations of the registers that we've already saved would
5735 remain based on the stack pointer even after we redefine
5736 the CFA based on the frame pointer. We therefore need new
5737 DW_CFA_expressions to re-express the save slots with addresses
5738 based on the frame pointer. */
5739 rtx_insn *insn = get_last_insn ();
5740 gcc_assert (RTX_FRAME_RELATED_P (insn));
5741
5742 /* Add an explicit CFA definition if this was previously
5743 implicit. */
5744 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5745 {
5746 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5747 callee_offset);
5748 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5749 gen_rtx_SET (hard_frame_pointer_rtx, src));
5750 }
5751
5752 /* Change the save slot expressions for the registers that
5753 we've already saved. */
5754 reg_offset -= callee_offset;
5755 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5756 reg_offset + UNITS_PER_WORD);
5757 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5758 reg_offset);
5759 }
5760 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5761 }
5762
5763 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5764 callee_adjust != 0 || emit_frame_chain);
5765 if (aarch64_simd_decl_p (cfun->decl))
5766 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5767 callee_adjust != 0 || emit_frame_chain);
5768 else
5769 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5770 callee_adjust != 0 || emit_frame_chain);
5771
5772 /* We may need to probe the final adjustment if it is larger than the guard
5773 that is assumed by the called. */
5774 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5775 !frame_pointer_needed, true);
5776 }
5777
5778 /* Return TRUE if we can use a simple_return insn.
5779
5780 This function checks whether the callee saved stack is empty, which
5781 means no restore actions are need. The pro_and_epilogue will use
5782 this to check whether shrink-wrapping opt is feasible. */
5783
5784 bool
5785 aarch64_use_return_insn_p (void)
5786 {
5787 if (!reload_completed)
5788 return false;
5789
5790 if (crtl->profile)
5791 return false;
5792
5793 return known_eq (cfun->machine->frame.frame_size, 0);
5794 }
5795
5796 /* Return false for non-leaf SIMD functions in order to avoid
5797 shrink-wrapping them. Doing this will lose the necessary
5798 save/restore of FP registers. */
5799
5800 bool
5801 aarch64_use_simple_return_insn_p (void)
5802 {
5803 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5804 return false;
5805
5806 return true;
5807 }
5808
5809 /* Generate the epilogue instructions for returning from a function.
5810 This is almost exactly the reverse of the prolog sequence, except
5811 that we need to insert barriers to avoid scheduling loads that read
5812 from a deallocated stack, and we optimize the unwind records by
5813 emitting them all together if possible. */
5814 void
5815 aarch64_expand_epilogue (bool for_sibcall)
5816 {
5817 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5818 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5819 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5820 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5821 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5822 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5823 rtx cfi_ops = NULL;
5824 rtx_insn *insn;
5825 /* A stack clash protection prologue may not have left EP0_REGNUM or
5826 EP1_REGNUM in a usable state. The same is true for allocations
5827 with an SVE component, since we then need both temporary registers
5828 for each allocation. For stack clash we are in a usable state if
5829 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5830 HOST_WIDE_INT guard_size
5831 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5832 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5833
5834 /* We can re-use the registers when the allocation amount is smaller than
5835 guard_size - guard_used_by_caller because we won't be doing any probes
5836 then. In such situations the register should remain live with the correct
5837 value. */
5838 bool can_inherit_p = (initial_adjust.is_constant ()
5839 && final_adjust.is_constant ())
5840 && (!flag_stack_clash_protection
5841 || known_lt (initial_adjust,
5842 guard_size - guard_used_by_caller));
5843
5844 /* We need to add memory barrier to prevent read from deallocated stack. */
5845 bool need_barrier_p
5846 = maybe_ne (get_frame_size ()
5847 + cfun->machine->frame.saved_varargs_size, 0);
5848
5849 /* Emit a barrier to prevent loads from a deallocated stack. */
5850 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5851 || cfun->calls_alloca
5852 || crtl->calls_eh_return)
5853 {
5854 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5855 need_barrier_p = false;
5856 }
5857
5858 /* Restore the stack pointer from the frame pointer if it may not
5859 be the same as the stack pointer. */
5860 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5861 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5862 if (frame_pointer_needed
5863 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5864 /* If writeback is used when restoring callee-saves, the CFA
5865 is restored on the instruction doing the writeback. */
5866 aarch64_add_offset (Pmode, stack_pointer_rtx,
5867 hard_frame_pointer_rtx, -callee_offset,
5868 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5869 else
5870 /* The case where we need to re-use the register here is very rare, so
5871 avoid the complicated condition and just always emit a move if the
5872 immediate doesn't fit. */
5873 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5874
5875 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5876 callee_adjust != 0, &cfi_ops);
5877 if (aarch64_simd_decl_p (cfun->decl))
5878 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5879 callee_adjust != 0, &cfi_ops);
5880 else
5881 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5882 callee_adjust != 0, &cfi_ops);
5883
5884 if (need_barrier_p)
5885 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5886
5887 if (callee_adjust != 0)
5888 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5889
5890 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5891 {
5892 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5893 insn = get_last_insn ();
5894 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5895 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5896 RTX_FRAME_RELATED_P (insn) = 1;
5897 cfi_ops = NULL;
5898 }
5899
5900 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5901 add restriction on emit_move optimization to leaf functions. */
5902 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5903 (!can_inherit_p || !crtl->is_leaf
5904 || df_regs_ever_live_p (EP0_REGNUM)));
5905
5906 if (cfi_ops)
5907 {
5908 /* Emit delayed restores and reset the CFA to be SP. */
5909 insn = get_last_insn ();
5910 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5911 REG_NOTES (insn) = cfi_ops;
5912 RTX_FRAME_RELATED_P (insn) = 1;
5913 }
5914
5915 /* We prefer to emit the combined return/authenticate instruction RETAA,
5916 however there are three cases in which we must instead emit an explicit
5917 authentication instruction.
5918
5919 1) Sibcalls don't return in a normal way, so if we're about to call one
5920 we must authenticate.
5921
5922 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5923 generating code for !TARGET_ARMV8_3 we can't use it and must
5924 explicitly authenticate.
5925
5926 3) On an eh_return path we make extra stack adjustments to update the
5927 canonical frame address to be the exception handler's CFA. We want
5928 to authenticate using the CFA of the function which calls eh_return.
5929 */
5930 if (aarch64_return_address_signing_enabled ()
5931 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5932 {
5933 switch (aarch64_ra_sign_key)
5934 {
5935 case AARCH64_KEY_A:
5936 insn = emit_insn (gen_autiasp ());
5937 break;
5938 case AARCH64_KEY_B:
5939 insn = emit_insn (gen_autibsp ());
5940 break;
5941 default:
5942 gcc_unreachable ();
5943 }
5944 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5945 RTX_FRAME_RELATED_P (insn) = 1;
5946 }
5947
5948 /* Stack adjustment for exception handler. */
5949 if (crtl->calls_eh_return && !for_sibcall)
5950 {
5951 /* We need to unwind the stack by the offset computed by
5952 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5953 to be SP; letting the CFA move during this adjustment
5954 is just as correct as retaining the CFA from the body
5955 of the function. Therefore, do nothing special. */
5956 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5957 }
5958
5959 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5960 if (!for_sibcall)
5961 emit_jump_insn (ret_rtx);
5962 }
5963
5964 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5965 normally or return to a previous frame after unwinding.
5966
5967 An EH return uses a single shared return sequence. The epilogue is
5968 exactly like a normal epilogue except that it has an extra input
5969 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5970 that must be applied after the frame has been destroyed. An extra label
5971 is inserted before the epilogue which initializes this register to zero,
5972 and this is the entry point for a normal return.
5973
5974 An actual EH return updates the return address, initializes the stack
5975 adjustment and jumps directly into the epilogue (bypassing the zeroing
5976 of the adjustment). Since the return address is typically saved on the
5977 stack when a function makes a call, the saved LR must be updated outside
5978 the epilogue.
5979
5980 This poses problems as the store is generated well before the epilogue,
5981 so the offset of LR is not known yet. Also optimizations will remove the
5982 store as it appears dead, even after the epilogue is generated (as the
5983 base or offset for loading LR is different in many cases).
5984
5985 To avoid these problems this implementation forces the frame pointer
5986 in eh_return functions so that the location of LR is fixed and known early.
5987 It also marks the store volatile, so no optimization is permitted to
5988 remove the store. */
5989 rtx
5990 aarch64_eh_return_handler_rtx (void)
5991 {
5992 rtx tmp = gen_frame_mem (Pmode,
5993 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5994
5995 /* Mark the store volatile, so no optimization is permitted to remove it. */
5996 MEM_VOLATILE_P (tmp) = true;
5997 return tmp;
5998 }
5999
6000 /* Output code to add DELTA to the first argument, and then jump
6001 to FUNCTION. Used for C++ multiple inheritance. */
6002 static void
6003 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6004 HOST_WIDE_INT delta,
6005 HOST_WIDE_INT vcall_offset,
6006 tree function)
6007 {
6008 /* The this pointer is always in x0. Note that this differs from
6009 Arm where the this pointer maybe bumped to r1 if r0 is required
6010 to return a pointer to an aggregate. On AArch64 a result value
6011 pointer will be in x8. */
6012 int this_regno = R0_REGNUM;
6013 rtx this_rtx, temp0, temp1, addr, funexp;
6014 rtx_insn *insn;
6015 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6016
6017 if (aarch64_bti_enabled ())
6018 emit_insn (gen_bti_c());
6019
6020 reload_completed = 1;
6021 emit_note (NOTE_INSN_PROLOGUE_END);
6022
6023 this_rtx = gen_rtx_REG (Pmode, this_regno);
6024 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6025 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6026
6027 if (vcall_offset == 0)
6028 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6029 else
6030 {
6031 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6032
6033 addr = this_rtx;
6034 if (delta != 0)
6035 {
6036 if (delta >= -256 && delta < 256)
6037 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6038 plus_constant (Pmode, this_rtx, delta));
6039 else
6040 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6041 temp1, temp0, false);
6042 }
6043
6044 if (Pmode == ptr_mode)
6045 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6046 else
6047 aarch64_emit_move (temp0,
6048 gen_rtx_ZERO_EXTEND (Pmode,
6049 gen_rtx_MEM (ptr_mode, addr)));
6050
6051 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6052 addr = plus_constant (Pmode, temp0, vcall_offset);
6053 else
6054 {
6055 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6056 Pmode);
6057 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6058 }
6059
6060 if (Pmode == ptr_mode)
6061 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6062 else
6063 aarch64_emit_move (temp1,
6064 gen_rtx_SIGN_EXTEND (Pmode,
6065 gen_rtx_MEM (ptr_mode, addr)));
6066
6067 emit_insn (gen_add2_insn (this_rtx, temp1));
6068 }
6069
6070 /* Generate a tail call to the target function. */
6071 if (!TREE_USED (function))
6072 {
6073 assemble_external (function);
6074 TREE_USED (function) = 1;
6075 }
6076 funexp = XEXP (DECL_RTL (function), 0);
6077 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6078 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6079 SIBLING_CALL_P (insn) = 1;
6080
6081 insn = get_insns ();
6082 shorten_branches (insn);
6083
6084 assemble_start_function (thunk, fnname);
6085 final_start_function (insn, file, 1);
6086 final (insn, file, 1);
6087 final_end_function ();
6088 assemble_end_function (thunk, fnname);
6089
6090 /* Stop pretending to be a post-reload pass. */
6091 reload_completed = 0;
6092 }
6093
6094 static bool
6095 aarch64_tls_referenced_p (rtx x)
6096 {
6097 if (!TARGET_HAVE_TLS)
6098 return false;
6099 subrtx_iterator::array_type array;
6100 FOR_EACH_SUBRTX (iter, array, x, ALL)
6101 {
6102 const_rtx x = *iter;
6103 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6104 return true;
6105 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6106 TLS offsets, not real symbol references. */
6107 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6108 iter.skip_subrtxes ();
6109 }
6110 return false;
6111 }
6112
6113
6114 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6115 a left shift of 0 or 12 bits. */
6116 bool
6117 aarch64_uimm12_shift (HOST_WIDE_INT val)
6118 {
6119 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6120 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6121 );
6122 }
6123
6124 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6125 that can be created with a left shift of 0 or 12. */
6126 static HOST_WIDE_INT
6127 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6128 {
6129 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6130 handle correctly. */
6131 gcc_assert ((val & 0xffffff) == val);
6132
6133 if (((val & 0xfff) << 0) == val)
6134 return val;
6135
6136 return val & (0xfff << 12);
6137 }
6138
6139 /* Return true if val is an immediate that can be loaded into a
6140 register by a MOVZ instruction. */
6141 static bool
6142 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6143 {
6144 if (GET_MODE_SIZE (mode) > 4)
6145 {
6146 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6147 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6148 return 1;
6149 }
6150 else
6151 {
6152 /* Ignore sign extension. */
6153 val &= (HOST_WIDE_INT) 0xffffffff;
6154 }
6155 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6156 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6157 }
6158
6159 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6160 64-bit (DImode) integer. */
6161
6162 static unsigned HOST_WIDE_INT
6163 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6164 {
6165 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6166 while (size < 64)
6167 {
6168 val &= (HOST_WIDE_INT_1U << size) - 1;
6169 val |= val << size;
6170 size *= 2;
6171 }
6172 return val;
6173 }
6174
6175 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6176
6177 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6178 {
6179 0x0000000100000001ull,
6180 0x0001000100010001ull,
6181 0x0101010101010101ull,
6182 0x1111111111111111ull,
6183 0x5555555555555555ull,
6184 };
6185
6186
6187 /* Return true if val is a valid bitmask immediate. */
6188
6189 bool
6190 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6191 {
6192 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6193 int bits;
6194
6195 /* Check for a single sequence of one bits and return quickly if so.
6196 The special cases of all ones and all zeroes returns false. */
6197 val = aarch64_replicate_bitmask_imm (val_in, mode);
6198 tmp = val + (val & -val);
6199
6200 if (tmp == (tmp & -tmp))
6201 return (val + 1) > 1;
6202
6203 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6204 if (mode == SImode)
6205 val = (val << 32) | (val & 0xffffffff);
6206
6207 /* Invert if the immediate doesn't start with a zero bit - this means we
6208 only need to search for sequences of one bits. */
6209 if (val & 1)
6210 val = ~val;
6211
6212 /* Find the first set bit and set tmp to val with the first sequence of one
6213 bits removed. Return success if there is a single sequence of ones. */
6214 first_one = val & -val;
6215 tmp = val & (val + first_one);
6216
6217 if (tmp == 0)
6218 return true;
6219
6220 /* Find the next set bit and compute the difference in bit position. */
6221 next_one = tmp & -tmp;
6222 bits = clz_hwi (first_one) - clz_hwi (next_one);
6223 mask = val ^ tmp;
6224
6225 /* Check the bit position difference is a power of 2, and that the first
6226 sequence of one bits fits within 'bits' bits. */
6227 if ((mask >> bits) != 0 || bits != (bits & -bits))
6228 return false;
6229
6230 /* Check the sequence of one bits is repeated 64/bits times. */
6231 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6232 }
6233
6234 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6235 Assumed precondition: VAL_IN Is not zero. */
6236
6237 unsigned HOST_WIDE_INT
6238 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6239 {
6240 int lowest_bit_set = ctz_hwi (val_in);
6241 int highest_bit_set = floor_log2 (val_in);
6242 gcc_assert (val_in != 0);
6243
6244 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6245 (HOST_WIDE_INT_1U << lowest_bit_set));
6246 }
6247
6248 /* Create constant where bits outside of lowest bit set to highest bit set
6249 are set to 1. */
6250
6251 unsigned HOST_WIDE_INT
6252 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6253 {
6254 return val_in | ~aarch64_and_split_imm1 (val_in);
6255 }
6256
6257 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6258
6259 bool
6260 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6261 {
6262 scalar_int_mode int_mode;
6263 if (!is_a <scalar_int_mode> (mode, &int_mode))
6264 return false;
6265
6266 if (aarch64_bitmask_imm (val_in, int_mode))
6267 return false;
6268
6269 if (aarch64_move_imm (val_in, int_mode))
6270 return false;
6271
6272 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6273
6274 return aarch64_bitmask_imm (imm2, int_mode);
6275 }
6276
6277 /* Return true if val is an immediate that can be loaded into a
6278 register in a single instruction. */
6279 bool
6280 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6281 {
6282 scalar_int_mode int_mode;
6283 if (!is_a <scalar_int_mode> (mode, &int_mode))
6284 return false;
6285
6286 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6287 return 1;
6288 return aarch64_bitmask_imm (val, int_mode);
6289 }
6290
6291 static bool
6292 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6293 {
6294 rtx base, offset;
6295
6296 if (GET_CODE (x) == HIGH)
6297 return true;
6298
6299 /* There's no way to calculate VL-based values using relocations. */
6300 subrtx_iterator::array_type array;
6301 FOR_EACH_SUBRTX (iter, array, x, ALL)
6302 if (GET_CODE (*iter) == CONST_POLY_INT)
6303 return true;
6304
6305 split_const (x, &base, &offset);
6306 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6307 {
6308 if (aarch64_classify_symbol (base, INTVAL (offset))
6309 != SYMBOL_FORCE_TO_MEM)
6310 return true;
6311 else
6312 /* Avoid generating a 64-bit relocation in ILP32; leave
6313 to aarch64_expand_mov_immediate to handle it properly. */
6314 return mode != ptr_mode;
6315 }
6316
6317 return aarch64_tls_referenced_p (x);
6318 }
6319
6320 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6321 The expansion for a table switch is quite expensive due to the number
6322 of instructions, the table lookup and hard to predict indirect jump.
6323 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6324 set, otherwise use tables for > 16 cases as a tradeoff between size and
6325 performance. When optimizing for size, use the default setting. */
6326
6327 static unsigned int
6328 aarch64_case_values_threshold (void)
6329 {
6330 /* Use the specified limit for the number of cases before using jump
6331 tables at higher optimization levels. */
6332 if (optimize > 2
6333 && selected_cpu->tune->max_case_values != 0)
6334 return selected_cpu->tune->max_case_values;
6335 else
6336 return optimize_size ? default_case_values_threshold () : 17;
6337 }
6338
6339 /* Return true if register REGNO is a valid index register.
6340 STRICT_P is true if REG_OK_STRICT is in effect. */
6341
6342 bool
6343 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6344 {
6345 if (!HARD_REGISTER_NUM_P (regno))
6346 {
6347 if (!strict_p)
6348 return true;
6349
6350 if (!reg_renumber)
6351 return false;
6352
6353 regno = reg_renumber[regno];
6354 }
6355 return GP_REGNUM_P (regno);
6356 }
6357
6358 /* Return true if register REGNO is a valid base register for mode MODE.
6359 STRICT_P is true if REG_OK_STRICT is in effect. */
6360
6361 bool
6362 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6363 {
6364 if (!HARD_REGISTER_NUM_P (regno))
6365 {
6366 if (!strict_p)
6367 return true;
6368
6369 if (!reg_renumber)
6370 return false;
6371
6372 regno = reg_renumber[regno];
6373 }
6374
6375 /* The fake registers will be eliminated to either the stack or
6376 hard frame pointer, both of which are usually valid base registers.
6377 Reload deals with the cases where the eliminated form isn't valid. */
6378 return (GP_REGNUM_P (regno)
6379 || regno == SP_REGNUM
6380 || regno == FRAME_POINTER_REGNUM
6381 || regno == ARG_POINTER_REGNUM);
6382 }
6383
6384 /* Return true if X is a valid base register for mode MODE.
6385 STRICT_P is true if REG_OK_STRICT is in effect. */
6386
6387 static bool
6388 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6389 {
6390 if (!strict_p
6391 && GET_CODE (x) == SUBREG
6392 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6393 x = SUBREG_REG (x);
6394
6395 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6396 }
6397
6398 /* Return true if address offset is a valid index. If it is, fill in INFO
6399 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6400
6401 static bool
6402 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6403 machine_mode mode, bool strict_p)
6404 {
6405 enum aarch64_address_type type;
6406 rtx index;
6407 int shift;
6408
6409 /* (reg:P) */
6410 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6411 && GET_MODE (x) == Pmode)
6412 {
6413 type = ADDRESS_REG_REG;
6414 index = x;
6415 shift = 0;
6416 }
6417 /* (sign_extend:DI (reg:SI)) */
6418 else if ((GET_CODE (x) == SIGN_EXTEND
6419 || GET_CODE (x) == ZERO_EXTEND)
6420 && GET_MODE (x) == DImode
6421 && GET_MODE (XEXP (x, 0)) == SImode)
6422 {
6423 type = (GET_CODE (x) == SIGN_EXTEND)
6424 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6425 index = XEXP (x, 0);
6426 shift = 0;
6427 }
6428 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6429 else if (GET_CODE (x) == MULT
6430 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6431 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6432 && GET_MODE (XEXP (x, 0)) == DImode
6433 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6434 && CONST_INT_P (XEXP (x, 1)))
6435 {
6436 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6437 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6438 index = XEXP (XEXP (x, 0), 0);
6439 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6440 }
6441 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6442 else if (GET_CODE (x) == ASHIFT
6443 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6444 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6445 && GET_MODE (XEXP (x, 0)) == DImode
6446 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6447 && CONST_INT_P (XEXP (x, 1)))
6448 {
6449 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6450 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6451 index = XEXP (XEXP (x, 0), 0);
6452 shift = INTVAL (XEXP (x, 1));
6453 }
6454 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6455 else if ((GET_CODE (x) == SIGN_EXTRACT
6456 || GET_CODE (x) == ZERO_EXTRACT)
6457 && GET_MODE (x) == DImode
6458 && GET_CODE (XEXP (x, 0)) == MULT
6459 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6460 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6461 {
6462 type = (GET_CODE (x) == SIGN_EXTRACT)
6463 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6464 index = XEXP (XEXP (x, 0), 0);
6465 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6466 if (INTVAL (XEXP (x, 1)) != 32 + shift
6467 || INTVAL (XEXP (x, 2)) != 0)
6468 shift = -1;
6469 }
6470 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6471 (const_int 0xffffffff<<shift)) */
6472 else if (GET_CODE (x) == AND
6473 && GET_MODE (x) == DImode
6474 && GET_CODE (XEXP (x, 0)) == MULT
6475 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6476 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6477 && CONST_INT_P (XEXP (x, 1)))
6478 {
6479 type = ADDRESS_REG_UXTW;
6480 index = XEXP (XEXP (x, 0), 0);
6481 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6482 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6483 shift = -1;
6484 }
6485 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6486 else if ((GET_CODE (x) == SIGN_EXTRACT
6487 || GET_CODE (x) == ZERO_EXTRACT)
6488 && GET_MODE (x) == DImode
6489 && GET_CODE (XEXP (x, 0)) == ASHIFT
6490 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6491 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6492 {
6493 type = (GET_CODE (x) == SIGN_EXTRACT)
6494 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6495 index = XEXP (XEXP (x, 0), 0);
6496 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6497 if (INTVAL (XEXP (x, 1)) != 32 + shift
6498 || INTVAL (XEXP (x, 2)) != 0)
6499 shift = -1;
6500 }
6501 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6502 (const_int 0xffffffff<<shift)) */
6503 else if (GET_CODE (x) == AND
6504 && GET_MODE (x) == DImode
6505 && GET_CODE (XEXP (x, 0)) == ASHIFT
6506 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6507 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6508 && CONST_INT_P (XEXP (x, 1)))
6509 {
6510 type = ADDRESS_REG_UXTW;
6511 index = XEXP (XEXP (x, 0), 0);
6512 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6513 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6514 shift = -1;
6515 }
6516 /* (mult:P (reg:P) (const_int scale)) */
6517 else if (GET_CODE (x) == MULT
6518 && GET_MODE (x) == Pmode
6519 && GET_MODE (XEXP (x, 0)) == Pmode
6520 && CONST_INT_P (XEXP (x, 1)))
6521 {
6522 type = ADDRESS_REG_REG;
6523 index = XEXP (x, 0);
6524 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6525 }
6526 /* (ashift:P (reg:P) (const_int shift)) */
6527 else if (GET_CODE (x) == ASHIFT
6528 && GET_MODE (x) == Pmode
6529 && GET_MODE (XEXP (x, 0)) == Pmode
6530 && CONST_INT_P (XEXP (x, 1)))
6531 {
6532 type = ADDRESS_REG_REG;
6533 index = XEXP (x, 0);
6534 shift = INTVAL (XEXP (x, 1));
6535 }
6536 else
6537 return false;
6538
6539 if (!strict_p
6540 && GET_CODE (index) == SUBREG
6541 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6542 index = SUBREG_REG (index);
6543
6544 if (aarch64_sve_data_mode_p (mode))
6545 {
6546 if (type != ADDRESS_REG_REG
6547 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6548 return false;
6549 }
6550 else
6551 {
6552 if (shift != 0
6553 && !(IN_RANGE (shift, 1, 3)
6554 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6555 return false;
6556 }
6557
6558 if (REG_P (index)
6559 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6560 {
6561 info->type = type;
6562 info->offset = index;
6563 info->shift = shift;
6564 return true;
6565 }
6566
6567 return false;
6568 }
6569
6570 /* Return true if MODE is one of the modes for which we
6571 support LDP/STP operations. */
6572
6573 static bool
6574 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6575 {
6576 return mode == SImode || mode == DImode
6577 || mode == SFmode || mode == DFmode
6578 || (aarch64_vector_mode_supported_p (mode)
6579 && (known_eq (GET_MODE_SIZE (mode), 8)
6580 || (known_eq (GET_MODE_SIZE (mode), 16)
6581 && (aarch64_tune_params.extra_tuning_flags
6582 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6583 }
6584
6585 /* Return true if REGNO is a virtual pointer register, or an eliminable
6586 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6587 include stack_pointer or hard_frame_pointer. */
6588 static bool
6589 virt_or_elim_regno_p (unsigned regno)
6590 {
6591 return ((regno >= FIRST_VIRTUAL_REGISTER
6592 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6593 || regno == FRAME_POINTER_REGNUM
6594 || regno == ARG_POINTER_REGNUM);
6595 }
6596
6597 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6598 If it is, fill in INFO appropriately. STRICT_P is true if
6599 REG_OK_STRICT is in effect. */
6600
6601 bool
6602 aarch64_classify_address (struct aarch64_address_info *info,
6603 rtx x, machine_mode mode, bool strict_p,
6604 aarch64_addr_query_type type)
6605 {
6606 enum rtx_code code = GET_CODE (x);
6607 rtx op0, op1;
6608 poly_int64 offset;
6609
6610 HOST_WIDE_INT const_size;
6611
6612 /* On BE, we use load/store pair for all large int mode load/stores.
6613 TI/TFmode may also use a load/store pair. */
6614 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6615 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6616 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6617 || type == ADDR_QUERY_LDP_STP_N
6618 || mode == TImode
6619 || mode == TFmode
6620 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6621
6622 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6623 corresponds to the actual size of the memory being loaded/stored and the
6624 mode of the corresponding addressing mode is half of that. */
6625 if (type == ADDR_QUERY_LDP_STP_N
6626 && known_eq (GET_MODE_SIZE (mode), 16))
6627 mode = DFmode;
6628
6629 bool allow_reg_index_p = (!load_store_pair_p
6630 && (known_lt (GET_MODE_SIZE (mode), 16)
6631 || vec_flags == VEC_ADVSIMD
6632 || vec_flags & VEC_SVE_DATA));
6633
6634 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6635 [Rn, #offset, MUL VL]. */
6636 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6637 && (code != REG && code != PLUS))
6638 return false;
6639
6640 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6641 REG addressing. */
6642 if (advsimd_struct_p
6643 && !BYTES_BIG_ENDIAN
6644 && (code != POST_INC && code != REG))
6645 return false;
6646
6647 gcc_checking_assert (GET_MODE (x) == VOIDmode
6648 || SCALAR_INT_MODE_P (GET_MODE (x)));
6649
6650 switch (code)
6651 {
6652 case REG:
6653 case SUBREG:
6654 info->type = ADDRESS_REG_IMM;
6655 info->base = x;
6656 info->offset = const0_rtx;
6657 info->const_offset = 0;
6658 return aarch64_base_register_rtx_p (x, strict_p);
6659
6660 case PLUS:
6661 op0 = XEXP (x, 0);
6662 op1 = XEXP (x, 1);
6663
6664 if (! strict_p
6665 && REG_P (op0)
6666 && virt_or_elim_regno_p (REGNO (op0))
6667 && poly_int_rtx_p (op1, &offset))
6668 {
6669 info->type = ADDRESS_REG_IMM;
6670 info->base = op0;
6671 info->offset = op1;
6672 info->const_offset = offset;
6673
6674 return true;
6675 }
6676
6677 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6678 && aarch64_base_register_rtx_p (op0, strict_p)
6679 && poly_int_rtx_p (op1, &offset))
6680 {
6681 info->type = ADDRESS_REG_IMM;
6682 info->base = op0;
6683 info->offset = op1;
6684 info->const_offset = offset;
6685
6686 /* TImode and TFmode values are allowed in both pairs of X
6687 registers and individual Q registers. The available
6688 address modes are:
6689 X,X: 7-bit signed scaled offset
6690 Q: 9-bit signed offset
6691 We conservatively require an offset representable in either mode.
6692 When performing the check for pairs of X registers i.e. LDP/STP
6693 pass down DImode since that is the natural size of the LDP/STP
6694 instruction memory accesses. */
6695 if (mode == TImode || mode == TFmode)
6696 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6697 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6698 || offset_12bit_unsigned_scaled_p (mode, offset)));
6699
6700 /* A 7bit offset check because OImode will emit a ldp/stp
6701 instruction (only big endian will get here).
6702 For ldp/stp instructions, the offset is scaled for the size of a
6703 single element of the pair. */
6704 if (mode == OImode)
6705 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6706
6707 /* Three 9/12 bit offsets checks because CImode will emit three
6708 ldr/str instructions (only big endian will get here). */
6709 if (mode == CImode)
6710 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6711 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6712 offset + 32)
6713 || offset_12bit_unsigned_scaled_p (V16QImode,
6714 offset + 32)));
6715
6716 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6717 instructions (only big endian will get here). */
6718 if (mode == XImode)
6719 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6720 && aarch64_offset_7bit_signed_scaled_p (TImode,
6721 offset + 32));
6722
6723 /* Make "m" use the LD1 offset range for SVE data modes, so
6724 that pre-RTL optimizers like ivopts will work to that
6725 instead of the wider LDR/STR range. */
6726 if (vec_flags == VEC_SVE_DATA)
6727 return (type == ADDR_QUERY_M
6728 ? offset_4bit_signed_scaled_p (mode, offset)
6729 : offset_9bit_signed_scaled_p (mode, offset));
6730
6731 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6732 {
6733 poly_int64 end_offset = (offset
6734 + GET_MODE_SIZE (mode)
6735 - BYTES_PER_SVE_VECTOR);
6736 return (type == ADDR_QUERY_M
6737 ? offset_4bit_signed_scaled_p (mode, offset)
6738 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6739 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6740 end_offset)));
6741 }
6742
6743 if (vec_flags == VEC_SVE_PRED)
6744 return offset_9bit_signed_scaled_p (mode, offset);
6745
6746 if (load_store_pair_p)
6747 return ((known_eq (GET_MODE_SIZE (mode), 4)
6748 || known_eq (GET_MODE_SIZE (mode), 8)
6749 || known_eq (GET_MODE_SIZE (mode), 16))
6750 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6751 else
6752 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6753 || offset_12bit_unsigned_scaled_p (mode, offset));
6754 }
6755
6756 if (allow_reg_index_p)
6757 {
6758 /* Look for base + (scaled/extended) index register. */
6759 if (aarch64_base_register_rtx_p (op0, strict_p)
6760 && aarch64_classify_index (info, op1, mode, strict_p))
6761 {
6762 info->base = op0;
6763 return true;
6764 }
6765 if (aarch64_base_register_rtx_p (op1, strict_p)
6766 && aarch64_classify_index (info, op0, mode, strict_p))
6767 {
6768 info->base = op1;
6769 return true;
6770 }
6771 }
6772
6773 return false;
6774
6775 case POST_INC:
6776 case POST_DEC:
6777 case PRE_INC:
6778 case PRE_DEC:
6779 info->type = ADDRESS_REG_WB;
6780 info->base = XEXP (x, 0);
6781 info->offset = NULL_RTX;
6782 return aarch64_base_register_rtx_p (info->base, strict_p);
6783
6784 case POST_MODIFY:
6785 case PRE_MODIFY:
6786 info->type = ADDRESS_REG_WB;
6787 info->base = XEXP (x, 0);
6788 if (GET_CODE (XEXP (x, 1)) == PLUS
6789 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6790 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6791 && aarch64_base_register_rtx_p (info->base, strict_p))
6792 {
6793 info->offset = XEXP (XEXP (x, 1), 1);
6794 info->const_offset = offset;
6795
6796 /* TImode and TFmode values are allowed in both pairs of X
6797 registers and individual Q registers. The available
6798 address modes are:
6799 X,X: 7-bit signed scaled offset
6800 Q: 9-bit signed offset
6801 We conservatively require an offset representable in either mode.
6802 */
6803 if (mode == TImode || mode == TFmode)
6804 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6805 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6806
6807 if (load_store_pair_p)
6808 return ((known_eq (GET_MODE_SIZE (mode), 4)
6809 || known_eq (GET_MODE_SIZE (mode), 8)
6810 || known_eq (GET_MODE_SIZE (mode), 16))
6811 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6812 else
6813 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6814 }
6815 return false;
6816
6817 case CONST:
6818 case SYMBOL_REF:
6819 case LABEL_REF:
6820 /* load literal: pc-relative constant pool entry. Only supported
6821 for SI mode or larger. */
6822 info->type = ADDRESS_SYMBOLIC;
6823
6824 if (!load_store_pair_p
6825 && GET_MODE_SIZE (mode).is_constant (&const_size)
6826 && const_size >= 4)
6827 {
6828 rtx sym, addend;
6829
6830 split_const (x, &sym, &addend);
6831 return ((GET_CODE (sym) == LABEL_REF
6832 || (GET_CODE (sym) == SYMBOL_REF
6833 && CONSTANT_POOL_ADDRESS_P (sym)
6834 && aarch64_pcrelative_literal_loads)));
6835 }
6836 return false;
6837
6838 case LO_SUM:
6839 info->type = ADDRESS_LO_SUM;
6840 info->base = XEXP (x, 0);
6841 info->offset = XEXP (x, 1);
6842 if (allow_reg_index_p
6843 && aarch64_base_register_rtx_p (info->base, strict_p))
6844 {
6845 rtx sym, offs;
6846 split_const (info->offset, &sym, &offs);
6847 if (GET_CODE (sym) == SYMBOL_REF
6848 && (aarch64_classify_symbol (sym, INTVAL (offs))
6849 == SYMBOL_SMALL_ABSOLUTE))
6850 {
6851 /* The symbol and offset must be aligned to the access size. */
6852 unsigned int align;
6853
6854 if (CONSTANT_POOL_ADDRESS_P (sym))
6855 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6856 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6857 {
6858 tree exp = SYMBOL_REF_DECL (sym);
6859 align = TYPE_ALIGN (TREE_TYPE (exp));
6860 align = aarch64_constant_alignment (exp, align);
6861 }
6862 else if (SYMBOL_REF_DECL (sym))
6863 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6864 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6865 && SYMBOL_REF_BLOCK (sym) != NULL)
6866 align = SYMBOL_REF_BLOCK (sym)->alignment;
6867 else
6868 align = BITS_PER_UNIT;
6869
6870 poly_int64 ref_size = GET_MODE_SIZE (mode);
6871 if (known_eq (ref_size, 0))
6872 ref_size = GET_MODE_SIZE (DImode);
6873
6874 return (multiple_p (INTVAL (offs), ref_size)
6875 && multiple_p (align / BITS_PER_UNIT, ref_size));
6876 }
6877 }
6878 return false;
6879
6880 default:
6881 return false;
6882 }
6883 }
6884
6885 /* Return true if the address X is valid for a PRFM instruction.
6886 STRICT_P is true if we should do strict checking with
6887 aarch64_classify_address. */
6888
6889 bool
6890 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6891 {
6892 struct aarch64_address_info addr;
6893
6894 /* PRFM accepts the same addresses as DImode... */
6895 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6896 if (!res)
6897 return false;
6898
6899 /* ... except writeback forms. */
6900 return addr.type != ADDRESS_REG_WB;
6901 }
6902
6903 bool
6904 aarch64_symbolic_address_p (rtx x)
6905 {
6906 rtx offset;
6907
6908 split_const (x, &x, &offset);
6909 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6910 }
6911
6912 /* Classify the base of symbolic expression X. */
6913
6914 enum aarch64_symbol_type
6915 aarch64_classify_symbolic_expression (rtx x)
6916 {
6917 rtx offset;
6918
6919 split_const (x, &x, &offset);
6920 return aarch64_classify_symbol (x, INTVAL (offset));
6921 }
6922
6923
6924 /* Return TRUE if X is a legitimate address for accessing memory in
6925 mode MODE. */
6926 static bool
6927 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6928 {
6929 struct aarch64_address_info addr;
6930
6931 return aarch64_classify_address (&addr, x, mode, strict_p);
6932 }
6933
6934 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6935 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6936 bool
6937 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6938 aarch64_addr_query_type type)
6939 {
6940 struct aarch64_address_info addr;
6941
6942 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6943 }
6944
6945 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6946
6947 static bool
6948 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6949 poly_int64 orig_offset,
6950 machine_mode mode)
6951 {
6952 HOST_WIDE_INT size;
6953 if (GET_MODE_SIZE (mode).is_constant (&size))
6954 {
6955 HOST_WIDE_INT const_offset, second_offset;
6956
6957 /* A general SVE offset is A * VQ + B. Remove the A component from
6958 coefficient 0 in order to get the constant B. */
6959 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6960
6961 /* Split an out-of-range address displacement into a base and
6962 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6963 range otherwise to increase opportunities for sharing the base
6964 address of different sizes. Unaligned accesses use the signed
6965 9-bit range, TImode/TFmode use the intersection of signed
6966 scaled 7-bit and signed 9-bit offset. */
6967 if (mode == TImode || mode == TFmode)
6968 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6969 else if ((const_offset & (size - 1)) != 0)
6970 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6971 else
6972 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6973
6974 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6975 return false;
6976
6977 /* Split the offset into second_offset and the rest. */
6978 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6979 *offset2 = gen_int_mode (second_offset, Pmode);
6980 return true;
6981 }
6982 else
6983 {
6984 /* Get the mode we should use as the basis of the range. For structure
6985 modes this is the mode of one vector. */
6986 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6987 machine_mode step_mode
6988 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6989
6990 /* Get the "mul vl" multiplier we'd like to use. */
6991 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6992 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6993 if (vec_flags & VEC_SVE_DATA)
6994 /* LDR supports a 9-bit range, but the move patterns for
6995 structure modes require all vectors to be in range of the
6996 same base. The simplest way of accomodating that while still
6997 promoting reuse of anchor points between different modes is
6998 to use an 8-bit range unconditionally. */
6999 vnum = ((vnum + 128) & 255) - 128;
7000 else
7001 /* Predicates are only handled singly, so we might as well use
7002 the full range. */
7003 vnum = ((vnum + 256) & 511) - 256;
7004 if (vnum == 0)
7005 return false;
7006
7007 /* Convert the "mul vl" multiplier into a byte offset. */
7008 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7009 if (known_eq (second_offset, orig_offset))
7010 return false;
7011
7012 /* Split the offset into second_offset and the rest. */
7013 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7014 *offset2 = gen_int_mode (second_offset, Pmode);
7015 return true;
7016 }
7017 }
7018
7019 /* Return the binary representation of floating point constant VALUE in INTVAL.
7020 If the value cannot be converted, return false without setting INTVAL.
7021 The conversion is done in the given MODE. */
7022 bool
7023 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7024 {
7025
7026 /* We make a general exception for 0. */
7027 if (aarch64_float_const_zero_rtx_p (value))
7028 {
7029 *intval = 0;
7030 return true;
7031 }
7032
7033 scalar_float_mode mode;
7034 if (GET_CODE (value) != CONST_DOUBLE
7035 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7036 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7037 /* Only support up to DF mode. */
7038 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7039 return false;
7040
7041 unsigned HOST_WIDE_INT ival = 0;
7042
7043 long res[2];
7044 real_to_target (res,
7045 CONST_DOUBLE_REAL_VALUE (value),
7046 REAL_MODE_FORMAT (mode));
7047
7048 if (mode == DFmode)
7049 {
7050 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7051 ival = zext_hwi (res[order], 32);
7052 ival |= (zext_hwi (res[1 - order], 32) << 32);
7053 }
7054 else
7055 ival = zext_hwi (res[0], 32);
7056
7057 *intval = ival;
7058 return true;
7059 }
7060
7061 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7062 single MOV(+MOVK) followed by an FMOV. */
7063 bool
7064 aarch64_float_const_rtx_p (rtx x)
7065 {
7066 machine_mode mode = GET_MODE (x);
7067 if (mode == VOIDmode)
7068 return false;
7069
7070 /* Determine whether it's cheaper to write float constants as
7071 mov/movk pairs over ldr/adrp pairs. */
7072 unsigned HOST_WIDE_INT ival;
7073
7074 if (GET_CODE (x) == CONST_DOUBLE
7075 && SCALAR_FLOAT_MODE_P (mode)
7076 && aarch64_reinterpret_float_as_int (x, &ival))
7077 {
7078 scalar_int_mode imode = (mode == HFmode
7079 ? SImode
7080 : int_mode_for_mode (mode).require ());
7081 int num_instr = aarch64_internal_mov_immediate
7082 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7083 return num_instr < 3;
7084 }
7085
7086 return false;
7087 }
7088
7089 /* Return TRUE if rtx X is immediate constant 0.0 */
7090 bool
7091 aarch64_float_const_zero_rtx_p (rtx x)
7092 {
7093 if (GET_MODE (x) == VOIDmode)
7094 return false;
7095
7096 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7097 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7098 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7099 }
7100
7101 /* Return TRUE if rtx X is immediate constant that fits in a single
7102 MOVI immediate operation. */
7103 bool
7104 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7105 {
7106 if (!TARGET_SIMD)
7107 return false;
7108
7109 machine_mode vmode;
7110 scalar_int_mode imode;
7111 unsigned HOST_WIDE_INT ival;
7112
7113 if (GET_CODE (x) == CONST_DOUBLE
7114 && SCALAR_FLOAT_MODE_P (mode))
7115 {
7116 if (!aarch64_reinterpret_float_as_int (x, &ival))
7117 return false;
7118
7119 /* We make a general exception for 0. */
7120 if (aarch64_float_const_zero_rtx_p (x))
7121 return true;
7122
7123 imode = int_mode_for_mode (mode).require ();
7124 }
7125 else if (GET_CODE (x) == CONST_INT
7126 && is_a <scalar_int_mode> (mode, &imode))
7127 ival = INTVAL (x);
7128 else
7129 return false;
7130
7131 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7132 a 128 bit vector mode. */
7133 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7134
7135 vmode = aarch64_simd_container_mode (imode, width);
7136 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7137
7138 return aarch64_simd_valid_immediate (v_op, NULL);
7139 }
7140
7141
7142 /* Return the fixed registers used for condition codes. */
7143
7144 static bool
7145 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7146 {
7147 *p1 = CC_REGNUM;
7148 *p2 = INVALID_REGNUM;
7149 return true;
7150 }
7151
7152 /* This function is used by the call expanders of the machine description.
7153 RESULT is the register in which the result is returned. It's NULL for
7154 "call" and "sibcall".
7155 MEM is the location of the function call.
7156 SIBCALL indicates whether this function call is normal call or sibling call.
7157 It will generate different pattern accordingly. */
7158
7159 void
7160 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7161 {
7162 rtx call, callee, tmp;
7163 rtvec vec;
7164 machine_mode mode;
7165
7166 gcc_assert (MEM_P (mem));
7167 callee = XEXP (mem, 0);
7168 mode = GET_MODE (callee);
7169 gcc_assert (mode == Pmode);
7170
7171 /* Decide if we should generate indirect calls by loading the
7172 address of the callee into a register before performing
7173 the branch-and-link. */
7174 if (SYMBOL_REF_P (callee)
7175 ? (aarch64_is_long_call_p (callee)
7176 || aarch64_is_noplt_call_p (callee))
7177 : !REG_P (callee))
7178 XEXP (mem, 0) = force_reg (mode, callee);
7179
7180 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7181
7182 if (result != NULL_RTX)
7183 call = gen_rtx_SET (result, call);
7184
7185 if (sibcall)
7186 tmp = ret_rtx;
7187 else
7188 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7189
7190 vec = gen_rtvec (2, call, tmp);
7191 call = gen_rtx_PARALLEL (VOIDmode, vec);
7192
7193 aarch64_emit_call_insn (call);
7194 }
7195
7196 /* Emit call insn with PAT and do aarch64-specific handling. */
7197
7198 void
7199 aarch64_emit_call_insn (rtx pat)
7200 {
7201 rtx insn = emit_call_insn (pat);
7202
7203 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7204 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7205 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7206 }
7207
7208 machine_mode
7209 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7210 {
7211 machine_mode mode_x = GET_MODE (x);
7212 rtx_code code_x = GET_CODE (x);
7213
7214 /* All floating point compares return CCFP if it is an equality
7215 comparison, and CCFPE otherwise. */
7216 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7217 {
7218 switch (code)
7219 {
7220 case EQ:
7221 case NE:
7222 case UNORDERED:
7223 case ORDERED:
7224 case UNLT:
7225 case UNLE:
7226 case UNGT:
7227 case UNGE:
7228 case UNEQ:
7229 return CCFPmode;
7230
7231 case LT:
7232 case LE:
7233 case GT:
7234 case GE:
7235 case LTGT:
7236 return CCFPEmode;
7237
7238 default:
7239 gcc_unreachable ();
7240 }
7241 }
7242
7243 /* Equality comparisons of short modes against zero can be performed
7244 using the TST instruction with the appropriate bitmask. */
7245 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7246 && (code == EQ || code == NE)
7247 && (mode_x == HImode || mode_x == QImode))
7248 return CC_NZmode;
7249
7250 /* Similarly, comparisons of zero_extends from shorter modes can
7251 be performed using an ANDS with an immediate mask. */
7252 if (y == const0_rtx && code_x == ZERO_EXTEND
7253 && (mode_x == SImode || mode_x == DImode)
7254 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7255 && (code == EQ || code == NE))
7256 return CC_NZmode;
7257
7258 if ((mode_x == SImode || mode_x == DImode)
7259 && y == const0_rtx
7260 && (code == EQ || code == NE || code == LT || code == GE)
7261 && (code_x == PLUS || code_x == MINUS || code_x == AND
7262 || code_x == NEG
7263 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7264 && CONST_INT_P (XEXP (x, 2)))))
7265 return CC_NZmode;
7266
7267 /* A compare with a shifted operand. Because of canonicalization,
7268 the comparison will have to be swapped when we emit the assembly
7269 code. */
7270 if ((mode_x == SImode || mode_x == DImode)
7271 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7272 && (code_x == ASHIFT || code_x == ASHIFTRT
7273 || code_x == LSHIFTRT
7274 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7275 return CC_SWPmode;
7276
7277 /* Similarly for a negated operand, but we can only do this for
7278 equalities. */
7279 if ((mode_x == SImode || mode_x == DImode)
7280 && (REG_P (y) || GET_CODE (y) == SUBREG)
7281 && (code == EQ || code == NE)
7282 && code_x == NEG)
7283 return CC_Zmode;
7284
7285 /* A test for unsigned overflow from an addition. */
7286 if ((mode_x == DImode || mode_x == TImode)
7287 && (code == LTU || code == GEU)
7288 && code_x == PLUS
7289 && rtx_equal_p (XEXP (x, 0), y))
7290 return CC_Cmode;
7291
7292 /* A test for unsigned overflow from an add with carry. */
7293 if ((mode_x == DImode || mode_x == TImode)
7294 && (code == LTU || code == GEU)
7295 && code_x == PLUS
7296 && CONST_SCALAR_INT_P (y)
7297 && (rtx_mode_t (y, mode_x)
7298 == (wi::shwi (1, mode_x)
7299 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7300 return CC_ADCmode;
7301
7302 /* A test for signed overflow. */
7303 if ((mode_x == DImode || mode_x == TImode)
7304 && code == NE
7305 && code_x == PLUS
7306 && GET_CODE (y) == SIGN_EXTEND)
7307 return CC_Vmode;
7308
7309 /* For everything else, return CCmode. */
7310 return CCmode;
7311 }
7312
7313 static int
7314 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7315
7316 int
7317 aarch64_get_condition_code (rtx x)
7318 {
7319 machine_mode mode = GET_MODE (XEXP (x, 0));
7320 enum rtx_code comp_code = GET_CODE (x);
7321
7322 if (GET_MODE_CLASS (mode) != MODE_CC)
7323 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7324 return aarch64_get_condition_code_1 (mode, comp_code);
7325 }
7326
7327 static int
7328 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7329 {
7330 switch (mode)
7331 {
7332 case E_CCFPmode:
7333 case E_CCFPEmode:
7334 switch (comp_code)
7335 {
7336 case GE: return AARCH64_GE;
7337 case GT: return AARCH64_GT;
7338 case LE: return AARCH64_LS;
7339 case LT: return AARCH64_MI;
7340 case NE: return AARCH64_NE;
7341 case EQ: return AARCH64_EQ;
7342 case ORDERED: return AARCH64_VC;
7343 case UNORDERED: return AARCH64_VS;
7344 case UNLT: return AARCH64_LT;
7345 case UNLE: return AARCH64_LE;
7346 case UNGT: return AARCH64_HI;
7347 case UNGE: return AARCH64_PL;
7348 default: return -1;
7349 }
7350 break;
7351
7352 case E_CCmode:
7353 switch (comp_code)
7354 {
7355 case NE: return AARCH64_NE;
7356 case EQ: return AARCH64_EQ;
7357 case GE: return AARCH64_GE;
7358 case GT: return AARCH64_GT;
7359 case LE: return AARCH64_LE;
7360 case LT: return AARCH64_LT;
7361 case GEU: return AARCH64_CS;
7362 case GTU: return AARCH64_HI;
7363 case LEU: return AARCH64_LS;
7364 case LTU: return AARCH64_CC;
7365 default: return -1;
7366 }
7367 break;
7368
7369 case E_CC_SWPmode:
7370 switch (comp_code)
7371 {
7372 case NE: return AARCH64_NE;
7373 case EQ: return AARCH64_EQ;
7374 case GE: return AARCH64_LE;
7375 case GT: return AARCH64_LT;
7376 case LE: return AARCH64_GE;
7377 case LT: return AARCH64_GT;
7378 case GEU: return AARCH64_LS;
7379 case GTU: return AARCH64_CC;
7380 case LEU: return AARCH64_CS;
7381 case LTU: return AARCH64_HI;
7382 default: return -1;
7383 }
7384 break;
7385
7386 case E_CC_NZmode:
7387 switch (comp_code)
7388 {
7389 case NE: return AARCH64_NE;
7390 case EQ: return AARCH64_EQ;
7391 case GE: return AARCH64_PL;
7392 case LT: return AARCH64_MI;
7393 default: return -1;
7394 }
7395 break;
7396
7397 case E_CC_Zmode:
7398 switch (comp_code)
7399 {
7400 case NE: return AARCH64_NE;
7401 case EQ: return AARCH64_EQ;
7402 default: return -1;
7403 }
7404 break;
7405
7406 case E_CC_Cmode:
7407 switch (comp_code)
7408 {
7409 case LTU: return AARCH64_CS;
7410 case GEU: return AARCH64_CC;
7411 default: return -1;
7412 }
7413 break;
7414
7415 case E_CC_ADCmode:
7416 switch (comp_code)
7417 {
7418 case GEU: return AARCH64_CS;
7419 case LTU: return AARCH64_CC;
7420 default: return -1;
7421 }
7422 break;
7423
7424 case E_CC_Vmode:
7425 switch (comp_code)
7426 {
7427 case NE: return AARCH64_VS;
7428 case EQ: return AARCH64_VC;
7429 default: return -1;
7430 }
7431 break;
7432
7433 default:
7434 return -1;
7435 }
7436
7437 return -1;
7438 }
7439
7440 bool
7441 aarch64_const_vec_all_same_in_range_p (rtx x,
7442 HOST_WIDE_INT minval,
7443 HOST_WIDE_INT maxval)
7444 {
7445 rtx elt;
7446 return (const_vec_duplicate_p (x, &elt)
7447 && CONST_INT_P (elt)
7448 && IN_RANGE (INTVAL (elt), minval, maxval));
7449 }
7450
7451 bool
7452 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7453 {
7454 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7455 }
7456
7457 /* Return true if VEC is a constant in which every element is in the range
7458 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7459
7460 static bool
7461 aarch64_const_vec_all_in_range_p (rtx vec,
7462 HOST_WIDE_INT minval,
7463 HOST_WIDE_INT maxval)
7464 {
7465 if (GET_CODE (vec) != CONST_VECTOR
7466 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7467 return false;
7468
7469 int nunits;
7470 if (!CONST_VECTOR_STEPPED_P (vec))
7471 nunits = const_vector_encoded_nelts (vec);
7472 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7473 return false;
7474
7475 for (int i = 0; i < nunits; i++)
7476 {
7477 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7478 if (!CONST_INT_P (vec_elem)
7479 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7480 return false;
7481 }
7482 return true;
7483 }
7484
7485 /* N Z C V. */
7486 #define AARCH64_CC_V 1
7487 #define AARCH64_CC_C (1 << 1)
7488 #define AARCH64_CC_Z (1 << 2)
7489 #define AARCH64_CC_N (1 << 3)
7490
7491 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7492 static const int aarch64_nzcv_codes[] =
7493 {
7494 0, /* EQ, Z == 1. */
7495 AARCH64_CC_Z, /* NE, Z == 0. */
7496 0, /* CS, C == 1. */
7497 AARCH64_CC_C, /* CC, C == 0. */
7498 0, /* MI, N == 1. */
7499 AARCH64_CC_N, /* PL, N == 0. */
7500 0, /* VS, V == 1. */
7501 AARCH64_CC_V, /* VC, V == 0. */
7502 0, /* HI, C ==1 && Z == 0. */
7503 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7504 AARCH64_CC_V, /* GE, N == V. */
7505 0, /* LT, N != V. */
7506 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7507 0, /* LE, !(Z == 0 && N == V). */
7508 0, /* AL, Any. */
7509 0 /* NV, Any. */
7510 };
7511
7512 /* Print floating-point vector immediate operand X to F, negating it
7513 first if NEGATE is true. Return true on success, false if it isn't
7514 a constant we can handle. */
7515
7516 static bool
7517 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7518 {
7519 rtx elt;
7520
7521 if (!const_vec_duplicate_p (x, &elt))
7522 return false;
7523
7524 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7525 if (negate)
7526 r = real_value_negate (&r);
7527
7528 /* We only handle the SVE single-bit immediates here. */
7529 if (real_equal (&r, &dconst0))
7530 asm_fprintf (f, "0.0");
7531 else if (real_equal (&r, &dconst1))
7532 asm_fprintf (f, "1.0");
7533 else if (real_equal (&r, &dconsthalf))
7534 asm_fprintf (f, "0.5");
7535 else
7536 return false;
7537
7538 return true;
7539 }
7540
7541 /* Return the equivalent letter for size. */
7542 static char
7543 sizetochar (int size)
7544 {
7545 switch (size)
7546 {
7547 case 64: return 'd';
7548 case 32: return 's';
7549 case 16: return 'h';
7550 case 8 : return 'b';
7551 default: gcc_unreachable ();
7552 }
7553 }
7554
7555 /* Print operand X to file F in a target specific manner according to CODE.
7556 The acceptable formatting commands given by CODE are:
7557 'c': An integer or symbol address without a preceding #
7558 sign.
7559 'C': Take the duplicated element in a vector constant
7560 and print it in hex.
7561 'D': Take the duplicated element in a vector constant
7562 and print it as an unsigned integer, in decimal.
7563 'e': Print the sign/zero-extend size as a character 8->b,
7564 16->h, 32->w.
7565 'p': Prints N such that 2^N == X (X must be power of 2 and
7566 const int).
7567 'P': Print the number of non-zero bits in X (a const_int).
7568 'H': Print the higher numbered register of a pair (TImode)
7569 of regs.
7570 'm': Print a condition (eq, ne, etc).
7571 'M': Same as 'm', but invert condition.
7572 'N': Take the duplicated element in a vector constant
7573 and print the negative of it in decimal.
7574 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7575 'S/T/U/V': Print a FP/SIMD register name for a register list.
7576 The register printed is the FP/SIMD register name
7577 of X + 0/1/2/3 for S/T/U/V.
7578 'R': Print a scalar FP/SIMD register name + 1.
7579 'X': Print bottom 16 bits of integer constant in hex.
7580 'w/x': Print a general register name or the zero register
7581 (32-bit or 64-bit).
7582 '0': Print a normal operand, if it's a general register,
7583 then we assume DImode.
7584 'k': Print NZCV for conditional compare instructions.
7585 'A': Output address constant representing the first
7586 argument of X, specifying a relocation offset
7587 if appropriate.
7588 'L': Output constant address specified by X
7589 with a relocation offset if appropriate.
7590 'G': Prints address of X, specifying a PC relative
7591 relocation mode if appropriate.
7592 'y': Output address of LDP or STP - this is used for
7593 some LDP/STPs which don't use a PARALLEL in their
7594 pattern (so the mode needs to be adjusted).
7595 'z': Output address of a typical LDP or STP. */
7596
7597 static void
7598 aarch64_print_operand (FILE *f, rtx x, int code)
7599 {
7600 rtx elt;
7601 switch (code)
7602 {
7603 case 'c':
7604 switch (GET_CODE (x))
7605 {
7606 case CONST_INT:
7607 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7608 break;
7609
7610 case SYMBOL_REF:
7611 output_addr_const (f, x);
7612 break;
7613
7614 case CONST:
7615 if (GET_CODE (XEXP (x, 0)) == PLUS
7616 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7617 {
7618 output_addr_const (f, x);
7619 break;
7620 }
7621 /* Fall through. */
7622
7623 default:
7624 output_operand_lossage ("unsupported operand for code '%c'", code);
7625 }
7626 break;
7627
7628 case 'e':
7629 {
7630 int n;
7631
7632 if (!CONST_INT_P (x)
7633 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7634 {
7635 output_operand_lossage ("invalid operand for '%%%c'", code);
7636 return;
7637 }
7638
7639 switch (n)
7640 {
7641 case 3:
7642 fputc ('b', f);
7643 break;
7644 case 4:
7645 fputc ('h', f);
7646 break;
7647 case 5:
7648 fputc ('w', f);
7649 break;
7650 default:
7651 output_operand_lossage ("invalid operand for '%%%c'", code);
7652 return;
7653 }
7654 }
7655 break;
7656
7657 case 'p':
7658 {
7659 int n;
7660
7661 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7662 {
7663 output_operand_lossage ("invalid operand for '%%%c'", code);
7664 return;
7665 }
7666
7667 asm_fprintf (f, "%d", n);
7668 }
7669 break;
7670
7671 case 'P':
7672 if (!CONST_INT_P (x))
7673 {
7674 output_operand_lossage ("invalid operand for '%%%c'", code);
7675 return;
7676 }
7677
7678 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7679 break;
7680
7681 case 'H':
7682 if (x == const0_rtx)
7683 {
7684 asm_fprintf (f, "xzr");
7685 break;
7686 }
7687
7688 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7689 {
7690 output_operand_lossage ("invalid operand for '%%%c'", code);
7691 return;
7692 }
7693
7694 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7695 break;
7696
7697 case 'M':
7698 case 'm':
7699 {
7700 int cond_code;
7701 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7702 if (x == const_true_rtx)
7703 {
7704 if (code == 'M')
7705 fputs ("nv", f);
7706 return;
7707 }
7708
7709 if (!COMPARISON_P (x))
7710 {
7711 output_operand_lossage ("invalid operand for '%%%c'", code);
7712 return;
7713 }
7714
7715 cond_code = aarch64_get_condition_code (x);
7716 gcc_assert (cond_code >= 0);
7717 if (code == 'M')
7718 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7719 fputs (aarch64_condition_codes[cond_code], f);
7720 }
7721 break;
7722
7723 case 'N':
7724 if (!const_vec_duplicate_p (x, &elt))
7725 {
7726 output_operand_lossage ("invalid vector constant");
7727 return;
7728 }
7729
7730 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7731 asm_fprintf (f, "%wd", -INTVAL (elt));
7732 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7733 && aarch64_print_vector_float_operand (f, x, true))
7734 ;
7735 else
7736 {
7737 output_operand_lossage ("invalid vector constant");
7738 return;
7739 }
7740 break;
7741
7742 case 'b':
7743 case 'h':
7744 case 's':
7745 case 'd':
7746 case 'q':
7747 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7748 {
7749 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7750 return;
7751 }
7752 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7753 break;
7754
7755 case 'S':
7756 case 'T':
7757 case 'U':
7758 case 'V':
7759 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7760 {
7761 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7762 return;
7763 }
7764 asm_fprintf (f, "%c%d",
7765 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7766 REGNO (x) - V0_REGNUM + (code - 'S'));
7767 break;
7768
7769 case 'R':
7770 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7771 {
7772 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7773 return;
7774 }
7775 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7776 break;
7777
7778 case 'X':
7779 if (!CONST_INT_P (x))
7780 {
7781 output_operand_lossage ("invalid operand for '%%%c'", code);
7782 return;
7783 }
7784 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7785 break;
7786
7787 case 'C':
7788 {
7789 /* Print a replicated constant in hex. */
7790 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7791 {
7792 output_operand_lossage ("invalid operand for '%%%c'", code);
7793 return;
7794 }
7795 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7796 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7797 }
7798 break;
7799
7800 case 'D':
7801 {
7802 /* Print a replicated constant in decimal, treating it as
7803 unsigned. */
7804 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7805 {
7806 output_operand_lossage ("invalid operand for '%%%c'", code);
7807 return;
7808 }
7809 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7810 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7811 }
7812 break;
7813
7814 case 'w':
7815 case 'x':
7816 if (x == const0_rtx
7817 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7818 {
7819 asm_fprintf (f, "%czr", code);
7820 break;
7821 }
7822
7823 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7824 {
7825 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7826 break;
7827 }
7828
7829 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7830 {
7831 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7832 break;
7833 }
7834
7835 /* Fall through */
7836
7837 case 0:
7838 if (x == NULL)
7839 {
7840 output_operand_lossage ("missing operand");
7841 return;
7842 }
7843
7844 switch (GET_CODE (x))
7845 {
7846 case REG:
7847 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7848 {
7849 if (REG_NREGS (x) == 1)
7850 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7851 else
7852 {
7853 char suffix
7854 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7855 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7856 REGNO (x) - V0_REGNUM, suffix,
7857 END_REGNO (x) - V0_REGNUM - 1, suffix);
7858 }
7859 }
7860 else
7861 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7862 break;
7863
7864 case MEM:
7865 output_address (GET_MODE (x), XEXP (x, 0));
7866 break;
7867
7868 case LABEL_REF:
7869 case SYMBOL_REF:
7870 output_addr_const (asm_out_file, x);
7871 break;
7872
7873 case CONST_INT:
7874 asm_fprintf (f, "%wd", INTVAL (x));
7875 break;
7876
7877 case CONST:
7878 if (!VECTOR_MODE_P (GET_MODE (x)))
7879 {
7880 output_addr_const (asm_out_file, x);
7881 break;
7882 }
7883 /* fall through */
7884
7885 case CONST_VECTOR:
7886 if (!const_vec_duplicate_p (x, &elt))
7887 {
7888 output_operand_lossage ("invalid vector constant");
7889 return;
7890 }
7891
7892 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7893 asm_fprintf (f, "%wd", INTVAL (elt));
7894 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7895 && aarch64_print_vector_float_operand (f, x, false))
7896 ;
7897 else
7898 {
7899 output_operand_lossage ("invalid vector constant");
7900 return;
7901 }
7902 break;
7903
7904 case CONST_DOUBLE:
7905 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7906 be getting CONST_DOUBLEs holding integers. */
7907 gcc_assert (GET_MODE (x) != VOIDmode);
7908 if (aarch64_float_const_zero_rtx_p (x))
7909 {
7910 fputc ('0', f);
7911 break;
7912 }
7913 else if (aarch64_float_const_representable_p (x))
7914 {
7915 #define buf_size 20
7916 char float_buf[buf_size] = {'\0'};
7917 real_to_decimal_for_mode (float_buf,
7918 CONST_DOUBLE_REAL_VALUE (x),
7919 buf_size, buf_size,
7920 1, GET_MODE (x));
7921 asm_fprintf (asm_out_file, "%s", float_buf);
7922 break;
7923 #undef buf_size
7924 }
7925 output_operand_lossage ("invalid constant");
7926 return;
7927 default:
7928 output_operand_lossage ("invalid operand");
7929 return;
7930 }
7931 break;
7932
7933 case 'A':
7934 if (GET_CODE (x) == HIGH)
7935 x = XEXP (x, 0);
7936
7937 switch (aarch64_classify_symbolic_expression (x))
7938 {
7939 case SYMBOL_SMALL_GOT_4G:
7940 asm_fprintf (asm_out_file, ":got:");
7941 break;
7942
7943 case SYMBOL_SMALL_TLSGD:
7944 asm_fprintf (asm_out_file, ":tlsgd:");
7945 break;
7946
7947 case SYMBOL_SMALL_TLSDESC:
7948 asm_fprintf (asm_out_file, ":tlsdesc:");
7949 break;
7950
7951 case SYMBOL_SMALL_TLSIE:
7952 asm_fprintf (asm_out_file, ":gottprel:");
7953 break;
7954
7955 case SYMBOL_TLSLE24:
7956 asm_fprintf (asm_out_file, ":tprel:");
7957 break;
7958
7959 case SYMBOL_TINY_GOT:
7960 gcc_unreachable ();
7961 break;
7962
7963 default:
7964 break;
7965 }
7966 output_addr_const (asm_out_file, x);
7967 break;
7968
7969 case 'L':
7970 switch (aarch64_classify_symbolic_expression (x))
7971 {
7972 case SYMBOL_SMALL_GOT_4G:
7973 asm_fprintf (asm_out_file, ":lo12:");
7974 break;
7975
7976 case SYMBOL_SMALL_TLSGD:
7977 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7978 break;
7979
7980 case SYMBOL_SMALL_TLSDESC:
7981 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7982 break;
7983
7984 case SYMBOL_SMALL_TLSIE:
7985 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7986 break;
7987
7988 case SYMBOL_TLSLE12:
7989 asm_fprintf (asm_out_file, ":tprel_lo12:");
7990 break;
7991
7992 case SYMBOL_TLSLE24:
7993 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7994 break;
7995
7996 case SYMBOL_TINY_GOT:
7997 asm_fprintf (asm_out_file, ":got:");
7998 break;
7999
8000 case SYMBOL_TINY_TLSIE:
8001 asm_fprintf (asm_out_file, ":gottprel:");
8002 break;
8003
8004 default:
8005 break;
8006 }
8007 output_addr_const (asm_out_file, x);
8008 break;
8009
8010 case 'G':
8011 switch (aarch64_classify_symbolic_expression (x))
8012 {
8013 case SYMBOL_TLSLE24:
8014 asm_fprintf (asm_out_file, ":tprel_hi12:");
8015 break;
8016 default:
8017 break;
8018 }
8019 output_addr_const (asm_out_file, x);
8020 break;
8021
8022 case 'k':
8023 {
8024 HOST_WIDE_INT cond_code;
8025
8026 if (!CONST_INT_P (x))
8027 {
8028 output_operand_lossage ("invalid operand for '%%%c'", code);
8029 return;
8030 }
8031
8032 cond_code = INTVAL (x);
8033 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8034 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8035 }
8036 break;
8037
8038 case 'y':
8039 case 'z':
8040 {
8041 machine_mode mode = GET_MODE (x);
8042
8043 if (GET_CODE (x) != MEM
8044 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8045 {
8046 output_operand_lossage ("invalid operand for '%%%c'", code);
8047 return;
8048 }
8049
8050 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8051 code == 'y'
8052 ? ADDR_QUERY_LDP_STP_N
8053 : ADDR_QUERY_LDP_STP))
8054 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8055 }
8056 break;
8057
8058 default:
8059 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8060 return;
8061 }
8062 }
8063
8064 /* Print address 'x' of a memory access with mode 'mode'.
8065 'op' is the context required by aarch64_classify_address. It can either be
8066 MEM for a normal memory access or PARALLEL for LDP/STP. */
8067 static bool
8068 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8069 aarch64_addr_query_type type)
8070 {
8071 struct aarch64_address_info addr;
8072 unsigned int size;
8073
8074 /* Check all addresses are Pmode - including ILP32. */
8075 if (GET_MODE (x) != Pmode
8076 && (!CONST_INT_P (x)
8077 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8078 {
8079 output_operand_lossage ("invalid address mode");
8080 return false;
8081 }
8082
8083 if (aarch64_classify_address (&addr, x, mode, true, type))
8084 switch (addr.type)
8085 {
8086 case ADDRESS_REG_IMM:
8087 if (known_eq (addr.const_offset, 0))
8088 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8089 else if (aarch64_sve_data_mode_p (mode))
8090 {
8091 HOST_WIDE_INT vnum
8092 = exact_div (addr.const_offset,
8093 BYTES_PER_SVE_VECTOR).to_constant ();
8094 asm_fprintf (f, "[%s, #%wd, mul vl]",
8095 reg_names[REGNO (addr.base)], vnum);
8096 }
8097 else if (aarch64_sve_pred_mode_p (mode))
8098 {
8099 HOST_WIDE_INT vnum
8100 = exact_div (addr.const_offset,
8101 BYTES_PER_SVE_PRED).to_constant ();
8102 asm_fprintf (f, "[%s, #%wd, mul vl]",
8103 reg_names[REGNO (addr.base)], vnum);
8104 }
8105 else
8106 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8107 INTVAL (addr.offset));
8108 return true;
8109
8110 case ADDRESS_REG_REG:
8111 if (addr.shift == 0)
8112 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8113 reg_names [REGNO (addr.offset)]);
8114 else
8115 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8116 reg_names [REGNO (addr.offset)], addr.shift);
8117 return true;
8118
8119 case ADDRESS_REG_UXTW:
8120 if (addr.shift == 0)
8121 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8122 REGNO (addr.offset) - R0_REGNUM);
8123 else
8124 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8125 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8126 return true;
8127
8128 case ADDRESS_REG_SXTW:
8129 if (addr.shift == 0)
8130 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8131 REGNO (addr.offset) - R0_REGNUM);
8132 else
8133 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8134 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8135 return true;
8136
8137 case ADDRESS_REG_WB:
8138 /* Writeback is only supported for fixed-width modes. */
8139 size = GET_MODE_SIZE (mode).to_constant ();
8140 switch (GET_CODE (x))
8141 {
8142 case PRE_INC:
8143 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8144 return true;
8145 case POST_INC:
8146 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8147 return true;
8148 case PRE_DEC:
8149 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8150 return true;
8151 case POST_DEC:
8152 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8153 return true;
8154 case PRE_MODIFY:
8155 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8156 INTVAL (addr.offset));
8157 return true;
8158 case POST_MODIFY:
8159 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8160 INTVAL (addr.offset));
8161 return true;
8162 default:
8163 break;
8164 }
8165 break;
8166
8167 case ADDRESS_LO_SUM:
8168 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8169 output_addr_const (f, addr.offset);
8170 asm_fprintf (f, "]");
8171 return true;
8172
8173 case ADDRESS_SYMBOLIC:
8174 output_addr_const (f, x);
8175 return true;
8176 }
8177
8178 return false;
8179 }
8180
8181 /* Print address 'x' of a memory access with mode 'mode'. */
8182 static void
8183 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8184 {
8185 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8186 output_addr_const (f, x);
8187 }
8188
8189 bool
8190 aarch64_label_mentioned_p (rtx x)
8191 {
8192 const char *fmt;
8193 int i;
8194
8195 if (GET_CODE (x) == LABEL_REF)
8196 return true;
8197
8198 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8199 referencing instruction, but they are constant offsets, not
8200 symbols. */
8201 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8202 return false;
8203
8204 fmt = GET_RTX_FORMAT (GET_CODE (x));
8205 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8206 {
8207 if (fmt[i] == 'E')
8208 {
8209 int j;
8210
8211 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8212 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8213 return 1;
8214 }
8215 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8216 return 1;
8217 }
8218
8219 return 0;
8220 }
8221
8222 /* Implement REGNO_REG_CLASS. */
8223
8224 enum reg_class
8225 aarch64_regno_regclass (unsigned regno)
8226 {
8227 if (GP_REGNUM_P (regno))
8228 return GENERAL_REGS;
8229
8230 if (regno == SP_REGNUM)
8231 return STACK_REG;
8232
8233 if (regno == FRAME_POINTER_REGNUM
8234 || regno == ARG_POINTER_REGNUM)
8235 return POINTER_REGS;
8236
8237 if (FP_REGNUM_P (regno))
8238 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
8239
8240 if (PR_REGNUM_P (regno))
8241 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8242
8243 return NO_REGS;
8244 }
8245
8246 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8247 If OFFSET is out of range, return an offset of an anchor point
8248 that is in range. Return 0 otherwise. */
8249
8250 static HOST_WIDE_INT
8251 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8252 machine_mode mode)
8253 {
8254 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8255 if (size > 16)
8256 return (offset + 0x400) & ~0x7f0;
8257
8258 /* For offsets that aren't a multiple of the access size, the limit is
8259 -256...255. */
8260 if (offset & (size - 1))
8261 {
8262 /* BLKmode typically uses LDP of X-registers. */
8263 if (mode == BLKmode)
8264 return (offset + 512) & ~0x3ff;
8265 return (offset + 0x100) & ~0x1ff;
8266 }
8267
8268 /* Small negative offsets are supported. */
8269 if (IN_RANGE (offset, -256, 0))
8270 return 0;
8271
8272 if (mode == TImode || mode == TFmode)
8273 return (offset + 0x100) & ~0x1ff;
8274
8275 /* Use 12-bit offset by access size. */
8276 return offset & (~0xfff * size);
8277 }
8278
8279 static rtx
8280 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8281 {
8282 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8283 where mask is selected by alignment and size of the offset.
8284 We try to pick as large a range for the offset as possible to
8285 maximize the chance of a CSE. However, for aligned addresses
8286 we limit the range to 4k so that structures with different sized
8287 elements are likely to use the same base. We need to be careful
8288 not to split a CONST for some forms of address expression, otherwise
8289 it will generate sub-optimal code. */
8290
8291 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8292 {
8293 rtx base = XEXP (x, 0);
8294 rtx offset_rtx = XEXP (x, 1);
8295 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8296
8297 if (GET_CODE (base) == PLUS)
8298 {
8299 rtx op0 = XEXP (base, 0);
8300 rtx op1 = XEXP (base, 1);
8301
8302 /* Force any scaling into a temp for CSE. */
8303 op0 = force_reg (Pmode, op0);
8304 op1 = force_reg (Pmode, op1);
8305
8306 /* Let the pointer register be in op0. */
8307 if (REG_POINTER (op1))
8308 std::swap (op0, op1);
8309
8310 /* If the pointer is virtual or frame related, then we know that
8311 virtual register instantiation or register elimination is going
8312 to apply a second constant. We want the two constants folded
8313 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8314 if (virt_or_elim_regno_p (REGNO (op0)))
8315 {
8316 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8317 NULL_RTX, true, OPTAB_DIRECT);
8318 return gen_rtx_PLUS (Pmode, base, op1);
8319 }
8320
8321 /* Otherwise, in order to encourage CSE (and thence loop strength
8322 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8323 base = expand_binop (Pmode, add_optab, op0, op1,
8324 NULL_RTX, true, OPTAB_DIRECT);
8325 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8326 }
8327
8328 HOST_WIDE_INT size;
8329 if (GET_MODE_SIZE (mode).is_constant (&size))
8330 {
8331 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8332 mode);
8333 if (base_offset != 0)
8334 {
8335 base = plus_constant (Pmode, base, base_offset);
8336 base = force_operand (base, NULL_RTX);
8337 return plus_constant (Pmode, base, offset - base_offset);
8338 }
8339 }
8340 }
8341
8342 return x;
8343 }
8344
8345 static reg_class_t
8346 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8347 reg_class_t rclass,
8348 machine_mode mode,
8349 secondary_reload_info *sri)
8350 {
8351 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8352 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8353 comment at the head of aarch64-sve.md for more details about the
8354 big-endian handling. */
8355 if (BYTES_BIG_ENDIAN
8356 && reg_class_subset_p (rclass, FP_REGS)
8357 && !((REG_P (x) && HARD_REGISTER_P (x))
8358 || aarch64_simd_valid_immediate (x, NULL))
8359 && aarch64_sve_data_mode_p (mode))
8360 {
8361 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8362 return NO_REGS;
8363 }
8364
8365 /* If we have to disable direct literal pool loads and stores because the
8366 function is too big, then we need a scratch register. */
8367 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8368 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8369 || targetm.vector_mode_supported_p (GET_MODE (x)))
8370 && !aarch64_pcrelative_literal_loads)
8371 {
8372 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8373 return NO_REGS;
8374 }
8375
8376 /* Without the TARGET_SIMD instructions we cannot move a Q register
8377 to a Q register directly. We need a scratch. */
8378 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8379 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8380 && reg_class_subset_p (rclass, FP_REGS))
8381 {
8382 sri->icode = code_for_aarch64_reload_mov (mode);
8383 return NO_REGS;
8384 }
8385
8386 /* A TFmode or TImode memory access should be handled via an FP_REGS
8387 because AArch64 has richer addressing modes for LDR/STR instructions
8388 than LDP/STP instructions. */
8389 if (TARGET_FLOAT && rclass == GENERAL_REGS
8390 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8391 return FP_REGS;
8392
8393 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8394 return GENERAL_REGS;
8395
8396 return NO_REGS;
8397 }
8398
8399 static bool
8400 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8401 {
8402 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8403
8404 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8405 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8406 if (frame_pointer_needed)
8407 return to == HARD_FRAME_POINTER_REGNUM;
8408 return true;
8409 }
8410
8411 poly_int64
8412 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8413 {
8414 if (to == HARD_FRAME_POINTER_REGNUM)
8415 {
8416 if (from == ARG_POINTER_REGNUM)
8417 return cfun->machine->frame.hard_fp_offset;
8418
8419 if (from == FRAME_POINTER_REGNUM)
8420 return cfun->machine->frame.hard_fp_offset
8421 - cfun->machine->frame.locals_offset;
8422 }
8423
8424 if (to == STACK_POINTER_REGNUM)
8425 {
8426 if (from == FRAME_POINTER_REGNUM)
8427 return cfun->machine->frame.frame_size
8428 - cfun->machine->frame.locals_offset;
8429 }
8430
8431 return cfun->machine->frame.frame_size;
8432 }
8433
8434 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8435 previous frame. */
8436
8437 rtx
8438 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8439 {
8440 if (count != 0)
8441 return const0_rtx;
8442 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8443 }
8444
8445
8446 static void
8447 aarch64_asm_trampoline_template (FILE *f)
8448 {
8449 int offset1 = 16;
8450 int offset2 = 20;
8451
8452 if (aarch64_bti_enabled ())
8453 {
8454 asm_fprintf (f, "\thint\t34 // bti c\n");
8455 offset1 -= 4;
8456 offset2 -= 4;
8457 }
8458
8459 if (TARGET_ILP32)
8460 {
8461 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8462 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8463 offset1);
8464 }
8465 else
8466 {
8467 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8468 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8469 offset2);
8470 }
8471 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8472
8473 /* The trampoline needs an extra padding instruction. In case if BTI is
8474 enabled the padding instruction is replaced by the BTI instruction at
8475 the beginning. */
8476 if (!aarch64_bti_enabled ())
8477 assemble_aligned_integer (4, const0_rtx);
8478
8479 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8480 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8481 }
8482
8483 static void
8484 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8485 {
8486 rtx fnaddr, mem, a_tramp;
8487 const int tramp_code_sz = 16;
8488
8489 /* Don't need to copy the trailing D-words, we fill those in below. */
8490 emit_block_move (m_tramp, assemble_trampoline_template (),
8491 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8492 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8493 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8494 if (GET_MODE (fnaddr) != ptr_mode)
8495 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8496 emit_move_insn (mem, fnaddr);
8497
8498 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8499 emit_move_insn (mem, chain_value);
8500
8501 /* XXX We should really define a "clear_cache" pattern and use
8502 gen_clear_cache(). */
8503 a_tramp = XEXP (m_tramp, 0);
8504 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8505 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8506 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8507 ptr_mode);
8508 }
8509
8510 static unsigned char
8511 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8512 {
8513 /* ??? Logically we should only need to provide a value when
8514 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8515 can hold MODE, but at the moment we need to handle all modes.
8516 Just ignore any runtime parts for registers that can't store them. */
8517 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8518 unsigned int nregs;
8519 switch (regclass)
8520 {
8521 case TAILCALL_ADDR_REGS:
8522 case POINTER_REGS:
8523 case GENERAL_REGS:
8524 case ALL_REGS:
8525 case POINTER_AND_FP_REGS:
8526 case FP_REGS:
8527 case FP_LO_REGS:
8528 if (aarch64_sve_data_mode_p (mode)
8529 && constant_multiple_p (GET_MODE_SIZE (mode),
8530 BYTES_PER_SVE_VECTOR, &nregs))
8531 return nregs;
8532 return (aarch64_vector_data_mode_p (mode)
8533 ? CEIL (lowest_size, UNITS_PER_VREG)
8534 : CEIL (lowest_size, UNITS_PER_WORD));
8535 case STACK_REG:
8536 case PR_REGS:
8537 case PR_LO_REGS:
8538 case PR_HI_REGS:
8539 return 1;
8540
8541 case NO_REGS:
8542 return 0;
8543
8544 default:
8545 break;
8546 }
8547 gcc_unreachable ();
8548 }
8549
8550 static reg_class_t
8551 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8552 {
8553 if (regclass == POINTER_REGS)
8554 return GENERAL_REGS;
8555
8556 if (regclass == STACK_REG)
8557 {
8558 if (REG_P(x)
8559 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8560 return regclass;
8561
8562 return NO_REGS;
8563 }
8564
8565 /* Register eliminiation can result in a request for
8566 SP+constant->FP_REGS. We cannot support such operations which
8567 use SP as source and an FP_REG as destination, so reject out
8568 right now. */
8569 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8570 {
8571 rtx lhs = XEXP (x, 0);
8572
8573 /* Look through a possible SUBREG introduced by ILP32. */
8574 if (GET_CODE (lhs) == SUBREG)
8575 lhs = SUBREG_REG (lhs);
8576
8577 gcc_assert (REG_P (lhs));
8578 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8579 POINTER_REGS));
8580 return NO_REGS;
8581 }
8582
8583 return regclass;
8584 }
8585
8586 void
8587 aarch64_asm_output_labelref (FILE* f, const char *name)
8588 {
8589 asm_fprintf (f, "%U%s", name);
8590 }
8591
8592 static void
8593 aarch64_elf_asm_constructor (rtx symbol, int priority)
8594 {
8595 if (priority == DEFAULT_INIT_PRIORITY)
8596 default_ctor_section_asm_out_constructor (symbol, priority);
8597 else
8598 {
8599 section *s;
8600 /* While priority is known to be in range [0, 65535], so 18 bytes
8601 would be enough, the compiler might not know that. To avoid
8602 -Wformat-truncation false positive, use a larger size. */
8603 char buf[23];
8604 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8605 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8606 switch_to_section (s);
8607 assemble_align (POINTER_SIZE);
8608 assemble_aligned_integer (POINTER_BYTES, symbol);
8609 }
8610 }
8611
8612 static void
8613 aarch64_elf_asm_destructor (rtx symbol, int priority)
8614 {
8615 if (priority == DEFAULT_INIT_PRIORITY)
8616 default_dtor_section_asm_out_destructor (symbol, priority);
8617 else
8618 {
8619 section *s;
8620 /* While priority is known to be in range [0, 65535], so 18 bytes
8621 would be enough, the compiler might not know that. To avoid
8622 -Wformat-truncation false positive, use a larger size. */
8623 char buf[23];
8624 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8625 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8626 switch_to_section (s);
8627 assemble_align (POINTER_SIZE);
8628 assemble_aligned_integer (POINTER_BYTES, symbol);
8629 }
8630 }
8631
8632 const char*
8633 aarch64_output_casesi (rtx *operands)
8634 {
8635 char buf[100];
8636 char label[100];
8637 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8638 int index;
8639 static const char *const patterns[4][2] =
8640 {
8641 {
8642 "ldrb\t%w3, [%0,%w1,uxtw]",
8643 "add\t%3, %4, %w3, sxtb #2"
8644 },
8645 {
8646 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8647 "add\t%3, %4, %w3, sxth #2"
8648 },
8649 {
8650 "ldr\t%w3, [%0,%w1,uxtw #2]",
8651 "add\t%3, %4, %w3, sxtw #2"
8652 },
8653 /* We assume that DImode is only generated when not optimizing and
8654 that we don't really need 64-bit address offsets. That would
8655 imply an object file with 8GB of code in a single function! */
8656 {
8657 "ldr\t%w3, [%0,%w1,uxtw #2]",
8658 "add\t%3, %4, %w3, sxtw #2"
8659 }
8660 };
8661
8662 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8663
8664 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8665 index = exact_log2 (GET_MODE_SIZE (mode));
8666
8667 gcc_assert (index >= 0 && index <= 3);
8668
8669 /* Need to implement table size reduction, by chaning the code below. */
8670 output_asm_insn (patterns[index][0], operands);
8671 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8672 snprintf (buf, sizeof (buf),
8673 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8674 output_asm_insn (buf, operands);
8675 output_asm_insn (patterns[index][1], operands);
8676 output_asm_insn ("br\t%3", operands);
8677 assemble_label (asm_out_file, label);
8678 return "";
8679 }
8680
8681
8682 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8683 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8684 operator. */
8685
8686 int
8687 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8688 {
8689 if (shift >= 0 && shift <= 3)
8690 {
8691 int size;
8692 for (size = 8; size <= 32; size *= 2)
8693 {
8694 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8695 if (mask == bits << shift)
8696 return size;
8697 }
8698 }
8699 return 0;
8700 }
8701
8702 /* Constant pools are per function only when PC relative
8703 literal loads are true or we are in the large memory
8704 model. */
8705
8706 static inline bool
8707 aarch64_can_use_per_function_literal_pools_p (void)
8708 {
8709 return (aarch64_pcrelative_literal_loads
8710 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8711 }
8712
8713 static bool
8714 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8715 {
8716 /* We can't use blocks for constants when we're using a per-function
8717 constant pool. */
8718 return !aarch64_can_use_per_function_literal_pools_p ();
8719 }
8720
8721 /* Select appropriate section for constants depending
8722 on where we place literal pools. */
8723
8724 static section *
8725 aarch64_select_rtx_section (machine_mode mode,
8726 rtx x,
8727 unsigned HOST_WIDE_INT align)
8728 {
8729 if (aarch64_can_use_per_function_literal_pools_p ())
8730 return function_section (current_function_decl);
8731
8732 return default_elf_select_rtx_section (mode, x, align);
8733 }
8734
8735 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8736 void
8737 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8738 HOST_WIDE_INT offset)
8739 {
8740 /* When using per-function literal pools, we must ensure that any code
8741 section is aligned to the minimal instruction length, lest we get
8742 errors from the assembler re "unaligned instructions". */
8743 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8744 ASM_OUTPUT_ALIGN (f, 2);
8745 }
8746
8747 /* Costs. */
8748
8749 /* Helper function for rtx cost calculation. Strip a shift expression
8750 from X. Returns the inner operand if successful, or the original
8751 expression on failure. */
8752 static rtx
8753 aarch64_strip_shift (rtx x)
8754 {
8755 rtx op = x;
8756
8757 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8758 we can convert both to ROR during final output. */
8759 if ((GET_CODE (op) == ASHIFT
8760 || GET_CODE (op) == ASHIFTRT
8761 || GET_CODE (op) == LSHIFTRT
8762 || GET_CODE (op) == ROTATERT
8763 || GET_CODE (op) == ROTATE)
8764 && CONST_INT_P (XEXP (op, 1)))
8765 return XEXP (op, 0);
8766
8767 if (GET_CODE (op) == MULT
8768 && CONST_INT_P (XEXP (op, 1))
8769 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8770 return XEXP (op, 0);
8771
8772 return x;
8773 }
8774
8775 /* Helper function for rtx cost calculation. Strip an extend
8776 expression from X. Returns the inner operand if successful, or the
8777 original expression on failure. We deal with a number of possible
8778 canonicalization variations here. If STRIP_SHIFT is true, then
8779 we can strip off a shift also. */
8780 static rtx
8781 aarch64_strip_extend (rtx x, bool strip_shift)
8782 {
8783 scalar_int_mode mode;
8784 rtx op = x;
8785
8786 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8787 return op;
8788
8789 /* Zero and sign extraction of a widened value. */
8790 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8791 && XEXP (op, 2) == const0_rtx
8792 && GET_CODE (XEXP (op, 0)) == MULT
8793 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8794 XEXP (op, 1)))
8795 return XEXP (XEXP (op, 0), 0);
8796
8797 /* It can also be represented (for zero-extend) as an AND with an
8798 immediate. */
8799 if (GET_CODE (op) == AND
8800 && GET_CODE (XEXP (op, 0)) == MULT
8801 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8802 && CONST_INT_P (XEXP (op, 1))
8803 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8804 INTVAL (XEXP (op, 1))) != 0)
8805 return XEXP (XEXP (op, 0), 0);
8806
8807 /* Now handle extended register, as this may also have an optional
8808 left shift by 1..4. */
8809 if (strip_shift
8810 && GET_CODE (op) == ASHIFT
8811 && CONST_INT_P (XEXP (op, 1))
8812 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8813 op = XEXP (op, 0);
8814
8815 if (GET_CODE (op) == ZERO_EXTEND
8816 || GET_CODE (op) == SIGN_EXTEND)
8817 op = XEXP (op, 0);
8818
8819 if (op != x)
8820 return op;
8821
8822 return x;
8823 }
8824
8825 /* Return true iff CODE is a shift supported in combination
8826 with arithmetic instructions. */
8827
8828 static bool
8829 aarch64_shift_p (enum rtx_code code)
8830 {
8831 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8832 }
8833
8834
8835 /* Return true iff X is a cheap shift without a sign extend. */
8836
8837 static bool
8838 aarch64_cheap_mult_shift_p (rtx x)
8839 {
8840 rtx op0, op1;
8841
8842 op0 = XEXP (x, 0);
8843 op1 = XEXP (x, 1);
8844
8845 if (!(aarch64_tune_params.extra_tuning_flags
8846 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8847 return false;
8848
8849 if (GET_CODE (op0) == SIGN_EXTEND)
8850 return false;
8851
8852 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8853 && UINTVAL (op1) <= 4)
8854 return true;
8855
8856 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8857 return false;
8858
8859 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8860
8861 if (l2 > 0 && l2 <= 4)
8862 return true;
8863
8864 return false;
8865 }
8866
8867 /* Helper function for rtx cost calculation. Calculate the cost of
8868 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8869 Return the calculated cost of the expression, recursing manually in to
8870 operands where needed. */
8871
8872 static int
8873 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8874 {
8875 rtx op0, op1;
8876 const struct cpu_cost_table *extra_cost
8877 = aarch64_tune_params.insn_extra_cost;
8878 int cost = 0;
8879 bool compound_p = (outer == PLUS || outer == MINUS);
8880 machine_mode mode = GET_MODE (x);
8881
8882 gcc_checking_assert (code == MULT);
8883
8884 op0 = XEXP (x, 0);
8885 op1 = XEXP (x, 1);
8886
8887 if (VECTOR_MODE_P (mode))
8888 mode = GET_MODE_INNER (mode);
8889
8890 /* Integer multiply/fma. */
8891 if (GET_MODE_CLASS (mode) == MODE_INT)
8892 {
8893 /* The multiply will be canonicalized as a shift, cost it as such. */
8894 if (aarch64_shift_p (GET_CODE (x))
8895 || (CONST_INT_P (op1)
8896 && exact_log2 (INTVAL (op1)) > 0))
8897 {
8898 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8899 || GET_CODE (op0) == SIGN_EXTEND;
8900 if (speed)
8901 {
8902 if (compound_p)
8903 {
8904 /* If the shift is considered cheap,
8905 then don't add any cost. */
8906 if (aarch64_cheap_mult_shift_p (x))
8907 ;
8908 else if (REG_P (op1))
8909 /* ARITH + shift-by-register. */
8910 cost += extra_cost->alu.arith_shift_reg;
8911 else if (is_extend)
8912 /* ARITH + extended register. We don't have a cost field
8913 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8914 cost += extra_cost->alu.extend_arith;
8915 else
8916 /* ARITH + shift-by-immediate. */
8917 cost += extra_cost->alu.arith_shift;
8918 }
8919 else
8920 /* LSL (immediate). */
8921 cost += extra_cost->alu.shift;
8922
8923 }
8924 /* Strip extends as we will have costed them in the case above. */
8925 if (is_extend)
8926 op0 = aarch64_strip_extend (op0, true);
8927
8928 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8929
8930 return cost;
8931 }
8932
8933 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8934 compound and let the below cases handle it. After all, MNEG is a
8935 special-case alias of MSUB. */
8936 if (GET_CODE (op0) == NEG)
8937 {
8938 op0 = XEXP (op0, 0);
8939 compound_p = true;
8940 }
8941
8942 /* Integer multiplies or FMAs have zero/sign extending variants. */
8943 if ((GET_CODE (op0) == ZERO_EXTEND
8944 && GET_CODE (op1) == ZERO_EXTEND)
8945 || (GET_CODE (op0) == SIGN_EXTEND
8946 && GET_CODE (op1) == SIGN_EXTEND))
8947 {
8948 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8949 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8950
8951 if (speed)
8952 {
8953 if (compound_p)
8954 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8955 cost += extra_cost->mult[0].extend_add;
8956 else
8957 /* MUL/SMULL/UMULL. */
8958 cost += extra_cost->mult[0].extend;
8959 }
8960
8961 return cost;
8962 }
8963
8964 /* This is either an integer multiply or a MADD. In both cases
8965 we want to recurse and cost the operands. */
8966 cost += rtx_cost (op0, mode, MULT, 0, speed);
8967 cost += rtx_cost (op1, mode, MULT, 1, speed);
8968
8969 if (speed)
8970 {
8971 if (compound_p)
8972 /* MADD/MSUB. */
8973 cost += extra_cost->mult[mode == DImode].add;
8974 else
8975 /* MUL. */
8976 cost += extra_cost->mult[mode == DImode].simple;
8977 }
8978
8979 return cost;
8980 }
8981 else
8982 {
8983 if (speed)
8984 {
8985 /* Floating-point FMA/FMUL can also support negations of the
8986 operands, unless the rounding mode is upward or downward in
8987 which case FNMUL is different than FMUL with operand negation. */
8988 bool neg0 = GET_CODE (op0) == NEG;
8989 bool neg1 = GET_CODE (op1) == NEG;
8990 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8991 {
8992 if (neg0)
8993 op0 = XEXP (op0, 0);
8994 if (neg1)
8995 op1 = XEXP (op1, 0);
8996 }
8997
8998 if (compound_p)
8999 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9000 cost += extra_cost->fp[mode == DFmode].fma;
9001 else
9002 /* FMUL/FNMUL. */
9003 cost += extra_cost->fp[mode == DFmode].mult;
9004 }
9005
9006 cost += rtx_cost (op0, mode, MULT, 0, speed);
9007 cost += rtx_cost (op1, mode, MULT, 1, speed);
9008 return cost;
9009 }
9010 }
9011
9012 static int
9013 aarch64_address_cost (rtx x,
9014 machine_mode mode,
9015 addr_space_t as ATTRIBUTE_UNUSED,
9016 bool speed)
9017 {
9018 enum rtx_code c = GET_CODE (x);
9019 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9020 struct aarch64_address_info info;
9021 int cost = 0;
9022 info.shift = 0;
9023
9024 if (!aarch64_classify_address (&info, x, mode, false))
9025 {
9026 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9027 {
9028 /* This is a CONST or SYMBOL ref which will be split
9029 in a different way depending on the code model in use.
9030 Cost it through the generic infrastructure. */
9031 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9032 /* Divide through by the cost of one instruction to
9033 bring it to the same units as the address costs. */
9034 cost_symbol_ref /= COSTS_N_INSNS (1);
9035 /* The cost is then the cost of preparing the address,
9036 followed by an immediate (possibly 0) offset. */
9037 return cost_symbol_ref + addr_cost->imm_offset;
9038 }
9039 else
9040 {
9041 /* This is most likely a jump table from a case
9042 statement. */
9043 return addr_cost->register_offset;
9044 }
9045 }
9046
9047 switch (info.type)
9048 {
9049 case ADDRESS_LO_SUM:
9050 case ADDRESS_SYMBOLIC:
9051 case ADDRESS_REG_IMM:
9052 cost += addr_cost->imm_offset;
9053 break;
9054
9055 case ADDRESS_REG_WB:
9056 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9057 cost += addr_cost->pre_modify;
9058 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9059 cost += addr_cost->post_modify;
9060 else
9061 gcc_unreachable ();
9062
9063 break;
9064
9065 case ADDRESS_REG_REG:
9066 cost += addr_cost->register_offset;
9067 break;
9068
9069 case ADDRESS_REG_SXTW:
9070 cost += addr_cost->register_sextend;
9071 break;
9072
9073 case ADDRESS_REG_UXTW:
9074 cost += addr_cost->register_zextend;
9075 break;
9076
9077 default:
9078 gcc_unreachable ();
9079 }
9080
9081
9082 if (info.shift > 0)
9083 {
9084 /* For the sake of calculating the cost of the shifted register
9085 component, we can treat same sized modes in the same way. */
9086 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9087 cost += addr_cost->addr_scale_costs.hi;
9088 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9089 cost += addr_cost->addr_scale_costs.si;
9090 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9091 cost += addr_cost->addr_scale_costs.di;
9092 else
9093 /* We can't tell, or this is a 128-bit vector. */
9094 cost += addr_cost->addr_scale_costs.ti;
9095 }
9096
9097 return cost;
9098 }
9099
9100 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9101 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9102 to be taken. */
9103
9104 int
9105 aarch64_branch_cost (bool speed_p, bool predictable_p)
9106 {
9107 /* When optimizing for speed, use the cost of unpredictable branches. */
9108 const struct cpu_branch_cost *branch_costs =
9109 aarch64_tune_params.branch_costs;
9110
9111 if (!speed_p || predictable_p)
9112 return branch_costs->predictable;
9113 else
9114 return branch_costs->unpredictable;
9115 }
9116
9117 /* Return true if the RTX X in mode MODE is a zero or sign extract
9118 usable in an ADD or SUB (extended register) instruction. */
9119 static bool
9120 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9121 {
9122 /* Catch add with a sign extract.
9123 This is add_<optab><mode>_multp2. */
9124 if (GET_CODE (x) == SIGN_EXTRACT
9125 || GET_CODE (x) == ZERO_EXTRACT)
9126 {
9127 rtx op0 = XEXP (x, 0);
9128 rtx op1 = XEXP (x, 1);
9129 rtx op2 = XEXP (x, 2);
9130
9131 if (GET_CODE (op0) == MULT
9132 && CONST_INT_P (op1)
9133 && op2 == const0_rtx
9134 && CONST_INT_P (XEXP (op0, 1))
9135 && aarch64_is_extend_from_extract (mode,
9136 XEXP (op0, 1),
9137 op1))
9138 {
9139 return true;
9140 }
9141 }
9142 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9143 No shift. */
9144 else if (GET_CODE (x) == SIGN_EXTEND
9145 || GET_CODE (x) == ZERO_EXTEND)
9146 return REG_P (XEXP (x, 0));
9147
9148 return false;
9149 }
9150
9151 static bool
9152 aarch64_frint_unspec_p (unsigned int u)
9153 {
9154 switch (u)
9155 {
9156 case UNSPEC_FRINTZ:
9157 case UNSPEC_FRINTP:
9158 case UNSPEC_FRINTM:
9159 case UNSPEC_FRINTA:
9160 case UNSPEC_FRINTN:
9161 case UNSPEC_FRINTX:
9162 case UNSPEC_FRINTI:
9163 return true;
9164
9165 default:
9166 return false;
9167 }
9168 }
9169
9170 /* Return true iff X is an rtx that will match an extr instruction
9171 i.e. as described in the *extr<mode>5_insn family of patterns.
9172 OP0 and OP1 will be set to the operands of the shifts involved
9173 on success and will be NULL_RTX otherwise. */
9174
9175 static bool
9176 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9177 {
9178 rtx op0, op1;
9179 scalar_int_mode mode;
9180 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9181 return false;
9182
9183 *res_op0 = NULL_RTX;
9184 *res_op1 = NULL_RTX;
9185
9186 if (GET_CODE (x) != IOR)
9187 return false;
9188
9189 op0 = XEXP (x, 0);
9190 op1 = XEXP (x, 1);
9191
9192 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9193 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9194 {
9195 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9196 if (GET_CODE (op1) == ASHIFT)
9197 std::swap (op0, op1);
9198
9199 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9200 return false;
9201
9202 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9203 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9204
9205 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9206 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9207 {
9208 *res_op0 = XEXP (op0, 0);
9209 *res_op1 = XEXP (op1, 0);
9210 return true;
9211 }
9212 }
9213
9214 return false;
9215 }
9216
9217 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9218 storing it in *COST. Result is true if the total cost of the operation
9219 has now been calculated. */
9220 static bool
9221 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9222 {
9223 rtx inner;
9224 rtx comparator;
9225 enum rtx_code cmpcode;
9226
9227 if (COMPARISON_P (op0))
9228 {
9229 inner = XEXP (op0, 0);
9230 comparator = XEXP (op0, 1);
9231 cmpcode = GET_CODE (op0);
9232 }
9233 else
9234 {
9235 inner = op0;
9236 comparator = const0_rtx;
9237 cmpcode = NE;
9238 }
9239
9240 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9241 {
9242 /* Conditional branch. */
9243 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9244 return true;
9245 else
9246 {
9247 if (cmpcode == NE || cmpcode == EQ)
9248 {
9249 if (comparator == const0_rtx)
9250 {
9251 /* TBZ/TBNZ/CBZ/CBNZ. */
9252 if (GET_CODE (inner) == ZERO_EXTRACT)
9253 /* TBZ/TBNZ. */
9254 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9255 ZERO_EXTRACT, 0, speed);
9256 else
9257 /* CBZ/CBNZ. */
9258 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9259
9260 return true;
9261 }
9262 }
9263 else if (cmpcode == LT || cmpcode == GE)
9264 {
9265 /* TBZ/TBNZ. */
9266 if (comparator == const0_rtx)
9267 return true;
9268 }
9269 }
9270 }
9271 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9272 {
9273 /* CCMP. */
9274 if (GET_CODE (op1) == COMPARE)
9275 {
9276 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9277 if (XEXP (op1, 1) == const0_rtx)
9278 *cost += 1;
9279 if (speed)
9280 {
9281 machine_mode mode = GET_MODE (XEXP (op1, 0));
9282 const struct cpu_cost_table *extra_cost
9283 = aarch64_tune_params.insn_extra_cost;
9284
9285 if (GET_MODE_CLASS (mode) == MODE_INT)
9286 *cost += extra_cost->alu.arith;
9287 else
9288 *cost += extra_cost->fp[mode == DFmode].compare;
9289 }
9290 return true;
9291 }
9292
9293 /* It's a conditional operation based on the status flags,
9294 so it must be some flavor of CSEL. */
9295
9296 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9297 if (GET_CODE (op1) == NEG
9298 || GET_CODE (op1) == NOT
9299 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9300 op1 = XEXP (op1, 0);
9301 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9302 {
9303 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9304 op1 = XEXP (op1, 0);
9305 op2 = XEXP (op2, 0);
9306 }
9307
9308 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9309 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9310 return true;
9311 }
9312
9313 /* We don't know what this is, cost all operands. */
9314 return false;
9315 }
9316
9317 /* Check whether X is a bitfield operation of the form shift + extend that
9318 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9319 operand to which the bitfield operation is applied. Otherwise return
9320 NULL_RTX. */
9321
9322 static rtx
9323 aarch64_extend_bitfield_pattern_p (rtx x)
9324 {
9325 rtx_code outer_code = GET_CODE (x);
9326 machine_mode outer_mode = GET_MODE (x);
9327
9328 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9329 && outer_mode != SImode && outer_mode != DImode)
9330 return NULL_RTX;
9331
9332 rtx inner = XEXP (x, 0);
9333 rtx_code inner_code = GET_CODE (inner);
9334 machine_mode inner_mode = GET_MODE (inner);
9335 rtx op = NULL_RTX;
9336
9337 switch (inner_code)
9338 {
9339 case ASHIFT:
9340 if (CONST_INT_P (XEXP (inner, 1))
9341 && (inner_mode == QImode || inner_mode == HImode))
9342 op = XEXP (inner, 0);
9343 break;
9344 case LSHIFTRT:
9345 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9346 && (inner_mode == QImode || inner_mode == HImode))
9347 op = XEXP (inner, 0);
9348 break;
9349 case ASHIFTRT:
9350 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9351 && (inner_mode == QImode || inner_mode == HImode))
9352 op = XEXP (inner, 0);
9353 break;
9354 default:
9355 break;
9356 }
9357
9358 return op;
9359 }
9360
9361 /* Return true if the mask and a shift amount from an RTX of the form
9362 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9363 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9364
9365 bool
9366 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9367 rtx shft_amnt)
9368 {
9369 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9370 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9371 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9372 && (INTVAL (mask)
9373 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9374 }
9375
9376 /* Return true if the masks and a shift amount from an RTX of the form
9377 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9378 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9379
9380 bool
9381 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9382 unsigned HOST_WIDE_INT mask1,
9383 unsigned HOST_WIDE_INT shft_amnt,
9384 unsigned HOST_WIDE_INT mask2)
9385 {
9386 unsigned HOST_WIDE_INT t;
9387
9388 /* Verify that there is no overlap in what bits are set in the two masks. */
9389 if (mask1 != ~mask2)
9390 return false;
9391
9392 /* Verify that mask2 is not all zeros or ones. */
9393 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9394 return false;
9395
9396 /* The shift amount should always be less than the mode size. */
9397 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9398
9399 /* Verify that the mask being shifted is contiguous and would be in the
9400 least significant bits after shifting by shft_amnt. */
9401 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9402 return (t == (t & -t));
9403 }
9404
9405 /* Calculate the cost of calculating X, storing it in *COST. Result
9406 is true if the total cost of the operation has now been calculated. */
9407 static bool
9408 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9409 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9410 {
9411 rtx op0, op1, op2;
9412 const struct cpu_cost_table *extra_cost
9413 = aarch64_tune_params.insn_extra_cost;
9414 int code = GET_CODE (x);
9415 scalar_int_mode int_mode;
9416
9417 /* By default, assume that everything has equivalent cost to the
9418 cheapest instruction. Any additional costs are applied as a delta
9419 above this default. */
9420 *cost = COSTS_N_INSNS (1);
9421
9422 switch (code)
9423 {
9424 case SET:
9425 /* The cost depends entirely on the operands to SET. */
9426 *cost = 0;
9427 op0 = SET_DEST (x);
9428 op1 = SET_SRC (x);
9429
9430 switch (GET_CODE (op0))
9431 {
9432 case MEM:
9433 if (speed)
9434 {
9435 rtx address = XEXP (op0, 0);
9436 if (VECTOR_MODE_P (mode))
9437 *cost += extra_cost->ldst.storev;
9438 else if (GET_MODE_CLASS (mode) == MODE_INT)
9439 *cost += extra_cost->ldst.store;
9440 else if (mode == SFmode)
9441 *cost += extra_cost->ldst.storef;
9442 else if (mode == DFmode)
9443 *cost += extra_cost->ldst.stored;
9444
9445 *cost +=
9446 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9447 0, speed));
9448 }
9449
9450 *cost += rtx_cost (op1, mode, SET, 1, speed);
9451 return true;
9452
9453 case SUBREG:
9454 if (! REG_P (SUBREG_REG (op0)))
9455 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9456
9457 /* Fall through. */
9458 case REG:
9459 /* The cost is one per vector-register copied. */
9460 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9461 {
9462 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9463 *cost = COSTS_N_INSNS (nregs);
9464 }
9465 /* const0_rtx is in general free, but we will use an
9466 instruction to set a register to 0. */
9467 else if (REG_P (op1) || op1 == const0_rtx)
9468 {
9469 /* The cost is 1 per register copied. */
9470 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9471 *cost = COSTS_N_INSNS (nregs);
9472 }
9473 else
9474 /* Cost is just the cost of the RHS of the set. */
9475 *cost += rtx_cost (op1, mode, SET, 1, speed);
9476 return true;
9477
9478 case ZERO_EXTRACT:
9479 case SIGN_EXTRACT:
9480 /* Bit-field insertion. Strip any redundant widening of
9481 the RHS to meet the width of the target. */
9482 if (GET_CODE (op1) == SUBREG)
9483 op1 = SUBREG_REG (op1);
9484 if ((GET_CODE (op1) == ZERO_EXTEND
9485 || GET_CODE (op1) == SIGN_EXTEND)
9486 && CONST_INT_P (XEXP (op0, 1))
9487 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9488 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9489 op1 = XEXP (op1, 0);
9490
9491 if (CONST_INT_P (op1))
9492 {
9493 /* MOV immediate is assumed to always be cheap. */
9494 *cost = COSTS_N_INSNS (1);
9495 }
9496 else
9497 {
9498 /* BFM. */
9499 if (speed)
9500 *cost += extra_cost->alu.bfi;
9501 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9502 }
9503
9504 return true;
9505
9506 default:
9507 /* We can't make sense of this, assume default cost. */
9508 *cost = COSTS_N_INSNS (1);
9509 return false;
9510 }
9511 return false;
9512
9513 case CONST_INT:
9514 /* If an instruction can incorporate a constant within the
9515 instruction, the instruction's expression avoids calling
9516 rtx_cost() on the constant. If rtx_cost() is called on a
9517 constant, then it is usually because the constant must be
9518 moved into a register by one or more instructions.
9519
9520 The exception is constant 0, which can be expressed
9521 as XZR/WZR and is therefore free. The exception to this is
9522 if we have (set (reg) (const0_rtx)) in which case we must cost
9523 the move. However, we can catch that when we cost the SET, so
9524 we don't need to consider that here. */
9525 if (x == const0_rtx)
9526 *cost = 0;
9527 else
9528 {
9529 /* To an approximation, building any other constant is
9530 proportionally expensive to the number of instructions
9531 required to build that constant. This is true whether we
9532 are compiling for SPEED or otherwise. */
9533 if (!is_a <scalar_int_mode> (mode, &int_mode))
9534 int_mode = word_mode;
9535 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9536 (NULL_RTX, x, false, int_mode));
9537 }
9538 return true;
9539
9540 case CONST_DOUBLE:
9541
9542 /* First determine number of instructions to do the move
9543 as an integer constant. */
9544 if (!aarch64_float_const_representable_p (x)
9545 && !aarch64_can_const_movi_rtx_p (x, mode)
9546 && aarch64_float_const_rtx_p (x))
9547 {
9548 unsigned HOST_WIDE_INT ival;
9549 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9550 gcc_assert (succeed);
9551
9552 scalar_int_mode imode = (mode == HFmode
9553 ? SImode
9554 : int_mode_for_mode (mode).require ());
9555 int ncost = aarch64_internal_mov_immediate
9556 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9557 *cost += COSTS_N_INSNS (ncost);
9558 return true;
9559 }
9560
9561 if (speed)
9562 {
9563 /* mov[df,sf]_aarch64. */
9564 if (aarch64_float_const_representable_p (x))
9565 /* FMOV (scalar immediate). */
9566 *cost += extra_cost->fp[mode == DFmode].fpconst;
9567 else if (!aarch64_float_const_zero_rtx_p (x))
9568 {
9569 /* This will be a load from memory. */
9570 if (mode == DFmode)
9571 *cost += extra_cost->ldst.loadd;
9572 else
9573 *cost += extra_cost->ldst.loadf;
9574 }
9575 else
9576 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9577 or MOV v0.s[0], wzr - neither of which are modeled by the
9578 cost tables. Just use the default cost. */
9579 {
9580 }
9581 }
9582
9583 return true;
9584
9585 case MEM:
9586 if (speed)
9587 {
9588 /* For loads we want the base cost of a load, plus an
9589 approximation for the additional cost of the addressing
9590 mode. */
9591 rtx address = XEXP (x, 0);
9592 if (VECTOR_MODE_P (mode))
9593 *cost += extra_cost->ldst.loadv;
9594 else if (GET_MODE_CLASS (mode) == MODE_INT)
9595 *cost += extra_cost->ldst.load;
9596 else if (mode == SFmode)
9597 *cost += extra_cost->ldst.loadf;
9598 else if (mode == DFmode)
9599 *cost += extra_cost->ldst.loadd;
9600
9601 *cost +=
9602 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9603 0, speed));
9604 }
9605
9606 return true;
9607
9608 case NEG:
9609 op0 = XEXP (x, 0);
9610
9611 if (VECTOR_MODE_P (mode))
9612 {
9613 if (speed)
9614 {
9615 /* FNEG. */
9616 *cost += extra_cost->vect.alu;
9617 }
9618 return false;
9619 }
9620
9621 if (GET_MODE_CLASS (mode) == MODE_INT)
9622 {
9623 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9624 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9625 {
9626 /* CSETM. */
9627 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9628 return true;
9629 }
9630
9631 /* Cost this as SUB wzr, X. */
9632 op0 = CONST0_RTX (mode);
9633 op1 = XEXP (x, 0);
9634 goto cost_minus;
9635 }
9636
9637 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9638 {
9639 /* Support (neg(fma...)) as a single instruction only if
9640 sign of zeros is unimportant. This matches the decision
9641 making in aarch64.md. */
9642 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9643 {
9644 /* FNMADD. */
9645 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9646 return true;
9647 }
9648 if (GET_CODE (op0) == MULT)
9649 {
9650 /* FNMUL. */
9651 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9652 return true;
9653 }
9654 if (speed)
9655 /* FNEG. */
9656 *cost += extra_cost->fp[mode == DFmode].neg;
9657 return false;
9658 }
9659
9660 return false;
9661
9662 case CLRSB:
9663 case CLZ:
9664 if (speed)
9665 {
9666 if (VECTOR_MODE_P (mode))
9667 *cost += extra_cost->vect.alu;
9668 else
9669 *cost += extra_cost->alu.clz;
9670 }
9671
9672 return false;
9673
9674 case COMPARE:
9675 op0 = XEXP (x, 0);
9676 op1 = XEXP (x, 1);
9677
9678 if (op1 == const0_rtx
9679 && GET_CODE (op0) == AND)
9680 {
9681 x = op0;
9682 mode = GET_MODE (op0);
9683 goto cost_logic;
9684 }
9685
9686 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9687 {
9688 /* TODO: A write to the CC flags possibly costs extra, this
9689 needs encoding in the cost tables. */
9690
9691 mode = GET_MODE (op0);
9692 /* ANDS. */
9693 if (GET_CODE (op0) == AND)
9694 {
9695 x = op0;
9696 goto cost_logic;
9697 }
9698
9699 if (GET_CODE (op0) == PLUS)
9700 {
9701 /* ADDS (and CMN alias). */
9702 x = op0;
9703 goto cost_plus;
9704 }
9705
9706 if (GET_CODE (op0) == MINUS)
9707 {
9708 /* SUBS. */
9709 x = op0;
9710 goto cost_minus;
9711 }
9712
9713 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9714 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9715 && CONST_INT_P (XEXP (op0, 2)))
9716 {
9717 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9718 Handle it here directly rather than going to cost_logic
9719 since we know the immediate generated for the TST is valid
9720 so we can avoid creating an intermediate rtx for it only
9721 for costing purposes. */
9722 if (speed)
9723 *cost += extra_cost->alu.logical;
9724
9725 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9726 ZERO_EXTRACT, 0, speed);
9727 return true;
9728 }
9729
9730 if (GET_CODE (op1) == NEG)
9731 {
9732 /* CMN. */
9733 if (speed)
9734 *cost += extra_cost->alu.arith;
9735
9736 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9737 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9738 return true;
9739 }
9740
9741 /* CMP.
9742
9743 Compare can freely swap the order of operands, and
9744 canonicalization puts the more complex operation first.
9745 But the integer MINUS logic expects the shift/extend
9746 operation in op1. */
9747 if (! (REG_P (op0)
9748 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9749 {
9750 op0 = XEXP (x, 1);
9751 op1 = XEXP (x, 0);
9752 }
9753 goto cost_minus;
9754 }
9755
9756 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9757 {
9758 /* FCMP. */
9759 if (speed)
9760 *cost += extra_cost->fp[mode == DFmode].compare;
9761
9762 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9763 {
9764 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9765 /* FCMP supports constant 0.0 for no extra cost. */
9766 return true;
9767 }
9768 return false;
9769 }
9770
9771 if (VECTOR_MODE_P (mode))
9772 {
9773 /* Vector compare. */
9774 if (speed)
9775 *cost += extra_cost->vect.alu;
9776
9777 if (aarch64_float_const_zero_rtx_p (op1))
9778 {
9779 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9780 cost. */
9781 return true;
9782 }
9783 return false;
9784 }
9785 return false;
9786
9787 case MINUS:
9788 {
9789 op0 = XEXP (x, 0);
9790 op1 = XEXP (x, 1);
9791
9792 cost_minus:
9793 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9794
9795 /* Detect valid immediates. */
9796 if ((GET_MODE_CLASS (mode) == MODE_INT
9797 || (GET_MODE_CLASS (mode) == MODE_CC
9798 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9799 && CONST_INT_P (op1)
9800 && aarch64_uimm12_shift (INTVAL (op1)))
9801 {
9802 if (speed)
9803 /* SUB(S) (immediate). */
9804 *cost += extra_cost->alu.arith;
9805 return true;
9806 }
9807
9808 /* Look for SUB (extended register). */
9809 if (is_a <scalar_int_mode> (mode, &int_mode)
9810 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9811 {
9812 if (speed)
9813 *cost += extra_cost->alu.extend_arith;
9814
9815 op1 = aarch64_strip_extend (op1, true);
9816 *cost += rtx_cost (op1, VOIDmode,
9817 (enum rtx_code) GET_CODE (op1), 0, speed);
9818 return true;
9819 }
9820
9821 rtx new_op1 = aarch64_strip_extend (op1, false);
9822
9823 /* Cost this as an FMA-alike operation. */
9824 if ((GET_CODE (new_op1) == MULT
9825 || aarch64_shift_p (GET_CODE (new_op1)))
9826 && code != COMPARE)
9827 {
9828 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9829 (enum rtx_code) code,
9830 speed);
9831 return true;
9832 }
9833
9834 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9835
9836 if (speed)
9837 {
9838 if (VECTOR_MODE_P (mode))
9839 {
9840 /* Vector SUB. */
9841 *cost += extra_cost->vect.alu;
9842 }
9843 else if (GET_MODE_CLASS (mode) == MODE_INT)
9844 {
9845 /* SUB(S). */
9846 *cost += extra_cost->alu.arith;
9847 }
9848 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9849 {
9850 /* FSUB. */
9851 *cost += extra_cost->fp[mode == DFmode].addsub;
9852 }
9853 }
9854 return true;
9855 }
9856
9857 case PLUS:
9858 {
9859 rtx new_op0;
9860
9861 op0 = XEXP (x, 0);
9862 op1 = XEXP (x, 1);
9863
9864 cost_plus:
9865 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9866 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9867 {
9868 /* CSINC. */
9869 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9870 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9871 return true;
9872 }
9873
9874 if (GET_MODE_CLASS (mode) == MODE_INT
9875 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9876 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9877 {
9878 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9879
9880 if (speed)
9881 /* ADD (immediate). */
9882 *cost += extra_cost->alu.arith;
9883 return true;
9884 }
9885
9886 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9887
9888 /* Look for ADD (extended register). */
9889 if (is_a <scalar_int_mode> (mode, &int_mode)
9890 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9891 {
9892 if (speed)
9893 *cost += extra_cost->alu.extend_arith;
9894
9895 op0 = aarch64_strip_extend (op0, true);
9896 *cost += rtx_cost (op0, VOIDmode,
9897 (enum rtx_code) GET_CODE (op0), 0, speed);
9898 return true;
9899 }
9900
9901 /* Strip any extend, leave shifts behind as we will
9902 cost them through mult_cost. */
9903 new_op0 = aarch64_strip_extend (op0, false);
9904
9905 if (GET_CODE (new_op0) == MULT
9906 || aarch64_shift_p (GET_CODE (new_op0)))
9907 {
9908 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9909 speed);
9910 return true;
9911 }
9912
9913 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9914
9915 if (speed)
9916 {
9917 if (VECTOR_MODE_P (mode))
9918 {
9919 /* Vector ADD. */
9920 *cost += extra_cost->vect.alu;
9921 }
9922 else if (GET_MODE_CLASS (mode) == MODE_INT)
9923 {
9924 /* ADD. */
9925 *cost += extra_cost->alu.arith;
9926 }
9927 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9928 {
9929 /* FADD. */
9930 *cost += extra_cost->fp[mode == DFmode].addsub;
9931 }
9932 }
9933 return true;
9934 }
9935
9936 case BSWAP:
9937 *cost = COSTS_N_INSNS (1);
9938
9939 if (speed)
9940 {
9941 if (VECTOR_MODE_P (mode))
9942 *cost += extra_cost->vect.alu;
9943 else
9944 *cost += extra_cost->alu.rev;
9945 }
9946 return false;
9947
9948 case IOR:
9949 if (aarch_rev16_p (x))
9950 {
9951 *cost = COSTS_N_INSNS (1);
9952
9953 if (speed)
9954 {
9955 if (VECTOR_MODE_P (mode))
9956 *cost += extra_cost->vect.alu;
9957 else
9958 *cost += extra_cost->alu.rev;
9959 }
9960 return true;
9961 }
9962
9963 if (aarch64_extr_rtx_p (x, &op0, &op1))
9964 {
9965 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9966 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9967 if (speed)
9968 *cost += extra_cost->alu.shift;
9969
9970 return true;
9971 }
9972 /* Fall through. */
9973 case XOR:
9974 case AND:
9975 cost_logic:
9976 op0 = XEXP (x, 0);
9977 op1 = XEXP (x, 1);
9978
9979 if (VECTOR_MODE_P (mode))
9980 {
9981 if (speed)
9982 *cost += extra_cost->vect.alu;
9983 return true;
9984 }
9985
9986 if (code == AND
9987 && GET_CODE (op0) == MULT
9988 && CONST_INT_P (XEXP (op0, 1))
9989 && CONST_INT_P (op1)
9990 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9991 INTVAL (op1)) != 0)
9992 {
9993 /* This is a UBFM/SBFM. */
9994 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9995 if (speed)
9996 *cost += extra_cost->alu.bfx;
9997 return true;
9998 }
9999
10000 if (is_int_mode (mode, &int_mode))
10001 {
10002 if (CONST_INT_P (op1))
10003 {
10004 /* We have a mask + shift version of a UBFIZ
10005 i.e. the *andim_ashift<mode>_bfiz pattern. */
10006 if (GET_CODE (op0) == ASHIFT
10007 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10008 XEXP (op0, 1)))
10009 {
10010 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10011 (enum rtx_code) code, 0, speed);
10012 if (speed)
10013 *cost += extra_cost->alu.bfx;
10014
10015 return true;
10016 }
10017 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10018 {
10019 /* We possibly get the immediate for free, this is not
10020 modelled. */
10021 *cost += rtx_cost (op0, int_mode,
10022 (enum rtx_code) code, 0, speed);
10023 if (speed)
10024 *cost += extra_cost->alu.logical;
10025
10026 return true;
10027 }
10028 }
10029 else
10030 {
10031 rtx new_op0 = op0;
10032
10033 /* Handle ORN, EON, or BIC. */
10034 if (GET_CODE (op0) == NOT)
10035 op0 = XEXP (op0, 0);
10036
10037 new_op0 = aarch64_strip_shift (op0);
10038
10039 /* If we had a shift on op0 then this is a logical-shift-
10040 by-register/immediate operation. Otherwise, this is just
10041 a logical operation. */
10042 if (speed)
10043 {
10044 if (new_op0 != op0)
10045 {
10046 /* Shift by immediate. */
10047 if (CONST_INT_P (XEXP (op0, 1)))
10048 *cost += extra_cost->alu.log_shift;
10049 else
10050 *cost += extra_cost->alu.log_shift_reg;
10051 }
10052 else
10053 *cost += extra_cost->alu.logical;
10054 }
10055
10056 /* In both cases we want to cost both operands. */
10057 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10058 0, speed);
10059 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10060 1, speed);
10061
10062 return true;
10063 }
10064 }
10065 return false;
10066
10067 case NOT:
10068 x = XEXP (x, 0);
10069 op0 = aarch64_strip_shift (x);
10070
10071 if (VECTOR_MODE_P (mode))
10072 {
10073 /* Vector NOT. */
10074 *cost += extra_cost->vect.alu;
10075 return false;
10076 }
10077
10078 /* MVN-shifted-reg. */
10079 if (op0 != x)
10080 {
10081 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10082
10083 if (speed)
10084 *cost += extra_cost->alu.log_shift;
10085
10086 return true;
10087 }
10088 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10089 Handle the second form here taking care that 'a' in the above can
10090 be a shift. */
10091 else if (GET_CODE (op0) == XOR)
10092 {
10093 rtx newop0 = XEXP (op0, 0);
10094 rtx newop1 = XEXP (op0, 1);
10095 rtx op0_stripped = aarch64_strip_shift (newop0);
10096
10097 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10098 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10099
10100 if (speed)
10101 {
10102 if (op0_stripped != newop0)
10103 *cost += extra_cost->alu.log_shift;
10104 else
10105 *cost += extra_cost->alu.logical;
10106 }
10107
10108 return true;
10109 }
10110 /* MVN. */
10111 if (speed)
10112 *cost += extra_cost->alu.logical;
10113
10114 return false;
10115
10116 case ZERO_EXTEND:
10117
10118 op0 = XEXP (x, 0);
10119 /* If a value is written in SI mode, then zero extended to DI
10120 mode, the operation will in general be free as a write to
10121 a 'w' register implicitly zeroes the upper bits of an 'x'
10122 register. However, if this is
10123
10124 (set (reg) (zero_extend (reg)))
10125
10126 we must cost the explicit register move. */
10127 if (mode == DImode
10128 && GET_MODE (op0) == SImode
10129 && outer == SET)
10130 {
10131 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10132
10133 /* If OP_COST is non-zero, then the cost of the zero extend
10134 is effectively the cost of the inner operation. Otherwise
10135 we have a MOV instruction and we take the cost from the MOV
10136 itself. This is true independently of whether we are
10137 optimizing for space or time. */
10138 if (op_cost)
10139 *cost = op_cost;
10140
10141 return true;
10142 }
10143 else if (MEM_P (op0))
10144 {
10145 /* All loads can zero extend to any size for free. */
10146 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10147 return true;
10148 }
10149
10150 op0 = aarch64_extend_bitfield_pattern_p (x);
10151 if (op0)
10152 {
10153 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10154 if (speed)
10155 *cost += extra_cost->alu.bfx;
10156 return true;
10157 }
10158
10159 if (speed)
10160 {
10161 if (VECTOR_MODE_P (mode))
10162 {
10163 /* UMOV. */
10164 *cost += extra_cost->vect.alu;
10165 }
10166 else
10167 {
10168 /* We generate an AND instead of UXTB/UXTH. */
10169 *cost += extra_cost->alu.logical;
10170 }
10171 }
10172 return false;
10173
10174 case SIGN_EXTEND:
10175 if (MEM_P (XEXP (x, 0)))
10176 {
10177 /* LDRSH. */
10178 if (speed)
10179 {
10180 rtx address = XEXP (XEXP (x, 0), 0);
10181 *cost += extra_cost->ldst.load_sign_extend;
10182
10183 *cost +=
10184 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10185 0, speed));
10186 }
10187 return true;
10188 }
10189
10190 op0 = aarch64_extend_bitfield_pattern_p (x);
10191 if (op0)
10192 {
10193 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10194 if (speed)
10195 *cost += extra_cost->alu.bfx;
10196 return true;
10197 }
10198
10199 if (speed)
10200 {
10201 if (VECTOR_MODE_P (mode))
10202 *cost += extra_cost->vect.alu;
10203 else
10204 *cost += extra_cost->alu.extend;
10205 }
10206 return false;
10207
10208 case ASHIFT:
10209 op0 = XEXP (x, 0);
10210 op1 = XEXP (x, 1);
10211
10212 if (CONST_INT_P (op1))
10213 {
10214 if (speed)
10215 {
10216 if (VECTOR_MODE_P (mode))
10217 {
10218 /* Vector shift (immediate). */
10219 *cost += extra_cost->vect.alu;
10220 }
10221 else
10222 {
10223 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10224 aliases. */
10225 *cost += extra_cost->alu.shift;
10226 }
10227 }
10228
10229 /* We can incorporate zero/sign extend for free. */
10230 if (GET_CODE (op0) == ZERO_EXTEND
10231 || GET_CODE (op0) == SIGN_EXTEND)
10232 op0 = XEXP (op0, 0);
10233
10234 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10235 return true;
10236 }
10237 else
10238 {
10239 if (VECTOR_MODE_P (mode))
10240 {
10241 if (speed)
10242 /* Vector shift (register). */
10243 *cost += extra_cost->vect.alu;
10244 }
10245 else
10246 {
10247 if (speed)
10248 /* LSLV. */
10249 *cost += extra_cost->alu.shift_reg;
10250
10251 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10252 && CONST_INT_P (XEXP (op1, 1))
10253 && known_eq (INTVAL (XEXP (op1, 1)),
10254 GET_MODE_BITSIZE (mode) - 1))
10255 {
10256 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10257 /* We already demanded XEXP (op1, 0) to be REG_P, so
10258 don't recurse into it. */
10259 return true;
10260 }
10261 }
10262 return false; /* All arguments need to be in registers. */
10263 }
10264
10265 case ROTATE:
10266 case ROTATERT:
10267 case LSHIFTRT:
10268 case ASHIFTRT:
10269 op0 = XEXP (x, 0);
10270 op1 = XEXP (x, 1);
10271
10272 if (CONST_INT_P (op1))
10273 {
10274 /* ASR (immediate) and friends. */
10275 if (speed)
10276 {
10277 if (VECTOR_MODE_P (mode))
10278 *cost += extra_cost->vect.alu;
10279 else
10280 *cost += extra_cost->alu.shift;
10281 }
10282
10283 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10284 return true;
10285 }
10286 else
10287 {
10288 if (VECTOR_MODE_P (mode))
10289 {
10290 if (speed)
10291 /* Vector shift (register). */
10292 *cost += extra_cost->vect.alu;
10293 }
10294 else
10295 {
10296 if (speed)
10297 /* ASR (register) and friends. */
10298 *cost += extra_cost->alu.shift_reg;
10299
10300 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10301 && CONST_INT_P (XEXP (op1, 1))
10302 && known_eq (INTVAL (XEXP (op1, 1)),
10303 GET_MODE_BITSIZE (mode) - 1))
10304 {
10305 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10306 /* We already demanded XEXP (op1, 0) to be REG_P, so
10307 don't recurse into it. */
10308 return true;
10309 }
10310 }
10311 return false; /* All arguments need to be in registers. */
10312 }
10313
10314 case SYMBOL_REF:
10315
10316 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10317 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10318 {
10319 /* LDR. */
10320 if (speed)
10321 *cost += extra_cost->ldst.load;
10322 }
10323 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10324 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10325 {
10326 /* ADRP, followed by ADD. */
10327 *cost += COSTS_N_INSNS (1);
10328 if (speed)
10329 *cost += 2 * extra_cost->alu.arith;
10330 }
10331 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10332 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10333 {
10334 /* ADR. */
10335 if (speed)
10336 *cost += extra_cost->alu.arith;
10337 }
10338
10339 if (flag_pic)
10340 {
10341 /* One extra load instruction, after accessing the GOT. */
10342 *cost += COSTS_N_INSNS (1);
10343 if (speed)
10344 *cost += extra_cost->ldst.load;
10345 }
10346 return true;
10347
10348 case HIGH:
10349 case LO_SUM:
10350 /* ADRP/ADD (immediate). */
10351 if (speed)
10352 *cost += extra_cost->alu.arith;
10353 return true;
10354
10355 case ZERO_EXTRACT:
10356 case SIGN_EXTRACT:
10357 /* UBFX/SBFX. */
10358 if (speed)
10359 {
10360 if (VECTOR_MODE_P (mode))
10361 *cost += extra_cost->vect.alu;
10362 else
10363 *cost += extra_cost->alu.bfx;
10364 }
10365
10366 /* We can trust that the immediates used will be correct (there
10367 are no by-register forms), so we need only cost op0. */
10368 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10369 return true;
10370
10371 case MULT:
10372 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10373 /* aarch64_rtx_mult_cost always handles recursion to its
10374 operands. */
10375 return true;
10376
10377 case MOD:
10378 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10379 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10380 an unconditional negate. This case should only ever be reached through
10381 the set_smod_pow2_cheap check in expmed.c. */
10382 if (CONST_INT_P (XEXP (x, 1))
10383 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10384 && (mode == SImode || mode == DImode))
10385 {
10386 /* We expand to 4 instructions. Reset the baseline. */
10387 *cost = COSTS_N_INSNS (4);
10388
10389 if (speed)
10390 *cost += 2 * extra_cost->alu.logical
10391 + 2 * extra_cost->alu.arith;
10392
10393 return true;
10394 }
10395
10396 /* Fall-through. */
10397 case UMOD:
10398 if (speed)
10399 {
10400 /* Slighly prefer UMOD over SMOD. */
10401 if (VECTOR_MODE_P (mode))
10402 *cost += extra_cost->vect.alu;
10403 else if (GET_MODE_CLASS (mode) == MODE_INT)
10404 *cost += (extra_cost->mult[mode == DImode].add
10405 + extra_cost->mult[mode == DImode].idiv
10406 + (code == MOD ? 1 : 0));
10407 }
10408 return false; /* All arguments need to be in registers. */
10409
10410 case DIV:
10411 case UDIV:
10412 case SQRT:
10413 if (speed)
10414 {
10415 if (VECTOR_MODE_P (mode))
10416 *cost += extra_cost->vect.alu;
10417 else if (GET_MODE_CLASS (mode) == MODE_INT)
10418 /* There is no integer SQRT, so only DIV and UDIV can get
10419 here. */
10420 *cost += (extra_cost->mult[mode == DImode].idiv
10421 /* Slighly prefer UDIV over SDIV. */
10422 + (code == DIV ? 1 : 0));
10423 else
10424 *cost += extra_cost->fp[mode == DFmode].div;
10425 }
10426 return false; /* All arguments need to be in registers. */
10427
10428 case IF_THEN_ELSE:
10429 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10430 XEXP (x, 2), cost, speed);
10431
10432 case EQ:
10433 case NE:
10434 case GT:
10435 case GTU:
10436 case LT:
10437 case LTU:
10438 case GE:
10439 case GEU:
10440 case LE:
10441 case LEU:
10442
10443 return false; /* All arguments must be in registers. */
10444
10445 case FMA:
10446 op0 = XEXP (x, 0);
10447 op1 = XEXP (x, 1);
10448 op2 = XEXP (x, 2);
10449
10450 if (speed)
10451 {
10452 if (VECTOR_MODE_P (mode))
10453 *cost += extra_cost->vect.alu;
10454 else
10455 *cost += extra_cost->fp[mode == DFmode].fma;
10456 }
10457
10458 /* FMSUB, FNMADD, and FNMSUB are free. */
10459 if (GET_CODE (op0) == NEG)
10460 op0 = XEXP (op0, 0);
10461
10462 if (GET_CODE (op2) == NEG)
10463 op2 = XEXP (op2, 0);
10464
10465 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10466 and the by-element operand as operand 0. */
10467 if (GET_CODE (op1) == NEG)
10468 op1 = XEXP (op1, 0);
10469
10470 /* Catch vector-by-element operations. The by-element operand can
10471 either be (vec_duplicate (vec_select (x))) or just
10472 (vec_select (x)), depending on whether we are multiplying by
10473 a vector or a scalar.
10474
10475 Canonicalization is not very good in these cases, FMA4 will put the
10476 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10477 if (GET_CODE (op0) == VEC_DUPLICATE)
10478 op0 = XEXP (op0, 0);
10479 else if (GET_CODE (op1) == VEC_DUPLICATE)
10480 op1 = XEXP (op1, 0);
10481
10482 if (GET_CODE (op0) == VEC_SELECT)
10483 op0 = XEXP (op0, 0);
10484 else if (GET_CODE (op1) == VEC_SELECT)
10485 op1 = XEXP (op1, 0);
10486
10487 /* If the remaining parameters are not registers,
10488 get the cost to put them into registers. */
10489 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10490 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10491 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10492 return true;
10493
10494 case FLOAT:
10495 case UNSIGNED_FLOAT:
10496 if (speed)
10497 *cost += extra_cost->fp[mode == DFmode].fromint;
10498 return false;
10499
10500 case FLOAT_EXTEND:
10501 if (speed)
10502 {
10503 if (VECTOR_MODE_P (mode))
10504 {
10505 /*Vector truncate. */
10506 *cost += extra_cost->vect.alu;
10507 }
10508 else
10509 *cost += extra_cost->fp[mode == DFmode].widen;
10510 }
10511 return false;
10512
10513 case FLOAT_TRUNCATE:
10514 if (speed)
10515 {
10516 if (VECTOR_MODE_P (mode))
10517 {
10518 /*Vector conversion. */
10519 *cost += extra_cost->vect.alu;
10520 }
10521 else
10522 *cost += extra_cost->fp[mode == DFmode].narrow;
10523 }
10524 return false;
10525
10526 case FIX:
10527 case UNSIGNED_FIX:
10528 x = XEXP (x, 0);
10529 /* Strip the rounding part. They will all be implemented
10530 by the fcvt* family of instructions anyway. */
10531 if (GET_CODE (x) == UNSPEC)
10532 {
10533 unsigned int uns_code = XINT (x, 1);
10534
10535 if (uns_code == UNSPEC_FRINTA
10536 || uns_code == UNSPEC_FRINTM
10537 || uns_code == UNSPEC_FRINTN
10538 || uns_code == UNSPEC_FRINTP
10539 || uns_code == UNSPEC_FRINTZ)
10540 x = XVECEXP (x, 0, 0);
10541 }
10542
10543 if (speed)
10544 {
10545 if (VECTOR_MODE_P (mode))
10546 *cost += extra_cost->vect.alu;
10547 else
10548 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10549 }
10550
10551 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10552 fixed-point fcvt. */
10553 if (GET_CODE (x) == MULT
10554 && ((VECTOR_MODE_P (mode)
10555 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10556 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10557 {
10558 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10559 0, speed);
10560 return true;
10561 }
10562
10563 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10564 return true;
10565
10566 case ABS:
10567 if (VECTOR_MODE_P (mode))
10568 {
10569 /* ABS (vector). */
10570 if (speed)
10571 *cost += extra_cost->vect.alu;
10572 }
10573 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10574 {
10575 op0 = XEXP (x, 0);
10576
10577 /* FABD, which is analogous to FADD. */
10578 if (GET_CODE (op0) == MINUS)
10579 {
10580 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10581 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10582 if (speed)
10583 *cost += extra_cost->fp[mode == DFmode].addsub;
10584
10585 return true;
10586 }
10587 /* Simple FABS is analogous to FNEG. */
10588 if (speed)
10589 *cost += extra_cost->fp[mode == DFmode].neg;
10590 }
10591 else
10592 {
10593 /* Integer ABS will either be split to
10594 two arithmetic instructions, or will be an ABS
10595 (scalar), which we don't model. */
10596 *cost = COSTS_N_INSNS (2);
10597 if (speed)
10598 *cost += 2 * extra_cost->alu.arith;
10599 }
10600 return false;
10601
10602 case SMAX:
10603 case SMIN:
10604 if (speed)
10605 {
10606 if (VECTOR_MODE_P (mode))
10607 *cost += extra_cost->vect.alu;
10608 else
10609 {
10610 /* FMAXNM/FMINNM/FMAX/FMIN.
10611 TODO: This may not be accurate for all implementations, but
10612 we do not model this in the cost tables. */
10613 *cost += extra_cost->fp[mode == DFmode].addsub;
10614 }
10615 }
10616 return false;
10617
10618 case UNSPEC:
10619 /* The floating point round to integer frint* instructions. */
10620 if (aarch64_frint_unspec_p (XINT (x, 1)))
10621 {
10622 if (speed)
10623 *cost += extra_cost->fp[mode == DFmode].roundint;
10624
10625 return false;
10626 }
10627
10628 if (XINT (x, 1) == UNSPEC_RBIT)
10629 {
10630 if (speed)
10631 *cost += extra_cost->alu.rev;
10632
10633 return false;
10634 }
10635 break;
10636
10637 case TRUNCATE:
10638
10639 /* Decompose <su>muldi3_highpart. */
10640 if (/* (truncate:DI */
10641 mode == DImode
10642 /* (lshiftrt:TI */
10643 && GET_MODE (XEXP (x, 0)) == TImode
10644 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10645 /* (mult:TI */
10646 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10647 /* (ANY_EXTEND:TI (reg:DI))
10648 (ANY_EXTEND:TI (reg:DI))) */
10649 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10650 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10651 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10652 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10653 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10654 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10655 /* (const_int 64) */
10656 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10657 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10658 {
10659 /* UMULH/SMULH. */
10660 if (speed)
10661 *cost += extra_cost->mult[mode == DImode].extend;
10662 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10663 mode, MULT, 0, speed);
10664 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10665 mode, MULT, 1, speed);
10666 return true;
10667 }
10668
10669 /* Fall through. */
10670 default:
10671 break;
10672 }
10673
10674 if (dump_file
10675 && flag_aarch64_verbose_cost)
10676 fprintf (dump_file,
10677 "\nFailed to cost RTX. Assuming default cost.\n");
10678
10679 return true;
10680 }
10681
10682 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10683 calculated for X. This cost is stored in *COST. Returns true
10684 if the total cost of X was calculated. */
10685 static bool
10686 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10687 int param, int *cost, bool speed)
10688 {
10689 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10690
10691 if (dump_file
10692 && flag_aarch64_verbose_cost)
10693 {
10694 print_rtl_single (dump_file, x);
10695 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10696 speed ? "Hot" : "Cold",
10697 *cost, result ? "final" : "partial");
10698 }
10699
10700 return result;
10701 }
10702
10703 static int
10704 aarch64_register_move_cost (machine_mode mode,
10705 reg_class_t from_i, reg_class_t to_i)
10706 {
10707 enum reg_class from = (enum reg_class) from_i;
10708 enum reg_class to = (enum reg_class) to_i;
10709 const struct cpu_regmove_cost *regmove_cost
10710 = aarch64_tune_params.regmove_cost;
10711
10712 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10713 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10714 to = GENERAL_REGS;
10715
10716 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10717 from = GENERAL_REGS;
10718
10719 /* Moving between GPR and stack cost is the same as GP2GP. */
10720 if ((from == GENERAL_REGS && to == STACK_REG)
10721 || (to == GENERAL_REGS && from == STACK_REG))
10722 return regmove_cost->GP2GP;
10723
10724 /* To/From the stack register, we move via the gprs. */
10725 if (to == STACK_REG || from == STACK_REG)
10726 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10727 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10728
10729 if (known_eq (GET_MODE_SIZE (mode), 16))
10730 {
10731 /* 128-bit operations on general registers require 2 instructions. */
10732 if (from == GENERAL_REGS && to == GENERAL_REGS)
10733 return regmove_cost->GP2GP * 2;
10734 else if (from == GENERAL_REGS)
10735 return regmove_cost->GP2FP * 2;
10736 else if (to == GENERAL_REGS)
10737 return regmove_cost->FP2GP * 2;
10738
10739 /* When AdvSIMD instructions are disabled it is not possible to move
10740 a 128-bit value directly between Q registers. This is handled in
10741 secondary reload. A general register is used as a scratch to move
10742 the upper DI value and the lower DI value is moved directly,
10743 hence the cost is the sum of three moves. */
10744 if (! TARGET_SIMD)
10745 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10746
10747 return regmove_cost->FP2FP;
10748 }
10749
10750 if (from == GENERAL_REGS && to == GENERAL_REGS)
10751 return regmove_cost->GP2GP;
10752 else if (from == GENERAL_REGS)
10753 return regmove_cost->GP2FP;
10754 else if (to == GENERAL_REGS)
10755 return regmove_cost->FP2GP;
10756
10757 return regmove_cost->FP2FP;
10758 }
10759
10760 static int
10761 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10762 reg_class_t rclass ATTRIBUTE_UNUSED,
10763 bool in ATTRIBUTE_UNUSED)
10764 {
10765 return aarch64_tune_params.memmov_cost;
10766 }
10767
10768 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10769 to optimize 1.0/sqrt. */
10770
10771 static bool
10772 use_rsqrt_p (machine_mode mode)
10773 {
10774 return (!flag_trapping_math
10775 && flag_unsafe_math_optimizations
10776 && ((aarch64_tune_params.approx_modes->recip_sqrt
10777 & AARCH64_APPROX_MODE (mode))
10778 || flag_mrecip_low_precision_sqrt));
10779 }
10780
10781 /* Function to decide when to use the approximate reciprocal square root
10782 builtin. */
10783
10784 static tree
10785 aarch64_builtin_reciprocal (tree fndecl)
10786 {
10787 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10788
10789 if (!use_rsqrt_p (mode))
10790 return NULL_TREE;
10791 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10792 }
10793
10794 /* Emit instruction sequence to compute either the approximate square root
10795 or its approximate reciprocal, depending on the flag RECP, and return
10796 whether the sequence was emitted or not. */
10797
10798 bool
10799 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10800 {
10801 machine_mode mode = GET_MODE (dst);
10802
10803 if (GET_MODE_INNER (mode) == HFmode)
10804 {
10805 gcc_assert (!recp);
10806 return false;
10807 }
10808
10809 if (!recp)
10810 {
10811 if (!(flag_mlow_precision_sqrt
10812 || (aarch64_tune_params.approx_modes->sqrt
10813 & AARCH64_APPROX_MODE (mode))))
10814 return false;
10815
10816 if (flag_finite_math_only
10817 || flag_trapping_math
10818 || !flag_unsafe_math_optimizations
10819 || optimize_function_for_size_p (cfun))
10820 return false;
10821 }
10822 else
10823 /* Caller assumes we cannot fail. */
10824 gcc_assert (use_rsqrt_p (mode));
10825
10826 machine_mode mmsk = mode_for_int_vector (mode).require ();
10827 rtx xmsk = gen_reg_rtx (mmsk);
10828 if (!recp)
10829 /* When calculating the approximate square root, compare the
10830 argument with 0.0 and create a mask. */
10831 emit_insn (gen_rtx_SET (xmsk,
10832 gen_rtx_NEG (mmsk,
10833 gen_rtx_EQ (mmsk, src,
10834 CONST0_RTX (mode)))));
10835
10836 /* Estimate the approximate reciprocal square root. */
10837 rtx xdst = gen_reg_rtx (mode);
10838 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10839
10840 /* Iterate over the series twice for SF and thrice for DF. */
10841 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10842
10843 /* Optionally iterate over the series once less for faster performance
10844 while sacrificing the accuracy. */
10845 if ((recp && flag_mrecip_low_precision_sqrt)
10846 || (!recp && flag_mlow_precision_sqrt))
10847 iterations--;
10848
10849 /* Iterate over the series to calculate the approximate reciprocal square
10850 root. */
10851 rtx x1 = gen_reg_rtx (mode);
10852 while (iterations--)
10853 {
10854 rtx x2 = gen_reg_rtx (mode);
10855 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10856
10857 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10858
10859 if (iterations > 0)
10860 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10861 }
10862
10863 if (!recp)
10864 {
10865 /* Qualify the approximate reciprocal square root when the argument is
10866 0.0 by squashing the intermediary result to 0.0. */
10867 rtx xtmp = gen_reg_rtx (mmsk);
10868 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10869 gen_rtx_SUBREG (mmsk, xdst, 0)));
10870 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10871
10872 /* Calculate the approximate square root. */
10873 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10874 }
10875
10876 /* Finalize the approximation. */
10877 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10878
10879 return true;
10880 }
10881
10882 /* Emit the instruction sequence to compute the approximation for the division
10883 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10884
10885 bool
10886 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10887 {
10888 machine_mode mode = GET_MODE (quo);
10889
10890 if (GET_MODE_INNER (mode) == HFmode)
10891 return false;
10892
10893 bool use_approx_division_p = (flag_mlow_precision_div
10894 || (aarch64_tune_params.approx_modes->division
10895 & AARCH64_APPROX_MODE (mode)));
10896
10897 if (!flag_finite_math_only
10898 || flag_trapping_math
10899 || !flag_unsafe_math_optimizations
10900 || optimize_function_for_size_p (cfun)
10901 || !use_approx_division_p)
10902 return false;
10903
10904 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10905 return false;
10906
10907 /* Estimate the approximate reciprocal. */
10908 rtx xrcp = gen_reg_rtx (mode);
10909 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10910
10911 /* Iterate over the series twice for SF and thrice for DF. */
10912 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10913
10914 /* Optionally iterate over the series once less for faster performance,
10915 while sacrificing the accuracy. */
10916 if (flag_mlow_precision_div)
10917 iterations--;
10918
10919 /* Iterate over the series to calculate the approximate reciprocal. */
10920 rtx xtmp = gen_reg_rtx (mode);
10921 while (iterations--)
10922 {
10923 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10924
10925 if (iterations > 0)
10926 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10927 }
10928
10929 if (num != CONST1_RTX (mode))
10930 {
10931 /* As the approximate reciprocal of DEN is already calculated, only
10932 calculate the approximate division when NUM is not 1.0. */
10933 rtx xnum = force_reg (mode, num);
10934 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10935 }
10936
10937 /* Finalize the approximation. */
10938 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10939 return true;
10940 }
10941
10942 /* Return the number of instructions that can be issued per cycle. */
10943 static int
10944 aarch64_sched_issue_rate (void)
10945 {
10946 return aarch64_tune_params.issue_rate;
10947 }
10948
10949 static int
10950 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10951 {
10952 int issue_rate = aarch64_sched_issue_rate ();
10953
10954 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10955 }
10956
10957
10958 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10959 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10960 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10961
10962 static int
10963 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10964 int ready_index)
10965 {
10966 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10967 }
10968
10969
10970 /* Vectorizer cost model target hooks. */
10971
10972 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10973 static int
10974 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10975 tree vectype,
10976 int misalign ATTRIBUTE_UNUSED)
10977 {
10978 unsigned elements;
10979 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10980 bool fp = false;
10981
10982 if (vectype != NULL)
10983 fp = FLOAT_TYPE_P (vectype);
10984
10985 switch (type_of_cost)
10986 {
10987 case scalar_stmt:
10988 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10989
10990 case scalar_load:
10991 return costs->scalar_load_cost;
10992
10993 case scalar_store:
10994 return costs->scalar_store_cost;
10995
10996 case vector_stmt:
10997 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10998
10999 case vector_load:
11000 return costs->vec_align_load_cost;
11001
11002 case vector_store:
11003 return costs->vec_store_cost;
11004
11005 case vec_to_scalar:
11006 return costs->vec_to_scalar_cost;
11007
11008 case scalar_to_vec:
11009 return costs->scalar_to_vec_cost;
11010
11011 case unaligned_load:
11012 case vector_gather_load:
11013 return costs->vec_unalign_load_cost;
11014
11015 case unaligned_store:
11016 case vector_scatter_store:
11017 return costs->vec_unalign_store_cost;
11018
11019 case cond_branch_taken:
11020 return costs->cond_taken_branch_cost;
11021
11022 case cond_branch_not_taken:
11023 return costs->cond_not_taken_branch_cost;
11024
11025 case vec_perm:
11026 return costs->vec_permute_cost;
11027
11028 case vec_promote_demote:
11029 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11030
11031 case vec_construct:
11032 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11033 return elements / 2 + 1;
11034
11035 default:
11036 gcc_unreachable ();
11037 }
11038 }
11039
11040 /* Implement targetm.vectorize.add_stmt_cost. */
11041 static unsigned
11042 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11043 struct _stmt_vec_info *stmt_info, int misalign,
11044 enum vect_cost_model_location where)
11045 {
11046 unsigned *cost = (unsigned *) data;
11047 unsigned retval = 0;
11048
11049 if (flag_vect_cost_model)
11050 {
11051 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11052 int stmt_cost =
11053 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11054
11055 /* Statements in an inner loop relative to the loop being
11056 vectorized are weighted more heavily. The value here is
11057 arbitrary and could potentially be improved with analysis. */
11058 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11059 count *= 50; /* FIXME */
11060
11061 retval = (unsigned) (count * stmt_cost);
11062 cost[where] += retval;
11063 }
11064
11065 return retval;
11066 }
11067
11068 static void initialize_aarch64_code_model (struct gcc_options *);
11069
11070 /* Parse the TO_PARSE string and put the architecture struct that it
11071 selects into RES and the architectural features into ISA_FLAGS.
11072 Return an aarch64_parse_opt_result describing the parse result.
11073 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11074 When the TO_PARSE string contains an invalid extension,
11075 a copy of the string is created and stored to INVALID_EXTENSION. */
11076
11077 static enum aarch64_parse_opt_result
11078 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11079 uint64_t *isa_flags, std::string *invalid_extension)
11080 {
11081 const char *ext;
11082 const struct processor *arch;
11083 size_t len;
11084
11085 ext = strchr (to_parse, '+');
11086
11087 if (ext != NULL)
11088 len = ext - to_parse;
11089 else
11090 len = strlen (to_parse);
11091
11092 if (len == 0)
11093 return AARCH64_PARSE_MISSING_ARG;
11094
11095
11096 /* Loop through the list of supported ARCHes to find a match. */
11097 for (arch = all_architectures; arch->name != NULL; arch++)
11098 {
11099 if (strlen (arch->name) == len
11100 && strncmp (arch->name, to_parse, len) == 0)
11101 {
11102 uint64_t isa_temp = arch->flags;
11103
11104 if (ext != NULL)
11105 {
11106 /* TO_PARSE string contains at least one extension. */
11107 enum aarch64_parse_opt_result ext_res
11108 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11109
11110 if (ext_res != AARCH64_PARSE_OK)
11111 return ext_res;
11112 }
11113 /* Extension parsing was successful. Confirm the result
11114 arch and ISA flags. */
11115 *res = arch;
11116 *isa_flags = isa_temp;
11117 return AARCH64_PARSE_OK;
11118 }
11119 }
11120
11121 /* ARCH name not found in list. */
11122 return AARCH64_PARSE_INVALID_ARG;
11123 }
11124
11125 /* Parse the TO_PARSE string and put the result tuning in RES and the
11126 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11127 describing the parse result. If there is an error parsing, RES and
11128 ISA_FLAGS are left unchanged.
11129 When the TO_PARSE string contains an invalid extension,
11130 a copy of the string is created and stored to INVALID_EXTENSION. */
11131
11132 static enum aarch64_parse_opt_result
11133 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11134 uint64_t *isa_flags, std::string *invalid_extension)
11135 {
11136 const char *ext;
11137 const struct processor *cpu;
11138 size_t len;
11139
11140 ext = strchr (to_parse, '+');
11141
11142 if (ext != NULL)
11143 len = ext - to_parse;
11144 else
11145 len = strlen (to_parse);
11146
11147 if (len == 0)
11148 return AARCH64_PARSE_MISSING_ARG;
11149
11150
11151 /* Loop through the list of supported CPUs to find a match. */
11152 for (cpu = all_cores; cpu->name != NULL; cpu++)
11153 {
11154 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11155 {
11156 uint64_t isa_temp = cpu->flags;
11157
11158
11159 if (ext != NULL)
11160 {
11161 /* TO_PARSE string contains at least one extension. */
11162 enum aarch64_parse_opt_result ext_res
11163 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11164
11165 if (ext_res != AARCH64_PARSE_OK)
11166 return ext_res;
11167 }
11168 /* Extension parsing was successfull. Confirm the result
11169 cpu and ISA flags. */
11170 *res = cpu;
11171 *isa_flags = isa_temp;
11172 return AARCH64_PARSE_OK;
11173 }
11174 }
11175
11176 /* CPU name not found in list. */
11177 return AARCH64_PARSE_INVALID_ARG;
11178 }
11179
11180 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11181 Return an aarch64_parse_opt_result describing the parse result.
11182 If the parsing fails the RES does not change. */
11183
11184 static enum aarch64_parse_opt_result
11185 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11186 {
11187 const struct processor *cpu;
11188
11189 /* Loop through the list of supported CPUs to find a match. */
11190 for (cpu = all_cores; cpu->name != NULL; cpu++)
11191 {
11192 if (strcmp (cpu->name, to_parse) == 0)
11193 {
11194 *res = cpu;
11195 return AARCH64_PARSE_OK;
11196 }
11197 }
11198
11199 /* CPU name not found in list. */
11200 return AARCH64_PARSE_INVALID_ARG;
11201 }
11202
11203 /* Parse TOKEN, which has length LENGTH to see if it is an option
11204 described in FLAG. If it is, return the index bit for that fusion type.
11205 If not, error (printing OPTION_NAME) and return zero. */
11206
11207 static unsigned int
11208 aarch64_parse_one_option_token (const char *token,
11209 size_t length,
11210 const struct aarch64_flag_desc *flag,
11211 const char *option_name)
11212 {
11213 for (; flag->name != NULL; flag++)
11214 {
11215 if (length == strlen (flag->name)
11216 && !strncmp (flag->name, token, length))
11217 return flag->flag;
11218 }
11219
11220 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11221 return 0;
11222 }
11223
11224 /* Parse OPTION which is a comma-separated list of flags to enable.
11225 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11226 default state we inherit from the CPU tuning structures. OPTION_NAME
11227 gives the top-level option we are parsing in the -moverride string,
11228 for use in error messages. */
11229
11230 static unsigned int
11231 aarch64_parse_boolean_options (const char *option,
11232 const struct aarch64_flag_desc *flags,
11233 unsigned int initial_state,
11234 const char *option_name)
11235 {
11236 const char separator = '.';
11237 const char* specs = option;
11238 const char* ntoken = option;
11239 unsigned int found_flags = initial_state;
11240
11241 while ((ntoken = strchr (specs, separator)))
11242 {
11243 size_t token_length = ntoken - specs;
11244 unsigned token_ops = aarch64_parse_one_option_token (specs,
11245 token_length,
11246 flags,
11247 option_name);
11248 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11249 in the token stream, reset the supported operations. So:
11250
11251 adrp+add.cmp+branch.none.adrp+add
11252
11253 would have the result of turning on only adrp+add fusion. */
11254 if (!token_ops)
11255 found_flags = 0;
11256
11257 found_flags |= token_ops;
11258 specs = ++ntoken;
11259 }
11260
11261 /* We ended with a comma, print something. */
11262 if (!(*specs))
11263 {
11264 error ("%s string ill-formed\n", option_name);
11265 return 0;
11266 }
11267
11268 /* We still have one more token to parse. */
11269 size_t token_length = strlen (specs);
11270 unsigned token_ops = aarch64_parse_one_option_token (specs,
11271 token_length,
11272 flags,
11273 option_name);
11274 if (!token_ops)
11275 found_flags = 0;
11276
11277 found_flags |= token_ops;
11278 return found_flags;
11279 }
11280
11281 /* Support for overriding instruction fusion. */
11282
11283 static void
11284 aarch64_parse_fuse_string (const char *fuse_string,
11285 struct tune_params *tune)
11286 {
11287 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11288 aarch64_fusible_pairs,
11289 tune->fusible_ops,
11290 "fuse=");
11291 }
11292
11293 /* Support for overriding other tuning flags. */
11294
11295 static void
11296 aarch64_parse_tune_string (const char *tune_string,
11297 struct tune_params *tune)
11298 {
11299 tune->extra_tuning_flags
11300 = aarch64_parse_boolean_options (tune_string,
11301 aarch64_tuning_flags,
11302 tune->extra_tuning_flags,
11303 "tune=");
11304 }
11305
11306 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11307 Accept the valid SVE vector widths allowed by
11308 aarch64_sve_vector_bits_enum and use it to override sve_width
11309 in TUNE. */
11310
11311 static void
11312 aarch64_parse_sve_width_string (const char *tune_string,
11313 struct tune_params *tune)
11314 {
11315 int width = -1;
11316
11317 int n = sscanf (tune_string, "%d", &width);
11318 if (n == EOF)
11319 {
11320 error ("invalid format for sve_width");
11321 return;
11322 }
11323 switch (width)
11324 {
11325 case SVE_128:
11326 case SVE_256:
11327 case SVE_512:
11328 case SVE_1024:
11329 case SVE_2048:
11330 break;
11331 default:
11332 error ("invalid sve_width value: %d", width);
11333 }
11334 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11335 }
11336
11337 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11338 we understand. If it is, extract the option string and handoff to
11339 the appropriate function. */
11340
11341 void
11342 aarch64_parse_one_override_token (const char* token,
11343 size_t length,
11344 struct tune_params *tune)
11345 {
11346 const struct aarch64_tuning_override_function *fn
11347 = aarch64_tuning_override_functions;
11348
11349 const char *option_part = strchr (token, '=');
11350 if (!option_part)
11351 {
11352 error ("tuning string missing in option (%s)", token);
11353 return;
11354 }
11355
11356 /* Get the length of the option name. */
11357 length = option_part - token;
11358 /* Skip the '=' to get to the option string. */
11359 option_part++;
11360
11361 for (; fn->name != NULL; fn++)
11362 {
11363 if (!strncmp (fn->name, token, length))
11364 {
11365 fn->parse_override (option_part, tune);
11366 return;
11367 }
11368 }
11369
11370 error ("unknown tuning option (%s)",token);
11371 return;
11372 }
11373
11374 /* A checking mechanism for the implementation of the tls size. */
11375
11376 static void
11377 initialize_aarch64_tls_size (struct gcc_options *opts)
11378 {
11379 if (aarch64_tls_size == 0)
11380 aarch64_tls_size = 24;
11381
11382 switch (opts->x_aarch64_cmodel_var)
11383 {
11384 case AARCH64_CMODEL_TINY:
11385 /* Both the default and maximum TLS size allowed under tiny is 1M which
11386 needs two instructions to address, so we clamp the size to 24. */
11387 if (aarch64_tls_size > 24)
11388 aarch64_tls_size = 24;
11389 break;
11390 case AARCH64_CMODEL_SMALL:
11391 /* The maximum TLS size allowed under small is 4G. */
11392 if (aarch64_tls_size > 32)
11393 aarch64_tls_size = 32;
11394 break;
11395 case AARCH64_CMODEL_LARGE:
11396 /* The maximum TLS size allowed under large is 16E.
11397 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11398 if (aarch64_tls_size > 48)
11399 aarch64_tls_size = 48;
11400 break;
11401 default:
11402 gcc_unreachable ();
11403 }
11404
11405 return;
11406 }
11407
11408 /* Parse STRING looking for options in the format:
11409 string :: option:string
11410 option :: name=substring
11411 name :: {a-z}
11412 substring :: defined by option. */
11413
11414 static void
11415 aarch64_parse_override_string (const char* input_string,
11416 struct tune_params* tune)
11417 {
11418 const char separator = ':';
11419 size_t string_length = strlen (input_string) + 1;
11420 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11421 char *string = string_root;
11422 strncpy (string, input_string, string_length);
11423 string[string_length - 1] = '\0';
11424
11425 char* ntoken = string;
11426
11427 while ((ntoken = strchr (string, separator)))
11428 {
11429 size_t token_length = ntoken - string;
11430 /* Make this substring look like a string. */
11431 *ntoken = '\0';
11432 aarch64_parse_one_override_token (string, token_length, tune);
11433 string = ++ntoken;
11434 }
11435
11436 /* One last option to parse. */
11437 aarch64_parse_one_override_token (string, strlen (string), tune);
11438 free (string_root);
11439 }
11440
11441
11442 static void
11443 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11444 {
11445 if (accepted_branch_protection_string)
11446 {
11447 opts->x_aarch64_branch_protection_string
11448 = xstrdup (accepted_branch_protection_string);
11449 }
11450
11451 /* PR 70044: We have to be careful about being called multiple times for the
11452 same function. This means all changes should be repeatable. */
11453
11454 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11455 Disable the frame pointer flag so the mid-end will not use a frame
11456 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11457 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11458 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11459 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11460 if (opts->x_flag_omit_frame_pointer == 0)
11461 opts->x_flag_omit_frame_pointer = 2;
11462
11463 /* If not optimizing for size, set the default
11464 alignment to what the target wants. */
11465 if (!opts->x_optimize_size)
11466 {
11467 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11468 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11469 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11470 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11471 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11472 opts->x_str_align_functions = aarch64_tune_params.function_align;
11473 }
11474
11475 /* We default to no pc-relative literal loads. */
11476
11477 aarch64_pcrelative_literal_loads = false;
11478
11479 /* If -mpc-relative-literal-loads is set on the command line, this
11480 implies that the user asked for PC relative literal loads. */
11481 if (opts->x_pcrelative_literal_loads == 1)
11482 aarch64_pcrelative_literal_loads = true;
11483
11484 /* In the tiny memory model it makes no sense to disallow PC relative
11485 literal pool loads. */
11486 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11487 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11488 aarch64_pcrelative_literal_loads = true;
11489
11490 /* When enabling the lower precision Newton series for the square root, also
11491 enable it for the reciprocal square root, since the latter is an
11492 intermediary step for the former. */
11493 if (flag_mlow_precision_sqrt)
11494 flag_mrecip_low_precision_sqrt = true;
11495 }
11496
11497 /* 'Unpack' up the internal tuning structs and update the options
11498 in OPTS. The caller must have set up selected_tune and selected_arch
11499 as all the other target-specific codegen decisions are
11500 derived from them. */
11501
11502 void
11503 aarch64_override_options_internal (struct gcc_options *opts)
11504 {
11505 aarch64_tune_flags = selected_tune->flags;
11506 aarch64_tune = selected_tune->sched_core;
11507 /* Make a copy of the tuning parameters attached to the core, which
11508 we may later overwrite. */
11509 aarch64_tune_params = *(selected_tune->tune);
11510 aarch64_architecture_version = selected_arch->architecture_version;
11511
11512 if (opts->x_aarch64_override_tune_string)
11513 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11514 &aarch64_tune_params);
11515
11516 /* This target defaults to strict volatile bitfields. */
11517 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11518 opts->x_flag_strict_volatile_bitfields = 1;
11519
11520 if (aarch64_stack_protector_guard == SSP_GLOBAL
11521 && opts->x_aarch64_stack_protector_guard_offset_str)
11522 {
11523 error ("incompatible options %<-mstack-protector-guard=global%> and "
11524 "%<-mstack-protector-guard-offset=%s%>",
11525 aarch64_stack_protector_guard_offset_str);
11526 }
11527
11528 if (aarch64_stack_protector_guard == SSP_SYSREG
11529 && !(opts->x_aarch64_stack_protector_guard_offset_str
11530 && opts->x_aarch64_stack_protector_guard_reg_str))
11531 {
11532 error ("both %<-mstack-protector-guard-offset%> and "
11533 "%<-mstack-protector-guard-reg%> must be used "
11534 "with %<-mstack-protector-guard=sysreg%>");
11535 }
11536
11537 if (opts->x_aarch64_stack_protector_guard_reg_str)
11538 {
11539 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11540 error ("specify a system register with a small string length.");
11541 }
11542
11543 if (opts->x_aarch64_stack_protector_guard_offset_str)
11544 {
11545 char *end;
11546 const char *str = aarch64_stack_protector_guard_offset_str;
11547 errno = 0;
11548 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11549 if (!*str || *end || errno)
11550 error ("%qs is not a valid offset in %qs", str,
11551 "-mstack-protector-guard-offset=");
11552 aarch64_stack_protector_guard_offset = offs;
11553 }
11554
11555 initialize_aarch64_code_model (opts);
11556 initialize_aarch64_tls_size (opts);
11557
11558 int queue_depth = 0;
11559 switch (aarch64_tune_params.autoprefetcher_model)
11560 {
11561 case tune_params::AUTOPREFETCHER_OFF:
11562 queue_depth = -1;
11563 break;
11564 case tune_params::AUTOPREFETCHER_WEAK:
11565 queue_depth = 0;
11566 break;
11567 case tune_params::AUTOPREFETCHER_STRONG:
11568 queue_depth = max_insn_queue_index + 1;
11569 break;
11570 default:
11571 gcc_unreachable ();
11572 }
11573
11574 /* We don't mind passing in global_options_set here as we don't use
11575 the *options_set structs anyway. */
11576 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11577 queue_depth,
11578 opts->x_param_values,
11579 global_options_set.x_param_values);
11580
11581 /* Set up parameters to be used in prefetching algorithm. Do not
11582 override the defaults unless we are tuning for a core we have
11583 researched values for. */
11584 if (aarch64_tune_params.prefetch->num_slots > 0)
11585 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11586 aarch64_tune_params.prefetch->num_slots,
11587 opts->x_param_values,
11588 global_options_set.x_param_values);
11589 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11590 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11591 aarch64_tune_params.prefetch->l1_cache_size,
11592 opts->x_param_values,
11593 global_options_set.x_param_values);
11594 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11595 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11596 aarch64_tune_params.prefetch->l1_cache_line_size,
11597 opts->x_param_values,
11598 global_options_set.x_param_values);
11599 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11600 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11601 aarch64_tune_params.prefetch->l2_cache_size,
11602 opts->x_param_values,
11603 global_options_set.x_param_values);
11604 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11605 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11606 0,
11607 opts->x_param_values,
11608 global_options_set.x_param_values);
11609 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11610 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11611 aarch64_tune_params.prefetch->minimum_stride,
11612 opts->x_param_values,
11613 global_options_set.x_param_values);
11614
11615 /* Use the alternative scheduling-pressure algorithm by default. */
11616 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11617 opts->x_param_values,
11618 global_options_set.x_param_values);
11619
11620 /* If the user hasn't changed it via configure then set the default to 64 KB
11621 for the backend. */
11622 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11623 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11624 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11625 opts->x_param_values,
11626 global_options_set.x_param_values);
11627
11628 /* Validate the guard size. */
11629 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11630
11631 /* Enforce that interval is the same size as size so the mid-end does the
11632 right thing. */
11633 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11634 guard_size,
11635 opts->x_param_values,
11636 global_options_set.x_param_values);
11637
11638 /* The maybe_set calls won't update the value if the user has explicitly set
11639 one. Which means we need to validate that probing interval and guard size
11640 are equal. */
11641 int probe_interval
11642 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11643 if (guard_size != probe_interval)
11644 error ("stack clash guard size %<%d%> must be equal to probing interval "
11645 "%<%d%>", guard_size, probe_interval);
11646
11647 /* Enable sw prefetching at specified optimization level for
11648 CPUS that have prefetch. Lower optimization level threshold by 1
11649 when profiling is enabled. */
11650 if (opts->x_flag_prefetch_loop_arrays < 0
11651 && !opts->x_optimize_size
11652 && aarch64_tune_params.prefetch->default_opt_level >= 0
11653 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11654 opts->x_flag_prefetch_loop_arrays = 1;
11655
11656 if (opts->x_aarch64_arch_string == NULL)
11657 opts->x_aarch64_arch_string = selected_arch->name;
11658 if (opts->x_aarch64_cpu_string == NULL)
11659 opts->x_aarch64_cpu_string = selected_cpu->name;
11660 if (opts->x_aarch64_tune_string == NULL)
11661 opts->x_aarch64_tune_string = selected_tune->name;
11662
11663 aarch64_override_options_after_change_1 (opts);
11664 }
11665
11666 /* Print a hint with a suggestion for a core or architecture name that
11667 most closely resembles what the user passed in STR. ARCH is true if
11668 the user is asking for an architecture name. ARCH is false if the user
11669 is asking for a core name. */
11670
11671 static void
11672 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11673 {
11674 auto_vec<const char *> candidates;
11675 const struct processor *entry = arch ? all_architectures : all_cores;
11676 for (; entry->name != NULL; entry++)
11677 candidates.safe_push (entry->name);
11678
11679 #ifdef HAVE_LOCAL_CPU_DETECT
11680 /* Add also "native" as possible value. */
11681 if (arch)
11682 candidates.safe_push ("native");
11683 #endif
11684
11685 char *s;
11686 const char *hint = candidates_list_and_hint (str, s, candidates);
11687 if (hint)
11688 inform (input_location, "valid arguments are: %s;"
11689 " did you mean %qs?", s, hint);
11690 else
11691 inform (input_location, "valid arguments are: %s", s);
11692
11693 XDELETEVEC (s);
11694 }
11695
11696 /* Print a hint with a suggestion for a core name that most closely resembles
11697 what the user passed in STR. */
11698
11699 inline static void
11700 aarch64_print_hint_for_core (const char *str)
11701 {
11702 aarch64_print_hint_for_core_or_arch (str, false);
11703 }
11704
11705 /* Print a hint with a suggestion for an architecture name that most closely
11706 resembles what the user passed in STR. */
11707
11708 inline static void
11709 aarch64_print_hint_for_arch (const char *str)
11710 {
11711 aarch64_print_hint_for_core_or_arch (str, true);
11712 }
11713
11714
11715 /* Print a hint with a suggestion for an extension name
11716 that most closely resembles what the user passed in STR. */
11717
11718 void
11719 aarch64_print_hint_for_extensions (const std::string &str)
11720 {
11721 auto_vec<const char *> candidates;
11722 aarch64_get_all_extension_candidates (&candidates);
11723 char *s;
11724 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11725 if (hint)
11726 inform (input_location, "valid arguments are: %s;"
11727 " did you mean %qs?", s, hint);
11728 else
11729 inform (input_location, "valid arguments are: %s;", s);
11730
11731 XDELETEVEC (s);
11732 }
11733
11734 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11735 specified in STR and throw errors if appropriate. Put the results if
11736 they are valid in RES and ISA_FLAGS. Return whether the option is
11737 valid. */
11738
11739 static bool
11740 aarch64_validate_mcpu (const char *str, const struct processor **res,
11741 uint64_t *isa_flags)
11742 {
11743 std::string invalid_extension;
11744 enum aarch64_parse_opt_result parse_res
11745 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11746
11747 if (parse_res == AARCH64_PARSE_OK)
11748 return true;
11749
11750 switch (parse_res)
11751 {
11752 case AARCH64_PARSE_MISSING_ARG:
11753 error ("missing cpu name in %<-mcpu=%s%>", str);
11754 break;
11755 case AARCH64_PARSE_INVALID_ARG:
11756 error ("unknown value %qs for %<-mcpu%>", str);
11757 aarch64_print_hint_for_core (str);
11758 break;
11759 case AARCH64_PARSE_INVALID_FEATURE:
11760 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11761 invalid_extension.c_str (), str);
11762 aarch64_print_hint_for_extensions (invalid_extension);
11763 break;
11764 default:
11765 gcc_unreachable ();
11766 }
11767
11768 return false;
11769 }
11770
11771 /* Parses CONST_STR for branch protection features specified in
11772 aarch64_branch_protect_types, and set any global variables required. Returns
11773 the parsing result and assigns LAST_STR to the last processed token from
11774 CONST_STR so that it can be used for error reporting. */
11775
11776 static enum
11777 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11778 char** last_str)
11779 {
11780 char *str_root = xstrdup (const_str);
11781 char* token_save = NULL;
11782 char *str = strtok_r (str_root, "+", &token_save);
11783 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11784 if (!str)
11785 res = AARCH64_PARSE_MISSING_ARG;
11786 else
11787 {
11788 char *next_str = strtok_r (NULL, "+", &token_save);
11789 /* Reset the branch protection features to their defaults. */
11790 aarch64_handle_no_branch_protection (NULL, NULL);
11791
11792 while (str && res == AARCH64_PARSE_OK)
11793 {
11794 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11795 bool found = false;
11796 /* Search for this type. */
11797 while (type && type->name && !found && res == AARCH64_PARSE_OK)
11798 {
11799 if (strcmp (str, type->name) == 0)
11800 {
11801 found = true;
11802 res = type->handler (str, next_str);
11803 str = next_str;
11804 next_str = strtok_r (NULL, "+", &token_save);
11805 }
11806 else
11807 type++;
11808 }
11809 if (found && res == AARCH64_PARSE_OK)
11810 {
11811 bool found_subtype = true;
11812 /* Loop through each token until we find one that isn't a
11813 subtype. */
11814 while (found_subtype)
11815 {
11816 found_subtype = false;
11817 const aarch64_branch_protect_type *subtype = type->subtypes;
11818 /* Search for the subtype. */
11819 while (str && subtype && subtype->name && !found_subtype
11820 && res == AARCH64_PARSE_OK)
11821 {
11822 if (strcmp (str, subtype->name) == 0)
11823 {
11824 found_subtype = true;
11825 res = subtype->handler (str, next_str);
11826 str = next_str;
11827 next_str = strtok_r (NULL, "+", &token_save);
11828 }
11829 else
11830 subtype++;
11831 }
11832 }
11833 }
11834 else if (!found)
11835 res = AARCH64_PARSE_INVALID_ARG;
11836 }
11837 }
11838 /* Copy the last processed token into the argument to pass it back.
11839 Used by option and attribute validation to print the offending token. */
11840 if (last_str)
11841 {
11842 if (str) strcpy (*last_str, str);
11843 else *last_str = NULL;
11844 }
11845 if (res == AARCH64_PARSE_OK)
11846 {
11847 /* If needed, alloc the accepted string then copy in const_str.
11848 Used by override_option_after_change_1. */
11849 if (!accepted_branch_protection_string)
11850 accepted_branch_protection_string = (char *) xmalloc (
11851 BRANCH_PROTECT_STR_MAX
11852 + 1);
11853 strncpy (accepted_branch_protection_string, const_str,
11854 BRANCH_PROTECT_STR_MAX + 1);
11855 /* Forcibly null-terminate. */
11856 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11857 }
11858 return res;
11859 }
11860
11861 static bool
11862 aarch64_validate_mbranch_protection (const char *const_str)
11863 {
11864 char *str = (char *) xmalloc (strlen (const_str));
11865 enum aarch64_parse_opt_result res =
11866 aarch64_parse_branch_protection (const_str, &str);
11867 if (res == AARCH64_PARSE_INVALID_ARG)
11868 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
11869 else if (res == AARCH64_PARSE_MISSING_ARG)
11870 error ("missing argument for %<-mbranch-protection=%>");
11871 free (str);
11872 return res == AARCH64_PARSE_OK;
11873 }
11874
11875 /* Validate a command-line -march option. Parse the arch and extensions
11876 (if any) specified in STR and throw errors if appropriate. Put the
11877 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11878 option is valid. */
11879
11880 static bool
11881 aarch64_validate_march (const char *str, const struct processor **res,
11882 uint64_t *isa_flags)
11883 {
11884 std::string invalid_extension;
11885 enum aarch64_parse_opt_result parse_res
11886 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11887
11888 if (parse_res == AARCH64_PARSE_OK)
11889 return true;
11890
11891 switch (parse_res)
11892 {
11893 case AARCH64_PARSE_MISSING_ARG:
11894 error ("missing arch name in %<-march=%s%>", str);
11895 break;
11896 case AARCH64_PARSE_INVALID_ARG:
11897 error ("unknown value %qs for %<-march%>", str);
11898 aarch64_print_hint_for_arch (str);
11899 break;
11900 case AARCH64_PARSE_INVALID_FEATURE:
11901 error ("invalid feature modifier %qs in %<-march=%s%>",
11902 invalid_extension.c_str (), str);
11903 aarch64_print_hint_for_extensions (invalid_extension);
11904 break;
11905 default:
11906 gcc_unreachable ();
11907 }
11908
11909 return false;
11910 }
11911
11912 /* Validate a command-line -mtune option. Parse the cpu
11913 specified in STR and throw errors if appropriate. Put the
11914 result, if it is valid, in RES. Return whether the option is
11915 valid. */
11916
11917 static bool
11918 aarch64_validate_mtune (const char *str, const struct processor **res)
11919 {
11920 enum aarch64_parse_opt_result parse_res
11921 = aarch64_parse_tune (str, res);
11922
11923 if (parse_res == AARCH64_PARSE_OK)
11924 return true;
11925
11926 switch (parse_res)
11927 {
11928 case AARCH64_PARSE_MISSING_ARG:
11929 error ("missing cpu name in %<-mtune=%s%>", str);
11930 break;
11931 case AARCH64_PARSE_INVALID_ARG:
11932 error ("unknown value %qs for %<-mtune%>", str);
11933 aarch64_print_hint_for_core (str);
11934 break;
11935 default:
11936 gcc_unreachable ();
11937 }
11938 return false;
11939 }
11940
11941 /* Return the CPU corresponding to the enum CPU.
11942 If it doesn't specify a cpu, return the default. */
11943
11944 static const struct processor *
11945 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11946 {
11947 if (cpu != aarch64_none)
11948 return &all_cores[cpu];
11949
11950 /* The & 0x3f is to extract the bottom 6 bits that encode the
11951 default cpu as selected by the --with-cpu GCC configure option
11952 in config.gcc.
11953 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11954 flags mechanism should be reworked to make it more sane. */
11955 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11956 }
11957
11958 /* Return the architecture corresponding to the enum ARCH.
11959 If it doesn't specify a valid architecture, return the default. */
11960
11961 static const struct processor *
11962 aarch64_get_arch (enum aarch64_arch arch)
11963 {
11964 if (arch != aarch64_no_arch)
11965 return &all_architectures[arch];
11966
11967 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11968
11969 return &all_architectures[cpu->arch];
11970 }
11971
11972 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11973
11974 static poly_uint16
11975 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11976 {
11977 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11978 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11979 deciding which .md file patterns to use and when deciding whether
11980 something is a legitimate address or constant. */
11981 if (value == SVE_SCALABLE || value == SVE_128)
11982 return poly_uint16 (2, 2);
11983 else
11984 return (int) value / 64;
11985 }
11986
11987 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11988 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11989 tuning structs. In particular it must set selected_tune and
11990 aarch64_isa_flags that define the available ISA features and tuning
11991 decisions. It must also set selected_arch as this will be used to
11992 output the .arch asm tags for each function. */
11993
11994 static void
11995 aarch64_override_options (void)
11996 {
11997 uint64_t cpu_isa = 0;
11998 uint64_t arch_isa = 0;
11999 aarch64_isa_flags = 0;
12000
12001 bool valid_cpu = true;
12002 bool valid_tune = true;
12003 bool valid_arch = true;
12004
12005 selected_cpu = NULL;
12006 selected_arch = NULL;
12007 selected_tune = NULL;
12008
12009 if (aarch64_branch_protection_string)
12010 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12011
12012 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12013 If either of -march or -mtune is given, they override their
12014 respective component of -mcpu. */
12015 if (aarch64_cpu_string)
12016 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12017 &cpu_isa);
12018
12019 if (aarch64_arch_string)
12020 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12021 &arch_isa);
12022
12023 if (aarch64_tune_string)
12024 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12025
12026 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12027 SUBTARGET_OVERRIDE_OPTIONS;
12028 #endif
12029
12030 /* If the user did not specify a processor, choose the default
12031 one for them. This will be the CPU set during configuration using
12032 --with-cpu, otherwise it is "generic". */
12033 if (!selected_cpu)
12034 {
12035 if (selected_arch)
12036 {
12037 selected_cpu = &all_cores[selected_arch->ident];
12038 aarch64_isa_flags = arch_isa;
12039 explicit_arch = selected_arch->arch;
12040 }
12041 else
12042 {
12043 /* Get default configure-time CPU. */
12044 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12045 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12046 }
12047
12048 if (selected_tune)
12049 explicit_tune_core = selected_tune->ident;
12050 }
12051 /* If both -mcpu and -march are specified check that they are architecturally
12052 compatible, warn if they're not and prefer the -march ISA flags. */
12053 else if (selected_arch)
12054 {
12055 if (selected_arch->arch != selected_cpu->arch)
12056 {
12057 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12058 all_architectures[selected_cpu->arch].name,
12059 selected_arch->name);
12060 }
12061 aarch64_isa_flags = arch_isa;
12062 explicit_arch = selected_arch->arch;
12063 explicit_tune_core = selected_tune ? selected_tune->ident
12064 : selected_cpu->ident;
12065 }
12066 else
12067 {
12068 /* -mcpu but no -march. */
12069 aarch64_isa_flags = cpu_isa;
12070 explicit_tune_core = selected_tune ? selected_tune->ident
12071 : selected_cpu->ident;
12072 gcc_assert (selected_cpu);
12073 selected_arch = &all_architectures[selected_cpu->arch];
12074 explicit_arch = selected_arch->arch;
12075 }
12076
12077 /* Set the arch as well as we will need it when outputing
12078 the .arch directive in assembly. */
12079 if (!selected_arch)
12080 {
12081 gcc_assert (selected_cpu);
12082 selected_arch = &all_architectures[selected_cpu->arch];
12083 }
12084
12085 if (!selected_tune)
12086 selected_tune = selected_cpu;
12087
12088 if (aarch64_enable_bti == 2)
12089 {
12090 #ifdef TARGET_ENABLE_BTI
12091 aarch64_enable_bti = 1;
12092 #else
12093 aarch64_enable_bti = 0;
12094 #endif
12095 }
12096
12097 /* Return address signing is currently not supported for ILP32 targets. For
12098 LP64 targets use the configured option in the absence of a command-line
12099 option for -mbranch-protection. */
12100 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12101 {
12102 #ifdef TARGET_ENABLE_PAC_RET
12103 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12104 #else
12105 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12106 #endif
12107 }
12108
12109 #ifndef HAVE_AS_MABI_OPTION
12110 /* The compiler may have been configured with 2.23.* binutils, which does
12111 not have support for ILP32. */
12112 if (TARGET_ILP32)
12113 error ("assembler does not support %<-mabi=ilp32%>");
12114 #endif
12115
12116 /* Convert -msve-vector-bits to a VG count. */
12117 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12118
12119 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12120 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12121
12122 /* Make sure we properly set up the explicit options. */
12123 if ((aarch64_cpu_string && valid_cpu)
12124 || (aarch64_tune_string && valid_tune))
12125 gcc_assert (explicit_tune_core != aarch64_none);
12126
12127 if ((aarch64_cpu_string && valid_cpu)
12128 || (aarch64_arch_string && valid_arch))
12129 gcc_assert (explicit_arch != aarch64_no_arch);
12130
12131 /* The pass to insert speculation tracking runs before
12132 shrink-wrapping and the latter does not know how to update the
12133 tracking status. So disable it in this case. */
12134 if (aarch64_track_speculation)
12135 flag_shrink_wrap = 0;
12136
12137 aarch64_override_options_internal (&global_options);
12138
12139 /* Save these options as the default ones in case we push and pop them later
12140 while processing functions with potential target attributes. */
12141 target_option_default_node = target_option_current_node
12142 = build_target_option_node (&global_options);
12143 }
12144
12145 /* Implement targetm.override_options_after_change. */
12146
12147 static void
12148 aarch64_override_options_after_change (void)
12149 {
12150 aarch64_override_options_after_change_1 (&global_options);
12151 }
12152
12153 static struct machine_function *
12154 aarch64_init_machine_status (void)
12155 {
12156 struct machine_function *machine;
12157 machine = ggc_cleared_alloc<machine_function> ();
12158 return machine;
12159 }
12160
12161 void
12162 aarch64_init_expanders (void)
12163 {
12164 init_machine_status = aarch64_init_machine_status;
12165 }
12166
12167 /* A checking mechanism for the implementation of the various code models. */
12168 static void
12169 initialize_aarch64_code_model (struct gcc_options *opts)
12170 {
12171 if (opts->x_flag_pic)
12172 {
12173 switch (opts->x_aarch64_cmodel_var)
12174 {
12175 case AARCH64_CMODEL_TINY:
12176 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12177 break;
12178 case AARCH64_CMODEL_SMALL:
12179 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12180 aarch64_cmodel = (flag_pic == 2
12181 ? AARCH64_CMODEL_SMALL_PIC
12182 : AARCH64_CMODEL_SMALL_SPIC);
12183 #else
12184 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12185 #endif
12186 break;
12187 case AARCH64_CMODEL_LARGE:
12188 sorry ("code model %qs with %<-f%s%>", "large",
12189 opts->x_flag_pic > 1 ? "PIC" : "pic");
12190 break;
12191 default:
12192 gcc_unreachable ();
12193 }
12194 }
12195 else
12196 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12197 }
12198
12199 /* Implement TARGET_OPTION_SAVE. */
12200
12201 static void
12202 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12203 {
12204 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12205 ptr->x_aarch64_branch_protection_string
12206 = opts->x_aarch64_branch_protection_string;
12207 }
12208
12209 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12210 using the information saved in PTR. */
12211
12212 static void
12213 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12214 {
12215 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12216 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12217 opts->x_explicit_arch = ptr->x_explicit_arch;
12218 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12219 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12220 opts->x_aarch64_branch_protection_string
12221 = ptr->x_aarch64_branch_protection_string;
12222 if (opts->x_aarch64_branch_protection_string)
12223 {
12224 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12225 NULL);
12226 }
12227
12228 aarch64_override_options_internal (opts);
12229 }
12230
12231 /* Implement TARGET_OPTION_PRINT. */
12232
12233 static void
12234 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12235 {
12236 const struct processor *cpu
12237 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12238 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12239 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12240 std::string extension
12241 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12242
12243 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12244 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12245 arch->name, extension.c_str ());
12246 }
12247
12248 static GTY(()) tree aarch64_previous_fndecl;
12249
12250 void
12251 aarch64_reset_previous_fndecl (void)
12252 {
12253 aarch64_previous_fndecl = NULL;
12254 }
12255
12256 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12257 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12258 make sure optab availability predicates are recomputed when necessary. */
12259
12260 void
12261 aarch64_save_restore_target_globals (tree new_tree)
12262 {
12263 if (TREE_TARGET_GLOBALS (new_tree))
12264 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12265 else if (new_tree == target_option_default_node)
12266 restore_target_globals (&default_target_globals);
12267 else
12268 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12269 }
12270
12271 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12272 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12273 of the function, if such exists. This function may be called multiple
12274 times on a single function so use aarch64_previous_fndecl to avoid
12275 setting up identical state. */
12276
12277 static void
12278 aarch64_set_current_function (tree fndecl)
12279 {
12280 if (!fndecl || fndecl == aarch64_previous_fndecl)
12281 return;
12282
12283 tree old_tree = (aarch64_previous_fndecl
12284 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12285 : NULL_TREE);
12286
12287 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12288
12289 /* If current function has no attributes but the previous one did,
12290 use the default node. */
12291 if (!new_tree && old_tree)
12292 new_tree = target_option_default_node;
12293
12294 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12295 the default have been handled by aarch64_save_restore_target_globals from
12296 aarch64_pragma_target_parse. */
12297 if (old_tree == new_tree)
12298 return;
12299
12300 aarch64_previous_fndecl = fndecl;
12301
12302 /* First set the target options. */
12303 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12304
12305 aarch64_save_restore_target_globals (new_tree);
12306 }
12307
12308 /* Enum describing the various ways we can handle attributes.
12309 In many cases we can reuse the generic option handling machinery. */
12310
12311 enum aarch64_attr_opt_type
12312 {
12313 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12314 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12315 aarch64_attr_enum, /* Attribute sets an enum variable. */
12316 aarch64_attr_custom /* Attribute requires a custom handling function. */
12317 };
12318
12319 /* All the information needed to handle a target attribute.
12320 NAME is the name of the attribute.
12321 ATTR_TYPE specifies the type of behavior of the attribute as described
12322 in the definition of enum aarch64_attr_opt_type.
12323 ALLOW_NEG is true if the attribute supports a "no-" form.
12324 HANDLER is the function that takes the attribute string as an argument
12325 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12326 OPT_NUM is the enum specifying the option that the attribute modifies.
12327 This is needed for attributes that mirror the behavior of a command-line
12328 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12329 aarch64_attr_enum. */
12330
12331 struct aarch64_attribute_info
12332 {
12333 const char *name;
12334 enum aarch64_attr_opt_type attr_type;
12335 bool allow_neg;
12336 bool (*handler) (const char *);
12337 enum opt_code opt_num;
12338 };
12339
12340 /* Handle the ARCH_STR argument to the arch= target attribute. */
12341
12342 static bool
12343 aarch64_handle_attr_arch (const char *str)
12344 {
12345 const struct processor *tmp_arch = NULL;
12346 std::string invalid_extension;
12347 enum aarch64_parse_opt_result parse_res
12348 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12349
12350 if (parse_res == AARCH64_PARSE_OK)
12351 {
12352 gcc_assert (tmp_arch);
12353 selected_arch = tmp_arch;
12354 explicit_arch = selected_arch->arch;
12355 return true;
12356 }
12357
12358 switch (parse_res)
12359 {
12360 case AARCH64_PARSE_MISSING_ARG:
12361 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12362 break;
12363 case AARCH64_PARSE_INVALID_ARG:
12364 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12365 aarch64_print_hint_for_arch (str);
12366 break;
12367 case AARCH64_PARSE_INVALID_FEATURE:
12368 error ("invalid feature modifier %s of value (\"%s\") in "
12369 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12370 aarch64_print_hint_for_extensions (invalid_extension);
12371 break;
12372 default:
12373 gcc_unreachable ();
12374 }
12375
12376 return false;
12377 }
12378
12379 /* Handle the argument CPU_STR to the cpu= target attribute. */
12380
12381 static bool
12382 aarch64_handle_attr_cpu (const char *str)
12383 {
12384 const struct processor *tmp_cpu = NULL;
12385 std::string invalid_extension;
12386 enum aarch64_parse_opt_result parse_res
12387 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12388
12389 if (parse_res == AARCH64_PARSE_OK)
12390 {
12391 gcc_assert (tmp_cpu);
12392 selected_tune = tmp_cpu;
12393 explicit_tune_core = selected_tune->ident;
12394
12395 selected_arch = &all_architectures[tmp_cpu->arch];
12396 explicit_arch = selected_arch->arch;
12397 return true;
12398 }
12399
12400 switch (parse_res)
12401 {
12402 case AARCH64_PARSE_MISSING_ARG:
12403 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12404 break;
12405 case AARCH64_PARSE_INVALID_ARG:
12406 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12407 aarch64_print_hint_for_core (str);
12408 break;
12409 case AARCH64_PARSE_INVALID_FEATURE:
12410 error ("invalid feature modifier %s of value (\"%s\") in "
12411 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12412 aarch64_print_hint_for_extensions (invalid_extension);
12413 break;
12414 default:
12415 gcc_unreachable ();
12416 }
12417
12418 return false;
12419 }
12420
12421 /* Handle the argument STR to the branch-protection= attribute. */
12422
12423 static bool
12424 aarch64_handle_attr_branch_protection (const char* str)
12425 {
12426 char *err_str = (char *) xmalloc (strlen (str));
12427 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12428 &err_str);
12429 bool success = false;
12430 switch (res)
12431 {
12432 case AARCH64_PARSE_MISSING_ARG:
12433 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12434 " attribute");
12435 break;
12436 case AARCH64_PARSE_INVALID_ARG:
12437 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12438 "=\")%> pragma or attribute", err_str);
12439 break;
12440 case AARCH64_PARSE_OK:
12441 success = true;
12442 /* Fall through. */
12443 case AARCH64_PARSE_INVALID_FEATURE:
12444 break;
12445 default:
12446 gcc_unreachable ();
12447 }
12448 free (err_str);
12449 return success;
12450 }
12451
12452 /* Handle the argument STR to the tune= target attribute. */
12453
12454 static bool
12455 aarch64_handle_attr_tune (const char *str)
12456 {
12457 const struct processor *tmp_tune = NULL;
12458 enum aarch64_parse_opt_result parse_res
12459 = aarch64_parse_tune (str, &tmp_tune);
12460
12461 if (parse_res == AARCH64_PARSE_OK)
12462 {
12463 gcc_assert (tmp_tune);
12464 selected_tune = tmp_tune;
12465 explicit_tune_core = selected_tune->ident;
12466 return true;
12467 }
12468
12469 switch (parse_res)
12470 {
12471 case AARCH64_PARSE_INVALID_ARG:
12472 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12473 aarch64_print_hint_for_core (str);
12474 break;
12475 default:
12476 gcc_unreachable ();
12477 }
12478
12479 return false;
12480 }
12481
12482 /* Parse an architecture extensions target attribute string specified in STR.
12483 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12484 if successful. Update aarch64_isa_flags to reflect the ISA features
12485 modified. */
12486
12487 static bool
12488 aarch64_handle_attr_isa_flags (char *str)
12489 {
12490 enum aarch64_parse_opt_result parse_res;
12491 uint64_t isa_flags = aarch64_isa_flags;
12492
12493 /* We allow "+nothing" in the beginning to clear out all architectural
12494 features if the user wants to handpick specific features. */
12495 if (strncmp ("+nothing", str, 8) == 0)
12496 {
12497 isa_flags = 0;
12498 str += 8;
12499 }
12500
12501 std::string invalid_extension;
12502 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12503
12504 if (parse_res == AARCH64_PARSE_OK)
12505 {
12506 aarch64_isa_flags = isa_flags;
12507 return true;
12508 }
12509
12510 switch (parse_res)
12511 {
12512 case AARCH64_PARSE_MISSING_ARG:
12513 error ("missing value in %<target()%> pragma or attribute");
12514 break;
12515
12516 case AARCH64_PARSE_INVALID_FEATURE:
12517 error ("invalid feature modifier %s of value (\"%s\") in "
12518 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12519 break;
12520
12521 default:
12522 gcc_unreachable ();
12523 }
12524
12525 return false;
12526 }
12527
12528 /* The target attributes that we support. On top of these we also support just
12529 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12530 handled explicitly in aarch64_process_one_target_attr. */
12531
12532 static const struct aarch64_attribute_info aarch64_attributes[] =
12533 {
12534 { "general-regs-only", aarch64_attr_mask, false, NULL,
12535 OPT_mgeneral_regs_only },
12536 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12537 OPT_mfix_cortex_a53_835769 },
12538 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12539 OPT_mfix_cortex_a53_843419 },
12540 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12541 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12542 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12543 OPT_momit_leaf_frame_pointer },
12544 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12545 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12546 OPT_march_ },
12547 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12548 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12549 OPT_mtune_ },
12550 { "branch-protection", aarch64_attr_custom, false,
12551 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12552 { "sign-return-address", aarch64_attr_enum, false, NULL,
12553 OPT_msign_return_address_ },
12554 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12555 };
12556
12557 /* Parse ARG_STR which contains the definition of one target attribute.
12558 Show appropriate errors if any or return true if the attribute is valid. */
12559
12560 static bool
12561 aarch64_process_one_target_attr (char *arg_str)
12562 {
12563 bool invert = false;
12564
12565 size_t len = strlen (arg_str);
12566
12567 if (len == 0)
12568 {
12569 error ("malformed %<target()%> pragma or attribute");
12570 return false;
12571 }
12572
12573 char *str_to_check = (char *) alloca (len + 1);
12574 strcpy (str_to_check, arg_str);
12575
12576 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12577 It is easier to detect and handle it explicitly here rather than going
12578 through the machinery for the rest of the target attributes in this
12579 function. */
12580 if (*str_to_check == '+')
12581 return aarch64_handle_attr_isa_flags (str_to_check);
12582
12583 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12584 {
12585 invert = true;
12586 str_to_check += 3;
12587 }
12588 char *arg = strchr (str_to_check, '=');
12589
12590 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12591 and point ARG to "foo". */
12592 if (arg)
12593 {
12594 *arg = '\0';
12595 arg++;
12596 }
12597 const struct aarch64_attribute_info *p_attr;
12598 bool found = false;
12599 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12600 {
12601 /* If the names don't match up, or the user has given an argument
12602 to an attribute that doesn't accept one, or didn't give an argument
12603 to an attribute that expects one, fail to match. */
12604 if (strcmp (str_to_check, p_attr->name) != 0)
12605 continue;
12606
12607 found = true;
12608 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12609 || p_attr->attr_type == aarch64_attr_enum;
12610
12611 if (attr_need_arg_p ^ (arg != NULL))
12612 {
12613 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12614 return false;
12615 }
12616
12617 /* If the name matches but the attribute does not allow "no-" versions
12618 then we can't match. */
12619 if (invert && !p_attr->allow_neg)
12620 {
12621 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12622 return false;
12623 }
12624
12625 switch (p_attr->attr_type)
12626 {
12627 /* Has a custom handler registered.
12628 For example, cpu=, arch=, tune=. */
12629 case aarch64_attr_custom:
12630 gcc_assert (p_attr->handler);
12631 if (!p_attr->handler (arg))
12632 return false;
12633 break;
12634
12635 /* Either set or unset a boolean option. */
12636 case aarch64_attr_bool:
12637 {
12638 struct cl_decoded_option decoded;
12639
12640 generate_option (p_attr->opt_num, NULL, !invert,
12641 CL_TARGET, &decoded);
12642 aarch64_handle_option (&global_options, &global_options_set,
12643 &decoded, input_location);
12644 break;
12645 }
12646 /* Set or unset a bit in the target_flags. aarch64_handle_option
12647 should know what mask to apply given the option number. */
12648 case aarch64_attr_mask:
12649 {
12650 struct cl_decoded_option decoded;
12651 /* We only need to specify the option number.
12652 aarch64_handle_option will know which mask to apply. */
12653 decoded.opt_index = p_attr->opt_num;
12654 decoded.value = !invert;
12655 aarch64_handle_option (&global_options, &global_options_set,
12656 &decoded, input_location);
12657 break;
12658 }
12659 /* Use the option setting machinery to set an option to an enum. */
12660 case aarch64_attr_enum:
12661 {
12662 gcc_assert (arg);
12663 bool valid;
12664 int value;
12665 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12666 &value, CL_TARGET);
12667 if (valid)
12668 {
12669 set_option (&global_options, NULL, p_attr->opt_num, value,
12670 NULL, DK_UNSPECIFIED, input_location,
12671 global_dc);
12672 }
12673 else
12674 {
12675 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12676 }
12677 break;
12678 }
12679 default:
12680 gcc_unreachable ();
12681 }
12682 }
12683
12684 /* If we reached here we either have found an attribute and validated
12685 it or didn't match any. If we matched an attribute but its arguments
12686 were malformed we will have returned false already. */
12687 return found;
12688 }
12689
12690 /* Count how many times the character C appears in
12691 NULL-terminated string STR. */
12692
12693 static unsigned int
12694 num_occurences_in_str (char c, char *str)
12695 {
12696 unsigned int res = 0;
12697 while (*str != '\0')
12698 {
12699 if (*str == c)
12700 res++;
12701
12702 str++;
12703 }
12704
12705 return res;
12706 }
12707
12708 /* Parse the tree in ARGS that contains the target attribute information
12709 and update the global target options space. */
12710
12711 bool
12712 aarch64_process_target_attr (tree args)
12713 {
12714 if (TREE_CODE (args) == TREE_LIST)
12715 {
12716 do
12717 {
12718 tree head = TREE_VALUE (args);
12719 if (head)
12720 {
12721 if (!aarch64_process_target_attr (head))
12722 return false;
12723 }
12724 args = TREE_CHAIN (args);
12725 } while (args);
12726
12727 return true;
12728 }
12729
12730 if (TREE_CODE (args) != STRING_CST)
12731 {
12732 error ("attribute %<target%> argument not a string");
12733 return false;
12734 }
12735
12736 size_t len = strlen (TREE_STRING_POINTER (args));
12737 char *str_to_check = (char *) alloca (len + 1);
12738 strcpy (str_to_check, TREE_STRING_POINTER (args));
12739
12740 if (len == 0)
12741 {
12742 error ("malformed %<target()%> pragma or attribute");
12743 return false;
12744 }
12745
12746 /* Used to catch empty spaces between commas i.e.
12747 attribute ((target ("attr1,,attr2"))). */
12748 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12749
12750 /* Handle multiple target attributes separated by ','. */
12751 char *token = strtok_r (str_to_check, ",", &str_to_check);
12752
12753 unsigned int num_attrs = 0;
12754 while (token)
12755 {
12756 num_attrs++;
12757 if (!aarch64_process_one_target_attr (token))
12758 {
12759 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12760 return false;
12761 }
12762
12763 token = strtok_r (NULL, ",", &str_to_check);
12764 }
12765
12766 if (num_attrs != num_commas + 1)
12767 {
12768 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12769 return false;
12770 }
12771
12772 return true;
12773 }
12774
12775 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12776 process attribute ((target ("..."))). */
12777
12778 static bool
12779 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12780 {
12781 struct cl_target_option cur_target;
12782 bool ret;
12783 tree old_optimize;
12784 tree new_target, new_optimize;
12785 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12786
12787 /* If what we're processing is the current pragma string then the
12788 target option node is already stored in target_option_current_node
12789 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12790 having to re-parse the string. This is especially useful to keep
12791 arm_neon.h compile times down since that header contains a lot
12792 of intrinsics enclosed in pragmas. */
12793 if (!existing_target && args == current_target_pragma)
12794 {
12795 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12796 return true;
12797 }
12798 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12799
12800 old_optimize = build_optimization_node (&global_options);
12801 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12802
12803 /* If the function changed the optimization levels as well as setting
12804 target options, start with the optimizations specified. */
12805 if (func_optimize && func_optimize != old_optimize)
12806 cl_optimization_restore (&global_options,
12807 TREE_OPTIMIZATION (func_optimize));
12808
12809 /* Save the current target options to restore at the end. */
12810 cl_target_option_save (&cur_target, &global_options);
12811
12812 /* If fndecl already has some target attributes applied to it, unpack
12813 them so that we add this attribute on top of them, rather than
12814 overwriting them. */
12815 if (existing_target)
12816 {
12817 struct cl_target_option *existing_options
12818 = TREE_TARGET_OPTION (existing_target);
12819
12820 if (existing_options)
12821 cl_target_option_restore (&global_options, existing_options);
12822 }
12823 else
12824 cl_target_option_restore (&global_options,
12825 TREE_TARGET_OPTION (target_option_current_node));
12826
12827 ret = aarch64_process_target_attr (args);
12828
12829 /* Set up any additional state. */
12830 if (ret)
12831 {
12832 aarch64_override_options_internal (&global_options);
12833 /* Initialize SIMD builtins if we haven't already.
12834 Set current_target_pragma to NULL for the duration so that
12835 the builtin initialization code doesn't try to tag the functions
12836 being built with the attributes specified by any current pragma, thus
12837 going into an infinite recursion. */
12838 if (TARGET_SIMD)
12839 {
12840 tree saved_current_target_pragma = current_target_pragma;
12841 current_target_pragma = NULL;
12842 aarch64_init_simd_builtins ();
12843 current_target_pragma = saved_current_target_pragma;
12844 }
12845 new_target = build_target_option_node (&global_options);
12846 }
12847 else
12848 new_target = NULL;
12849
12850 new_optimize = build_optimization_node (&global_options);
12851
12852 if (fndecl && ret)
12853 {
12854 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12855
12856 if (old_optimize != new_optimize)
12857 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12858 }
12859
12860 cl_target_option_restore (&global_options, &cur_target);
12861
12862 if (old_optimize != new_optimize)
12863 cl_optimization_restore (&global_options,
12864 TREE_OPTIMIZATION (old_optimize));
12865 return ret;
12866 }
12867
12868 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12869 tri-bool options (yes, no, don't care) and the default value is
12870 DEF, determine whether to reject inlining. */
12871
12872 static bool
12873 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12874 int dont_care, int def)
12875 {
12876 /* If the callee doesn't care, always allow inlining. */
12877 if (callee == dont_care)
12878 return true;
12879
12880 /* If the caller doesn't care, always allow inlining. */
12881 if (caller == dont_care)
12882 return true;
12883
12884 /* Otherwise, allow inlining if either the callee and caller values
12885 agree, or if the callee is using the default value. */
12886 return (callee == caller || callee == def);
12887 }
12888
12889 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12890 to inline CALLEE into CALLER based on target-specific info.
12891 Make sure that the caller and callee have compatible architectural
12892 features. Then go through the other possible target attributes
12893 and see if they can block inlining. Try not to reject always_inline
12894 callees unless they are incompatible architecturally. */
12895
12896 static bool
12897 aarch64_can_inline_p (tree caller, tree callee)
12898 {
12899 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12900 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12901
12902 struct cl_target_option *caller_opts
12903 = TREE_TARGET_OPTION (caller_tree ? caller_tree
12904 : target_option_default_node);
12905
12906 struct cl_target_option *callee_opts
12907 = TREE_TARGET_OPTION (callee_tree ? callee_tree
12908 : target_option_default_node);
12909
12910 /* Callee's ISA flags should be a subset of the caller's. */
12911 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12912 != callee_opts->x_aarch64_isa_flags)
12913 return false;
12914
12915 /* Allow non-strict aligned functions inlining into strict
12916 aligned ones. */
12917 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12918 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12919 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12920 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12921 return false;
12922
12923 bool always_inline = lookup_attribute ("always_inline",
12924 DECL_ATTRIBUTES (callee));
12925
12926 /* If the architectural features match up and the callee is always_inline
12927 then the other attributes don't matter. */
12928 if (always_inline)
12929 return true;
12930
12931 if (caller_opts->x_aarch64_cmodel_var
12932 != callee_opts->x_aarch64_cmodel_var)
12933 return false;
12934
12935 if (caller_opts->x_aarch64_tls_dialect
12936 != callee_opts->x_aarch64_tls_dialect)
12937 return false;
12938
12939 /* Honour explicit requests to workaround errata. */
12940 if (!aarch64_tribools_ok_for_inlining_p (
12941 caller_opts->x_aarch64_fix_a53_err835769,
12942 callee_opts->x_aarch64_fix_a53_err835769,
12943 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12944 return false;
12945
12946 if (!aarch64_tribools_ok_for_inlining_p (
12947 caller_opts->x_aarch64_fix_a53_err843419,
12948 callee_opts->x_aarch64_fix_a53_err843419,
12949 2, TARGET_FIX_ERR_A53_843419))
12950 return false;
12951
12952 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12953 caller and calle and they don't match up, reject inlining. */
12954 if (!aarch64_tribools_ok_for_inlining_p (
12955 caller_opts->x_flag_omit_leaf_frame_pointer,
12956 callee_opts->x_flag_omit_leaf_frame_pointer,
12957 2, 1))
12958 return false;
12959
12960 /* If the callee has specific tuning overrides, respect them. */
12961 if (callee_opts->x_aarch64_override_tune_string != NULL
12962 && caller_opts->x_aarch64_override_tune_string == NULL)
12963 return false;
12964
12965 /* If the user specified tuning override strings for the
12966 caller and callee and they don't match up, reject inlining.
12967 We just do a string compare here, we don't analyze the meaning
12968 of the string, as it would be too costly for little gain. */
12969 if (callee_opts->x_aarch64_override_tune_string
12970 && caller_opts->x_aarch64_override_tune_string
12971 && (strcmp (callee_opts->x_aarch64_override_tune_string,
12972 caller_opts->x_aarch64_override_tune_string) != 0))
12973 return false;
12974
12975 return true;
12976 }
12977
12978 /* Return true if SYMBOL_REF X binds locally. */
12979
12980 static bool
12981 aarch64_symbol_binds_local_p (const_rtx x)
12982 {
12983 return (SYMBOL_REF_DECL (x)
12984 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12985 : SYMBOL_REF_LOCAL_P (x));
12986 }
12987
12988 /* Return true if SYMBOL_REF X is thread local */
12989 static bool
12990 aarch64_tls_symbol_p (rtx x)
12991 {
12992 if (! TARGET_HAVE_TLS)
12993 return false;
12994
12995 if (GET_CODE (x) != SYMBOL_REF)
12996 return false;
12997
12998 return SYMBOL_REF_TLS_MODEL (x) != 0;
12999 }
13000
13001 /* Classify a TLS symbol into one of the TLS kinds. */
13002 enum aarch64_symbol_type
13003 aarch64_classify_tls_symbol (rtx x)
13004 {
13005 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13006
13007 switch (tls_kind)
13008 {
13009 case TLS_MODEL_GLOBAL_DYNAMIC:
13010 case TLS_MODEL_LOCAL_DYNAMIC:
13011 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13012
13013 case TLS_MODEL_INITIAL_EXEC:
13014 switch (aarch64_cmodel)
13015 {
13016 case AARCH64_CMODEL_TINY:
13017 case AARCH64_CMODEL_TINY_PIC:
13018 return SYMBOL_TINY_TLSIE;
13019 default:
13020 return SYMBOL_SMALL_TLSIE;
13021 }
13022
13023 case TLS_MODEL_LOCAL_EXEC:
13024 if (aarch64_tls_size == 12)
13025 return SYMBOL_TLSLE12;
13026 else if (aarch64_tls_size == 24)
13027 return SYMBOL_TLSLE24;
13028 else if (aarch64_tls_size == 32)
13029 return SYMBOL_TLSLE32;
13030 else if (aarch64_tls_size == 48)
13031 return SYMBOL_TLSLE48;
13032 else
13033 gcc_unreachable ();
13034
13035 case TLS_MODEL_EMULATED:
13036 case TLS_MODEL_NONE:
13037 return SYMBOL_FORCE_TO_MEM;
13038
13039 default:
13040 gcc_unreachable ();
13041 }
13042 }
13043
13044 /* Return the correct method for accessing X + OFFSET, where X is either
13045 a SYMBOL_REF or LABEL_REF. */
13046
13047 enum aarch64_symbol_type
13048 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13049 {
13050 if (GET_CODE (x) == LABEL_REF)
13051 {
13052 switch (aarch64_cmodel)
13053 {
13054 case AARCH64_CMODEL_LARGE:
13055 return SYMBOL_FORCE_TO_MEM;
13056
13057 case AARCH64_CMODEL_TINY_PIC:
13058 case AARCH64_CMODEL_TINY:
13059 return SYMBOL_TINY_ABSOLUTE;
13060
13061 case AARCH64_CMODEL_SMALL_SPIC:
13062 case AARCH64_CMODEL_SMALL_PIC:
13063 case AARCH64_CMODEL_SMALL:
13064 return SYMBOL_SMALL_ABSOLUTE;
13065
13066 default:
13067 gcc_unreachable ();
13068 }
13069 }
13070
13071 if (GET_CODE (x) == SYMBOL_REF)
13072 {
13073 if (aarch64_tls_symbol_p (x))
13074 return aarch64_classify_tls_symbol (x);
13075
13076 switch (aarch64_cmodel)
13077 {
13078 case AARCH64_CMODEL_TINY:
13079 /* When we retrieve symbol + offset address, we have to make sure
13080 the offset does not cause overflow of the final address. But
13081 we have no way of knowing the address of symbol at compile time
13082 so we can't accurately say if the distance between the PC and
13083 symbol + offset is outside the addressible range of +/-1M in the
13084 TINY code model. So we rely on images not being greater than
13085 1M and cap the offset at 1M and anything beyond 1M will have to
13086 be loaded using an alternative mechanism. Furthermore if the
13087 symbol is a weak reference to something that isn't known to
13088 resolve to a symbol in this module, then force to memory. */
13089 if ((SYMBOL_REF_WEAK (x)
13090 && !aarch64_symbol_binds_local_p (x))
13091 || !IN_RANGE (offset, -1048575, 1048575))
13092 return SYMBOL_FORCE_TO_MEM;
13093 return SYMBOL_TINY_ABSOLUTE;
13094
13095 case AARCH64_CMODEL_SMALL:
13096 /* Same reasoning as the tiny code model, but the offset cap here is
13097 4G. */
13098 if ((SYMBOL_REF_WEAK (x)
13099 && !aarch64_symbol_binds_local_p (x))
13100 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13101 HOST_WIDE_INT_C (4294967264)))
13102 return SYMBOL_FORCE_TO_MEM;
13103 return SYMBOL_SMALL_ABSOLUTE;
13104
13105 case AARCH64_CMODEL_TINY_PIC:
13106 if (!aarch64_symbol_binds_local_p (x))
13107 return SYMBOL_TINY_GOT;
13108 return SYMBOL_TINY_ABSOLUTE;
13109
13110 case AARCH64_CMODEL_SMALL_SPIC:
13111 case AARCH64_CMODEL_SMALL_PIC:
13112 if (!aarch64_symbol_binds_local_p (x))
13113 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13114 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13115 return SYMBOL_SMALL_ABSOLUTE;
13116
13117 case AARCH64_CMODEL_LARGE:
13118 /* This is alright even in PIC code as the constant
13119 pool reference is always PC relative and within
13120 the same translation unit. */
13121 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13122 return SYMBOL_SMALL_ABSOLUTE;
13123 else
13124 return SYMBOL_FORCE_TO_MEM;
13125
13126 default:
13127 gcc_unreachable ();
13128 }
13129 }
13130
13131 /* By default push everything into the constant pool. */
13132 return SYMBOL_FORCE_TO_MEM;
13133 }
13134
13135 bool
13136 aarch64_constant_address_p (rtx x)
13137 {
13138 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13139 }
13140
13141 bool
13142 aarch64_legitimate_pic_operand_p (rtx x)
13143 {
13144 if (GET_CODE (x) == SYMBOL_REF
13145 || (GET_CODE (x) == CONST
13146 && GET_CODE (XEXP (x, 0)) == PLUS
13147 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13148 return false;
13149
13150 return true;
13151 }
13152
13153 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13154 that should be rematerialized rather than spilled. */
13155
13156 static bool
13157 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13158 {
13159 /* Support CSE and rematerialization of common constants. */
13160 if (CONST_INT_P (x)
13161 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13162 || GET_CODE (x) == CONST_VECTOR)
13163 return true;
13164
13165 /* Do not allow vector struct mode constants for Advanced SIMD.
13166 We could support 0 and -1 easily, but they need support in
13167 aarch64-simd.md. */
13168 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13169 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13170 return false;
13171
13172 /* Only accept variable-length vector constants if they can be
13173 handled directly.
13174
13175 ??? It would be possible to handle rematerialization of other
13176 constants via secondary reloads. */
13177 if (vec_flags & VEC_ANY_SVE)
13178 return aarch64_simd_valid_immediate (x, NULL);
13179
13180 if (GET_CODE (x) == HIGH)
13181 x = XEXP (x, 0);
13182
13183 /* Accept polynomial constants that can be calculated by using the
13184 destination of a move as the sole temporary. Constants that
13185 require a second temporary cannot be rematerialized (they can't be
13186 forced to memory and also aren't legitimate constants). */
13187 poly_int64 offset;
13188 if (poly_int_rtx_p (x, &offset))
13189 return aarch64_offset_temporaries (false, offset) <= 1;
13190
13191 /* If an offset is being added to something else, we need to allow the
13192 base to be moved into the destination register, meaning that there
13193 are no free temporaries for the offset. */
13194 x = strip_offset (x, &offset);
13195 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13196 return false;
13197
13198 /* Do not allow const (plus (anchor_symbol, const_int)). */
13199 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13200 return false;
13201
13202 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13203 so spilling them is better than rematerialization. */
13204 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13205 return true;
13206
13207 /* Label references are always constant. */
13208 if (GET_CODE (x) == LABEL_REF)
13209 return true;
13210
13211 return false;
13212 }
13213
13214 rtx
13215 aarch64_load_tp (rtx target)
13216 {
13217 if (!target
13218 || GET_MODE (target) != Pmode
13219 || !register_operand (target, Pmode))
13220 target = gen_reg_rtx (Pmode);
13221
13222 /* Can return in any reg. */
13223 emit_insn (gen_aarch64_load_tp_hard (target));
13224 return target;
13225 }
13226
13227 /* On AAPCS systems, this is the "struct __va_list". */
13228 static GTY(()) tree va_list_type;
13229
13230 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13231 Return the type to use as __builtin_va_list.
13232
13233 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13234
13235 struct __va_list
13236 {
13237 void *__stack;
13238 void *__gr_top;
13239 void *__vr_top;
13240 int __gr_offs;
13241 int __vr_offs;
13242 }; */
13243
13244 static tree
13245 aarch64_build_builtin_va_list (void)
13246 {
13247 tree va_list_name;
13248 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13249
13250 /* Create the type. */
13251 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13252 /* Give it the required name. */
13253 va_list_name = build_decl (BUILTINS_LOCATION,
13254 TYPE_DECL,
13255 get_identifier ("__va_list"),
13256 va_list_type);
13257 DECL_ARTIFICIAL (va_list_name) = 1;
13258 TYPE_NAME (va_list_type) = va_list_name;
13259 TYPE_STUB_DECL (va_list_type) = va_list_name;
13260
13261 /* Create the fields. */
13262 f_stack = build_decl (BUILTINS_LOCATION,
13263 FIELD_DECL, get_identifier ("__stack"),
13264 ptr_type_node);
13265 f_grtop = build_decl (BUILTINS_LOCATION,
13266 FIELD_DECL, get_identifier ("__gr_top"),
13267 ptr_type_node);
13268 f_vrtop = build_decl (BUILTINS_LOCATION,
13269 FIELD_DECL, get_identifier ("__vr_top"),
13270 ptr_type_node);
13271 f_groff = build_decl (BUILTINS_LOCATION,
13272 FIELD_DECL, get_identifier ("__gr_offs"),
13273 integer_type_node);
13274 f_vroff = build_decl (BUILTINS_LOCATION,
13275 FIELD_DECL, get_identifier ("__vr_offs"),
13276 integer_type_node);
13277
13278 /* Tell tree-stdarg pass about our internal offset fields.
13279 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13280 purpose to identify whether the code is updating va_list internal
13281 offset fields through irregular way. */
13282 va_list_gpr_counter_field = f_groff;
13283 va_list_fpr_counter_field = f_vroff;
13284
13285 DECL_ARTIFICIAL (f_stack) = 1;
13286 DECL_ARTIFICIAL (f_grtop) = 1;
13287 DECL_ARTIFICIAL (f_vrtop) = 1;
13288 DECL_ARTIFICIAL (f_groff) = 1;
13289 DECL_ARTIFICIAL (f_vroff) = 1;
13290
13291 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13292 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13293 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13294 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13295 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13296
13297 TYPE_FIELDS (va_list_type) = f_stack;
13298 DECL_CHAIN (f_stack) = f_grtop;
13299 DECL_CHAIN (f_grtop) = f_vrtop;
13300 DECL_CHAIN (f_vrtop) = f_groff;
13301 DECL_CHAIN (f_groff) = f_vroff;
13302
13303 /* Compute its layout. */
13304 layout_type (va_list_type);
13305
13306 return va_list_type;
13307 }
13308
13309 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13310 static void
13311 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13312 {
13313 const CUMULATIVE_ARGS *cum;
13314 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13315 tree stack, grtop, vrtop, groff, vroff;
13316 tree t;
13317 int gr_save_area_size = cfun->va_list_gpr_size;
13318 int vr_save_area_size = cfun->va_list_fpr_size;
13319 int vr_offset;
13320
13321 cum = &crtl->args.info;
13322 if (cfun->va_list_gpr_size)
13323 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13324 cfun->va_list_gpr_size);
13325 if (cfun->va_list_fpr_size)
13326 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13327 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13328
13329 if (!TARGET_FLOAT)
13330 {
13331 gcc_assert (cum->aapcs_nvrn == 0);
13332 vr_save_area_size = 0;
13333 }
13334
13335 f_stack = TYPE_FIELDS (va_list_type_node);
13336 f_grtop = DECL_CHAIN (f_stack);
13337 f_vrtop = DECL_CHAIN (f_grtop);
13338 f_groff = DECL_CHAIN (f_vrtop);
13339 f_vroff = DECL_CHAIN (f_groff);
13340
13341 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13342 NULL_TREE);
13343 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13344 NULL_TREE);
13345 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13346 NULL_TREE);
13347 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13348 NULL_TREE);
13349 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13350 NULL_TREE);
13351
13352 /* Emit code to initialize STACK, which points to the next varargs stack
13353 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13354 by named arguments. STACK is 8-byte aligned. */
13355 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13356 if (cum->aapcs_stack_size > 0)
13357 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13358 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13359 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13360
13361 /* Emit code to initialize GRTOP, the top of the GR save area.
13362 virtual_incoming_args_rtx should have been 16 byte aligned. */
13363 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13364 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13365 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13366
13367 /* Emit code to initialize VRTOP, the top of the VR save area.
13368 This address is gr_save_area_bytes below GRTOP, rounded
13369 down to the next 16-byte boundary. */
13370 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13371 vr_offset = ROUND_UP (gr_save_area_size,
13372 STACK_BOUNDARY / BITS_PER_UNIT);
13373
13374 if (vr_offset)
13375 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13376 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13377 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13378
13379 /* Emit code to initialize GROFF, the offset from GRTOP of the
13380 next GPR argument. */
13381 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13382 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13383 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13384
13385 /* Likewise emit code to initialize VROFF, the offset from FTOP
13386 of the next VR argument. */
13387 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13388 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13389 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13390 }
13391
13392 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13393
13394 static tree
13395 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13396 gimple_seq *post_p ATTRIBUTE_UNUSED)
13397 {
13398 tree addr;
13399 bool indirect_p;
13400 bool is_ha; /* is HFA or HVA. */
13401 bool dw_align; /* double-word align. */
13402 machine_mode ag_mode = VOIDmode;
13403 int nregs;
13404 machine_mode mode;
13405
13406 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13407 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13408 HOST_WIDE_INT size, rsize, adjust, align;
13409 tree t, u, cond1, cond2;
13410
13411 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13412 if (indirect_p)
13413 type = build_pointer_type (type);
13414
13415 mode = TYPE_MODE (type);
13416
13417 f_stack = TYPE_FIELDS (va_list_type_node);
13418 f_grtop = DECL_CHAIN (f_stack);
13419 f_vrtop = DECL_CHAIN (f_grtop);
13420 f_groff = DECL_CHAIN (f_vrtop);
13421 f_vroff = DECL_CHAIN (f_groff);
13422
13423 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13424 f_stack, NULL_TREE);
13425 size = int_size_in_bytes (type);
13426
13427 bool abi_break;
13428 align
13429 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13430
13431 dw_align = false;
13432 adjust = 0;
13433 if (aarch64_vfp_is_call_or_return_candidate (mode,
13434 type,
13435 &ag_mode,
13436 &nregs,
13437 &is_ha))
13438 {
13439 /* No frontends can create types with variable-sized modes, so we
13440 shouldn't be asked to pass or return them. */
13441 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13442
13443 /* TYPE passed in fp/simd registers. */
13444 if (!TARGET_FLOAT)
13445 aarch64_err_no_fpadvsimd (mode);
13446
13447 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13448 unshare_expr (valist), f_vrtop, NULL_TREE);
13449 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13450 unshare_expr (valist), f_vroff, NULL_TREE);
13451
13452 rsize = nregs * UNITS_PER_VREG;
13453
13454 if (is_ha)
13455 {
13456 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13457 adjust = UNITS_PER_VREG - ag_size;
13458 }
13459 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13460 && size < UNITS_PER_VREG)
13461 {
13462 adjust = UNITS_PER_VREG - size;
13463 }
13464 }
13465 else
13466 {
13467 /* TYPE passed in general registers. */
13468 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13469 unshare_expr (valist), f_grtop, NULL_TREE);
13470 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13471 unshare_expr (valist), f_groff, NULL_TREE);
13472 rsize = ROUND_UP (size, UNITS_PER_WORD);
13473 nregs = rsize / UNITS_PER_WORD;
13474
13475 if (align > 8)
13476 {
13477 if (abi_break && warn_psabi)
13478 inform (input_location, "parameter passing for argument of type "
13479 "%qT changed in GCC 9.1", type);
13480 dw_align = true;
13481 }
13482
13483 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13484 && size < UNITS_PER_WORD)
13485 {
13486 adjust = UNITS_PER_WORD - size;
13487 }
13488 }
13489
13490 /* Get a local temporary for the field value. */
13491 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13492
13493 /* Emit code to branch if off >= 0. */
13494 t = build2 (GE_EXPR, boolean_type_node, off,
13495 build_int_cst (TREE_TYPE (off), 0));
13496 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13497
13498 if (dw_align)
13499 {
13500 /* Emit: offs = (offs + 15) & -16. */
13501 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13502 build_int_cst (TREE_TYPE (off), 15));
13503 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13504 build_int_cst (TREE_TYPE (off), -16));
13505 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13506 }
13507 else
13508 roundup = NULL;
13509
13510 /* Update ap.__[g|v]r_offs */
13511 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13512 build_int_cst (TREE_TYPE (off), rsize));
13513 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13514
13515 /* String up. */
13516 if (roundup)
13517 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13518
13519 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13520 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13521 build_int_cst (TREE_TYPE (f_off), 0));
13522 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13523
13524 /* String up: make sure the assignment happens before the use. */
13525 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13526 COND_EXPR_ELSE (cond1) = t;
13527
13528 /* Prepare the trees handling the argument that is passed on the stack;
13529 the top level node will store in ON_STACK. */
13530 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13531 if (align > 8)
13532 {
13533 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13534 t = fold_build_pointer_plus_hwi (arg, 15);
13535 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13536 build_int_cst (TREE_TYPE (t), -16));
13537 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13538 }
13539 else
13540 roundup = NULL;
13541 /* Advance ap.__stack */
13542 t = fold_build_pointer_plus_hwi (arg, size + 7);
13543 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13544 build_int_cst (TREE_TYPE (t), -8));
13545 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13546 /* String up roundup and advance. */
13547 if (roundup)
13548 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13549 /* String up with arg */
13550 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13551 /* Big-endianness related address adjustment. */
13552 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13553 && size < UNITS_PER_WORD)
13554 {
13555 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13556 size_int (UNITS_PER_WORD - size));
13557 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13558 }
13559
13560 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13561 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13562
13563 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13564 t = off;
13565 if (adjust)
13566 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13567 build_int_cst (TREE_TYPE (off), adjust));
13568
13569 t = fold_convert (sizetype, t);
13570 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13571
13572 if (is_ha)
13573 {
13574 /* type ha; // treat as "struct {ftype field[n];}"
13575 ... [computing offs]
13576 for (i = 0; i <nregs; ++i, offs += 16)
13577 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13578 return ha; */
13579 int i;
13580 tree tmp_ha, field_t, field_ptr_t;
13581
13582 /* Declare a local variable. */
13583 tmp_ha = create_tmp_var_raw (type, "ha");
13584 gimple_add_tmp_var (tmp_ha);
13585
13586 /* Establish the base type. */
13587 switch (ag_mode)
13588 {
13589 case E_SFmode:
13590 field_t = float_type_node;
13591 field_ptr_t = float_ptr_type_node;
13592 break;
13593 case E_DFmode:
13594 field_t = double_type_node;
13595 field_ptr_t = double_ptr_type_node;
13596 break;
13597 case E_TFmode:
13598 field_t = long_double_type_node;
13599 field_ptr_t = long_double_ptr_type_node;
13600 break;
13601 case E_HFmode:
13602 field_t = aarch64_fp16_type_node;
13603 field_ptr_t = aarch64_fp16_ptr_type_node;
13604 break;
13605 case E_V2SImode:
13606 case E_V4SImode:
13607 {
13608 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13609 field_t = build_vector_type_for_mode (innertype, ag_mode);
13610 field_ptr_t = build_pointer_type (field_t);
13611 }
13612 break;
13613 default:
13614 gcc_assert (0);
13615 }
13616
13617 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13618 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13619 addr = t;
13620 t = fold_convert (field_ptr_t, addr);
13621 t = build2 (MODIFY_EXPR, field_t,
13622 build1 (INDIRECT_REF, field_t, tmp_ha),
13623 build1 (INDIRECT_REF, field_t, t));
13624
13625 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13626 for (i = 1; i < nregs; ++i)
13627 {
13628 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13629 u = fold_convert (field_ptr_t, addr);
13630 u = build2 (MODIFY_EXPR, field_t,
13631 build2 (MEM_REF, field_t, tmp_ha,
13632 build_int_cst (field_ptr_t,
13633 (i *
13634 int_size_in_bytes (field_t)))),
13635 build1 (INDIRECT_REF, field_t, u));
13636 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13637 }
13638
13639 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13640 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13641 }
13642
13643 COND_EXPR_ELSE (cond2) = t;
13644 addr = fold_convert (build_pointer_type (type), cond1);
13645 addr = build_va_arg_indirect_ref (addr);
13646
13647 if (indirect_p)
13648 addr = build_va_arg_indirect_ref (addr);
13649
13650 return addr;
13651 }
13652
13653 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13654
13655 static void
13656 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13657 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13658 int no_rtl)
13659 {
13660 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13661 CUMULATIVE_ARGS local_cum;
13662 int gr_saved = cfun->va_list_gpr_size;
13663 int vr_saved = cfun->va_list_fpr_size;
13664
13665 /* The caller has advanced CUM up to, but not beyond, the last named
13666 argument. Advance a local copy of CUM past the last "real" named
13667 argument, to find out how many registers are left over. */
13668 local_cum = *cum;
13669 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13670
13671 /* Found out how many registers we need to save.
13672 Honor tree-stdvar analysis results. */
13673 if (cfun->va_list_gpr_size)
13674 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13675 cfun->va_list_gpr_size / UNITS_PER_WORD);
13676 if (cfun->va_list_fpr_size)
13677 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13678 cfun->va_list_fpr_size / UNITS_PER_VREG);
13679
13680 if (!TARGET_FLOAT)
13681 {
13682 gcc_assert (local_cum.aapcs_nvrn == 0);
13683 vr_saved = 0;
13684 }
13685
13686 if (!no_rtl)
13687 {
13688 if (gr_saved > 0)
13689 {
13690 rtx ptr, mem;
13691
13692 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13693 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13694 - gr_saved * UNITS_PER_WORD);
13695 mem = gen_frame_mem (BLKmode, ptr);
13696 set_mem_alias_set (mem, get_varargs_alias_set ());
13697
13698 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13699 mem, gr_saved);
13700 }
13701 if (vr_saved > 0)
13702 {
13703 /* We can't use move_block_from_reg, because it will use
13704 the wrong mode, storing D regs only. */
13705 machine_mode mode = TImode;
13706 int off, i, vr_start;
13707
13708 /* Set OFF to the offset from virtual_incoming_args_rtx of
13709 the first vector register. The VR save area lies below
13710 the GR one, and is aligned to 16 bytes. */
13711 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13712 STACK_BOUNDARY / BITS_PER_UNIT);
13713 off -= vr_saved * UNITS_PER_VREG;
13714
13715 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13716 for (i = 0; i < vr_saved; ++i)
13717 {
13718 rtx ptr, mem;
13719
13720 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13721 mem = gen_frame_mem (mode, ptr);
13722 set_mem_alias_set (mem, get_varargs_alias_set ());
13723 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13724 off += UNITS_PER_VREG;
13725 }
13726 }
13727 }
13728
13729 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13730 any complication of having crtl->args.pretend_args_size changed. */
13731 cfun->machine->frame.saved_varargs_size
13732 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13733 STACK_BOUNDARY / BITS_PER_UNIT)
13734 + vr_saved * UNITS_PER_VREG);
13735 }
13736
13737 static void
13738 aarch64_conditional_register_usage (void)
13739 {
13740 int i;
13741 if (!TARGET_FLOAT)
13742 {
13743 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13744 {
13745 fixed_regs[i] = 1;
13746 call_used_regs[i] = 1;
13747 }
13748 }
13749 if (!TARGET_SVE)
13750 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13751 {
13752 fixed_regs[i] = 1;
13753 call_used_regs[i] = 1;
13754 }
13755
13756 /* When tracking speculation, we need a couple of call-clobbered registers
13757 to track the speculation state. It would be nice to just use
13758 IP0 and IP1, but currently there are numerous places that just
13759 assume these registers are free for other uses (eg pointer
13760 authentication). */
13761 if (aarch64_track_speculation)
13762 {
13763 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13764 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13765 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13766 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13767 }
13768 }
13769
13770 /* Walk down the type tree of TYPE counting consecutive base elements.
13771 If *MODEP is VOIDmode, then set it to the first valid floating point
13772 type. If a non-floating point type is found, or if a floating point
13773 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13774 otherwise return the count in the sub-tree. */
13775 static int
13776 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13777 {
13778 machine_mode mode;
13779 HOST_WIDE_INT size;
13780
13781 switch (TREE_CODE (type))
13782 {
13783 case REAL_TYPE:
13784 mode = TYPE_MODE (type);
13785 if (mode != DFmode && mode != SFmode
13786 && mode != TFmode && mode != HFmode)
13787 return -1;
13788
13789 if (*modep == VOIDmode)
13790 *modep = mode;
13791
13792 if (*modep == mode)
13793 return 1;
13794
13795 break;
13796
13797 case COMPLEX_TYPE:
13798 mode = TYPE_MODE (TREE_TYPE (type));
13799 if (mode != DFmode && mode != SFmode
13800 && mode != TFmode && mode != HFmode)
13801 return -1;
13802
13803 if (*modep == VOIDmode)
13804 *modep = mode;
13805
13806 if (*modep == mode)
13807 return 2;
13808
13809 break;
13810
13811 case VECTOR_TYPE:
13812 /* Use V2SImode and V4SImode as representatives of all 64-bit
13813 and 128-bit vector types. */
13814 size = int_size_in_bytes (type);
13815 switch (size)
13816 {
13817 case 8:
13818 mode = V2SImode;
13819 break;
13820 case 16:
13821 mode = V4SImode;
13822 break;
13823 default:
13824 return -1;
13825 }
13826
13827 if (*modep == VOIDmode)
13828 *modep = mode;
13829
13830 /* Vector modes are considered to be opaque: two vectors are
13831 equivalent for the purposes of being homogeneous aggregates
13832 if they are the same size. */
13833 if (*modep == mode)
13834 return 1;
13835
13836 break;
13837
13838 case ARRAY_TYPE:
13839 {
13840 int count;
13841 tree index = TYPE_DOMAIN (type);
13842
13843 /* Can't handle incomplete types nor sizes that are not
13844 fixed. */
13845 if (!COMPLETE_TYPE_P (type)
13846 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13847 return -1;
13848
13849 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13850 if (count == -1
13851 || !index
13852 || !TYPE_MAX_VALUE (index)
13853 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13854 || !TYPE_MIN_VALUE (index)
13855 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13856 || count < 0)
13857 return -1;
13858
13859 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13860 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13861
13862 /* There must be no padding. */
13863 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13864 count * GET_MODE_BITSIZE (*modep)))
13865 return -1;
13866
13867 return count;
13868 }
13869
13870 case RECORD_TYPE:
13871 {
13872 int count = 0;
13873 int sub_count;
13874 tree field;
13875
13876 /* Can't handle incomplete types nor sizes that are not
13877 fixed. */
13878 if (!COMPLETE_TYPE_P (type)
13879 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13880 return -1;
13881
13882 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13883 {
13884 if (TREE_CODE (field) != FIELD_DECL)
13885 continue;
13886
13887 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13888 if (sub_count < 0)
13889 return -1;
13890 count += sub_count;
13891 }
13892
13893 /* There must be no padding. */
13894 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13895 count * GET_MODE_BITSIZE (*modep)))
13896 return -1;
13897
13898 return count;
13899 }
13900
13901 case UNION_TYPE:
13902 case QUAL_UNION_TYPE:
13903 {
13904 /* These aren't very interesting except in a degenerate case. */
13905 int count = 0;
13906 int sub_count;
13907 tree field;
13908
13909 /* Can't handle incomplete types nor sizes that are not
13910 fixed. */
13911 if (!COMPLETE_TYPE_P (type)
13912 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13913 return -1;
13914
13915 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13916 {
13917 if (TREE_CODE (field) != FIELD_DECL)
13918 continue;
13919
13920 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13921 if (sub_count < 0)
13922 return -1;
13923 count = count > sub_count ? count : sub_count;
13924 }
13925
13926 /* There must be no padding. */
13927 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13928 count * GET_MODE_BITSIZE (*modep)))
13929 return -1;
13930
13931 return count;
13932 }
13933
13934 default:
13935 break;
13936 }
13937
13938 return -1;
13939 }
13940
13941 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13942 type as described in AAPCS64 \S 4.1.2.
13943
13944 See the comment above aarch64_composite_type_p for the notes on MODE. */
13945
13946 static bool
13947 aarch64_short_vector_p (const_tree type,
13948 machine_mode mode)
13949 {
13950 poly_int64 size = -1;
13951
13952 if (type && TREE_CODE (type) == VECTOR_TYPE)
13953 size = int_size_in_bytes (type);
13954 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13955 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13956 size = GET_MODE_SIZE (mode);
13957
13958 return known_eq (size, 8) || known_eq (size, 16);
13959 }
13960
13961 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13962 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13963 array types. The C99 floating-point complex types are also considered
13964 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13965 types, which are GCC extensions and out of the scope of AAPCS64, are
13966 treated as composite types here as well.
13967
13968 Note that MODE itself is not sufficient in determining whether a type
13969 is such a composite type or not. This is because
13970 stor-layout.c:compute_record_mode may have already changed the MODE
13971 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13972 structure with only one field may have its MODE set to the mode of the
13973 field. Also an integer mode whose size matches the size of the
13974 RECORD_TYPE type may be used to substitute the original mode
13975 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13976 solely relied on. */
13977
13978 static bool
13979 aarch64_composite_type_p (const_tree type,
13980 machine_mode mode)
13981 {
13982 if (aarch64_short_vector_p (type, mode))
13983 return false;
13984
13985 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13986 return true;
13987
13988 if (mode == BLKmode
13989 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13990 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13991 return true;
13992
13993 return false;
13994 }
13995
13996 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13997 shall be passed or returned in simd/fp register(s) (providing these
13998 parameter passing registers are available).
13999
14000 Upon successful return, *COUNT returns the number of needed registers,
14001 *BASE_MODE returns the mode of the individual register and when IS_HAF
14002 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14003 floating-point aggregate or a homogeneous short-vector aggregate. */
14004
14005 static bool
14006 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14007 const_tree type,
14008 machine_mode *base_mode,
14009 int *count,
14010 bool *is_ha)
14011 {
14012 machine_mode new_mode = VOIDmode;
14013 bool composite_p = aarch64_composite_type_p (type, mode);
14014
14015 if (is_ha != NULL) *is_ha = false;
14016
14017 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14018 || aarch64_short_vector_p (type, mode))
14019 {
14020 *count = 1;
14021 new_mode = mode;
14022 }
14023 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14024 {
14025 if (is_ha != NULL) *is_ha = true;
14026 *count = 2;
14027 new_mode = GET_MODE_INNER (mode);
14028 }
14029 else if (type && composite_p)
14030 {
14031 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14032
14033 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14034 {
14035 if (is_ha != NULL) *is_ha = true;
14036 *count = ag_count;
14037 }
14038 else
14039 return false;
14040 }
14041 else
14042 return false;
14043
14044 *base_mode = new_mode;
14045 return true;
14046 }
14047
14048 /* Implement TARGET_STRUCT_VALUE_RTX. */
14049
14050 static rtx
14051 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14052 int incoming ATTRIBUTE_UNUSED)
14053 {
14054 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14055 }
14056
14057 /* Implements target hook vector_mode_supported_p. */
14058 static bool
14059 aarch64_vector_mode_supported_p (machine_mode mode)
14060 {
14061 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14062 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14063 }
14064
14065 /* Return appropriate SIMD container
14066 for MODE within a vector of WIDTH bits. */
14067 static machine_mode
14068 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14069 {
14070 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14071 switch (mode)
14072 {
14073 case E_DFmode:
14074 return VNx2DFmode;
14075 case E_SFmode:
14076 return VNx4SFmode;
14077 case E_HFmode:
14078 return VNx8HFmode;
14079 case E_DImode:
14080 return VNx2DImode;
14081 case E_SImode:
14082 return VNx4SImode;
14083 case E_HImode:
14084 return VNx8HImode;
14085 case E_QImode:
14086 return VNx16QImode;
14087 default:
14088 return word_mode;
14089 }
14090
14091 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14092 if (TARGET_SIMD)
14093 {
14094 if (known_eq (width, 128))
14095 switch (mode)
14096 {
14097 case E_DFmode:
14098 return V2DFmode;
14099 case E_SFmode:
14100 return V4SFmode;
14101 case E_HFmode:
14102 return V8HFmode;
14103 case E_SImode:
14104 return V4SImode;
14105 case E_HImode:
14106 return V8HImode;
14107 case E_QImode:
14108 return V16QImode;
14109 case E_DImode:
14110 return V2DImode;
14111 default:
14112 break;
14113 }
14114 else
14115 switch (mode)
14116 {
14117 case E_SFmode:
14118 return V2SFmode;
14119 case E_HFmode:
14120 return V4HFmode;
14121 case E_SImode:
14122 return V2SImode;
14123 case E_HImode:
14124 return V4HImode;
14125 case E_QImode:
14126 return V8QImode;
14127 default:
14128 break;
14129 }
14130 }
14131 return word_mode;
14132 }
14133
14134 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14135 static machine_mode
14136 aarch64_preferred_simd_mode (scalar_mode mode)
14137 {
14138 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14139 return aarch64_simd_container_mode (mode, bits);
14140 }
14141
14142 /* Return a list of possible vector sizes for the vectorizer
14143 to iterate over. */
14144 static void
14145 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14146 {
14147 if (TARGET_SVE)
14148 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14149 sizes->safe_push (16);
14150 sizes->safe_push (8);
14151 }
14152
14153 /* Implement TARGET_MANGLE_TYPE. */
14154
14155 static const char *
14156 aarch64_mangle_type (const_tree type)
14157 {
14158 /* The AArch64 ABI documents say that "__va_list" has to be
14159 mangled as if it is in the "std" namespace. */
14160 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14161 return "St9__va_list";
14162
14163 /* Half-precision float. */
14164 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14165 return "Dh";
14166
14167 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14168 builtin types. */
14169 if (TYPE_NAME (type) != NULL)
14170 return aarch64_mangle_builtin_type (type);
14171
14172 /* Use the default mangling. */
14173 return NULL;
14174 }
14175
14176 /* Find the first rtx_insn before insn that will generate an assembly
14177 instruction. */
14178
14179 static rtx_insn *
14180 aarch64_prev_real_insn (rtx_insn *insn)
14181 {
14182 if (!insn)
14183 return NULL;
14184
14185 do
14186 {
14187 insn = prev_real_insn (insn);
14188 }
14189 while (insn && recog_memoized (insn) < 0);
14190
14191 return insn;
14192 }
14193
14194 static bool
14195 is_madd_op (enum attr_type t1)
14196 {
14197 unsigned int i;
14198 /* A number of these may be AArch32 only. */
14199 enum attr_type mlatypes[] = {
14200 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14201 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14202 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14203 };
14204
14205 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14206 {
14207 if (t1 == mlatypes[i])
14208 return true;
14209 }
14210
14211 return false;
14212 }
14213
14214 /* Check if there is a register dependency between a load and the insn
14215 for which we hold recog_data. */
14216
14217 static bool
14218 dep_between_memop_and_curr (rtx memop)
14219 {
14220 rtx load_reg;
14221 int opno;
14222
14223 gcc_assert (GET_CODE (memop) == SET);
14224
14225 if (!REG_P (SET_DEST (memop)))
14226 return false;
14227
14228 load_reg = SET_DEST (memop);
14229 for (opno = 1; opno < recog_data.n_operands; opno++)
14230 {
14231 rtx operand = recog_data.operand[opno];
14232 if (REG_P (operand)
14233 && reg_overlap_mentioned_p (load_reg, operand))
14234 return true;
14235
14236 }
14237 return false;
14238 }
14239
14240
14241 /* When working around the Cortex-A53 erratum 835769,
14242 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14243 instruction and has a preceding memory instruction such that a NOP
14244 should be inserted between them. */
14245
14246 bool
14247 aarch64_madd_needs_nop (rtx_insn* insn)
14248 {
14249 enum attr_type attr_type;
14250 rtx_insn *prev;
14251 rtx body;
14252
14253 if (!TARGET_FIX_ERR_A53_835769)
14254 return false;
14255
14256 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14257 return false;
14258
14259 attr_type = get_attr_type (insn);
14260 if (!is_madd_op (attr_type))
14261 return false;
14262
14263 prev = aarch64_prev_real_insn (insn);
14264 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14265 Restore recog state to INSN to avoid state corruption. */
14266 extract_constrain_insn_cached (insn);
14267
14268 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14269 return false;
14270
14271 body = single_set (prev);
14272
14273 /* If the previous insn is a memory op and there is no dependency between
14274 it and the DImode madd, emit a NOP between them. If body is NULL then we
14275 have a complex memory operation, probably a load/store pair.
14276 Be conservative for now and emit a NOP. */
14277 if (GET_MODE (recog_data.operand[0]) == DImode
14278 && (!body || !dep_between_memop_and_curr (body)))
14279 return true;
14280
14281 return false;
14282
14283 }
14284
14285
14286 /* Implement FINAL_PRESCAN_INSN. */
14287
14288 void
14289 aarch64_final_prescan_insn (rtx_insn *insn)
14290 {
14291 if (aarch64_madd_needs_nop (insn))
14292 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14293 }
14294
14295
14296 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14297 instruction. */
14298
14299 bool
14300 aarch64_sve_index_immediate_p (rtx base_or_step)
14301 {
14302 return (CONST_INT_P (base_or_step)
14303 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14304 }
14305
14306 /* Return true if X is a valid immediate for the SVE ADD and SUB
14307 instructions. Negate X first if NEGATE_P is true. */
14308
14309 bool
14310 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14311 {
14312 rtx elt;
14313
14314 if (!const_vec_duplicate_p (x, &elt)
14315 || !CONST_INT_P (elt))
14316 return false;
14317
14318 HOST_WIDE_INT val = INTVAL (elt);
14319 if (negate_p)
14320 val = -val;
14321 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14322
14323 if (val & 0xff)
14324 return IN_RANGE (val, 0, 0xff);
14325 return IN_RANGE (val, 0, 0xff00);
14326 }
14327
14328 /* Return true if X is a valid immediate operand for an SVE logical
14329 instruction such as AND. */
14330
14331 bool
14332 aarch64_sve_bitmask_immediate_p (rtx x)
14333 {
14334 rtx elt;
14335
14336 return (const_vec_duplicate_p (x, &elt)
14337 && CONST_INT_P (elt)
14338 && aarch64_bitmask_imm (INTVAL (elt),
14339 GET_MODE_INNER (GET_MODE (x))));
14340 }
14341
14342 /* Return true if X is a valid immediate for the SVE DUP and CPY
14343 instructions. */
14344
14345 bool
14346 aarch64_sve_dup_immediate_p (rtx x)
14347 {
14348 rtx elt;
14349
14350 if (!const_vec_duplicate_p (x, &elt)
14351 || !CONST_INT_P (elt))
14352 return false;
14353
14354 HOST_WIDE_INT val = INTVAL (elt);
14355 if (val & 0xff)
14356 return IN_RANGE (val, -0x80, 0x7f);
14357 return IN_RANGE (val, -0x8000, 0x7f00);
14358 }
14359
14360 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14361 SIGNED_P says whether the operand is signed rather than unsigned. */
14362
14363 bool
14364 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14365 {
14366 rtx elt;
14367
14368 return (const_vec_duplicate_p (x, &elt)
14369 && CONST_INT_P (elt)
14370 && (signed_p
14371 ? IN_RANGE (INTVAL (elt), -16, 15)
14372 : IN_RANGE (INTVAL (elt), 0, 127)));
14373 }
14374
14375 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14376 instruction. Negate X first if NEGATE_P is true. */
14377
14378 bool
14379 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14380 {
14381 rtx elt;
14382 REAL_VALUE_TYPE r;
14383
14384 if (!const_vec_duplicate_p (x, &elt)
14385 || GET_CODE (elt) != CONST_DOUBLE)
14386 return false;
14387
14388 r = *CONST_DOUBLE_REAL_VALUE (elt);
14389
14390 if (negate_p)
14391 r = real_value_negate (&r);
14392
14393 if (real_equal (&r, &dconst1))
14394 return true;
14395 if (real_equal (&r, &dconsthalf))
14396 return true;
14397 return false;
14398 }
14399
14400 /* Return true if X is a valid immediate operand for an SVE FMUL
14401 instruction. */
14402
14403 bool
14404 aarch64_sve_float_mul_immediate_p (rtx x)
14405 {
14406 rtx elt;
14407
14408 /* GCC will never generate a multiply with an immediate of 2, so there is no
14409 point testing for it (even though it is a valid constant). */
14410 return (const_vec_duplicate_p (x, &elt)
14411 && GET_CODE (elt) == CONST_DOUBLE
14412 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14413 }
14414
14415 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14416 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14417 is nonnull, use it to describe valid immediates. */
14418 static bool
14419 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14420 simd_immediate_info *info,
14421 enum simd_immediate_check which,
14422 simd_immediate_info::insn_type insn)
14423 {
14424 /* Try a 4-byte immediate with LSL. */
14425 for (unsigned int shift = 0; shift < 32; shift += 8)
14426 if ((val32 & (0xff << shift)) == val32)
14427 {
14428 if (info)
14429 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14430 simd_immediate_info::LSL, shift);
14431 return true;
14432 }
14433
14434 /* Try a 2-byte immediate with LSL. */
14435 unsigned int imm16 = val32 & 0xffff;
14436 if (imm16 == (val32 >> 16))
14437 for (unsigned int shift = 0; shift < 16; shift += 8)
14438 if ((imm16 & (0xff << shift)) == imm16)
14439 {
14440 if (info)
14441 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14442 simd_immediate_info::LSL, shift);
14443 return true;
14444 }
14445
14446 /* Try a 4-byte immediate with MSL, except for cases that MVN
14447 can handle. */
14448 if (which == AARCH64_CHECK_MOV)
14449 for (unsigned int shift = 8; shift < 24; shift += 8)
14450 {
14451 unsigned int low = (1 << shift) - 1;
14452 if (((val32 & (0xff << shift)) | low) == val32)
14453 {
14454 if (info)
14455 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14456 simd_immediate_info::MSL, shift);
14457 return true;
14458 }
14459 }
14460
14461 return false;
14462 }
14463
14464 /* Return true if replicating VAL64 is a valid immediate for the
14465 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14466 use it to describe valid immediates. */
14467 static bool
14468 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14469 simd_immediate_info *info,
14470 enum simd_immediate_check which)
14471 {
14472 unsigned int val32 = val64 & 0xffffffff;
14473 unsigned int val16 = val64 & 0xffff;
14474 unsigned int val8 = val64 & 0xff;
14475
14476 if (val32 == (val64 >> 32))
14477 {
14478 if ((which & AARCH64_CHECK_ORR) != 0
14479 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14480 simd_immediate_info::MOV))
14481 return true;
14482
14483 if ((which & AARCH64_CHECK_BIC) != 0
14484 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14485 simd_immediate_info::MVN))
14486 return true;
14487
14488 /* Try using a replicated byte. */
14489 if (which == AARCH64_CHECK_MOV
14490 && val16 == (val32 >> 16)
14491 && val8 == (val16 >> 8))
14492 {
14493 if (info)
14494 *info = simd_immediate_info (QImode, val8);
14495 return true;
14496 }
14497 }
14498
14499 /* Try using a bit-to-bytemask. */
14500 if (which == AARCH64_CHECK_MOV)
14501 {
14502 unsigned int i;
14503 for (i = 0; i < 64; i += 8)
14504 {
14505 unsigned char byte = (val64 >> i) & 0xff;
14506 if (byte != 0 && byte != 0xff)
14507 break;
14508 }
14509 if (i == 64)
14510 {
14511 if (info)
14512 *info = simd_immediate_info (DImode, val64);
14513 return true;
14514 }
14515 }
14516 return false;
14517 }
14518
14519 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14520 instruction. If INFO is nonnull, use it to describe valid immediates. */
14521
14522 static bool
14523 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14524 simd_immediate_info *info)
14525 {
14526 scalar_int_mode mode = DImode;
14527 unsigned int val32 = val64 & 0xffffffff;
14528 if (val32 == (val64 >> 32))
14529 {
14530 mode = SImode;
14531 unsigned int val16 = val32 & 0xffff;
14532 if (val16 == (val32 >> 16))
14533 {
14534 mode = HImode;
14535 unsigned int val8 = val16 & 0xff;
14536 if (val8 == (val16 >> 8))
14537 mode = QImode;
14538 }
14539 }
14540 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14541 if (IN_RANGE (val, -0x80, 0x7f))
14542 {
14543 /* DUP with no shift. */
14544 if (info)
14545 *info = simd_immediate_info (mode, val);
14546 return true;
14547 }
14548 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14549 {
14550 /* DUP with LSL #8. */
14551 if (info)
14552 *info = simd_immediate_info (mode, val);
14553 return true;
14554 }
14555 if (aarch64_bitmask_imm (val64, mode))
14556 {
14557 /* DUPM. */
14558 if (info)
14559 *info = simd_immediate_info (mode, val);
14560 return true;
14561 }
14562 return false;
14563 }
14564
14565 /* Return true if OP is a valid SIMD immediate for the operation
14566 described by WHICH. If INFO is nonnull, use it to describe valid
14567 immediates. */
14568 bool
14569 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14570 enum simd_immediate_check which)
14571 {
14572 machine_mode mode = GET_MODE (op);
14573 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14574 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14575 return false;
14576
14577 scalar_mode elt_mode = GET_MODE_INNER (mode);
14578 rtx base, step;
14579 unsigned int n_elts;
14580 if (GET_CODE (op) == CONST_VECTOR
14581 && CONST_VECTOR_DUPLICATE_P (op))
14582 n_elts = CONST_VECTOR_NPATTERNS (op);
14583 else if ((vec_flags & VEC_SVE_DATA)
14584 && const_vec_series_p (op, &base, &step))
14585 {
14586 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14587 if (!aarch64_sve_index_immediate_p (base)
14588 || !aarch64_sve_index_immediate_p (step))
14589 return false;
14590
14591 if (info)
14592 *info = simd_immediate_info (elt_mode, base, step);
14593 return true;
14594 }
14595 else if (GET_CODE (op) == CONST_VECTOR
14596 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14597 /* N_ELTS set above. */;
14598 else
14599 return false;
14600
14601 /* Handle PFALSE and PTRUE. */
14602 if (vec_flags & VEC_SVE_PRED)
14603 return (op == CONST0_RTX (mode)
14604 || op == CONSTM1_RTX (mode));
14605
14606 scalar_float_mode elt_float_mode;
14607 if (n_elts == 1
14608 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14609 {
14610 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14611 if (aarch64_float_const_zero_rtx_p (elt)
14612 || aarch64_float_const_representable_p (elt))
14613 {
14614 if (info)
14615 *info = simd_immediate_info (elt_float_mode, elt);
14616 return true;
14617 }
14618 }
14619
14620 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14621 if (elt_size > 8)
14622 return false;
14623
14624 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14625
14626 /* Expand the vector constant out into a byte vector, with the least
14627 significant byte of the register first. */
14628 auto_vec<unsigned char, 16> bytes;
14629 bytes.reserve (n_elts * elt_size);
14630 for (unsigned int i = 0; i < n_elts; i++)
14631 {
14632 /* The vector is provided in gcc endian-neutral fashion.
14633 For aarch64_be Advanced SIMD, it must be laid out in the vector
14634 register in reverse order. */
14635 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14636 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14637
14638 if (elt_mode != elt_int_mode)
14639 elt = gen_lowpart (elt_int_mode, elt);
14640
14641 if (!CONST_INT_P (elt))
14642 return false;
14643
14644 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14645 for (unsigned int byte = 0; byte < elt_size; byte++)
14646 {
14647 bytes.quick_push (elt_val & 0xff);
14648 elt_val >>= BITS_PER_UNIT;
14649 }
14650 }
14651
14652 /* The immediate must repeat every eight bytes. */
14653 unsigned int nbytes = bytes.length ();
14654 for (unsigned i = 8; i < nbytes; ++i)
14655 if (bytes[i] != bytes[i - 8])
14656 return false;
14657
14658 /* Get the repeating 8-byte value as an integer. No endian correction
14659 is needed here because bytes is already in lsb-first order. */
14660 unsigned HOST_WIDE_INT val64 = 0;
14661 for (unsigned int i = 0; i < 8; i++)
14662 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14663 << (i * BITS_PER_UNIT));
14664
14665 if (vec_flags & VEC_SVE_DATA)
14666 return aarch64_sve_valid_immediate (val64, info);
14667 else
14668 return aarch64_advsimd_valid_immediate (val64, info, which);
14669 }
14670
14671 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14672 has a step in the range of INDEX. Return the index expression if so,
14673 otherwise return null. */
14674 rtx
14675 aarch64_check_zero_based_sve_index_immediate (rtx x)
14676 {
14677 rtx base, step;
14678 if (const_vec_series_p (x, &base, &step)
14679 && base == const0_rtx
14680 && aarch64_sve_index_immediate_p (step))
14681 return step;
14682 return NULL_RTX;
14683 }
14684
14685 /* Check of immediate shift constants are within range. */
14686 bool
14687 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14688 {
14689 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14690 if (left)
14691 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14692 else
14693 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14694 }
14695
14696 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14697 operation of width WIDTH at bit position POS. */
14698
14699 rtx
14700 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14701 {
14702 gcc_assert (CONST_INT_P (width));
14703 gcc_assert (CONST_INT_P (pos));
14704
14705 unsigned HOST_WIDE_INT mask
14706 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14707 return GEN_INT (mask << UINTVAL (pos));
14708 }
14709
14710 bool
14711 aarch64_mov_operand_p (rtx x, machine_mode mode)
14712 {
14713 if (GET_CODE (x) == HIGH
14714 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14715 return true;
14716
14717 if (CONST_INT_P (x))
14718 return true;
14719
14720 if (VECTOR_MODE_P (GET_MODE (x)))
14721 return aarch64_simd_valid_immediate (x, NULL);
14722
14723 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14724 return true;
14725
14726 if (aarch64_sve_cnt_immediate_p (x))
14727 return true;
14728
14729 return aarch64_classify_symbolic_expression (x)
14730 == SYMBOL_TINY_ABSOLUTE;
14731 }
14732
14733 /* Return a const_int vector of VAL. */
14734 rtx
14735 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14736 {
14737 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14738 return gen_const_vec_duplicate (mode, c);
14739 }
14740
14741 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14742
14743 bool
14744 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14745 {
14746 machine_mode vmode;
14747
14748 vmode = aarch64_simd_container_mode (mode, 64);
14749 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14750 return aarch64_simd_valid_immediate (op_v, NULL);
14751 }
14752
14753 /* Construct and return a PARALLEL RTX vector with elements numbering the
14754 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14755 the vector - from the perspective of the architecture. This does not
14756 line up with GCC's perspective on lane numbers, so we end up with
14757 different masks depending on our target endian-ness. The diagram
14758 below may help. We must draw the distinction when building masks
14759 which select one half of the vector. An instruction selecting
14760 architectural low-lanes for a big-endian target, must be described using
14761 a mask selecting GCC high-lanes.
14762
14763 Big-Endian Little-Endian
14764
14765 GCC 0 1 2 3 3 2 1 0
14766 | x | x | x | x | | x | x | x | x |
14767 Architecture 3 2 1 0 3 2 1 0
14768
14769 Low Mask: { 2, 3 } { 0, 1 }
14770 High Mask: { 0, 1 } { 2, 3 }
14771
14772 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14773
14774 rtx
14775 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14776 {
14777 rtvec v = rtvec_alloc (nunits / 2);
14778 int high_base = nunits / 2;
14779 int low_base = 0;
14780 int base;
14781 rtx t1;
14782 int i;
14783
14784 if (BYTES_BIG_ENDIAN)
14785 base = high ? low_base : high_base;
14786 else
14787 base = high ? high_base : low_base;
14788
14789 for (i = 0; i < nunits / 2; i++)
14790 RTVEC_ELT (v, i) = GEN_INT (base + i);
14791
14792 t1 = gen_rtx_PARALLEL (mode, v);
14793 return t1;
14794 }
14795
14796 /* Check OP for validity as a PARALLEL RTX vector with elements
14797 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14798 from the perspective of the architecture. See the diagram above
14799 aarch64_simd_vect_par_cnst_half for more details. */
14800
14801 bool
14802 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14803 bool high)
14804 {
14805 int nelts;
14806 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14807 return false;
14808
14809 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14810 HOST_WIDE_INT count_op = XVECLEN (op, 0);
14811 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14812 int i = 0;
14813
14814 if (count_op != count_ideal)
14815 return false;
14816
14817 for (i = 0; i < count_ideal; i++)
14818 {
14819 rtx elt_op = XVECEXP (op, 0, i);
14820 rtx elt_ideal = XVECEXP (ideal, 0, i);
14821
14822 if (!CONST_INT_P (elt_op)
14823 || INTVAL (elt_ideal) != INTVAL (elt_op))
14824 return false;
14825 }
14826 return true;
14827 }
14828
14829 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14830 HIGH (exclusive). */
14831 void
14832 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14833 const_tree exp)
14834 {
14835 HOST_WIDE_INT lane;
14836 gcc_assert (CONST_INT_P (operand));
14837 lane = INTVAL (operand);
14838
14839 if (lane < low || lane >= high)
14840 {
14841 if (exp)
14842 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14843 else
14844 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14845 }
14846 }
14847
14848 /* Peform endian correction on lane number N, which indexes a vector
14849 of mode MODE, and return the result as an SImode rtx. */
14850
14851 rtx
14852 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14853 {
14854 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14855 }
14856
14857 /* Return TRUE if OP is a valid vector addressing mode. */
14858
14859 bool
14860 aarch64_simd_mem_operand_p (rtx op)
14861 {
14862 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14863 || REG_P (XEXP (op, 0)));
14864 }
14865
14866 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14867
14868 bool
14869 aarch64_sve_ld1r_operand_p (rtx op)
14870 {
14871 struct aarch64_address_info addr;
14872 scalar_mode mode;
14873
14874 return (MEM_P (op)
14875 && is_a <scalar_mode> (GET_MODE (op), &mode)
14876 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14877 && addr.type == ADDRESS_REG_IMM
14878 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14879 }
14880
14881 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14882 The conditions for STR are the same. */
14883 bool
14884 aarch64_sve_ldr_operand_p (rtx op)
14885 {
14886 struct aarch64_address_info addr;
14887
14888 return (MEM_P (op)
14889 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14890 false, ADDR_QUERY_ANY)
14891 && addr.type == ADDRESS_REG_IMM);
14892 }
14893
14894 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14895 We need to be able to access the individual pieces, so the range
14896 is different from LD[234] and ST[234]. */
14897 bool
14898 aarch64_sve_struct_memory_operand_p (rtx op)
14899 {
14900 if (!MEM_P (op))
14901 return false;
14902
14903 machine_mode mode = GET_MODE (op);
14904 struct aarch64_address_info addr;
14905 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14906 ADDR_QUERY_ANY)
14907 || addr.type != ADDRESS_REG_IMM)
14908 return false;
14909
14910 poly_int64 first = addr.const_offset;
14911 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14912 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14913 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14914 }
14915
14916 /* Emit a register copy from operand to operand, taking care not to
14917 early-clobber source registers in the process.
14918
14919 COUNT is the number of components into which the copy needs to be
14920 decomposed. */
14921 void
14922 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14923 unsigned int count)
14924 {
14925 unsigned int i;
14926 int rdest = REGNO (operands[0]);
14927 int rsrc = REGNO (operands[1]);
14928
14929 if (!reg_overlap_mentioned_p (operands[0], operands[1])
14930 || rdest < rsrc)
14931 for (i = 0; i < count; i++)
14932 emit_move_insn (gen_rtx_REG (mode, rdest + i),
14933 gen_rtx_REG (mode, rsrc + i));
14934 else
14935 for (i = 0; i < count; i++)
14936 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14937 gen_rtx_REG (mode, rsrc + count - i - 1));
14938 }
14939
14940 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14941 one of VSTRUCT modes: OI, CI, or XI. */
14942 int
14943 aarch64_simd_attr_length_rglist (machine_mode mode)
14944 {
14945 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14946 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14947 }
14948
14949 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14950 alignment of a vector to 128 bits. SVE predicates have an alignment of
14951 16 bits. */
14952 static HOST_WIDE_INT
14953 aarch64_simd_vector_alignment (const_tree type)
14954 {
14955 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14956 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14957 be set for non-predicate vectors of booleans. Modes are the most
14958 direct way we have of identifying real SVE predicate types. */
14959 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14960 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
14961 }
14962
14963 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14964 static poly_uint64
14965 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14966 {
14967 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14968 {
14969 /* If the length of the vector is fixed, try to align to that length,
14970 otherwise don't try to align at all. */
14971 HOST_WIDE_INT result;
14972 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14973 result = TYPE_ALIGN (TREE_TYPE (type));
14974 return result;
14975 }
14976 return TYPE_ALIGN (type);
14977 }
14978
14979 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14980 static bool
14981 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14982 {
14983 if (is_packed)
14984 return false;
14985
14986 /* For fixed-length vectors, check that the vectorizer will aim for
14987 full-vector alignment. This isn't true for generic GCC vectors
14988 that are wider than the ABI maximum of 128 bits. */
14989 poly_uint64 preferred_alignment =
14990 aarch64_vectorize_preferred_vector_alignment (type);
14991 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14992 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14993 preferred_alignment))
14994 return false;
14995
14996 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
14997 return true;
14998 }
14999
15000 /* Return true if the vector misalignment factor is supported by the
15001 target. */
15002 static bool
15003 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15004 const_tree type, int misalignment,
15005 bool is_packed)
15006 {
15007 if (TARGET_SIMD && STRICT_ALIGNMENT)
15008 {
15009 /* Return if movmisalign pattern is not supported for this mode. */
15010 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15011 return false;
15012
15013 /* Misalignment factor is unknown at compile time. */
15014 if (misalignment == -1)
15015 return false;
15016 }
15017 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15018 is_packed);
15019 }
15020
15021 /* If VALS is a vector constant that can be loaded into a register
15022 using DUP, generate instructions to do so and return an RTX to
15023 assign to the register. Otherwise return NULL_RTX. */
15024 static rtx
15025 aarch64_simd_dup_constant (rtx vals)
15026 {
15027 machine_mode mode = GET_MODE (vals);
15028 machine_mode inner_mode = GET_MODE_INNER (mode);
15029 rtx x;
15030
15031 if (!const_vec_duplicate_p (vals, &x))
15032 return NULL_RTX;
15033
15034 /* We can load this constant by using DUP and a constant in a
15035 single ARM register. This will be cheaper than a vector
15036 load. */
15037 x = copy_to_mode_reg (inner_mode, x);
15038 return gen_vec_duplicate (mode, x);
15039 }
15040
15041
15042 /* Generate code to load VALS, which is a PARALLEL containing only
15043 constants (for vec_init) or CONST_VECTOR, efficiently into a
15044 register. Returns an RTX to copy into the register, or NULL_RTX
15045 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15046 static rtx
15047 aarch64_simd_make_constant (rtx vals)
15048 {
15049 machine_mode mode = GET_MODE (vals);
15050 rtx const_dup;
15051 rtx const_vec = NULL_RTX;
15052 int n_const = 0;
15053 int i;
15054
15055 if (GET_CODE (vals) == CONST_VECTOR)
15056 const_vec = vals;
15057 else if (GET_CODE (vals) == PARALLEL)
15058 {
15059 /* A CONST_VECTOR must contain only CONST_INTs and
15060 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15061 Only store valid constants in a CONST_VECTOR. */
15062 int n_elts = XVECLEN (vals, 0);
15063 for (i = 0; i < n_elts; ++i)
15064 {
15065 rtx x = XVECEXP (vals, 0, i);
15066 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15067 n_const++;
15068 }
15069 if (n_const == n_elts)
15070 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15071 }
15072 else
15073 gcc_unreachable ();
15074
15075 if (const_vec != NULL_RTX
15076 && aarch64_simd_valid_immediate (const_vec, NULL))
15077 /* Load using MOVI/MVNI. */
15078 return const_vec;
15079 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15080 /* Loaded using DUP. */
15081 return const_dup;
15082 else if (const_vec != NULL_RTX)
15083 /* Load from constant pool. We cannot take advantage of single-cycle
15084 LD1 because we need a PC-relative addressing mode. */
15085 return const_vec;
15086 else
15087 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15088 We cannot construct an initializer. */
15089 return NULL_RTX;
15090 }
15091
15092 /* Expand a vector initialisation sequence, such that TARGET is
15093 initialised to contain VALS. */
15094
15095 void
15096 aarch64_expand_vector_init (rtx target, rtx vals)
15097 {
15098 machine_mode mode = GET_MODE (target);
15099 scalar_mode inner_mode = GET_MODE_INNER (mode);
15100 /* The number of vector elements. */
15101 int n_elts = XVECLEN (vals, 0);
15102 /* The number of vector elements which are not constant. */
15103 int n_var = 0;
15104 rtx any_const = NULL_RTX;
15105 /* The first element of vals. */
15106 rtx v0 = XVECEXP (vals, 0, 0);
15107 bool all_same = true;
15108
15109 /* This is a special vec_init<M><N> where N is not an element mode but a
15110 vector mode with half the elements of M. We expect to find two entries
15111 of mode N in VALS and we must put their concatentation into TARGET. */
15112 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15113 {
15114 gcc_assert (known_eq (GET_MODE_SIZE (mode),
15115 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15116 rtx lo = XVECEXP (vals, 0, 0);
15117 rtx hi = XVECEXP (vals, 0, 1);
15118 machine_mode narrow_mode = GET_MODE (lo);
15119 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15120 gcc_assert (narrow_mode == GET_MODE (hi));
15121
15122 /* When we want to concatenate a half-width vector with zeroes we can
15123 use the aarch64_combinez[_be] patterns. Just make sure that the
15124 zeroes are in the right half. */
15125 if (BYTES_BIG_ENDIAN
15126 && aarch64_simd_imm_zero (lo, narrow_mode)
15127 && general_operand (hi, narrow_mode))
15128 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15129 else if (!BYTES_BIG_ENDIAN
15130 && aarch64_simd_imm_zero (hi, narrow_mode)
15131 && general_operand (lo, narrow_mode))
15132 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15133 else
15134 {
15135 /* Else create the two half-width registers and combine them. */
15136 if (!REG_P (lo))
15137 lo = force_reg (GET_MODE (lo), lo);
15138 if (!REG_P (hi))
15139 hi = force_reg (GET_MODE (hi), hi);
15140
15141 if (BYTES_BIG_ENDIAN)
15142 std::swap (lo, hi);
15143 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15144 }
15145 return;
15146 }
15147
15148 /* Count the number of variable elements to initialise. */
15149 for (int i = 0; i < n_elts; ++i)
15150 {
15151 rtx x = XVECEXP (vals, 0, i);
15152 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15153 ++n_var;
15154 else
15155 any_const = x;
15156
15157 all_same &= rtx_equal_p (x, v0);
15158 }
15159
15160 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15161 how best to handle this. */
15162 if (n_var == 0)
15163 {
15164 rtx constant = aarch64_simd_make_constant (vals);
15165 if (constant != NULL_RTX)
15166 {
15167 emit_move_insn (target, constant);
15168 return;
15169 }
15170 }
15171
15172 /* Splat a single non-constant element if we can. */
15173 if (all_same)
15174 {
15175 rtx x = copy_to_mode_reg (inner_mode, v0);
15176 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15177 return;
15178 }
15179
15180 enum insn_code icode = optab_handler (vec_set_optab, mode);
15181 gcc_assert (icode != CODE_FOR_nothing);
15182
15183 /* If there are only variable elements, try to optimize
15184 the insertion using dup for the most common element
15185 followed by insertions. */
15186
15187 /* The algorithm will fill matches[*][0] with the earliest matching element,
15188 and matches[X][1] with the count of duplicate elements (if X is the
15189 earliest element which has duplicates). */
15190
15191 if (n_var == n_elts && n_elts <= 16)
15192 {
15193 int matches[16][2] = {0};
15194 for (int i = 0; i < n_elts; i++)
15195 {
15196 for (int j = 0; j <= i; j++)
15197 {
15198 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15199 {
15200 matches[i][0] = j;
15201 matches[j][1]++;
15202 break;
15203 }
15204 }
15205 }
15206 int maxelement = 0;
15207 int maxv = 0;
15208 for (int i = 0; i < n_elts; i++)
15209 if (matches[i][1] > maxv)
15210 {
15211 maxelement = i;
15212 maxv = matches[i][1];
15213 }
15214
15215 /* Create a duplicate of the most common element, unless all elements
15216 are equally useless to us, in which case just immediately set the
15217 vector register using the first element. */
15218
15219 if (maxv == 1)
15220 {
15221 /* For vectors of two 64-bit elements, we can do even better. */
15222 if (n_elts == 2
15223 && (inner_mode == E_DImode
15224 || inner_mode == E_DFmode))
15225
15226 {
15227 rtx x0 = XVECEXP (vals, 0, 0);
15228 rtx x1 = XVECEXP (vals, 0, 1);
15229 /* Combine can pick up this case, but handling it directly
15230 here leaves clearer RTL.
15231
15232 This is load_pair_lanes<mode>, and also gives us a clean-up
15233 for store_pair_lanes<mode>. */
15234 if (memory_operand (x0, inner_mode)
15235 && memory_operand (x1, inner_mode)
15236 && !STRICT_ALIGNMENT
15237 && rtx_equal_p (XEXP (x1, 0),
15238 plus_constant (Pmode,
15239 XEXP (x0, 0),
15240 GET_MODE_SIZE (inner_mode))))
15241 {
15242 rtx t;
15243 if (inner_mode == DFmode)
15244 t = gen_load_pair_lanesdf (target, x0, x1);
15245 else
15246 t = gen_load_pair_lanesdi (target, x0, x1);
15247 emit_insn (t);
15248 return;
15249 }
15250 }
15251 /* The subreg-move sequence below will move into lane zero of the
15252 vector register. For big-endian we want that position to hold
15253 the last element of VALS. */
15254 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15255 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15256 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15257 }
15258 else
15259 {
15260 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15261 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15262 }
15263
15264 /* Insert the rest. */
15265 for (int i = 0; i < n_elts; i++)
15266 {
15267 rtx x = XVECEXP (vals, 0, i);
15268 if (matches[i][0] == maxelement)
15269 continue;
15270 x = copy_to_mode_reg (inner_mode, x);
15271 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15272 }
15273 return;
15274 }
15275
15276 /* Initialise a vector which is part-variable. We want to first try
15277 to build those lanes which are constant in the most efficient way we
15278 can. */
15279 if (n_var != n_elts)
15280 {
15281 rtx copy = copy_rtx (vals);
15282
15283 /* Load constant part of vector. We really don't care what goes into the
15284 parts we will overwrite, but we're more likely to be able to load the
15285 constant efficiently if it has fewer, larger, repeating parts
15286 (see aarch64_simd_valid_immediate). */
15287 for (int i = 0; i < n_elts; i++)
15288 {
15289 rtx x = XVECEXP (vals, 0, i);
15290 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15291 continue;
15292 rtx subst = any_const;
15293 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15294 {
15295 /* Look in the copied vector, as more elements are const. */
15296 rtx test = XVECEXP (copy, 0, i ^ bit);
15297 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15298 {
15299 subst = test;
15300 break;
15301 }
15302 }
15303 XVECEXP (copy, 0, i) = subst;
15304 }
15305 aarch64_expand_vector_init (target, copy);
15306 }
15307
15308 /* Insert the variable lanes directly. */
15309 for (int i = 0; i < n_elts; i++)
15310 {
15311 rtx x = XVECEXP (vals, 0, i);
15312 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15313 continue;
15314 x = copy_to_mode_reg (inner_mode, x);
15315 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15316 }
15317 }
15318
15319 /* Emit RTL corresponding to:
15320 insr TARGET, ELEM. */
15321
15322 static void
15323 emit_insr (rtx target, rtx elem)
15324 {
15325 machine_mode mode = GET_MODE (target);
15326 scalar_mode elem_mode = GET_MODE_INNER (mode);
15327 elem = force_reg (elem_mode, elem);
15328
15329 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15330 gcc_assert (icode != CODE_FOR_nothing);
15331 emit_insn (GEN_FCN (icode) (target, target, elem));
15332 }
15333
15334 /* Subroutine of aarch64_sve_expand_vector_init for handling
15335 trailing constants.
15336 This function works as follows:
15337 (a) Create a new vector consisting of trailing constants.
15338 (b) Initialize TARGET with the constant vector using emit_move_insn.
15339 (c) Insert remaining elements in TARGET using insr.
15340 NELTS is the total number of elements in original vector while
15341 while NELTS_REQD is the number of elements that are actually
15342 significant.
15343
15344 ??? The heuristic used is to do above only if number of constants
15345 is at least half the total number of elements. May need fine tuning. */
15346
15347 static bool
15348 aarch64_sve_expand_vector_init_handle_trailing_constants
15349 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
15350 {
15351 machine_mode mode = GET_MODE (target);
15352 scalar_mode elem_mode = GET_MODE_INNER (mode);
15353 int n_trailing_constants = 0;
15354
15355 for (int i = nelts_reqd - 1;
15356 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
15357 i--)
15358 n_trailing_constants++;
15359
15360 if (n_trailing_constants >= nelts_reqd / 2)
15361 {
15362 rtx_vector_builder v (mode, 1, nelts);
15363 for (int i = 0; i < nelts; i++)
15364 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
15365 rtx const_vec = v.build ();
15366 emit_move_insn (target, const_vec);
15367
15368 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
15369 emit_insr (target, builder.elt (i));
15370
15371 return true;
15372 }
15373
15374 return false;
15375 }
15376
15377 /* Subroutine of aarch64_sve_expand_vector_init.
15378 Works as follows:
15379 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15380 (b) Skip trailing elements from BUILDER, which are the same as
15381 element NELTS_REQD - 1.
15382 (c) Insert earlier elements in reverse order in TARGET using insr. */
15383
15384 static void
15385 aarch64_sve_expand_vector_init_insert_elems (rtx target,
15386 const rtx_vector_builder &builder,
15387 int nelts_reqd)
15388 {
15389 machine_mode mode = GET_MODE (target);
15390 scalar_mode elem_mode = GET_MODE_INNER (mode);
15391
15392 struct expand_operand ops[2];
15393 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
15394 gcc_assert (icode != CODE_FOR_nothing);
15395
15396 create_output_operand (&ops[0], target, mode);
15397 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
15398 expand_insn (icode, 2, ops);
15399
15400 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15401 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
15402 emit_insr (target, builder.elt (i));
15403 }
15404
15405 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15406 when all trailing elements of builder are same.
15407 This works as follows:
15408 (a) Use expand_insn interface to broadcast last vector element in TARGET.
15409 (b) Insert remaining elements in TARGET using insr.
15410
15411 ??? The heuristic used is to do above if number of same trailing elements
15412 is at least 3/4 of total number of elements, loosely based on
15413 heuristic from mostly_zeros_p. May need fine-tuning. */
15414
15415 static bool
15416 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15417 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
15418 {
15419 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15420 if (ndups >= (3 * nelts_reqd) / 4)
15421 {
15422 aarch64_sve_expand_vector_init_insert_elems (target, builder,
15423 nelts_reqd - ndups + 1);
15424 return true;
15425 }
15426
15427 return false;
15428 }
15429
15430 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15431 of elements in BUILDER.
15432
15433 The function tries to initialize TARGET from BUILDER if it fits one
15434 of the special cases outlined below.
15435
15436 Failing that, the function divides BUILDER into two sub-vectors:
15437 v_even = even elements of BUILDER;
15438 v_odd = odd elements of BUILDER;
15439
15440 and recursively calls itself with v_even and v_odd.
15441
15442 if (recursive call succeeded for v_even or v_odd)
15443 TARGET = zip (v_even, v_odd)
15444
15445 The function returns true if it managed to build TARGET from BUILDER
15446 with one of the special cases, false otherwise.
15447
15448 Example: {a, 1, b, 2, c, 3, d, 4}
15449
15450 The vector gets divided into:
15451 v_even = {a, b, c, d}
15452 v_odd = {1, 2, 3, 4}
15453
15454 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15455 initialize tmp2 from constant vector v_odd using emit_move_insn.
15456
15457 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15458 4 elements, so we construct tmp1 from v_even using insr:
15459 tmp1 = dup(d)
15460 insr tmp1, c
15461 insr tmp1, b
15462 insr tmp1, a
15463
15464 And finally:
15465 TARGET = zip (tmp1, tmp2)
15466 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
15467
15468 static bool
15469 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
15470 int nelts, int nelts_reqd)
15471 {
15472 machine_mode mode = GET_MODE (target);
15473
15474 /* Case 1: Vector contains trailing constants. */
15475
15476 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15477 (target, builder, nelts, nelts_reqd))
15478 return true;
15479
15480 /* Case 2: Vector contains leading constants. */
15481
15482 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
15483 for (int i = 0; i < nelts_reqd; i++)
15484 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
15485 rev_builder.finalize ();
15486
15487 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15488 (target, rev_builder, nelts, nelts_reqd))
15489 {
15490 emit_insn (gen_aarch64_sve_rev (mode, target, target));
15491 return true;
15492 }
15493
15494 /* Case 3: Vector contains trailing same element. */
15495
15496 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15497 (target, builder, nelts_reqd))
15498 return true;
15499
15500 /* Case 4: Vector contains leading same element. */
15501
15502 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15503 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
15504 {
15505 emit_insn (gen_aarch64_sve_rev (mode, target, target));
15506 return true;
15507 }
15508
15509 /* Avoid recursing below 4-elements.
15510 ??? The threshold 4 may need fine-tuning. */
15511
15512 if (nelts_reqd <= 4)
15513 return false;
15514
15515 rtx_vector_builder v_even (mode, 1, nelts);
15516 rtx_vector_builder v_odd (mode, 1, nelts);
15517
15518 for (int i = 0; i < nelts * 2; i += 2)
15519 {
15520 v_even.quick_push (builder.elt (i));
15521 v_odd.quick_push (builder.elt (i + 1));
15522 }
15523
15524 v_even.finalize ();
15525 v_odd.finalize ();
15526
15527 rtx tmp1 = gen_reg_rtx (mode);
15528 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
15529 nelts, nelts_reqd / 2);
15530
15531 rtx tmp2 = gen_reg_rtx (mode);
15532 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
15533 nelts, nelts_reqd / 2);
15534
15535 if (!did_even_p && !did_odd_p)
15536 return false;
15537
15538 /* Initialize v_even and v_odd using INSR if it didn't match any of the
15539 special cases and zip v_even, v_odd. */
15540
15541 if (!did_even_p)
15542 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
15543
15544 if (!did_odd_p)
15545 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
15546
15547 rtvec v = gen_rtvec (2, tmp1, tmp2);
15548 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
15549 return true;
15550 }
15551
15552 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
15553
15554 void
15555 aarch64_sve_expand_vector_init (rtx target, rtx vals)
15556 {
15557 machine_mode mode = GET_MODE (target);
15558 int nelts = XVECLEN (vals, 0);
15559
15560 rtx_vector_builder v (mode, 1, nelts);
15561 for (int i = 0; i < nelts; i++)
15562 v.quick_push (XVECEXP (vals, 0, i));
15563 v.finalize ();
15564
15565 /* If neither sub-vectors of v could be initialized specially,
15566 then use INSR to insert all elements from v into TARGET.
15567 ??? This might not be optimal for vectors with large
15568 initializers like 16-element or above.
15569 For nelts < 4, it probably isn't useful to handle specially. */
15570
15571 if (nelts < 4
15572 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
15573 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
15574 }
15575
15576 static unsigned HOST_WIDE_INT
15577 aarch64_shift_truncation_mask (machine_mode mode)
15578 {
15579 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15580 return 0;
15581 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15582 }
15583
15584 /* Select a format to encode pointers in exception handling data. */
15585 int
15586 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15587 {
15588 int type;
15589 switch (aarch64_cmodel)
15590 {
15591 case AARCH64_CMODEL_TINY:
15592 case AARCH64_CMODEL_TINY_PIC:
15593 case AARCH64_CMODEL_SMALL:
15594 case AARCH64_CMODEL_SMALL_PIC:
15595 case AARCH64_CMODEL_SMALL_SPIC:
15596 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15597 for everything. */
15598 type = DW_EH_PE_sdata4;
15599 break;
15600 default:
15601 /* No assumptions here. 8-byte relocs required. */
15602 type = DW_EH_PE_sdata8;
15603 break;
15604 }
15605 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15606 }
15607
15608 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
15609
15610 static void
15611 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15612 {
15613 if (aarch64_simd_decl_p (decl))
15614 {
15615 fprintf (stream, "\t.variant_pcs\t");
15616 assemble_name (stream, name);
15617 fprintf (stream, "\n");
15618 }
15619 }
15620
15621 /* The last .arch and .tune assembly strings that we printed. */
15622 static std::string aarch64_last_printed_arch_string;
15623 static std::string aarch64_last_printed_tune_string;
15624
15625 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15626 by the function fndecl. */
15627
15628 void
15629 aarch64_declare_function_name (FILE *stream, const char* name,
15630 tree fndecl)
15631 {
15632 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15633
15634 struct cl_target_option *targ_options;
15635 if (target_parts)
15636 targ_options = TREE_TARGET_OPTION (target_parts);
15637 else
15638 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15639 gcc_assert (targ_options);
15640
15641 const struct processor *this_arch
15642 = aarch64_get_arch (targ_options->x_explicit_arch);
15643
15644 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
15645 std::string extension
15646 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15647 this_arch->flags);
15648 /* Only update the assembler .arch string if it is distinct from the last
15649 such string we printed. */
15650 std::string to_print = this_arch->name + extension;
15651 if (to_print != aarch64_last_printed_arch_string)
15652 {
15653 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15654 aarch64_last_printed_arch_string = to_print;
15655 }
15656
15657 /* Print the cpu name we're tuning for in the comments, might be
15658 useful to readers of the generated asm. Do it only when it changes
15659 from function to function and verbose assembly is requested. */
15660 const struct processor *this_tune
15661 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15662
15663 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15664 {
15665 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15666 this_tune->name);
15667 aarch64_last_printed_tune_string = this_tune->name;
15668 }
15669
15670 aarch64_asm_output_variant_pcs (stream, fndecl, name);
15671
15672 /* Don't forget the type directive for ELF. */
15673 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15674 ASM_OUTPUT_LABEL (stream, name);
15675 }
15676
15677 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
15678
15679 void
15680 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15681 {
15682 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15683 const char *value = IDENTIFIER_POINTER (target);
15684 aarch64_asm_output_variant_pcs (stream, decl, name);
15685 ASM_OUTPUT_DEF (stream, name, value);
15686 }
15687
15688 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
15689 function symbol references. */
15690
15691 void
15692 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15693 {
15694 default_elf_asm_output_external (stream, decl, name);
15695 aarch64_asm_output_variant_pcs (stream, decl, name);
15696 }
15697
15698 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
15699 Used to output the .cfi_b_key_frame directive when signing the current
15700 function with the B key. */
15701
15702 void
15703 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
15704 {
15705 if (!cfun->is_thunk && aarch64_return_address_signing_enabled ()
15706 && aarch64_ra_sign_key == AARCH64_KEY_B)
15707 asm_fprintf (f, "\t.cfi_b_key_frame\n");
15708 }
15709
15710 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15711
15712 static void
15713 aarch64_start_file (void)
15714 {
15715 struct cl_target_option *default_options
15716 = TREE_TARGET_OPTION (target_option_default_node);
15717
15718 const struct processor *default_arch
15719 = aarch64_get_arch (default_options->x_explicit_arch);
15720 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
15721 std::string extension
15722 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15723 default_arch->flags);
15724
15725 aarch64_last_printed_arch_string = default_arch->name + extension;
15726 aarch64_last_printed_tune_string = "";
15727 asm_fprintf (asm_out_file, "\t.arch %s\n",
15728 aarch64_last_printed_arch_string.c_str ());
15729
15730 default_file_start ();
15731 }
15732
15733 /* Emit load exclusive. */
15734
15735 static void
15736 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15737 rtx mem, rtx model_rtx)
15738 {
15739 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15740 }
15741
15742 /* Emit store exclusive. */
15743
15744 static void
15745 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15746 rtx rval, rtx mem, rtx model_rtx)
15747 {
15748 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15749 }
15750
15751 /* Mark the previous jump instruction as unlikely. */
15752
15753 static void
15754 aarch64_emit_unlikely_jump (rtx insn)
15755 {
15756 rtx_insn *jump = emit_jump_insn (insn);
15757 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15758 }
15759
15760 /* Expand a compare and swap pattern. */
15761
15762 void
15763 aarch64_expand_compare_and_swap (rtx operands[])
15764 {
15765 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15766 machine_mode mode, r_mode;
15767
15768 bval = operands[0];
15769 rval = operands[1];
15770 mem = operands[2];
15771 oldval = operands[3];
15772 newval = operands[4];
15773 is_weak = operands[5];
15774 mod_s = operands[6];
15775 mod_f = operands[7];
15776 mode = GET_MODE (mem);
15777
15778 /* Normally the succ memory model must be stronger than fail, but in the
15779 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15780 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15781 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15782 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15783 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15784
15785 r_mode = mode;
15786 if (mode == QImode || mode == HImode)
15787 {
15788 r_mode = SImode;
15789 rval = gen_reg_rtx (r_mode);
15790 }
15791
15792 if (TARGET_LSE)
15793 {
15794 /* The CAS insn requires oldval and rval overlap, but we need to
15795 have a copy of oldval saved across the operation to tell if
15796 the operation is successful. */
15797 if (reg_overlap_mentioned_p (rval, oldval))
15798 rval = copy_to_mode_reg (r_mode, oldval);
15799 else
15800 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15801
15802 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15803 newval, mod_s));
15804 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15805 }
15806 else
15807 {
15808 /* The oldval predicate varies by mode. Test it and force to reg. */
15809 insn_code code = code_for_aarch64_compare_and_swap (mode);
15810 if (!insn_data[code].operand[2].predicate (oldval, mode))
15811 oldval = force_reg (mode, oldval);
15812
15813 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15814 is_weak, mod_s, mod_f));
15815 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15816 }
15817
15818 if (r_mode != mode)
15819 rval = gen_lowpart (mode, rval);
15820 emit_move_insn (operands[1], rval);
15821
15822 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15823 emit_insn (gen_rtx_SET (bval, x));
15824 }
15825
15826 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15827 sequence implementing an atomic operation. */
15828
15829 static void
15830 aarch64_emit_post_barrier (enum memmodel model)
15831 {
15832 const enum memmodel base_model = memmodel_base (model);
15833
15834 if (is_mm_sync (model)
15835 && (base_model == MEMMODEL_ACQUIRE
15836 || base_model == MEMMODEL_ACQ_REL
15837 || base_model == MEMMODEL_SEQ_CST))
15838 {
15839 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15840 }
15841 }
15842
15843 /* Split a compare and swap pattern. */
15844
15845 void
15846 aarch64_split_compare_and_swap (rtx operands[])
15847 {
15848 rtx rval, mem, oldval, newval, scratch;
15849 machine_mode mode;
15850 bool is_weak;
15851 rtx_code_label *label1, *label2;
15852 rtx x, cond;
15853 enum memmodel model;
15854 rtx model_rtx;
15855
15856 rval = operands[0];
15857 mem = operands[1];
15858 oldval = operands[2];
15859 newval = operands[3];
15860 is_weak = (operands[4] != const0_rtx);
15861 model_rtx = operands[5];
15862 scratch = operands[7];
15863 mode = GET_MODE (mem);
15864 model = memmodel_from_int (INTVAL (model_rtx));
15865
15866 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15867 loop:
15868 .label1:
15869 LD[A]XR rval, [mem]
15870 CBNZ rval, .label2
15871 ST[L]XR scratch, newval, [mem]
15872 CBNZ scratch, .label1
15873 .label2:
15874 CMP rval, 0. */
15875 bool strong_zero_p = !is_weak && oldval == const0_rtx;
15876
15877 label1 = NULL;
15878 if (!is_weak)
15879 {
15880 label1 = gen_label_rtx ();
15881 emit_label (label1);
15882 }
15883 label2 = gen_label_rtx ();
15884
15885 /* The initial load can be relaxed for a __sync operation since a final
15886 barrier will be emitted to stop code hoisting. */
15887 if (is_mm_sync (model))
15888 aarch64_emit_load_exclusive (mode, rval, mem,
15889 GEN_INT (MEMMODEL_RELAXED));
15890 else
15891 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15892
15893 if (strong_zero_p)
15894 {
15895 if (aarch64_track_speculation)
15896 {
15897 /* Emit an explicit compare instruction, so that we can correctly
15898 track the condition codes. */
15899 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15900 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15901 }
15902 else
15903 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15904
15905 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15906 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15907 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15908 }
15909 else
15910 {
15911 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15912 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15913 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15914 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15915 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15916 }
15917
15918 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15919
15920 if (!is_weak)
15921 {
15922 if (aarch64_track_speculation)
15923 {
15924 /* Emit an explicit compare instruction, so that we can correctly
15925 track the condition codes. */
15926 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15927 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15928 }
15929 else
15930 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15931
15932 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15933 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15934 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15935 }
15936 else
15937 {
15938 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15939 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15940 emit_insn (gen_rtx_SET (cond, x));
15941 }
15942
15943 emit_label (label2);
15944 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15945 to set the condition flags. If this is not used it will be removed by
15946 later passes. */
15947 if (strong_zero_p)
15948 {
15949 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15950 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15951 emit_insn (gen_rtx_SET (cond, x));
15952 }
15953 /* Emit any final barrier needed for a __sync operation. */
15954 if (is_mm_sync (model))
15955 aarch64_emit_post_barrier (model);
15956 }
15957
15958 /* Split an atomic operation. */
15959
15960 void
15961 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15962 rtx value, rtx model_rtx, rtx cond)
15963 {
15964 machine_mode mode = GET_MODE (mem);
15965 machine_mode wmode = (mode == DImode ? DImode : SImode);
15966 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15967 const bool is_sync = is_mm_sync (model);
15968 rtx_code_label *label;
15969 rtx x;
15970
15971 /* Split the atomic operation into a sequence. */
15972 label = gen_label_rtx ();
15973 emit_label (label);
15974
15975 if (new_out)
15976 new_out = gen_lowpart (wmode, new_out);
15977 if (old_out)
15978 old_out = gen_lowpart (wmode, old_out);
15979 else
15980 old_out = new_out;
15981 value = simplify_gen_subreg (wmode, value, mode, 0);
15982
15983 /* The initial load can be relaxed for a __sync operation since a final
15984 barrier will be emitted to stop code hoisting. */
15985 if (is_sync)
15986 aarch64_emit_load_exclusive (mode, old_out, mem,
15987 GEN_INT (MEMMODEL_RELAXED));
15988 else
15989 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15990
15991 switch (code)
15992 {
15993 case SET:
15994 new_out = value;
15995 break;
15996
15997 case NOT:
15998 x = gen_rtx_AND (wmode, old_out, value);
15999 emit_insn (gen_rtx_SET (new_out, x));
16000 x = gen_rtx_NOT (wmode, new_out);
16001 emit_insn (gen_rtx_SET (new_out, x));
16002 break;
16003
16004 case MINUS:
16005 if (CONST_INT_P (value))
16006 {
16007 value = GEN_INT (-INTVAL (value));
16008 code = PLUS;
16009 }
16010 /* Fall through. */
16011
16012 default:
16013 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16014 emit_insn (gen_rtx_SET (new_out, x));
16015 break;
16016 }
16017
16018 aarch64_emit_store_exclusive (mode, cond, mem,
16019 gen_lowpart (mode, new_out), model_rtx);
16020
16021 if (aarch64_track_speculation)
16022 {
16023 /* Emit an explicit compare instruction, so that we can correctly
16024 track the condition codes. */
16025 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16026 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16027 }
16028 else
16029 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16030
16031 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16032 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16033 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16034
16035 /* Emit any final barrier needed for a __sync operation. */
16036 if (is_sync)
16037 aarch64_emit_post_barrier (model);
16038 }
16039
16040 static void
16041 aarch64_init_libfuncs (void)
16042 {
16043 /* Half-precision float operations. The compiler handles all operations
16044 with NULL libfuncs by converting to SFmode. */
16045
16046 /* Conversions. */
16047 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16048 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16049
16050 /* Arithmetic. */
16051 set_optab_libfunc (add_optab, HFmode, NULL);
16052 set_optab_libfunc (sdiv_optab, HFmode, NULL);
16053 set_optab_libfunc (smul_optab, HFmode, NULL);
16054 set_optab_libfunc (neg_optab, HFmode, NULL);
16055 set_optab_libfunc (sub_optab, HFmode, NULL);
16056
16057 /* Comparisons. */
16058 set_optab_libfunc (eq_optab, HFmode, NULL);
16059 set_optab_libfunc (ne_optab, HFmode, NULL);
16060 set_optab_libfunc (lt_optab, HFmode, NULL);
16061 set_optab_libfunc (le_optab, HFmode, NULL);
16062 set_optab_libfunc (ge_optab, HFmode, NULL);
16063 set_optab_libfunc (gt_optab, HFmode, NULL);
16064 set_optab_libfunc (unord_optab, HFmode, NULL);
16065 }
16066
16067 /* Target hook for c_mode_for_suffix. */
16068 static machine_mode
16069 aarch64_c_mode_for_suffix (char suffix)
16070 {
16071 if (suffix == 'q')
16072 return TFmode;
16073
16074 return VOIDmode;
16075 }
16076
16077 /* We can only represent floating point constants which will fit in
16078 "quarter-precision" values. These values are characterised by
16079 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16080 by:
16081
16082 (-1)^s * (n/16) * 2^r
16083
16084 Where:
16085 's' is the sign bit.
16086 'n' is an integer in the range 16 <= n <= 31.
16087 'r' is an integer in the range -3 <= r <= 4. */
16088
16089 /* Return true iff X can be represented by a quarter-precision
16090 floating point immediate operand X. Note, we cannot represent 0.0. */
16091 bool
16092 aarch64_float_const_representable_p (rtx x)
16093 {
16094 /* This represents our current view of how many bits
16095 make up the mantissa. */
16096 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16097 int exponent;
16098 unsigned HOST_WIDE_INT mantissa, mask;
16099 REAL_VALUE_TYPE r, m;
16100 bool fail;
16101
16102 if (!CONST_DOUBLE_P (x))
16103 return false;
16104
16105 if (GET_MODE (x) == VOIDmode
16106 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16107 return false;
16108
16109 r = *CONST_DOUBLE_REAL_VALUE (x);
16110
16111 /* We cannot represent infinities, NaNs or +/-zero. We won't
16112 know if we have +zero until we analyse the mantissa, but we
16113 can reject the other invalid values. */
16114 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16115 || REAL_VALUE_MINUS_ZERO (r))
16116 return false;
16117
16118 /* Extract exponent. */
16119 r = real_value_abs (&r);
16120 exponent = REAL_EXP (&r);
16121
16122 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16123 highest (sign) bit, with a fixed binary point at bit point_pos.
16124 m1 holds the low part of the mantissa, m2 the high part.
16125 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16126 bits for the mantissa, this can fail (low bits will be lost). */
16127 real_ldexp (&m, &r, point_pos - exponent);
16128 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16129
16130 /* If the low part of the mantissa has bits set we cannot represent
16131 the value. */
16132 if (w.ulow () != 0)
16133 return false;
16134 /* We have rejected the lower HOST_WIDE_INT, so update our
16135 understanding of how many bits lie in the mantissa and
16136 look only at the high HOST_WIDE_INT. */
16137 mantissa = w.elt (1);
16138 point_pos -= HOST_BITS_PER_WIDE_INT;
16139
16140 /* We can only represent values with a mantissa of the form 1.xxxx. */
16141 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16142 if ((mantissa & mask) != 0)
16143 return false;
16144
16145 /* Having filtered unrepresentable values, we may now remove all
16146 but the highest 5 bits. */
16147 mantissa >>= point_pos - 5;
16148
16149 /* We cannot represent the value 0.0, so reject it. This is handled
16150 elsewhere. */
16151 if (mantissa == 0)
16152 return false;
16153
16154 /* Then, as bit 4 is always set, we can mask it off, leaving
16155 the mantissa in the range [0, 15]. */
16156 mantissa &= ~(1 << 4);
16157 gcc_assert (mantissa <= 15);
16158
16159 /* GCC internally does not use IEEE754-like encoding (where normalized
16160 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16161 Our mantissa values are shifted 4 places to the left relative to
16162 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16163 by 5 places to correct for GCC's representation. */
16164 exponent = 5 - exponent;
16165
16166 return (exponent >= 0 && exponent <= 7);
16167 }
16168
16169 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16170 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16171 output MOVI/MVNI, ORR or BIC immediate. */
16172 char*
16173 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16174 enum simd_immediate_check which)
16175 {
16176 bool is_valid;
16177 static char templ[40];
16178 const char *mnemonic;
16179 const char *shift_op;
16180 unsigned int lane_count = 0;
16181 char element_char;
16182
16183 struct simd_immediate_info info;
16184
16185 /* This will return true to show const_vector is legal for use as either
16186 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16187 It will also update INFO to show how the immediate should be generated.
16188 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16189 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16190 gcc_assert (is_valid);
16191
16192 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16193 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16194
16195 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16196 {
16197 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
16198 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16199 move immediate path. */
16200 if (aarch64_float_const_zero_rtx_p (info.value))
16201 info.value = GEN_INT (0);
16202 else
16203 {
16204 const unsigned int buf_size = 20;
16205 char float_buf[buf_size] = {'\0'};
16206 real_to_decimal_for_mode (float_buf,
16207 CONST_DOUBLE_REAL_VALUE (info.value),
16208 buf_size, buf_size, 1, info.elt_mode);
16209
16210 if (lane_count == 1)
16211 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16212 else
16213 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16214 lane_count, element_char, float_buf);
16215 return templ;
16216 }
16217 }
16218
16219 gcc_assert (CONST_INT_P (info.value));
16220
16221 if (which == AARCH64_CHECK_MOV)
16222 {
16223 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16224 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
16225 if (lane_count == 1)
16226 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16227 mnemonic, UINTVAL (info.value));
16228 else if (info.shift)
16229 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16230 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16231 element_char, UINTVAL (info.value), shift_op, info.shift);
16232 else
16233 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16234 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16235 element_char, UINTVAL (info.value));
16236 }
16237 else
16238 {
16239 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16240 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16241 if (info.shift)
16242 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16243 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16244 element_char, UINTVAL (info.value), "lsl", info.shift);
16245 else
16246 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16247 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16248 element_char, UINTVAL (info.value));
16249 }
16250 return templ;
16251 }
16252
16253 char*
16254 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16255 {
16256
16257 /* If a floating point number was passed and we desire to use it in an
16258 integer mode do the conversion to integer. */
16259 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16260 {
16261 unsigned HOST_WIDE_INT ival;
16262 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16263 gcc_unreachable ();
16264 immediate = gen_int_mode (ival, mode);
16265 }
16266
16267 machine_mode vmode;
16268 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16269 a 128 bit vector mode. */
16270 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16271
16272 vmode = aarch64_simd_container_mode (mode, width);
16273 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16274 return aarch64_output_simd_mov_immediate (v_op, width);
16275 }
16276
16277 /* Return the output string to use for moving immediate CONST_VECTOR
16278 into an SVE register. */
16279
16280 char *
16281 aarch64_output_sve_mov_immediate (rtx const_vector)
16282 {
16283 static char templ[40];
16284 struct simd_immediate_info info;
16285 char element_char;
16286
16287 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16288 gcc_assert (is_valid);
16289
16290 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16291
16292 if (info.step)
16293 {
16294 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16295 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16296 element_char, INTVAL (info.value), INTVAL (info.step));
16297 return templ;
16298 }
16299
16300 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16301 {
16302 if (aarch64_float_const_zero_rtx_p (info.value))
16303 info.value = GEN_INT (0);
16304 else
16305 {
16306 const int buf_size = 20;
16307 char float_buf[buf_size] = {};
16308 real_to_decimal_for_mode (float_buf,
16309 CONST_DOUBLE_REAL_VALUE (info.value),
16310 buf_size, buf_size, 1, info.elt_mode);
16311
16312 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16313 element_char, float_buf);
16314 return templ;
16315 }
16316 }
16317
16318 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16319 element_char, INTVAL (info.value));
16320 return templ;
16321 }
16322
16323 /* Return the asm format for a PTRUE instruction whose destination has
16324 mode MODE. SUFFIX is the element size suffix. */
16325
16326 char *
16327 aarch64_output_ptrue (machine_mode mode, char suffix)
16328 {
16329 unsigned int nunits;
16330 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16331 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16332 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16333 else
16334 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16335 return buf;
16336 }
16337
16338 /* Split operands into moves from op[1] + op[2] into op[0]. */
16339
16340 void
16341 aarch64_split_combinev16qi (rtx operands[3])
16342 {
16343 unsigned int dest = REGNO (operands[0]);
16344 unsigned int src1 = REGNO (operands[1]);
16345 unsigned int src2 = REGNO (operands[2]);
16346 machine_mode halfmode = GET_MODE (operands[1]);
16347 unsigned int halfregs = REG_NREGS (operands[1]);
16348 rtx destlo, desthi;
16349
16350 gcc_assert (halfmode == V16QImode);
16351
16352 if (src1 == dest && src2 == dest + halfregs)
16353 {
16354 /* No-op move. Can't split to nothing; emit something. */
16355 emit_note (NOTE_INSN_DELETED);
16356 return;
16357 }
16358
16359 /* Preserve register attributes for variable tracking. */
16360 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16361 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16362 GET_MODE_SIZE (halfmode));
16363
16364 /* Special case of reversed high/low parts. */
16365 if (reg_overlap_mentioned_p (operands[2], destlo)
16366 && reg_overlap_mentioned_p (operands[1], desthi))
16367 {
16368 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16369 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16370 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16371 }
16372 else if (!reg_overlap_mentioned_p (operands[2], destlo))
16373 {
16374 /* Try to avoid unnecessary moves if part of the result
16375 is in the right place already. */
16376 if (src1 != dest)
16377 emit_move_insn (destlo, operands[1]);
16378 if (src2 != dest + halfregs)
16379 emit_move_insn (desthi, operands[2]);
16380 }
16381 else
16382 {
16383 if (src2 != dest + halfregs)
16384 emit_move_insn (desthi, operands[2]);
16385 if (src1 != dest)
16386 emit_move_insn (destlo, operands[1]);
16387 }
16388 }
16389
16390 /* vec_perm support. */
16391
16392 struct expand_vec_perm_d
16393 {
16394 rtx target, op0, op1;
16395 vec_perm_indices perm;
16396 machine_mode vmode;
16397 unsigned int vec_flags;
16398 bool one_vector_p;
16399 bool testing_p;
16400 };
16401
16402 /* Generate a variable permutation. */
16403
16404 static void
16405 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16406 {
16407 machine_mode vmode = GET_MODE (target);
16408 bool one_vector_p = rtx_equal_p (op0, op1);
16409
16410 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16411 gcc_checking_assert (GET_MODE (op0) == vmode);
16412 gcc_checking_assert (GET_MODE (op1) == vmode);
16413 gcc_checking_assert (GET_MODE (sel) == vmode);
16414 gcc_checking_assert (TARGET_SIMD);
16415
16416 if (one_vector_p)
16417 {
16418 if (vmode == V8QImode)
16419 {
16420 /* Expand the argument to a V16QI mode by duplicating it. */
16421 rtx pair = gen_reg_rtx (V16QImode);
16422 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16423 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16424 }
16425 else
16426 {
16427 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16428 }
16429 }
16430 else
16431 {
16432 rtx pair;
16433
16434 if (vmode == V8QImode)
16435 {
16436 pair = gen_reg_rtx (V16QImode);
16437 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16438 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16439 }
16440 else
16441 {
16442 pair = gen_reg_rtx (OImode);
16443 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16444 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16445 }
16446 }
16447 }
16448
16449 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16450 NELT is the number of elements in the vector. */
16451
16452 void
16453 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16454 unsigned int nelt)
16455 {
16456 machine_mode vmode = GET_MODE (target);
16457 bool one_vector_p = rtx_equal_p (op0, op1);
16458 rtx mask;
16459
16460 /* The TBL instruction does not use a modulo index, so we must take care
16461 of that ourselves. */
16462 mask = aarch64_simd_gen_const_vector_dup (vmode,
16463 one_vector_p ? nelt - 1 : 2 * nelt - 1);
16464 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16465
16466 /* For big-endian, we also need to reverse the index within the vector
16467 (but not which vector). */
16468 if (BYTES_BIG_ENDIAN)
16469 {
16470 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16471 if (!one_vector_p)
16472 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16473 sel = expand_simple_binop (vmode, XOR, sel, mask,
16474 NULL, 0, OPTAB_LIB_WIDEN);
16475 }
16476 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16477 }
16478
16479 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
16480
16481 static void
16482 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16483 {
16484 emit_insn (gen_rtx_SET (target,
16485 gen_rtx_UNSPEC (GET_MODE (target),
16486 gen_rtvec (2, op0, op1), code)));
16487 }
16488
16489 /* Expand an SVE vec_perm with the given operands. */
16490
16491 void
16492 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16493 {
16494 machine_mode data_mode = GET_MODE (target);
16495 machine_mode sel_mode = GET_MODE (sel);
16496 /* Enforced by the pattern condition. */
16497 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16498
16499 /* Note: vec_perm indices are supposed to wrap when they go beyond the
16500 size of the two value vectors, i.e. the upper bits of the indices
16501 are effectively ignored. SVE TBL instead produces 0 for any
16502 out-of-range indices, so we need to modulo all the vec_perm indices
16503 to ensure they are all in range. */
16504 rtx sel_reg = force_reg (sel_mode, sel);
16505
16506 /* Check if the sel only references the first values vector. */
16507 if (GET_CODE (sel) == CONST_VECTOR
16508 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16509 {
16510 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16511 return;
16512 }
16513
16514 /* Check if the two values vectors are the same. */
16515 if (rtx_equal_p (op0, op1))
16516 {
16517 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16518 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16519 NULL, 0, OPTAB_DIRECT);
16520 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16521 return;
16522 }
16523
16524 /* Run TBL on for each value vector and combine the results. */
16525
16526 rtx res0 = gen_reg_rtx (data_mode);
16527 rtx res1 = gen_reg_rtx (data_mode);
16528 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16529 if (GET_CODE (sel) != CONST_VECTOR
16530 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16531 {
16532 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16533 2 * nunits - 1);
16534 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16535 NULL, 0, OPTAB_DIRECT);
16536 }
16537 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16538 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16539 NULL, 0, OPTAB_DIRECT);
16540 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16541 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16542 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16543 else
16544 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16545 }
16546
16547 /* Recognize patterns suitable for the TRN instructions. */
16548 static bool
16549 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16550 {
16551 HOST_WIDE_INT odd;
16552 poly_uint64 nelt = d->perm.length ();
16553 rtx out, in0, in1, x;
16554 machine_mode vmode = d->vmode;
16555
16556 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16557 return false;
16558
16559 /* Note that these are little-endian tests.
16560 We correct for big-endian later. */
16561 if (!d->perm[0].is_constant (&odd)
16562 || (odd != 0 && odd != 1)
16563 || !d->perm.series_p (0, 2, odd, 2)
16564 || !d->perm.series_p (1, 2, nelt + odd, 2))
16565 return false;
16566
16567 /* Success! */
16568 if (d->testing_p)
16569 return true;
16570
16571 in0 = d->op0;
16572 in1 = d->op1;
16573 /* We don't need a big-endian lane correction for SVE; see the comment
16574 at the head of aarch64-sve.md for details. */
16575 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16576 {
16577 x = in0, in0 = in1, in1 = x;
16578 odd = !odd;
16579 }
16580 out = d->target;
16581
16582 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16583 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16584 return true;
16585 }
16586
16587 /* Recognize patterns suitable for the UZP instructions. */
16588 static bool
16589 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16590 {
16591 HOST_WIDE_INT odd;
16592 rtx out, in0, in1, x;
16593 machine_mode vmode = d->vmode;
16594
16595 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16596 return false;
16597
16598 /* Note that these are little-endian tests.
16599 We correct for big-endian later. */
16600 if (!d->perm[0].is_constant (&odd)
16601 || (odd != 0 && odd != 1)
16602 || !d->perm.series_p (0, 1, odd, 2))
16603 return false;
16604
16605 /* Success! */
16606 if (d->testing_p)
16607 return true;
16608
16609 in0 = d->op0;
16610 in1 = d->op1;
16611 /* We don't need a big-endian lane correction for SVE; see the comment
16612 at the head of aarch64-sve.md for details. */
16613 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16614 {
16615 x = in0, in0 = in1, in1 = x;
16616 odd = !odd;
16617 }
16618 out = d->target;
16619
16620 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16621 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16622 return true;
16623 }
16624
16625 /* Recognize patterns suitable for the ZIP instructions. */
16626 static bool
16627 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16628 {
16629 unsigned int high;
16630 poly_uint64 nelt = d->perm.length ();
16631 rtx out, in0, in1, x;
16632 machine_mode vmode = d->vmode;
16633
16634 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16635 return false;
16636
16637 /* Note that these are little-endian tests.
16638 We correct for big-endian later. */
16639 poly_uint64 first = d->perm[0];
16640 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16641 || !d->perm.series_p (0, 2, first, 1)
16642 || !d->perm.series_p (1, 2, first + nelt, 1))
16643 return false;
16644 high = maybe_ne (first, 0U);
16645
16646 /* Success! */
16647 if (d->testing_p)
16648 return true;
16649
16650 in0 = d->op0;
16651 in1 = d->op1;
16652 /* We don't need a big-endian lane correction for SVE; see the comment
16653 at the head of aarch64-sve.md for details. */
16654 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16655 {
16656 x = in0, in0 = in1, in1 = x;
16657 high = !high;
16658 }
16659 out = d->target;
16660
16661 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16662 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16663 return true;
16664 }
16665
16666 /* Recognize patterns for the EXT insn. */
16667
16668 static bool
16669 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16670 {
16671 HOST_WIDE_INT location;
16672 rtx offset;
16673
16674 /* The first element always refers to the first vector.
16675 Check if the extracted indices are increasing by one. */
16676 if (d->vec_flags == VEC_SVE_PRED
16677 || !d->perm[0].is_constant (&location)
16678 || !d->perm.series_p (0, 1, location, 1))
16679 return false;
16680
16681 /* Success! */
16682 if (d->testing_p)
16683 return true;
16684
16685 /* The case where (location == 0) is a no-op for both big- and little-endian,
16686 and is removed by the mid-end at optimization levels -O1 and higher.
16687
16688 We don't need a big-endian lane correction for SVE; see the comment
16689 at the head of aarch64-sve.md for details. */
16690 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16691 {
16692 /* After setup, we want the high elements of the first vector (stored
16693 at the LSB end of the register), and the low elements of the second
16694 vector (stored at the MSB end of the register). So swap. */
16695 std::swap (d->op0, d->op1);
16696 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16697 to_constant () is safe since this is restricted to Advanced SIMD
16698 vectors. */
16699 location = d->perm.length ().to_constant () - location;
16700 }
16701
16702 offset = GEN_INT (location);
16703 emit_set_insn (d->target,
16704 gen_rtx_UNSPEC (d->vmode,
16705 gen_rtvec (3, d->op0, d->op1, offset),
16706 UNSPEC_EXT));
16707 return true;
16708 }
16709
16710 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16711 within each 64-bit, 32-bit or 16-bit granule. */
16712
16713 static bool
16714 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16715 {
16716 HOST_WIDE_INT diff;
16717 unsigned int i, size, unspec;
16718 machine_mode pred_mode;
16719
16720 if (d->vec_flags == VEC_SVE_PRED
16721 || !d->one_vector_p
16722 || !d->perm[0].is_constant (&diff))
16723 return false;
16724
16725 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16726 if (size == 8)
16727 {
16728 unspec = UNSPEC_REV64;
16729 pred_mode = VNx2BImode;
16730 }
16731 else if (size == 4)
16732 {
16733 unspec = UNSPEC_REV32;
16734 pred_mode = VNx4BImode;
16735 }
16736 else if (size == 2)
16737 {
16738 unspec = UNSPEC_REV16;
16739 pred_mode = VNx8BImode;
16740 }
16741 else
16742 return false;
16743
16744 unsigned int step = diff + 1;
16745 for (i = 0; i < step; ++i)
16746 if (!d->perm.series_p (i, step, diff - i, step))
16747 return false;
16748
16749 /* Success! */
16750 if (d->testing_p)
16751 return true;
16752
16753 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16754 if (d->vec_flags == VEC_SVE_DATA)
16755 {
16756 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16757 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16758 UNSPEC_MERGE_PTRUE);
16759 }
16760 emit_set_insn (d->target, src);
16761 return true;
16762 }
16763
16764 /* Recognize patterns for the REV insn, which reverses elements within
16765 a full vector. */
16766
16767 static bool
16768 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16769 {
16770 poly_uint64 nelt = d->perm.length ();
16771
16772 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16773 return false;
16774
16775 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16776 return false;
16777
16778 /* Success! */
16779 if (d->testing_p)
16780 return true;
16781
16782 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16783 emit_set_insn (d->target, src);
16784 return true;
16785 }
16786
16787 static bool
16788 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16789 {
16790 rtx out = d->target;
16791 rtx in0;
16792 HOST_WIDE_INT elt;
16793 machine_mode vmode = d->vmode;
16794 rtx lane;
16795
16796 if (d->vec_flags == VEC_SVE_PRED
16797 || d->perm.encoding ().encoded_nelts () != 1
16798 || !d->perm[0].is_constant (&elt))
16799 return false;
16800
16801 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16802 return false;
16803
16804 /* Success! */
16805 if (d->testing_p)
16806 return true;
16807
16808 /* The generic preparation in aarch64_expand_vec_perm_const_1
16809 swaps the operand order and the permute indices if it finds
16810 d->perm[0] to be in the second operand. Thus, we can always
16811 use d->op0 and need not do any extra arithmetic to get the
16812 correct lane number. */
16813 in0 = d->op0;
16814 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16815
16816 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16817 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16818 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16819 return true;
16820 }
16821
16822 static bool
16823 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16824 {
16825 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16826 machine_mode vmode = d->vmode;
16827
16828 /* Make sure that the indices are constant. */
16829 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16830 for (unsigned int i = 0; i < encoded_nelts; ++i)
16831 if (!d->perm[i].is_constant ())
16832 return false;
16833
16834 if (d->testing_p)
16835 return true;
16836
16837 /* Generic code will try constant permutation twice. Once with the
16838 original mode and again with the elements lowered to QImode.
16839 So wait and don't do the selector expansion ourselves. */
16840 if (vmode != V8QImode && vmode != V16QImode)
16841 return false;
16842
16843 /* to_constant is safe since this routine is specific to Advanced SIMD
16844 vectors. */
16845 unsigned int nelt = d->perm.length ().to_constant ();
16846 for (unsigned int i = 0; i < nelt; ++i)
16847 /* If big-endian and two vectors we end up with a weird mixed-endian
16848 mode on NEON. Reverse the index within each word but not the word
16849 itself. to_constant is safe because we checked is_constant above. */
16850 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16851 ? d->perm[i].to_constant () ^ (nelt - 1)
16852 : d->perm[i].to_constant ());
16853
16854 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16855 sel = force_reg (vmode, sel);
16856
16857 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16858 return true;
16859 }
16860
16861 /* Try to implement D using an SVE TBL instruction. */
16862
16863 static bool
16864 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16865 {
16866 unsigned HOST_WIDE_INT nelt;
16867
16868 /* Permuting two variable-length vectors could overflow the
16869 index range. */
16870 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16871 return false;
16872
16873 if (d->testing_p)
16874 return true;
16875
16876 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16877 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16878 if (d->one_vector_p)
16879 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16880 else
16881 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16882 return true;
16883 }
16884
16885 static bool
16886 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16887 {
16888 /* The pattern matching functions above are written to look for a small
16889 number to begin the sequence (0, 1, N/2). If we begin with an index
16890 from the second operand, we can swap the operands. */
16891 poly_int64 nelt = d->perm.length ();
16892 if (known_ge (d->perm[0], nelt))
16893 {
16894 d->perm.rotate_inputs (1);
16895 std::swap (d->op0, d->op1);
16896 }
16897
16898 if ((d->vec_flags == VEC_ADVSIMD
16899 || d->vec_flags == VEC_SVE_DATA
16900 || d->vec_flags == VEC_SVE_PRED)
16901 && known_gt (nelt, 1))
16902 {
16903 if (aarch64_evpc_rev_local (d))
16904 return true;
16905 else if (aarch64_evpc_rev_global (d))
16906 return true;
16907 else if (aarch64_evpc_ext (d))
16908 return true;
16909 else if (aarch64_evpc_dup (d))
16910 return true;
16911 else if (aarch64_evpc_zip (d))
16912 return true;
16913 else if (aarch64_evpc_uzp (d))
16914 return true;
16915 else if (aarch64_evpc_trn (d))
16916 return true;
16917 if (d->vec_flags == VEC_SVE_DATA)
16918 return aarch64_evpc_sve_tbl (d);
16919 else if (d->vec_flags == VEC_ADVSIMD)
16920 return aarch64_evpc_tbl (d);
16921 }
16922 return false;
16923 }
16924
16925 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16926
16927 static bool
16928 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16929 rtx op1, const vec_perm_indices &sel)
16930 {
16931 struct expand_vec_perm_d d;
16932
16933 /* Check whether the mask can be applied to a single vector. */
16934 if (sel.ninputs () == 1
16935 || (op0 && rtx_equal_p (op0, op1)))
16936 d.one_vector_p = true;
16937 else if (sel.all_from_input_p (0))
16938 {
16939 d.one_vector_p = true;
16940 op1 = op0;
16941 }
16942 else if (sel.all_from_input_p (1))
16943 {
16944 d.one_vector_p = true;
16945 op0 = op1;
16946 }
16947 else
16948 d.one_vector_p = false;
16949
16950 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16951 sel.nelts_per_input ());
16952 d.vmode = vmode;
16953 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16954 d.target = target;
16955 d.op0 = op0;
16956 d.op1 = op1;
16957 d.testing_p = !target;
16958
16959 if (!d.testing_p)
16960 return aarch64_expand_vec_perm_const_1 (&d);
16961
16962 rtx_insn *last = get_last_insn ();
16963 bool ret = aarch64_expand_vec_perm_const_1 (&d);
16964 gcc_assert (last == get_last_insn ());
16965
16966 return ret;
16967 }
16968
16969 /* Generate a byte permute mask for a register of mode MODE,
16970 which has NUNITS units. */
16971
16972 rtx
16973 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16974 {
16975 /* We have to reverse each vector because we dont have
16976 a permuted load that can reverse-load according to ABI rules. */
16977 rtx mask;
16978 rtvec v = rtvec_alloc (16);
16979 unsigned int i, j;
16980 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16981
16982 gcc_assert (BYTES_BIG_ENDIAN);
16983 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16984
16985 for (i = 0; i < nunits; i++)
16986 for (j = 0; j < usize; j++)
16987 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16988 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16989 return force_reg (V16QImode, mask);
16990 }
16991
16992 /* Return true if X is a valid second operand for the SVE instruction
16993 that implements integer comparison OP_CODE. */
16994
16995 static bool
16996 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16997 {
16998 if (register_operand (x, VOIDmode))
16999 return true;
17000
17001 switch (op_code)
17002 {
17003 case LTU:
17004 case LEU:
17005 case GEU:
17006 case GTU:
17007 return aarch64_sve_cmp_immediate_p (x, false);
17008 case LT:
17009 case LE:
17010 case GE:
17011 case GT:
17012 case NE:
17013 case EQ:
17014 return aarch64_sve_cmp_immediate_p (x, true);
17015 default:
17016 gcc_unreachable ();
17017 }
17018 }
17019
17020 /* Use predicated SVE instructions to implement the equivalent of:
17021
17022 (set TARGET OP)
17023
17024 given that PTRUE is an all-true predicate of the appropriate mode. */
17025
17026 static void
17027 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17028 {
17029 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17030 gen_rtvec (2, ptrue, op),
17031 UNSPEC_MERGE_PTRUE);
17032 rtx_insn *insn = emit_set_insn (target, unspec);
17033 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17034 }
17035
17036 /* Likewise, but also clobber the condition codes. */
17037
17038 static void
17039 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17040 {
17041 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17042 gen_rtvec (2, ptrue, op),
17043 UNSPEC_MERGE_PTRUE);
17044 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
17045 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17046 }
17047
17048 /* Return the UNSPEC_COND_* code for comparison CODE. */
17049
17050 static unsigned int
17051 aarch64_unspec_cond_code (rtx_code code)
17052 {
17053 switch (code)
17054 {
17055 case NE:
17056 return UNSPEC_COND_NE;
17057 case EQ:
17058 return UNSPEC_COND_EQ;
17059 case LT:
17060 return UNSPEC_COND_LT;
17061 case GT:
17062 return UNSPEC_COND_GT;
17063 case LE:
17064 return UNSPEC_COND_LE;
17065 case GE:
17066 return UNSPEC_COND_GE;
17067 default:
17068 gcc_unreachable ();
17069 }
17070 }
17071
17072 /* Emit:
17073
17074 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17075
17076 where <X> is the operation associated with comparison CODE. This form
17077 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17078 semantics, such as when PRED might not be all-true and when comparing
17079 inactive lanes could have side effects. */
17080
17081 static void
17082 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17083 rtx pred, rtx op0, rtx op1)
17084 {
17085 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17086 gen_rtvec (3, pred, op0, op1),
17087 aarch64_unspec_cond_code (code));
17088 emit_set_insn (target, unspec);
17089 }
17090
17091 /* Expand an SVE integer comparison using the SVE equivalent of:
17092
17093 (set TARGET (CODE OP0 OP1)). */
17094
17095 void
17096 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17097 {
17098 machine_mode pred_mode = GET_MODE (target);
17099 machine_mode data_mode = GET_MODE (op0);
17100
17101 if (!aarch64_sve_cmp_operand_p (code, op1))
17102 op1 = force_reg (data_mode, op1);
17103
17104 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
17105 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17106 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17107 }
17108
17109 /* Emit the SVE equivalent of:
17110
17111 (set TMP1 (CODE1 OP0 OP1))
17112 (set TMP2 (CODE2 OP0 OP1))
17113 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17114
17115 PTRUE is an all-true predicate with the same mode as TARGET. */
17116
17117 static void
17118 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17119 rtx ptrue, rtx op0, rtx op1)
17120 {
17121 machine_mode pred_mode = GET_MODE (ptrue);
17122 rtx tmp1 = gen_reg_rtx (pred_mode);
17123 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17124 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17125 rtx tmp2 = gen_reg_rtx (pred_mode);
17126 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17127 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17128 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17129 }
17130
17131 /* Emit the SVE equivalent of:
17132
17133 (set TMP (CODE OP0 OP1))
17134 (set TARGET (not TMP))
17135
17136 PTRUE is an all-true predicate with the same mode as TARGET. */
17137
17138 static void
17139 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17140 rtx op0, rtx op1)
17141 {
17142 machine_mode pred_mode = GET_MODE (ptrue);
17143 rtx tmp = gen_reg_rtx (pred_mode);
17144 aarch64_emit_sve_ptrue_op (tmp, ptrue,
17145 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17146 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17147 }
17148
17149 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17150
17151 (set TARGET (CODE OP0 OP1))
17152
17153 If CAN_INVERT_P is true, the caller can also handle inverted results;
17154 return true if the result is in fact inverted. */
17155
17156 bool
17157 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17158 rtx op0, rtx op1, bool can_invert_p)
17159 {
17160 machine_mode pred_mode = GET_MODE (target);
17161 machine_mode data_mode = GET_MODE (op0);
17162
17163 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
17164 switch (code)
17165 {
17166 case UNORDERED:
17167 /* UNORDERED has no immediate form. */
17168 op1 = force_reg (data_mode, op1);
17169 /* fall through */
17170 case LT:
17171 case LE:
17172 case GT:
17173 case GE:
17174 case EQ:
17175 case NE:
17176 {
17177 /* There is native support for the comparison. */
17178 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17179 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17180 return false;
17181 }
17182
17183 case LTGT:
17184 /* This is a trapping operation (LT or GT). */
17185 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17186 return false;
17187
17188 case UNEQ:
17189 if (!flag_trapping_math)
17190 {
17191 /* This would trap for signaling NaNs. */
17192 op1 = force_reg (data_mode, op1);
17193 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17194 return false;
17195 }
17196 /* fall through */
17197 case UNLT:
17198 case UNLE:
17199 case UNGT:
17200 case UNGE:
17201 if (flag_trapping_math)
17202 {
17203 /* Work out which elements are ordered. */
17204 rtx ordered = gen_reg_rtx (pred_mode);
17205 op1 = force_reg (data_mode, op1);
17206 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17207
17208 /* Test the opposite condition for the ordered elements,
17209 then invert the result. */
17210 if (code == UNEQ)
17211 code = NE;
17212 else
17213 code = reverse_condition_maybe_unordered (code);
17214 if (can_invert_p)
17215 {
17216 aarch64_emit_sve_predicated_cond (target, code,
17217 ordered, op0, op1);
17218 return true;
17219 }
17220 rtx tmp = gen_reg_rtx (pred_mode);
17221 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17222 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17223 return false;
17224 }
17225 break;
17226
17227 case ORDERED:
17228 /* ORDERED has no immediate form. */
17229 op1 = force_reg (data_mode, op1);
17230 break;
17231
17232 default:
17233 gcc_unreachable ();
17234 }
17235
17236 /* There is native support for the inverse comparison. */
17237 code = reverse_condition_maybe_unordered (code);
17238 if (can_invert_p)
17239 {
17240 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17241 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17242 return true;
17243 }
17244 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17245 return false;
17246 }
17247
17248 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17249 of the data being selected and CMP_MODE is the mode of the values being
17250 compared. */
17251
17252 void
17253 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17254 rtx *ops)
17255 {
17256 machine_mode pred_mode
17257 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17258 GET_MODE_SIZE (cmp_mode)).require ();
17259 rtx pred = gen_reg_rtx (pred_mode);
17260 if (FLOAT_MODE_P (cmp_mode))
17261 {
17262 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17263 ops[4], ops[5], true))
17264 std::swap (ops[1], ops[2]);
17265 }
17266 else
17267 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17268
17269 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17270 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17271 }
17272
17273 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17274 true. However due to issues with register allocation it is preferable
17275 to avoid tieing integer scalar and FP scalar modes. Executing integer
17276 operations in general registers is better than treating them as scalar
17277 vector operations. This reduces latency and avoids redundant int<->FP
17278 moves. So tie modes if they are either the same class, or vector modes
17279 with other vector modes, vector structs or any scalar mode. */
17280
17281 static bool
17282 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17283 {
17284 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17285 return true;
17286
17287 /* We specifically want to allow elements of "structure" modes to
17288 be tieable to the structure. This more general condition allows
17289 other rarer situations too. The reason we don't extend this to
17290 predicate modes is that there are no predicate structure modes
17291 nor any specific instructions for extracting part of a predicate
17292 register. */
17293 if (aarch64_vector_data_mode_p (mode1)
17294 && aarch64_vector_data_mode_p (mode2))
17295 return true;
17296
17297 /* Also allow any scalar modes with vectors. */
17298 if (aarch64_vector_mode_supported_p (mode1)
17299 || aarch64_vector_mode_supported_p (mode2))
17300 return true;
17301
17302 return false;
17303 }
17304
17305 /* Return a new RTX holding the result of moving POINTER forward by
17306 AMOUNT bytes. */
17307
17308 static rtx
17309 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17310 {
17311 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17312
17313 return adjust_automodify_address (pointer, GET_MODE (pointer),
17314 next, amount);
17315 }
17316
17317 /* Return a new RTX holding the result of moving POINTER forward by the
17318 size of the mode it points to. */
17319
17320 static rtx
17321 aarch64_progress_pointer (rtx pointer)
17322 {
17323 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17324 }
17325
17326 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17327 MODE bytes. */
17328
17329 static void
17330 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17331 machine_mode mode)
17332 {
17333 rtx reg = gen_reg_rtx (mode);
17334
17335 /* "Cast" the pointers to the correct mode. */
17336 *src = adjust_address (*src, mode, 0);
17337 *dst = adjust_address (*dst, mode, 0);
17338 /* Emit the memcpy. */
17339 emit_move_insn (reg, *src);
17340 emit_move_insn (*dst, reg);
17341 /* Move the pointers forward. */
17342 *src = aarch64_progress_pointer (*src);
17343 *dst = aarch64_progress_pointer (*dst);
17344 }
17345
17346 /* Expand movmem, as if from a __builtin_memcpy. Return true if
17347 we succeed, otherwise return false. */
17348
17349 bool
17350 aarch64_expand_movmem (rtx *operands)
17351 {
17352 int n, mode_bits;
17353 rtx dst = operands[0];
17354 rtx src = operands[1];
17355 rtx base;
17356 machine_mode cur_mode = BLKmode, next_mode;
17357 bool speed_p = !optimize_function_for_size_p (cfun);
17358
17359 /* When optimizing for size, give a better estimate of the length of a
17360 memcpy call, but use the default otherwise. Moves larger than 8 bytes
17361 will always require an even number of instructions to do now. And each
17362 operation requires both a load+store, so devide the max number by 2. */
17363 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17364
17365 /* We can't do anything smart if the amount to copy is not constant. */
17366 if (!CONST_INT_P (operands[2]))
17367 return false;
17368
17369 n = INTVAL (operands[2]);
17370
17371 /* Try to keep the number of instructions low. For all cases we will do at
17372 most two moves for the residual amount, since we'll always overlap the
17373 remainder. */
17374 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17375 return false;
17376
17377 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17378 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17379
17380 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17381 src = adjust_automodify_address (src, VOIDmode, base, 0);
17382
17383 /* Convert n to bits to make the rest of the code simpler. */
17384 n = n * BITS_PER_UNIT;
17385
17386 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
17387 larger than TImode, but we should not use them for loads/stores here. */
17388 const int copy_limit = GET_MODE_BITSIZE (TImode);
17389
17390 while (n > 0)
17391 {
17392 /* Find the largest mode in which to do the copy in without over reading
17393 or writing. */
17394 opt_scalar_int_mode mode_iter;
17395 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17396 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17397 cur_mode = mode_iter.require ();
17398
17399 gcc_assert (cur_mode != BLKmode);
17400
17401 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17402 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17403
17404 n -= mode_bits;
17405
17406 /* Do certain trailing copies as overlapping if it's going to be
17407 cheaper. i.e. less instructions to do so. For instance doing a 15
17408 byte copy it's more efficient to do two overlapping 8 byte copies than
17409 8 + 6 + 1. */
17410 if (n > 0 && n <= 8 * BITS_PER_UNIT)
17411 {
17412 next_mode = smallest_mode_for_size (n, MODE_INT);
17413 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17414 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17415 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17416 n = n_bits;
17417 }
17418 }
17419
17420 return true;
17421 }
17422
17423 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17424 SImode stores. Handle the case when the constant has identical
17425 bottom and top halves. This is beneficial when the two stores can be
17426 merged into an STP and we avoid synthesising potentially expensive
17427 immediates twice. Return true if such a split is possible. */
17428
17429 bool
17430 aarch64_split_dimode_const_store (rtx dst, rtx src)
17431 {
17432 rtx lo = gen_lowpart (SImode, src);
17433 rtx hi = gen_highpart_mode (SImode, DImode, src);
17434
17435 bool size_p = optimize_function_for_size_p (cfun);
17436
17437 if (!rtx_equal_p (lo, hi))
17438 return false;
17439
17440 unsigned int orig_cost
17441 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17442 unsigned int lo_cost
17443 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17444
17445 /* We want to transform:
17446 MOV x1, 49370
17447 MOVK x1, 0x140, lsl 16
17448 MOVK x1, 0xc0da, lsl 32
17449 MOVK x1, 0x140, lsl 48
17450 STR x1, [x0]
17451 into:
17452 MOV w1, 49370
17453 MOVK w1, 0x140, lsl 16
17454 STP w1, w1, [x0]
17455 So we want to perform this only when we save two instructions
17456 or more. When optimizing for size, however, accept any code size
17457 savings we can. */
17458 if (size_p && orig_cost <= lo_cost)
17459 return false;
17460
17461 if (!size_p
17462 && (orig_cost <= lo_cost + 1))
17463 return false;
17464
17465 rtx mem_lo = adjust_address (dst, SImode, 0);
17466 if (!aarch64_mem_pair_operand (mem_lo, SImode))
17467 return false;
17468
17469 rtx tmp_reg = gen_reg_rtx (SImode);
17470 aarch64_expand_mov_immediate (tmp_reg, lo);
17471 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17472 /* Don't emit an explicit store pair as this may not be always profitable.
17473 Let the sched-fusion logic decide whether to merge them. */
17474 emit_move_insn (mem_lo, tmp_reg);
17475 emit_move_insn (mem_hi, tmp_reg);
17476
17477 return true;
17478 }
17479
17480 /* Generate RTL for a conditional branch with rtx comparison CODE in
17481 mode CC_MODE. The destination of the unlikely conditional branch
17482 is LABEL_REF. */
17483
17484 void
17485 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17486 rtx label_ref)
17487 {
17488 rtx x;
17489 x = gen_rtx_fmt_ee (code, VOIDmode,
17490 gen_rtx_REG (cc_mode, CC_REGNUM),
17491 const0_rtx);
17492
17493 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17494 gen_rtx_LABEL_REF (VOIDmode, label_ref),
17495 pc_rtx);
17496 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17497 }
17498
17499 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17500
17501 OP1 represents the TImode destination operand 1
17502 OP2 represents the TImode destination operand 2
17503 LOW_DEST represents the low half (DImode) of TImode operand 0
17504 LOW_IN1 represents the low half (DImode) of TImode operand 1
17505 LOW_IN2 represents the low half (DImode) of TImode operand 2
17506 HIGH_DEST represents the high half (DImode) of TImode operand 0
17507 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17508 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17509
17510 void
17511 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17512 rtx *low_in1, rtx *low_in2,
17513 rtx *high_dest, rtx *high_in1,
17514 rtx *high_in2)
17515 {
17516 *low_dest = gen_reg_rtx (DImode);
17517 *low_in1 = gen_lowpart (DImode, op1);
17518 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17519 subreg_lowpart_offset (DImode, TImode));
17520 *high_dest = gen_reg_rtx (DImode);
17521 *high_in1 = gen_highpart (DImode, op1);
17522 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17523 subreg_highpart_offset (DImode, TImode));
17524 }
17525
17526 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17527
17528 This function differs from 'arch64_addti_scratch_regs' in that
17529 OP1 can be an immediate constant (zero). We must call
17530 subreg_highpart_offset with DImode and TImode arguments, otherwise
17531 VOIDmode will be used for the const_int which generates an internal
17532 error from subreg_size_highpart_offset which does not expect a size of zero.
17533
17534 OP1 represents the TImode destination operand 1
17535 OP2 represents the TImode destination operand 2
17536 LOW_DEST represents the low half (DImode) of TImode operand 0
17537 LOW_IN1 represents the low half (DImode) of TImode operand 1
17538 LOW_IN2 represents the low half (DImode) of TImode operand 2
17539 HIGH_DEST represents the high half (DImode) of TImode operand 0
17540 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17541 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17542
17543
17544 void
17545 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17546 rtx *low_in1, rtx *low_in2,
17547 rtx *high_dest, rtx *high_in1,
17548 rtx *high_in2)
17549 {
17550 *low_dest = gen_reg_rtx (DImode);
17551 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17552 subreg_lowpart_offset (DImode, TImode));
17553
17554 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17555 subreg_lowpart_offset (DImode, TImode));
17556 *high_dest = gen_reg_rtx (DImode);
17557
17558 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17559 subreg_highpart_offset (DImode, TImode));
17560 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17561 subreg_highpart_offset (DImode, TImode));
17562 }
17563
17564 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17565
17566 OP0 represents the TImode destination operand 0
17567 LOW_DEST represents the low half (DImode) of TImode operand 0
17568 LOW_IN1 represents the low half (DImode) of TImode operand 1
17569 LOW_IN2 represents the low half (DImode) of TImode operand 2
17570 HIGH_DEST represents the high half (DImode) of TImode operand 0
17571 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17572 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17573 UNSIGNED_P is true if the operation is being performed on unsigned
17574 values. */
17575 void
17576 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17577 rtx low_in2, rtx high_dest, rtx high_in1,
17578 rtx high_in2, bool unsigned_p)
17579 {
17580 if (low_in2 == const0_rtx)
17581 {
17582 low_dest = low_in1;
17583 high_in2 = force_reg (DImode, high_in2);
17584 if (unsigned_p)
17585 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17586 else
17587 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17588 }
17589 else
17590 {
17591 if (CONST_INT_P (low_in2))
17592 {
17593 high_in2 = force_reg (DImode, high_in2);
17594 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17595 GEN_INT (-INTVAL (low_in2))));
17596 }
17597 else
17598 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17599
17600 if (unsigned_p)
17601 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17602 else
17603 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17604 }
17605
17606 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17607 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17608
17609 }
17610
17611 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17612
17613 static unsigned HOST_WIDE_INT
17614 aarch64_asan_shadow_offset (void)
17615 {
17616 if (TARGET_ILP32)
17617 return (HOST_WIDE_INT_1 << 29);
17618 else
17619 return (HOST_WIDE_INT_1 << 36);
17620 }
17621
17622 static rtx
17623 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17624 int code, tree treeop0, tree treeop1)
17625 {
17626 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17627 rtx op0, op1;
17628 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17629 insn_code icode;
17630 struct expand_operand ops[4];
17631
17632 start_sequence ();
17633 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17634
17635 op_mode = GET_MODE (op0);
17636 if (op_mode == VOIDmode)
17637 op_mode = GET_MODE (op1);
17638
17639 switch (op_mode)
17640 {
17641 case E_QImode:
17642 case E_HImode:
17643 case E_SImode:
17644 cmp_mode = SImode;
17645 icode = CODE_FOR_cmpsi;
17646 break;
17647
17648 case E_DImode:
17649 cmp_mode = DImode;
17650 icode = CODE_FOR_cmpdi;
17651 break;
17652
17653 case E_SFmode:
17654 cmp_mode = SFmode;
17655 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17656 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17657 break;
17658
17659 case E_DFmode:
17660 cmp_mode = DFmode;
17661 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17662 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17663 break;
17664
17665 default:
17666 end_sequence ();
17667 return NULL_RTX;
17668 }
17669
17670 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17671 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17672 if (!op0 || !op1)
17673 {
17674 end_sequence ();
17675 return NULL_RTX;
17676 }
17677 *prep_seq = get_insns ();
17678 end_sequence ();
17679
17680 create_fixed_operand (&ops[0], op0);
17681 create_fixed_operand (&ops[1], op1);
17682
17683 start_sequence ();
17684 if (!maybe_expand_insn (icode, 2, ops))
17685 {
17686 end_sequence ();
17687 return NULL_RTX;
17688 }
17689 *gen_seq = get_insns ();
17690 end_sequence ();
17691
17692 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17693 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17694 }
17695
17696 static rtx
17697 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17698 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17699 {
17700 rtx op0, op1, target;
17701 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17702 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17703 insn_code icode;
17704 struct expand_operand ops[6];
17705 int aarch64_cond;
17706
17707 push_to_sequence (*prep_seq);
17708 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17709
17710 op_mode = GET_MODE (op0);
17711 if (op_mode == VOIDmode)
17712 op_mode = GET_MODE (op1);
17713
17714 switch (op_mode)
17715 {
17716 case E_QImode:
17717 case E_HImode:
17718 case E_SImode:
17719 cmp_mode = SImode;
17720 icode = CODE_FOR_ccmpsi;
17721 break;
17722
17723 case E_DImode:
17724 cmp_mode = DImode;
17725 icode = CODE_FOR_ccmpdi;
17726 break;
17727
17728 case E_SFmode:
17729 cmp_mode = SFmode;
17730 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17731 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17732 break;
17733
17734 case E_DFmode:
17735 cmp_mode = DFmode;
17736 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17737 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17738 break;
17739
17740 default:
17741 end_sequence ();
17742 return NULL_RTX;
17743 }
17744
17745 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17746 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17747 if (!op0 || !op1)
17748 {
17749 end_sequence ();
17750 return NULL_RTX;
17751 }
17752 *prep_seq = get_insns ();
17753 end_sequence ();
17754
17755 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17756 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17757
17758 if (bit_code != AND)
17759 {
17760 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17761 GET_MODE (XEXP (prev, 0))),
17762 VOIDmode, XEXP (prev, 0), const0_rtx);
17763 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17764 }
17765
17766 create_fixed_operand (&ops[0], XEXP (prev, 0));
17767 create_fixed_operand (&ops[1], target);
17768 create_fixed_operand (&ops[2], op0);
17769 create_fixed_operand (&ops[3], op1);
17770 create_fixed_operand (&ops[4], prev);
17771 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17772
17773 push_to_sequence (*gen_seq);
17774 if (!maybe_expand_insn (icode, 6, ops))
17775 {
17776 end_sequence ();
17777 return NULL_RTX;
17778 }
17779
17780 *gen_seq = get_insns ();
17781 end_sequence ();
17782
17783 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17784 }
17785
17786 #undef TARGET_GEN_CCMP_FIRST
17787 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17788
17789 #undef TARGET_GEN_CCMP_NEXT
17790 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17791
17792 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17793 instruction fusion of some sort. */
17794
17795 static bool
17796 aarch64_macro_fusion_p (void)
17797 {
17798 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17799 }
17800
17801
17802 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17803 should be kept together during scheduling. */
17804
17805 static bool
17806 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17807 {
17808 rtx set_dest;
17809 rtx prev_set = single_set (prev);
17810 rtx curr_set = single_set (curr);
17811 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17812 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17813
17814 if (!aarch64_macro_fusion_p ())
17815 return false;
17816
17817 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17818 {
17819 /* We are trying to match:
17820 prev (mov) == (set (reg r0) (const_int imm16))
17821 curr (movk) == (set (zero_extract (reg r0)
17822 (const_int 16)
17823 (const_int 16))
17824 (const_int imm16_1)) */
17825
17826 set_dest = SET_DEST (curr_set);
17827
17828 if (GET_CODE (set_dest) == ZERO_EXTRACT
17829 && CONST_INT_P (SET_SRC (curr_set))
17830 && CONST_INT_P (SET_SRC (prev_set))
17831 && CONST_INT_P (XEXP (set_dest, 2))
17832 && INTVAL (XEXP (set_dest, 2)) == 16
17833 && REG_P (XEXP (set_dest, 0))
17834 && REG_P (SET_DEST (prev_set))
17835 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17836 {
17837 return true;
17838 }
17839 }
17840
17841 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17842 {
17843
17844 /* We're trying to match:
17845 prev (adrp) == (set (reg r1)
17846 (high (symbol_ref ("SYM"))))
17847 curr (add) == (set (reg r0)
17848 (lo_sum (reg r1)
17849 (symbol_ref ("SYM"))))
17850 Note that r0 need not necessarily be the same as r1, especially
17851 during pre-regalloc scheduling. */
17852
17853 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17854 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17855 {
17856 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17857 && REG_P (XEXP (SET_SRC (curr_set), 0))
17858 && REGNO (XEXP (SET_SRC (curr_set), 0))
17859 == REGNO (SET_DEST (prev_set))
17860 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17861 XEXP (SET_SRC (curr_set), 1)))
17862 return true;
17863 }
17864 }
17865
17866 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17867 {
17868
17869 /* We're trying to match:
17870 prev (movk) == (set (zero_extract (reg r0)
17871 (const_int 16)
17872 (const_int 32))
17873 (const_int imm16_1))
17874 curr (movk) == (set (zero_extract (reg r0)
17875 (const_int 16)
17876 (const_int 48))
17877 (const_int imm16_2)) */
17878
17879 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17880 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17881 && REG_P (XEXP (SET_DEST (prev_set), 0))
17882 && REG_P (XEXP (SET_DEST (curr_set), 0))
17883 && REGNO (XEXP (SET_DEST (prev_set), 0))
17884 == REGNO (XEXP (SET_DEST (curr_set), 0))
17885 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17886 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17887 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17888 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17889 && CONST_INT_P (SET_SRC (prev_set))
17890 && CONST_INT_P (SET_SRC (curr_set)))
17891 return true;
17892
17893 }
17894 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17895 {
17896 /* We're trying to match:
17897 prev (adrp) == (set (reg r0)
17898 (high (symbol_ref ("SYM"))))
17899 curr (ldr) == (set (reg r1)
17900 (mem (lo_sum (reg r0)
17901 (symbol_ref ("SYM")))))
17902 or
17903 curr (ldr) == (set (reg r1)
17904 (zero_extend (mem
17905 (lo_sum (reg r0)
17906 (symbol_ref ("SYM")))))) */
17907 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17908 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17909 {
17910 rtx curr_src = SET_SRC (curr_set);
17911
17912 if (GET_CODE (curr_src) == ZERO_EXTEND)
17913 curr_src = XEXP (curr_src, 0);
17914
17915 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17916 && REG_P (XEXP (XEXP (curr_src, 0), 0))
17917 && REGNO (XEXP (XEXP (curr_src, 0), 0))
17918 == REGNO (SET_DEST (prev_set))
17919 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17920 XEXP (SET_SRC (prev_set), 0)))
17921 return true;
17922 }
17923 }
17924
17925 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17926 && aarch_crypto_can_dual_issue (prev, curr))
17927 return true;
17928
17929 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17930 && any_condjump_p (curr))
17931 {
17932 unsigned int condreg1, condreg2;
17933 rtx cc_reg_1;
17934 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17935 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17936
17937 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17938 && prev
17939 && modified_in_p (cc_reg_1, prev))
17940 {
17941 enum attr_type prev_type = get_attr_type (prev);
17942
17943 /* FIXME: this misses some which is considered simple arthematic
17944 instructions for ThunderX. Simple shifts are missed here. */
17945 if (prev_type == TYPE_ALUS_SREG
17946 || prev_type == TYPE_ALUS_IMM
17947 || prev_type == TYPE_LOGICS_REG
17948 || prev_type == TYPE_LOGICS_IMM)
17949 return true;
17950 }
17951 }
17952
17953 if (prev_set
17954 && curr_set
17955 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17956 && any_condjump_p (curr))
17957 {
17958 /* We're trying to match:
17959 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17960 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17961 (const_int 0))
17962 (label_ref ("SYM"))
17963 (pc)) */
17964 if (SET_DEST (curr_set) == (pc_rtx)
17965 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17966 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17967 && REG_P (SET_DEST (prev_set))
17968 && REGNO (SET_DEST (prev_set))
17969 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17970 {
17971 /* Fuse ALU operations followed by conditional branch instruction. */
17972 switch (get_attr_type (prev))
17973 {
17974 case TYPE_ALU_IMM:
17975 case TYPE_ALU_SREG:
17976 case TYPE_ADC_REG:
17977 case TYPE_ADC_IMM:
17978 case TYPE_ADCS_REG:
17979 case TYPE_ADCS_IMM:
17980 case TYPE_LOGIC_REG:
17981 case TYPE_LOGIC_IMM:
17982 case TYPE_CSEL:
17983 case TYPE_ADR:
17984 case TYPE_MOV_IMM:
17985 case TYPE_SHIFT_REG:
17986 case TYPE_SHIFT_IMM:
17987 case TYPE_BFM:
17988 case TYPE_RBIT:
17989 case TYPE_REV:
17990 case TYPE_EXTEND:
17991 return true;
17992
17993 default:;
17994 }
17995 }
17996 }
17997
17998 return false;
17999 }
18000
18001 /* Return true iff the instruction fusion described by OP is enabled. */
18002
18003 bool
18004 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18005 {
18006 return (aarch64_tune_params.fusible_ops & op) != 0;
18007 }
18008
18009 /* If MEM is in the form of [base+offset], extract the two parts
18010 of address and set to BASE and OFFSET, otherwise return false
18011 after clearing BASE and OFFSET. */
18012
18013 bool
18014 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18015 {
18016 rtx addr;
18017
18018 gcc_assert (MEM_P (mem));
18019
18020 addr = XEXP (mem, 0);
18021
18022 if (REG_P (addr))
18023 {
18024 *base = addr;
18025 *offset = const0_rtx;
18026 return true;
18027 }
18028
18029 if (GET_CODE (addr) == PLUS
18030 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18031 {
18032 *base = XEXP (addr, 0);
18033 *offset = XEXP (addr, 1);
18034 return true;
18035 }
18036
18037 *base = NULL_RTX;
18038 *offset = NULL_RTX;
18039
18040 return false;
18041 }
18042
18043 /* Types for scheduling fusion. */
18044 enum sched_fusion_type
18045 {
18046 SCHED_FUSION_NONE = 0,
18047 SCHED_FUSION_LD_SIGN_EXTEND,
18048 SCHED_FUSION_LD_ZERO_EXTEND,
18049 SCHED_FUSION_LD,
18050 SCHED_FUSION_ST,
18051 SCHED_FUSION_NUM
18052 };
18053
18054 /* If INSN is a load or store of address in the form of [base+offset],
18055 extract the two parts and set to BASE and OFFSET. Return scheduling
18056 fusion type this INSN is. */
18057
18058 static enum sched_fusion_type
18059 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18060 {
18061 rtx x, dest, src;
18062 enum sched_fusion_type fusion = SCHED_FUSION_LD;
18063
18064 gcc_assert (INSN_P (insn));
18065 x = PATTERN (insn);
18066 if (GET_CODE (x) != SET)
18067 return SCHED_FUSION_NONE;
18068
18069 src = SET_SRC (x);
18070 dest = SET_DEST (x);
18071
18072 machine_mode dest_mode = GET_MODE (dest);
18073
18074 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18075 return SCHED_FUSION_NONE;
18076
18077 if (GET_CODE (src) == SIGN_EXTEND)
18078 {
18079 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18080 src = XEXP (src, 0);
18081 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18082 return SCHED_FUSION_NONE;
18083 }
18084 else if (GET_CODE (src) == ZERO_EXTEND)
18085 {
18086 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18087 src = XEXP (src, 0);
18088 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18089 return SCHED_FUSION_NONE;
18090 }
18091
18092 if (GET_CODE (src) == MEM && REG_P (dest))
18093 extract_base_offset_in_addr (src, base, offset);
18094 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18095 {
18096 fusion = SCHED_FUSION_ST;
18097 extract_base_offset_in_addr (dest, base, offset);
18098 }
18099 else
18100 return SCHED_FUSION_NONE;
18101
18102 if (*base == NULL_RTX || *offset == NULL_RTX)
18103 fusion = SCHED_FUSION_NONE;
18104
18105 return fusion;
18106 }
18107
18108 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18109
18110 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18111 and PRI are only calculated for these instructions. For other instruction,
18112 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18113 type instruction fusion can be added by returning different priorities.
18114
18115 It's important that irrelevant instructions get the largest FUSION_PRI. */
18116
18117 static void
18118 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18119 int *fusion_pri, int *pri)
18120 {
18121 int tmp, off_val;
18122 rtx base, offset;
18123 enum sched_fusion_type fusion;
18124
18125 gcc_assert (INSN_P (insn));
18126
18127 tmp = max_pri - 1;
18128 fusion = fusion_load_store (insn, &base, &offset);
18129 if (fusion == SCHED_FUSION_NONE)
18130 {
18131 *pri = tmp;
18132 *fusion_pri = tmp;
18133 return;
18134 }
18135
18136 /* Set FUSION_PRI according to fusion type and base register. */
18137 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18138
18139 /* Calculate PRI. */
18140 tmp /= 2;
18141
18142 /* INSN with smaller offset goes first. */
18143 off_val = (int)(INTVAL (offset));
18144 if (off_val >= 0)
18145 tmp -= (off_val & 0xfffff);
18146 else
18147 tmp += ((- off_val) & 0xfffff);
18148
18149 *pri = tmp;
18150 return;
18151 }
18152
18153 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18154 Adjust priority of sha1h instructions so they are scheduled before
18155 other SHA1 instructions. */
18156
18157 static int
18158 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18159 {
18160 rtx x = PATTERN (insn);
18161
18162 if (GET_CODE (x) == SET)
18163 {
18164 x = SET_SRC (x);
18165
18166 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18167 return priority + 10;
18168 }
18169
18170 return priority;
18171 }
18172
18173 /* Given OPERANDS of consecutive load/store, check if we can merge
18174 them into ldp/stp. LOAD is true if they are load instructions.
18175 MODE is the mode of memory operands. */
18176
18177 bool
18178 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18179 machine_mode mode)
18180 {
18181 HOST_WIDE_INT offval_1, offval_2, msize;
18182 enum reg_class rclass_1, rclass_2;
18183 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18184
18185 if (load)
18186 {
18187 mem_1 = operands[1];
18188 mem_2 = operands[3];
18189 reg_1 = operands[0];
18190 reg_2 = operands[2];
18191 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18192 if (REGNO (reg_1) == REGNO (reg_2))
18193 return false;
18194 }
18195 else
18196 {
18197 mem_1 = operands[0];
18198 mem_2 = operands[2];
18199 reg_1 = operands[1];
18200 reg_2 = operands[3];
18201 }
18202
18203 /* The mems cannot be volatile. */
18204 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18205 return false;
18206
18207 /* If we have SImode and slow unaligned ldp,
18208 check the alignment to be at least 8 byte. */
18209 if (mode == SImode
18210 && (aarch64_tune_params.extra_tuning_flags
18211 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18212 && !optimize_size
18213 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18214 return false;
18215
18216 /* Check if the addresses are in the form of [base+offset]. */
18217 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18218 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18219 return false;
18220 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18221 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18222 return false;
18223
18224 /* Check if the bases are same. */
18225 if (!rtx_equal_p (base_1, base_2))
18226 return false;
18227
18228 /* The operands must be of the same size. */
18229 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18230 GET_MODE_SIZE (GET_MODE (mem_2))));
18231
18232 offval_1 = INTVAL (offset_1);
18233 offval_2 = INTVAL (offset_2);
18234 /* We should only be trying this for fixed-sized modes. There is no
18235 SVE LDP/STP instruction. */
18236 msize = GET_MODE_SIZE (mode).to_constant ();
18237 /* Check if the offsets are consecutive. */
18238 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18239 return false;
18240
18241 /* Check if the addresses are clobbered by load. */
18242 if (load)
18243 {
18244 if (reg_mentioned_p (reg_1, mem_1))
18245 return false;
18246
18247 /* In increasing order, the last load can clobber the address. */
18248 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18249 return false;
18250 }
18251
18252 /* One of the memory accesses must be a mempair operand.
18253 If it is not the first one, they need to be swapped by the
18254 peephole. */
18255 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18256 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18257 return false;
18258
18259 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18260 rclass_1 = FP_REGS;
18261 else
18262 rclass_1 = GENERAL_REGS;
18263
18264 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18265 rclass_2 = FP_REGS;
18266 else
18267 rclass_2 = GENERAL_REGS;
18268
18269 /* Check if the registers are of same class. */
18270 if (rclass_1 != rclass_2)
18271 return false;
18272
18273 return true;
18274 }
18275
18276 /* Given OPERANDS of consecutive load/store that can be merged,
18277 swap them if they are not in ascending order. */
18278 void
18279 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18280 {
18281 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18282 HOST_WIDE_INT offval_1, offval_2;
18283
18284 if (load)
18285 {
18286 mem_1 = operands[1];
18287 mem_2 = operands[3];
18288 }
18289 else
18290 {
18291 mem_1 = operands[0];
18292 mem_2 = operands[2];
18293 }
18294
18295 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18296 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18297
18298 offval_1 = INTVAL (offset_1);
18299 offval_2 = INTVAL (offset_2);
18300
18301 if (offval_1 > offval_2)
18302 {
18303 /* Irrespective of whether this is a load or a store,
18304 we do the same swap. */
18305 std::swap (operands[0], operands[2]);
18306 std::swap (operands[1], operands[3]);
18307 }
18308 }
18309
18310 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18311 comparison between the two. */
18312 int
18313 aarch64_host_wide_int_compare (const void *x, const void *y)
18314 {
18315 return wi::cmps (* ((const HOST_WIDE_INT *) x),
18316 * ((const HOST_WIDE_INT *) y));
18317 }
18318
18319 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18320 other pointing to a REG rtx containing an offset, compare the offsets
18321 of the two pairs.
18322
18323 Return:
18324
18325 1 iff offset (X) > offset (Y)
18326 0 iff offset (X) == offset (Y)
18327 -1 iff offset (X) < offset (Y) */
18328 int
18329 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18330 {
18331 const rtx * operands_1 = (const rtx *) x;
18332 const rtx * operands_2 = (const rtx *) y;
18333 rtx mem_1, mem_2, base, offset_1, offset_2;
18334
18335 if (MEM_P (operands_1[0]))
18336 mem_1 = operands_1[0];
18337 else
18338 mem_1 = operands_1[1];
18339
18340 if (MEM_P (operands_2[0]))
18341 mem_2 = operands_2[0];
18342 else
18343 mem_2 = operands_2[1];
18344
18345 /* Extract the offsets. */
18346 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18347 extract_base_offset_in_addr (mem_2, &base, &offset_2);
18348
18349 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18350
18351 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18352 }
18353
18354 /* Given OPERANDS of consecutive load/store, check if we can merge
18355 them into ldp/stp by adjusting the offset. LOAD is true if they
18356 are load instructions. MODE is the mode of memory operands.
18357
18358 Given below consecutive stores:
18359
18360 str w1, [xb, 0x100]
18361 str w1, [xb, 0x104]
18362 str w1, [xb, 0x108]
18363 str w1, [xb, 0x10c]
18364
18365 Though the offsets are out of the range supported by stp, we can
18366 still pair them after adjusting the offset, like:
18367
18368 add scratch, xb, 0x100
18369 stp w1, w1, [scratch]
18370 stp w1, w1, [scratch, 0x8]
18371
18372 The peephole patterns detecting this opportunity should guarantee
18373 the scratch register is avaliable. */
18374
18375 bool
18376 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18377 scalar_mode mode)
18378 {
18379 const int num_insns = 4;
18380 enum reg_class rclass;
18381 HOST_WIDE_INT offvals[num_insns], msize;
18382 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18383
18384 if (load)
18385 {
18386 for (int i = 0; i < num_insns; i++)
18387 {
18388 reg[i] = operands[2 * i];
18389 mem[i] = operands[2 * i + 1];
18390
18391 gcc_assert (REG_P (reg[i]));
18392 }
18393
18394 /* Do not attempt to merge the loads if the loads clobber each other. */
18395 for (int i = 0; i < 8; i += 2)
18396 for (int j = i + 2; j < 8; j += 2)
18397 if (reg_overlap_mentioned_p (operands[i], operands[j]))
18398 return false;
18399 }
18400 else
18401 for (int i = 0; i < num_insns; i++)
18402 {
18403 mem[i] = operands[2 * i];
18404 reg[i] = operands[2 * i + 1];
18405 }
18406
18407 /* Skip if memory operand is by itself valid for ldp/stp. */
18408 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18409 return false;
18410
18411 for (int i = 0; i < num_insns; i++)
18412 {
18413 /* The mems cannot be volatile. */
18414 if (MEM_VOLATILE_P (mem[i]))
18415 return false;
18416
18417 /* Check if the addresses are in the form of [base+offset]. */
18418 extract_base_offset_in_addr (mem[i], base + i, offset + i);
18419 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18420 return false;
18421 }
18422
18423 /* Check if the registers are of same class. */
18424 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18425 ? FP_REGS : GENERAL_REGS;
18426
18427 for (int i = 1; i < num_insns; i++)
18428 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18429 {
18430 if (rclass != FP_REGS)
18431 return false;
18432 }
18433 else
18434 {
18435 if (rclass != GENERAL_REGS)
18436 return false;
18437 }
18438
18439 /* Only the last register in the order in which they occur
18440 may be clobbered by the load. */
18441 if (rclass == GENERAL_REGS && load)
18442 for (int i = 0; i < num_insns - 1; i++)
18443 if (reg_mentioned_p (reg[i], mem[i]))
18444 return false;
18445
18446 /* Check if the bases are same. */
18447 for (int i = 0; i < num_insns - 1; i++)
18448 if (!rtx_equal_p (base[i], base[i + 1]))
18449 return false;
18450
18451 for (int i = 0; i < num_insns; i++)
18452 offvals[i] = INTVAL (offset[i]);
18453
18454 msize = GET_MODE_SIZE (mode);
18455
18456 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18457 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18458 aarch64_host_wide_int_compare);
18459
18460 if (!(offvals[1] == offvals[0] + msize
18461 && offvals[3] == offvals[2] + msize))
18462 return false;
18463
18464 /* Check that offsets are within range of each other. The ldp/stp
18465 instructions have 7 bit immediate offsets, so use 0x80. */
18466 if (offvals[2] - offvals[0] >= msize * 0x80)
18467 return false;
18468
18469 /* The offsets must be aligned with respect to each other. */
18470 if (offvals[0] % msize != offvals[2] % msize)
18471 return false;
18472
18473 /* If we have SImode and slow unaligned ldp,
18474 check the alignment to be at least 8 byte. */
18475 if (mode == SImode
18476 && (aarch64_tune_params.extra_tuning_flags
18477 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18478 && !optimize_size
18479 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18480 return false;
18481
18482 return true;
18483 }
18484
18485 /* Given OPERANDS of consecutive load/store, this function pairs them
18486 into LDP/STP after adjusting the offset. It depends on the fact
18487 that the operands can be sorted so the offsets are correct for STP.
18488 MODE is the mode of memory operands. CODE is the rtl operator
18489 which should be applied to all memory operands, it's SIGN_EXTEND,
18490 ZERO_EXTEND or UNKNOWN. */
18491
18492 bool
18493 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18494 scalar_mode mode, RTX_CODE code)
18495 {
18496 rtx base, offset_1, offset_3, t1, t2;
18497 rtx mem_1, mem_2, mem_3, mem_4;
18498 rtx temp_operands[8];
18499 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18500 stp_off_upper_limit, stp_off_lower_limit, msize;
18501
18502 /* We make changes on a copy as we may still bail out. */
18503 for (int i = 0; i < 8; i ++)
18504 temp_operands[i] = operands[i];
18505
18506 /* Sort the operands. */
18507 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18508
18509 if (load)
18510 {
18511 mem_1 = temp_operands[1];
18512 mem_2 = temp_operands[3];
18513 mem_3 = temp_operands[5];
18514 mem_4 = temp_operands[7];
18515 }
18516 else
18517 {
18518 mem_1 = temp_operands[0];
18519 mem_2 = temp_operands[2];
18520 mem_3 = temp_operands[4];
18521 mem_4 = temp_operands[6];
18522 gcc_assert (code == UNKNOWN);
18523 }
18524
18525 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18526 extract_base_offset_in_addr (mem_3, &base, &offset_3);
18527 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18528 && offset_3 != NULL_RTX);
18529
18530 /* Adjust offset so it can fit in LDP/STP instruction. */
18531 msize = GET_MODE_SIZE (mode);
18532 stp_off_upper_limit = msize * (0x40 - 1);
18533 stp_off_lower_limit = - msize * 0x40;
18534
18535 off_val_1 = INTVAL (offset_1);
18536 off_val_3 = INTVAL (offset_3);
18537
18538 /* The base offset is optimally half way between the two STP/LDP offsets. */
18539 if (msize <= 4)
18540 base_off = (off_val_1 + off_val_3) / 2;
18541 else
18542 /* However, due to issues with negative LDP/STP offset generation for
18543 larger modes, for DF, DI and vector modes. we must not use negative
18544 addresses smaller than 9 signed unadjusted bits can store. This
18545 provides the most range in this case. */
18546 base_off = off_val_1;
18547
18548 /* Adjust the base so that it is aligned with the addresses but still
18549 optimal. */
18550 if (base_off % msize != off_val_1 % msize)
18551 /* Fix the offset, bearing in mind we want to make it bigger not
18552 smaller. */
18553 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18554 else if (msize <= 4)
18555 /* The negative range of LDP/STP is one larger than the positive range. */
18556 base_off += msize;
18557
18558 /* Check if base offset is too big or too small. We can attempt to resolve
18559 this issue by setting it to the maximum value and seeing if the offsets
18560 still fit. */
18561 if (base_off >= 0x1000)
18562 {
18563 base_off = 0x1000 - 1;
18564 /* We must still make sure that the base offset is aligned with respect
18565 to the address. But it may may not be made any bigger. */
18566 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18567 }
18568
18569 /* Likewise for the case where the base is too small. */
18570 if (base_off <= -0x1000)
18571 {
18572 base_off = -0x1000 + 1;
18573 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18574 }
18575
18576 /* Offset of the first STP/LDP. */
18577 new_off_1 = off_val_1 - base_off;
18578
18579 /* Offset of the second STP/LDP. */
18580 new_off_3 = off_val_3 - base_off;
18581
18582 /* The offsets must be within the range of the LDP/STP instructions. */
18583 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18584 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18585 return false;
18586
18587 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18588 new_off_1), true);
18589 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18590 new_off_1 + msize), true);
18591 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18592 new_off_3), true);
18593 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18594 new_off_3 + msize), true);
18595
18596 if (!aarch64_mem_pair_operand (mem_1, mode)
18597 || !aarch64_mem_pair_operand (mem_3, mode))
18598 return false;
18599
18600 if (code == ZERO_EXTEND)
18601 {
18602 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18603 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18604 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18605 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18606 }
18607 else if (code == SIGN_EXTEND)
18608 {
18609 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18610 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18611 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18612 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18613 }
18614
18615 if (load)
18616 {
18617 operands[0] = temp_operands[0];
18618 operands[1] = mem_1;
18619 operands[2] = temp_operands[2];
18620 operands[3] = mem_2;
18621 operands[4] = temp_operands[4];
18622 operands[5] = mem_3;
18623 operands[6] = temp_operands[6];
18624 operands[7] = mem_4;
18625 }
18626 else
18627 {
18628 operands[0] = mem_1;
18629 operands[1] = temp_operands[1];
18630 operands[2] = mem_2;
18631 operands[3] = temp_operands[3];
18632 operands[4] = mem_3;
18633 operands[5] = temp_operands[5];
18634 operands[6] = mem_4;
18635 operands[7] = temp_operands[7];
18636 }
18637
18638 /* Emit adjusting instruction. */
18639 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18640 /* Emit ldp/stp instructions. */
18641 t1 = gen_rtx_SET (operands[0], operands[1]);
18642 t2 = gen_rtx_SET (operands[2], operands[3]);
18643 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18644 t1 = gen_rtx_SET (operands[4], operands[5]);
18645 t2 = gen_rtx_SET (operands[6], operands[7]);
18646 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18647 return true;
18648 }
18649
18650 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18651 it isn't worth branching around empty masked ops (including masked
18652 stores). */
18653
18654 static bool
18655 aarch64_empty_mask_is_expensive (unsigned)
18656 {
18657 return false;
18658 }
18659
18660 /* Return 1 if pseudo register should be created and used to hold
18661 GOT address for PIC code. */
18662
18663 bool
18664 aarch64_use_pseudo_pic_reg (void)
18665 {
18666 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18667 }
18668
18669 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18670
18671 static int
18672 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18673 {
18674 switch (XINT (x, 1))
18675 {
18676 case UNSPEC_GOTSMALLPIC:
18677 case UNSPEC_GOTSMALLPIC28K:
18678 case UNSPEC_GOTTINYPIC:
18679 return 0;
18680 default:
18681 break;
18682 }
18683
18684 return default_unspec_may_trap_p (x, flags);
18685 }
18686
18687
18688 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18689 return the log2 of that value. Otherwise return -1. */
18690
18691 int
18692 aarch64_fpconst_pow_of_2 (rtx x)
18693 {
18694 const REAL_VALUE_TYPE *r;
18695
18696 if (!CONST_DOUBLE_P (x))
18697 return -1;
18698
18699 r = CONST_DOUBLE_REAL_VALUE (x);
18700
18701 if (REAL_VALUE_NEGATIVE (*r)
18702 || REAL_VALUE_ISNAN (*r)
18703 || REAL_VALUE_ISINF (*r)
18704 || !real_isinteger (r, DFmode))
18705 return -1;
18706
18707 return exact_log2 (real_to_integer (r));
18708 }
18709
18710 /* If X is a vector of equal CONST_DOUBLE values and that value is
18711 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18712
18713 int
18714 aarch64_vec_fpconst_pow_of_2 (rtx x)
18715 {
18716 int nelts;
18717 if (GET_CODE (x) != CONST_VECTOR
18718 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18719 return -1;
18720
18721 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18722 return -1;
18723
18724 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18725 if (firstval <= 0)
18726 return -1;
18727
18728 for (int i = 1; i < nelts; i++)
18729 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18730 return -1;
18731
18732 return firstval;
18733 }
18734
18735 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18736 to float.
18737
18738 __fp16 always promotes through this hook.
18739 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18740 through the generic excess precision logic rather than here. */
18741
18742 static tree
18743 aarch64_promoted_type (const_tree t)
18744 {
18745 if (SCALAR_FLOAT_TYPE_P (t)
18746 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18747 return float_type_node;
18748
18749 return NULL_TREE;
18750 }
18751
18752 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18753
18754 static bool
18755 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18756 optimization_type opt_type)
18757 {
18758 switch (op)
18759 {
18760 case rsqrt_optab:
18761 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18762
18763 default:
18764 return true;
18765 }
18766 }
18767
18768 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18769
18770 static unsigned int
18771 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18772 int *offset)
18773 {
18774 /* Polynomial invariant 1 == (VG / 2) - 1. */
18775 gcc_assert (i == 1);
18776 *factor = 2;
18777 *offset = 1;
18778 return AARCH64_DWARF_VG;
18779 }
18780
18781 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18782 if MODE is HFmode, and punt to the generic implementation otherwise. */
18783
18784 static bool
18785 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18786 {
18787 return (mode == HFmode
18788 ? true
18789 : default_libgcc_floating_mode_supported_p (mode));
18790 }
18791
18792 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18793 if MODE is HFmode, and punt to the generic implementation otherwise. */
18794
18795 static bool
18796 aarch64_scalar_mode_supported_p (scalar_mode mode)
18797 {
18798 return (mode == HFmode
18799 ? true
18800 : default_scalar_mode_supported_p (mode));
18801 }
18802
18803 /* Set the value of FLT_EVAL_METHOD.
18804 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18805
18806 0: evaluate all operations and constants, whose semantic type has at
18807 most the range and precision of type float, to the range and
18808 precision of float; evaluate all other operations and constants to
18809 the range and precision of the semantic type;
18810
18811 N, where _FloatN is a supported interchange floating type
18812 evaluate all operations and constants, whose semantic type has at
18813 most the range and precision of _FloatN type, to the range and
18814 precision of the _FloatN type; evaluate all other operations and
18815 constants to the range and precision of the semantic type;
18816
18817 If we have the ARMv8.2-A extensions then we support _Float16 in native
18818 precision, so we should set this to 16. Otherwise, we support the type,
18819 but want to evaluate expressions in float precision, so set this to
18820 0. */
18821
18822 static enum flt_eval_method
18823 aarch64_excess_precision (enum excess_precision_type type)
18824 {
18825 switch (type)
18826 {
18827 case EXCESS_PRECISION_TYPE_FAST:
18828 case EXCESS_PRECISION_TYPE_STANDARD:
18829 /* We can calculate either in 16-bit range and precision or
18830 32-bit range and precision. Make that decision based on whether
18831 we have native support for the ARMv8.2-A 16-bit floating-point
18832 instructions or not. */
18833 return (TARGET_FP_F16INST
18834 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18835 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18836 case EXCESS_PRECISION_TYPE_IMPLICIT:
18837 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18838 default:
18839 gcc_unreachable ();
18840 }
18841 return FLT_EVAL_METHOD_UNPREDICTABLE;
18842 }
18843
18844 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18845 scheduled for speculative execution. Reject the long-running division
18846 and square-root instructions. */
18847
18848 static bool
18849 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18850 {
18851 switch (get_attr_type (insn))
18852 {
18853 case TYPE_SDIV:
18854 case TYPE_UDIV:
18855 case TYPE_FDIVS:
18856 case TYPE_FDIVD:
18857 case TYPE_FSQRTS:
18858 case TYPE_FSQRTD:
18859 case TYPE_NEON_FP_SQRT_S:
18860 case TYPE_NEON_FP_SQRT_D:
18861 case TYPE_NEON_FP_SQRT_S_Q:
18862 case TYPE_NEON_FP_SQRT_D_Q:
18863 case TYPE_NEON_FP_DIV_S:
18864 case TYPE_NEON_FP_DIV_D:
18865 case TYPE_NEON_FP_DIV_S_Q:
18866 case TYPE_NEON_FP_DIV_D_Q:
18867 return false;
18868 default:
18869 return true;
18870 }
18871 }
18872
18873 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18874
18875 static int
18876 aarch64_compute_pressure_classes (reg_class *classes)
18877 {
18878 int i = 0;
18879 classes[i++] = GENERAL_REGS;
18880 classes[i++] = FP_REGS;
18881 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18882 registers need to go in PR_LO_REGS at some point during their
18883 lifetime. Splitting it into two halves has the effect of making
18884 all predicates count against PR_LO_REGS, so that we try whenever
18885 possible to restrict the number of live predicates to 8. This
18886 greatly reduces the amount of spilling in certain loops. */
18887 classes[i++] = PR_LO_REGS;
18888 classes[i++] = PR_HI_REGS;
18889 return i;
18890 }
18891
18892 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18893
18894 static bool
18895 aarch64_can_change_mode_class (machine_mode from,
18896 machine_mode to, reg_class_t)
18897 {
18898 if (BYTES_BIG_ENDIAN)
18899 {
18900 bool from_sve_p = aarch64_sve_data_mode_p (from);
18901 bool to_sve_p = aarch64_sve_data_mode_p (to);
18902
18903 /* Don't allow changes between SVE data modes and non-SVE modes.
18904 See the comment at the head of aarch64-sve.md for details. */
18905 if (from_sve_p != to_sve_p)
18906 return false;
18907
18908 /* Don't allow changes in element size: lane 0 of the new vector
18909 would not then be lane 0 of the old vector. See the comment
18910 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18911 description.
18912
18913 In the worst case, this forces a register to be spilled in
18914 one mode and reloaded in the other, which handles the
18915 endianness correctly. */
18916 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18917 return false;
18918 }
18919 return true;
18920 }
18921
18922 /* Implement TARGET_EARLY_REMAT_MODES. */
18923
18924 static void
18925 aarch64_select_early_remat_modes (sbitmap modes)
18926 {
18927 /* SVE values are not normally live across a call, so it should be
18928 worth doing early rematerialization even in VL-specific mode. */
18929 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18930 {
18931 machine_mode mode = (machine_mode) i;
18932 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18933 if (vec_flags & VEC_ANY_SVE)
18934 bitmap_set_bit (modes, i);
18935 }
18936 }
18937
18938 /* Override the default target speculation_safe_value. */
18939 static rtx
18940 aarch64_speculation_safe_value (machine_mode mode,
18941 rtx result, rtx val, rtx failval)
18942 {
18943 /* Maybe we should warn if falling back to hard barriers. They are
18944 likely to be noticably more expensive than the alternative below. */
18945 if (!aarch64_track_speculation)
18946 return default_speculation_safe_value (mode, result, val, failval);
18947
18948 if (!REG_P (val))
18949 val = copy_to_mode_reg (mode, val);
18950
18951 if (!aarch64_reg_or_zero (failval, mode))
18952 failval = copy_to_mode_reg (mode, failval);
18953
18954 emit_insn (gen_despeculate_copy (mode, result, val, failval));
18955 return result;
18956 }
18957
18958 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18959 Look into the tuning structure for an estimate.
18960 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18961 Advanced SIMD 128 bits. */
18962
18963 static HOST_WIDE_INT
18964 aarch64_estimated_poly_value (poly_int64 val)
18965 {
18966 enum aarch64_sve_vector_bits_enum width_source
18967 = aarch64_tune_params.sve_width;
18968
18969 /* If we still don't have an estimate, use the default. */
18970 if (width_source == SVE_SCALABLE)
18971 return default_estimated_poly_value (val);
18972
18973 HOST_WIDE_INT over_128 = width_source - 128;
18974 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18975 }
18976
18977
18978 /* Return true for types that could be supported as SIMD return or
18979 argument types. */
18980
18981 static bool
18982 supported_simd_type (tree t)
18983 {
18984 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18985 {
18986 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18987 return s == 1 || s == 2 || s == 4 || s == 8;
18988 }
18989 return false;
18990 }
18991
18992 /* Return true for types that currently are supported as SIMD return
18993 or argument types. */
18994
18995 static bool
18996 currently_supported_simd_type (tree t, tree b)
18997 {
18998 if (COMPLEX_FLOAT_TYPE_P (t))
18999 return false;
19000
19001 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19002 return false;
19003
19004 return supported_simd_type (t);
19005 }
19006
19007 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19008
19009 static int
19010 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19011 struct cgraph_simd_clone *clonei,
19012 tree base_type, int num)
19013 {
19014 tree t, ret_type, arg_type;
19015 unsigned int elt_bits, vec_bits, count;
19016
19017 if (!TARGET_SIMD)
19018 return 0;
19019
19020 if (clonei->simdlen
19021 && (clonei->simdlen < 2
19022 || clonei->simdlen > 1024
19023 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19024 {
19025 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19026 "unsupported simdlen %d", clonei->simdlen);
19027 return 0;
19028 }
19029
19030 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19031 if (TREE_CODE (ret_type) != VOID_TYPE
19032 && !currently_supported_simd_type (ret_type, base_type))
19033 {
19034 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19035 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19036 "GCC does not currently support mixed size types "
19037 "for %<simd%> functions");
19038 else if (supported_simd_type (ret_type))
19039 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19040 "GCC does not currently support return type %qT "
19041 "for %<simd%> functions", ret_type);
19042 else
19043 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19044 "unsupported return type %qT for %<simd%> functions",
19045 ret_type);
19046 return 0;
19047 }
19048
19049 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19050 {
19051 arg_type = TREE_TYPE (t);
19052
19053 if (!currently_supported_simd_type (arg_type, base_type))
19054 {
19055 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19056 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19057 "GCC does not currently support mixed size types "
19058 "for %<simd%> functions");
19059 else
19060 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19061 "GCC does not currently support argument type %qT "
19062 "for %<simd%> functions", arg_type);
19063 return 0;
19064 }
19065 }
19066
19067 clonei->vecsize_mangle = 'n';
19068 clonei->mask_mode = VOIDmode;
19069 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19070 if (clonei->simdlen == 0)
19071 {
19072 count = 2;
19073 vec_bits = (num == 0 ? 64 : 128);
19074 clonei->simdlen = vec_bits / elt_bits;
19075 }
19076 else
19077 {
19078 count = 1;
19079 vec_bits = clonei->simdlen * elt_bits;
19080 if (vec_bits != 64 && vec_bits != 128)
19081 {
19082 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19083 "GCC does not currently support simdlen %d for type %qT",
19084 clonei->simdlen, base_type);
19085 return 0;
19086 }
19087 }
19088 clonei->vecsize_int = vec_bits;
19089 clonei->vecsize_float = vec_bits;
19090 return count;
19091 }
19092
19093 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19094
19095 static void
19096 aarch64_simd_clone_adjust (struct cgraph_node *node)
19097 {
19098 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19099 use the correct ABI. */
19100
19101 tree t = TREE_TYPE (node->decl);
19102 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19103 TYPE_ATTRIBUTES (t));
19104 }
19105
19106 /* Implement TARGET_SIMD_CLONE_USABLE. */
19107
19108 static int
19109 aarch64_simd_clone_usable (struct cgraph_node *node)
19110 {
19111 switch (node->simdclone->vecsize_mangle)
19112 {
19113 case 'n':
19114 if (!TARGET_SIMD)
19115 return -1;
19116 return 0;
19117 default:
19118 gcc_unreachable ();
19119 }
19120 }
19121
19122 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19123
19124 static int
19125 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19126 {
19127 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19128 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19129 return 0;
19130 return 1;
19131 }
19132
19133 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19134
19135 static const char *
19136 aarch64_get_multilib_abi_name (void)
19137 {
19138 if (TARGET_BIG_END)
19139 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19140 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19141 }
19142
19143 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19144 global variable based guard use the default else
19145 return a null tree. */
19146 static tree
19147 aarch64_stack_protect_guard (void)
19148 {
19149 if (aarch64_stack_protector_guard == SSP_GLOBAL)
19150 return default_stack_protect_guard ();
19151
19152 return NULL_TREE;
19153 }
19154
19155 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19156 section at the end if needed. */
19157 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19158 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19159 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19160 void
19161 aarch64_file_end_indicate_exec_stack ()
19162 {
19163 file_end_indicate_exec_stack ();
19164
19165 unsigned feature_1_and = 0;
19166 if (aarch64_bti_enabled ())
19167 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19168
19169 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19170 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19171
19172 if (feature_1_and)
19173 {
19174 /* Generate .note.gnu.property section. */
19175 switch_to_section (get_section (".note.gnu.property",
19176 SECTION_NOTYPE, NULL));
19177
19178 /* PT_NOTE header: namesz, descsz, type.
19179 namesz = 4 ("GNU\0")
19180 descsz = 16 (Size of the program property array)
19181 [(12 + padding) * Number of array elements]
19182 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19183 assemble_align (POINTER_SIZE);
19184 assemble_integer (GEN_INT (4), 4, 32, 1);
19185 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19186 assemble_integer (GEN_INT (5), 4, 32, 1);
19187
19188 /* PT_NOTE name. */
19189 assemble_string ("GNU", 4);
19190
19191 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19192 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19193 datasz = 4
19194 data = feature_1_and. */
19195 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19196 assemble_integer (GEN_INT (4), 4, 32, 1);
19197 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19198
19199 /* Pad the size of the note to the required alignment. */
19200 assemble_align (POINTER_SIZE);
19201 }
19202 }
19203 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19204 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19205 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19206
19207 /* Target-specific selftests. */
19208
19209 #if CHECKING_P
19210
19211 namespace selftest {
19212
19213 /* Selftest for the RTL loader.
19214 Verify that the RTL loader copes with a dump from
19215 print_rtx_function. This is essentially just a test that class
19216 function_reader can handle a real dump, but it also verifies
19217 that lookup_reg_by_dump_name correctly handles hard regs.
19218 The presence of hard reg names in the dump means that the test is
19219 target-specific, hence it is in this file. */
19220
19221 static void
19222 aarch64_test_loading_full_dump ()
19223 {
19224 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19225
19226 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19227
19228 rtx_insn *insn_1 = get_insn_by_uid (1);
19229 ASSERT_EQ (NOTE, GET_CODE (insn_1));
19230
19231 rtx_insn *insn_15 = get_insn_by_uid (15);
19232 ASSERT_EQ (INSN, GET_CODE (insn_15));
19233 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19234
19235 /* Verify crtl->return_rtx. */
19236 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19237 ASSERT_EQ (0, REGNO (crtl->return_rtx));
19238 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19239 }
19240
19241 /* Run all target-specific selftests. */
19242
19243 static void
19244 aarch64_run_selftests (void)
19245 {
19246 aarch64_test_loading_full_dump ();
19247 }
19248
19249 } // namespace selftest
19250
19251 #endif /* #if CHECKING_P */
19252
19253 #undef TARGET_STACK_PROTECT_GUARD
19254 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19255
19256 #undef TARGET_ADDRESS_COST
19257 #define TARGET_ADDRESS_COST aarch64_address_cost
19258
19259 /* This hook will determines whether unnamed bitfields affect the alignment
19260 of the containing structure. The hook returns true if the structure
19261 should inherit the alignment requirements of an unnamed bitfield's
19262 type. */
19263 #undef TARGET_ALIGN_ANON_BITFIELD
19264 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19265
19266 #undef TARGET_ASM_ALIGNED_DI_OP
19267 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19268
19269 #undef TARGET_ASM_ALIGNED_HI_OP
19270 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19271
19272 #undef TARGET_ASM_ALIGNED_SI_OP
19273 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19274
19275 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19276 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19277 hook_bool_const_tree_hwi_hwi_const_tree_true
19278
19279 #undef TARGET_ASM_FILE_START
19280 #define TARGET_ASM_FILE_START aarch64_start_file
19281
19282 #undef TARGET_ASM_OUTPUT_MI_THUNK
19283 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19284
19285 #undef TARGET_ASM_SELECT_RTX_SECTION
19286 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19287
19288 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19289 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19290
19291 #undef TARGET_BUILD_BUILTIN_VA_LIST
19292 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19293
19294 #undef TARGET_CALLEE_COPIES
19295 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19296
19297 #undef TARGET_CAN_ELIMINATE
19298 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19299
19300 #undef TARGET_CAN_INLINE_P
19301 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19302
19303 #undef TARGET_CANNOT_FORCE_CONST_MEM
19304 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19305
19306 #undef TARGET_CASE_VALUES_THRESHOLD
19307 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19308
19309 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19310 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19311
19312 /* Only the least significant bit is used for initialization guard
19313 variables. */
19314 #undef TARGET_CXX_GUARD_MASK_BIT
19315 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19316
19317 #undef TARGET_C_MODE_FOR_SUFFIX
19318 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19319
19320 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19321 #undef TARGET_DEFAULT_TARGET_FLAGS
19322 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19323 #endif
19324
19325 #undef TARGET_CLASS_MAX_NREGS
19326 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19327
19328 #undef TARGET_BUILTIN_DECL
19329 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19330
19331 #undef TARGET_BUILTIN_RECIPROCAL
19332 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19333
19334 #undef TARGET_C_EXCESS_PRECISION
19335 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19336
19337 #undef TARGET_EXPAND_BUILTIN
19338 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19339
19340 #undef TARGET_EXPAND_BUILTIN_VA_START
19341 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19342
19343 #undef TARGET_FOLD_BUILTIN
19344 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19345
19346 #undef TARGET_FUNCTION_ARG
19347 #define TARGET_FUNCTION_ARG aarch64_function_arg
19348
19349 #undef TARGET_FUNCTION_ARG_ADVANCE
19350 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19351
19352 #undef TARGET_FUNCTION_ARG_BOUNDARY
19353 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19354
19355 #undef TARGET_FUNCTION_ARG_PADDING
19356 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19357
19358 #undef TARGET_GET_RAW_RESULT_MODE
19359 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19360 #undef TARGET_GET_RAW_ARG_MODE
19361 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19362
19363 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19364 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19365
19366 #undef TARGET_FUNCTION_VALUE
19367 #define TARGET_FUNCTION_VALUE aarch64_function_value
19368
19369 #undef TARGET_FUNCTION_VALUE_REGNO_P
19370 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19371
19372 #undef TARGET_GIMPLE_FOLD_BUILTIN
19373 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19374
19375 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19376 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19377
19378 #undef TARGET_INIT_BUILTINS
19379 #define TARGET_INIT_BUILTINS aarch64_init_builtins
19380
19381 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19382 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19383 aarch64_ira_change_pseudo_allocno_class
19384
19385 #undef TARGET_LEGITIMATE_ADDRESS_P
19386 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19387
19388 #undef TARGET_LEGITIMATE_CONSTANT_P
19389 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19390
19391 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19392 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19393 aarch64_legitimize_address_displacement
19394
19395 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19396 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19397
19398 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19399 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19400 aarch64_libgcc_floating_mode_supported_p
19401
19402 #undef TARGET_MANGLE_TYPE
19403 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19404
19405 #undef TARGET_MEMORY_MOVE_COST
19406 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19407
19408 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19409 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19410
19411 #undef TARGET_MUST_PASS_IN_STACK
19412 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19413
19414 /* This target hook should return true if accesses to volatile bitfields
19415 should use the narrowest mode possible. It should return false if these
19416 accesses should use the bitfield container type. */
19417 #undef TARGET_NARROW_VOLATILE_BITFIELD
19418 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19419
19420 #undef TARGET_OPTION_OVERRIDE
19421 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19422
19423 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19424 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19425 aarch64_override_options_after_change
19426
19427 #undef TARGET_OPTION_SAVE
19428 #define TARGET_OPTION_SAVE aarch64_option_save
19429
19430 #undef TARGET_OPTION_RESTORE
19431 #define TARGET_OPTION_RESTORE aarch64_option_restore
19432
19433 #undef TARGET_OPTION_PRINT
19434 #define TARGET_OPTION_PRINT aarch64_option_print
19435
19436 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19437 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19438
19439 #undef TARGET_SET_CURRENT_FUNCTION
19440 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19441
19442 #undef TARGET_PASS_BY_REFERENCE
19443 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19444
19445 #undef TARGET_PREFERRED_RELOAD_CLASS
19446 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19447
19448 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19449 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19450
19451 #undef TARGET_PROMOTED_TYPE
19452 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19453
19454 #undef TARGET_SECONDARY_RELOAD
19455 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19456
19457 #undef TARGET_SHIFT_TRUNCATION_MASK
19458 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19459
19460 #undef TARGET_SETUP_INCOMING_VARARGS
19461 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19462
19463 #undef TARGET_STRUCT_VALUE_RTX
19464 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
19465
19466 #undef TARGET_REGISTER_MOVE_COST
19467 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19468
19469 #undef TARGET_RETURN_IN_MEMORY
19470 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19471
19472 #undef TARGET_RETURN_IN_MSB
19473 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19474
19475 #undef TARGET_RTX_COSTS
19476 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19477
19478 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19479 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19480
19481 #undef TARGET_SCHED_ISSUE_RATE
19482 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19483
19484 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19485 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19486 aarch64_sched_first_cycle_multipass_dfa_lookahead
19487
19488 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19489 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19490 aarch64_first_cycle_multipass_dfa_lookahead_guard
19491
19492 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19493 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19494 aarch64_get_separate_components
19495
19496 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19497 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19498 aarch64_components_for_bb
19499
19500 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19501 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19502 aarch64_disqualify_components
19503
19504 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19505 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19506 aarch64_emit_prologue_components
19507
19508 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19509 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19510 aarch64_emit_epilogue_components
19511
19512 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19513 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19514 aarch64_set_handled_components
19515
19516 #undef TARGET_TRAMPOLINE_INIT
19517 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19518
19519 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19520 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19521
19522 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19523 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19524
19525 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19526 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19527 aarch64_builtin_support_vector_misalignment
19528
19529 #undef TARGET_ARRAY_MODE
19530 #define TARGET_ARRAY_MODE aarch64_array_mode
19531
19532 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19533 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19534
19535 #undef TARGET_VECTORIZE_ADD_STMT_COST
19536 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19537
19538 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19539 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19540 aarch64_builtin_vectorization_cost
19541
19542 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19543 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19544
19545 #undef TARGET_VECTORIZE_BUILTINS
19546 #define TARGET_VECTORIZE_BUILTINS
19547
19548 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19549 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19550 aarch64_builtin_vectorized_function
19551
19552 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19553 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19554 aarch64_autovectorize_vector_sizes
19555
19556 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19557 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19558 aarch64_atomic_assign_expand_fenv
19559
19560 /* Section anchor support. */
19561
19562 #undef TARGET_MIN_ANCHOR_OFFSET
19563 #define TARGET_MIN_ANCHOR_OFFSET -256
19564
19565 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19566 byte offset; we can do much more for larger data types, but have no way
19567 to determine the size of the access. We assume accesses are aligned. */
19568 #undef TARGET_MAX_ANCHOR_OFFSET
19569 #define TARGET_MAX_ANCHOR_OFFSET 4095
19570
19571 #undef TARGET_VECTOR_ALIGNMENT
19572 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19573
19574 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19575 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19576 aarch64_vectorize_preferred_vector_alignment
19577 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19578 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19579 aarch64_simd_vector_alignment_reachable
19580
19581 /* vec_perm support. */
19582
19583 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19584 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19585 aarch64_vectorize_vec_perm_const
19586
19587 #undef TARGET_VECTORIZE_GET_MASK_MODE
19588 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19589 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19590 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19591 aarch64_empty_mask_is_expensive
19592 #undef TARGET_PREFERRED_ELSE_VALUE
19593 #define TARGET_PREFERRED_ELSE_VALUE \
19594 aarch64_preferred_else_value
19595
19596 #undef TARGET_INIT_LIBFUNCS
19597 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19598
19599 #undef TARGET_FIXED_CONDITION_CODE_REGS
19600 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19601
19602 #undef TARGET_FLAGS_REGNUM
19603 #define TARGET_FLAGS_REGNUM CC_REGNUM
19604
19605 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19606 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19607
19608 #undef TARGET_ASAN_SHADOW_OFFSET
19609 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19610
19611 #undef TARGET_LEGITIMIZE_ADDRESS
19612 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19613
19614 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19615 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19616
19617 #undef TARGET_CAN_USE_DOLOOP_P
19618 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19619
19620 #undef TARGET_SCHED_ADJUST_PRIORITY
19621 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19622
19623 #undef TARGET_SCHED_MACRO_FUSION_P
19624 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19625
19626 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19627 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19628
19629 #undef TARGET_SCHED_FUSION_PRIORITY
19630 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19631
19632 #undef TARGET_UNSPEC_MAY_TRAP_P
19633 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19634
19635 #undef TARGET_USE_PSEUDO_PIC_REG
19636 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19637
19638 #undef TARGET_PRINT_OPERAND
19639 #define TARGET_PRINT_OPERAND aarch64_print_operand
19640
19641 #undef TARGET_PRINT_OPERAND_ADDRESS
19642 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19643
19644 #undef TARGET_OPTAB_SUPPORTED_P
19645 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19646
19647 #undef TARGET_OMIT_STRUCT_RETURN_REG
19648 #define TARGET_OMIT_STRUCT_RETURN_REG true
19649
19650 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19651 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19652 aarch64_dwarf_poly_indeterminate_value
19653
19654 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
19655 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19656 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19657
19658 #undef TARGET_HARD_REGNO_NREGS
19659 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19660 #undef TARGET_HARD_REGNO_MODE_OK
19661 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19662
19663 #undef TARGET_MODES_TIEABLE_P
19664 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19665
19666 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19667 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19668 aarch64_hard_regno_call_part_clobbered
19669
19670 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19671 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19672 aarch64_remove_extra_call_preserved_regs
19673
19674 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19675 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19676 aarch64_return_call_with_max_clobbers
19677
19678 #undef TARGET_CONSTANT_ALIGNMENT
19679 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19680
19681 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19682 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19683 aarch64_stack_clash_protection_alloca_probe_range
19684
19685 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19686 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19687
19688 #undef TARGET_CAN_CHANGE_MODE_CLASS
19689 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19690
19691 #undef TARGET_SELECT_EARLY_REMAT_MODES
19692 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19693
19694 #undef TARGET_SPECULATION_SAFE_VALUE
19695 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19696
19697 #undef TARGET_ESTIMATED_POLY_VALUE
19698 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19699
19700 #undef TARGET_ATTRIBUTE_TABLE
19701 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19702
19703 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19704 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19705 aarch64_simd_clone_compute_vecsize_and_simdlen
19706
19707 #undef TARGET_SIMD_CLONE_ADJUST
19708 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19709
19710 #undef TARGET_SIMD_CLONE_USABLE
19711 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19712
19713 #undef TARGET_COMP_TYPE_ATTRIBUTES
19714 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19715
19716 #undef TARGET_GET_MULTILIB_ABI_NAME
19717 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19718
19719 #if CHECKING_P
19720 #undef TARGET_RUN_TARGET_SELFTESTS
19721 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19722 #endif /* #if CHECKING_P */
19723
19724 #undef TARGET_ASM_POST_CFI_STARTPROC
19725 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
19726
19727 struct gcc_target targetm = TARGET_INITIALIZER;
19728
19729 #include "gt-aarch64.h"