]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
Wrap option names in gcc internal messages with %< and %>.
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95
96 /* The mode of the elements. */
97 scalar_mode elt_mode;
98
99 /* The value of each element if all elements are the same, or the
100 first value if the constant is a series. */
101 rtx value;
102
103 /* The value of the step if the constant is a series, null otherwise. */
104 rtx step;
105
106 /* The instruction to use to move the immediate into a vector. */
107 insn_type insn;
108
109 /* The kind of shift modifier to use, and the number of bits to shift.
110 This is (LSL, 0) if no shift is needed. */
111 modifier_type modifier;
112 unsigned int shift;
113 };
114
115 /* Construct a floating-point immediate in which each element has mode
116 ELT_MODE_IN and value VALUE_IN. */
117 inline simd_immediate_info
118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
119 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
120 modifier (LSL), shift (0)
121 {}
122
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124 and value VALUE_IN. The other parameters are as for the structure
125 fields. */
126 inline simd_immediate_info
127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
128 unsigned HOST_WIDE_INT value_in,
129 insn_type insn_in, modifier_type modifier_in,
130 unsigned int shift_in)
131 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
132 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 {}
134
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136 and where element I is equal to VALUE_IN + I * STEP_IN. */
137 inline simd_immediate_info
138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
139 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
140 modifier (LSL), shift (0)
141 {}
142
143 /* The current code model. */
144 enum aarch64_code_model aarch64_cmodel;
145
146 /* The number of 64-bit elements in an SVE vector. */
147 poly_uint16 aarch64_sve_vg;
148
149 #ifdef HAVE_AS_TLS
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
152 #endif
153
154 static bool aarch64_composite_type_p (const_tree, machine_mode);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
156 const_tree,
157 machine_mode *, int *,
158 bool *);
159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode);
163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
165 const_tree type,
166 int misalignment,
167 bool is_packed);
168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
170 aarch64_addr_query_type);
171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
172
173 /* Major revision number of the ARM Architecture implemented by the target. */
174 unsigned aarch64_architecture_version;
175
176 /* The processor for which instructions should be scheduled. */
177 enum aarch64_processor aarch64_tune = cortexa53;
178
179 /* Mask to specify which instruction scheduling options should be used. */
180 unsigned long aarch64_tune_flags = 0;
181
182 /* Global flag for PC relative loads. */
183 bool aarch64_pcrelative_literal_loads;
184
185 /* Global flag for whether frame pointer is enabled. */
186 bool aarch64_use_frame_pointer;
187
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string = NULL;
190
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
193
194 /* Support for command line parsing of boolean flags in the tuning
195 structures. */
196 struct aarch64_flag_desc
197 {
198 const char* name;
199 unsigned int flag;
200 };
201
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203 { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
205 {
206 { "none", AARCH64_FUSE_NOTHING },
207 #include "aarch64-fusion-pairs.def"
208 { "all", AARCH64_FUSE_ALL },
209 { NULL, AARCH64_FUSE_NOTHING }
210 };
211
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213 { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
215 {
216 { "none", AARCH64_EXTRA_TUNE_NONE },
217 #include "aarch64-tuning-flags.def"
218 { "all", AARCH64_EXTRA_TUNE_ALL },
219 { NULL, AARCH64_EXTRA_TUNE_NONE }
220 };
221
222 /* Tuning parameters. */
223
224 static const struct cpu_addrcost_table generic_addrcost_table =
225 {
226 {
227 1, /* hi */
228 0, /* si */
229 0, /* di */
230 1, /* ti */
231 },
232 0, /* pre_modify */
233 0, /* post_modify */
234 0, /* register_offset */
235 0, /* register_sextend */
236 0, /* register_zextend */
237 0 /* imm_offset */
238 };
239
240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
241 {
242 {
243 0, /* hi */
244 0, /* si */
245 0, /* di */
246 2, /* ti */
247 },
248 0, /* pre_modify */
249 0, /* post_modify */
250 1, /* register_offset */
251 1, /* register_sextend */
252 2, /* register_zextend */
253 0, /* imm_offset */
254 };
255
256 static const struct cpu_addrcost_table xgene1_addrcost_table =
257 {
258 {
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
263 },
264 1, /* pre_modify */
265 1, /* post_modify */
266 0, /* register_offset */
267 1, /* register_sextend */
268 1, /* register_zextend */
269 0, /* imm_offset */
270 };
271
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
273 {
274 {
275 1, /* hi */
276 1, /* si */
277 1, /* di */
278 2, /* ti */
279 },
280 0, /* pre_modify */
281 0, /* post_modify */
282 2, /* register_offset */
283 3, /* register_sextend */
284 3, /* register_zextend */
285 0, /* imm_offset */
286 };
287
288 static const struct cpu_addrcost_table tsv110_addrcost_table =
289 {
290 {
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
295 },
296 0, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
302 };
303
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
305 {
306 {
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
311 },
312 1, /* pre_modify */
313 1, /* post_modify */
314 3, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 2, /* imm_offset */
318 };
319
320 static const struct cpu_regmove_cost generic_regmove_cost =
321 {
322 1, /* GP2GP */
323 /* Avoid the use of slow int<->fp moves for spilling by setting
324 their cost higher than memmov_cost. */
325 5, /* GP2FP */
326 5, /* FP2GP */
327 2 /* FP2FP */
328 };
329
330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
331 {
332 1, /* GP2GP */
333 /* Avoid the use of slow int<->fp moves for spilling by setting
334 their cost higher than memmov_cost. */
335 5, /* GP2FP */
336 5, /* FP2GP */
337 2 /* FP2FP */
338 };
339
340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
341 {
342 1, /* GP2GP */
343 /* Avoid the use of slow int<->fp moves for spilling by setting
344 their cost higher than memmov_cost. */
345 5, /* GP2FP */
346 5, /* FP2GP */
347 2 /* FP2FP */
348 };
349
350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
351 {
352 1, /* GP2GP */
353 /* Avoid the use of slow int<->fp moves for spilling by setting
354 their cost higher than memmov_cost (actual, 4 and 9). */
355 9, /* GP2FP */
356 9, /* FP2GP */
357 1 /* FP2FP */
358 };
359
360 static const struct cpu_regmove_cost thunderx_regmove_cost =
361 {
362 2, /* GP2GP */
363 2, /* GP2FP */
364 6, /* FP2GP */
365 4 /* FP2FP */
366 };
367
368 static const struct cpu_regmove_cost xgene1_regmove_cost =
369 {
370 1, /* GP2GP */
371 /* Avoid the use of slow int<->fp moves for spilling by setting
372 their cost higher than memmov_cost. */
373 8, /* GP2FP */
374 8, /* FP2GP */
375 2 /* FP2FP */
376 };
377
378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
379 {
380 2, /* GP2GP */
381 /* Avoid the use of int<->fp moves for spilling. */
382 6, /* GP2FP */
383 6, /* FP2GP */
384 4 /* FP2FP */
385 };
386
387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
388 {
389 1, /* GP2GP */
390 /* Avoid the use of int<->fp moves for spilling. */
391 8, /* GP2FP */
392 8, /* FP2GP */
393 4 /* FP2FP */
394 };
395
396 static const struct cpu_regmove_cost tsv110_regmove_cost =
397 {
398 1, /* GP2GP */
399 /* Avoid the use of slow int<->fp moves for spilling by setting
400 their cost higher than memmov_cost. */
401 2, /* GP2FP */
402 3, /* FP2GP */
403 2 /* FP2FP */
404 };
405
406 /* Generic costs for vector insn classes. */
407 static const struct cpu_vector_cost generic_vector_cost =
408 {
409 1, /* scalar_int_stmt_cost */
410 1, /* scalar_fp_stmt_cost */
411 1, /* scalar_load_cost */
412 1, /* scalar_store_cost */
413 1, /* vec_int_stmt_cost */
414 1, /* vec_fp_stmt_cost */
415 2, /* vec_permute_cost */
416 1, /* vec_to_scalar_cost */
417 1, /* scalar_to_vec_cost */
418 1, /* vec_align_load_cost */
419 1, /* vec_unalign_load_cost */
420 1, /* vec_unalign_store_cost */
421 1, /* vec_store_cost */
422 3, /* cond_taken_branch_cost */
423 1 /* cond_not_taken_branch_cost */
424 };
425
426 /* QDF24XX costs for vector insn classes. */
427 static const struct cpu_vector_cost qdf24xx_vector_cost =
428 {
429 1, /* scalar_int_stmt_cost */
430 1, /* scalar_fp_stmt_cost */
431 1, /* scalar_load_cost */
432 1, /* scalar_store_cost */
433 1, /* vec_int_stmt_cost */
434 3, /* vec_fp_stmt_cost */
435 2, /* vec_permute_cost */
436 1, /* vec_to_scalar_cost */
437 1, /* scalar_to_vec_cost */
438 1, /* vec_align_load_cost */
439 1, /* vec_unalign_load_cost */
440 1, /* vec_unalign_store_cost */
441 1, /* vec_store_cost */
442 3, /* cond_taken_branch_cost */
443 1 /* cond_not_taken_branch_cost */
444 };
445
446 /* ThunderX costs for vector insn classes. */
447 static const struct cpu_vector_cost thunderx_vector_cost =
448 {
449 1, /* scalar_int_stmt_cost */
450 1, /* scalar_fp_stmt_cost */
451 3, /* scalar_load_cost */
452 1, /* scalar_store_cost */
453 4, /* vec_int_stmt_cost */
454 1, /* vec_fp_stmt_cost */
455 4, /* vec_permute_cost */
456 2, /* vec_to_scalar_cost */
457 2, /* scalar_to_vec_cost */
458 3, /* vec_align_load_cost */
459 5, /* vec_unalign_load_cost */
460 5, /* vec_unalign_store_cost */
461 1, /* vec_store_cost */
462 3, /* cond_taken_branch_cost */
463 3 /* cond_not_taken_branch_cost */
464 };
465
466 static const struct cpu_vector_cost tsv110_vector_cost =
467 {
468 1, /* scalar_int_stmt_cost */
469 1, /* scalar_fp_stmt_cost */
470 5, /* scalar_load_cost */
471 1, /* scalar_store_cost */
472 2, /* vec_int_stmt_cost */
473 2, /* vec_fp_stmt_cost */
474 2, /* vec_permute_cost */
475 3, /* vec_to_scalar_cost */
476 2, /* scalar_to_vec_cost */
477 5, /* vec_align_load_cost */
478 5, /* vec_unalign_load_cost */
479 1, /* vec_unalign_store_cost */
480 1, /* vec_store_cost */
481 1, /* cond_taken_branch_cost */
482 1 /* cond_not_taken_branch_cost */
483 };
484
485 /* Generic costs for vector insn classes. */
486 static const struct cpu_vector_cost cortexa57_vector_cost =
487 {
488 1, /* scalar_int_stmt_cost */
489 1, /* scalar_fp_stmt_cost */
490 4, /* scalar_load_cost */
491 1, /* scalar_store_cost */
492 2, /* vec_int_stmt_cost */
493 2, /* vec_fp_stmt_cost */
494 3, /* vec_permute_cost */
495 8, /* vec_to_scalar_cost */
496 8, /* scalar_to_vec_cost */
497 4, /* vec_align_load_cost */
498 4, /* vec_unalign_load_cost */
499 1, /* vec_unalign_store_cost */
500 1, /* vec_store_cost */
501 1, /* cond_taken_branch_cost */
502 1 /* cond_not_taken_branch_cost */
503 };
504
505 static const struct cpu_vector_cost exynosm1_vector_cost =
506 {
507 1, /* scalar_int_stmt_cost */
508 1, /* scalar_fp_stmt_cost */
509 5, /* scalar_load_cost */
510 1, /* scalar_store_cost */
511 3, /* vec_int_stmt_cost */
512 3, /* vec_fp_stmt_cost */
513 3, /* vec_permute_cost */
514 3, /* vec_to_scalar_cost */
515 3, /* scalar_to_vec_cost */
516 5, /* vec_align_load_cost */
517 5, /* vec_unalign_load_cost */
518 1, /* vec_unalign_store_cost */
519 1, /* vec_store_cost */
520 1, /* cond_taken_branch_cost */
521 1 /* cond_not_taken_branch_cost */
522 };
523
524 /* Generic costs for vector insn classes. */
525 static const struct cpu_vector_cost xgene1_vector_cost =
526 {
527 1, /* scalar_int_stmt_cost */
528 1, /* scalar_fp_stmt_cost */
529 5, /* scalar_load_cost */
530 1, /* scalar_store_cost */
531 2, /* vec_int_stmt_cost */
532 2, /* vec_fp_stmt_cost */
533 2, /* vec_permute_cost */
534 4, /* vec_to_scalar_cost */
535 4, /* scalar_to_vec_cost */
536 10, /* vec_align_load_cost */
537 10, /* vec_unalign_load_cost */
538 2, /* vec_unalign_store_cost */
539 2, /* vec_store_cost */
540 2, /* cond_taken_branch_cost */
541 1 /* cond_not_taken_branch_cost */
542 };
543
544 /* Costs for vector insn classes for Vulcan. */
545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
546 {
547 1, /* scalar_int_stmt_cost */
548 6, /* scalar_fp_stmt_cost */
549 4, /* scalar_load_cost */
550 1, /* scalar_store_cost */
551 5, /* vec_int_stmt_cost */
552 6, /* vec_fp_stmt_cost */
553 3, /* vec_permute_cost */
554 6, /* vec_to_scalar_cost */
555 5, /* scalar_to_vec_cost */
556 8, /* vec_align_load_cost */
557 8, /* vec_unalign_load_cost */
558 4, /* vec_unalign_store_cost */
559 4, /* vec_store_cost */
560 2, /* cond_taken_branch_cost */
561 1 /* cond_not_taken_branch_cost */
562 };
563
564 /* Generic costs for branch instructions. */
565 static const struct cpu_branch_cost generic_branch_cost =
566 {
567 1, /* Predictable. */
568 3 /* Unpredictable. */
569 };
570
571 /* Generic approximation modes. */
572 static const cpu_approx_modes generic_approx_modes =
573 {
574 AARCH64_APPROX_NONE, /* division */
575 AARCH64_APPROX_NONE, /* sqrt */
576 AARCH64_APPROX_NONE /* recip_sqrt */
577 };
578
579 /* Approximation modes for Exynos M1. */
580 static const cpu_approx_modes exynosm1_approx_modes =
581 {
582 AARCH64_APPROX_NONE, /* division */
583 AARCH64_APPROX_ALL, /* sqrt */
584 AARCH64_APPROX_ALL /* recip_sqrt */
585 };
586
587 /* Approximation modes for X-Gene 1. */
588 static const cpu_approx_modes xgene1_approx_modes =
589 {
590 AARCH64_APPROX_NONE, /* division */
591 AARCH64_APPROX_NONE, /* sqrt */
592 AARCH64_APPROX_ALL /* recip_sqrt */
593 };
594
595 /* Generic prefetch settings (which disable prefetch). */
596 static const cpu_prefetch_tune generic_prefetch_tune =
597 {
598 0, /* num_slots */
599 -1, /* l1_cache_size */
600 -1, /* l1_cache_line_size */
601 -1, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 -1 /* default_opt_level */
605 };
606
607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
608 {
609 0, /* num_slots */
610 -1, /* l1_cache_size */
611 64, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
616 };
617
618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
619 {
620 4, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 512, /* l2_cache_size */
624 false, /* prefetch_dynamic_strides */
625 2048, /* minimum_stride */
626 3 /* default_opt_level */
627 };
628
629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
630 {
631 8, /* num_slots */
632 32, /* l1_cache_size */
633 128, /* l1_cache_line_size */
634 16*1024, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 3 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune thunderx_prefetch_tune =
641 {
642 8, /* num_slots */
643 32, /* l1_cache_size */
644 128, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
652 {
653 8, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 256, /* l2_cache_size */
657 true, /* prefetch_dynamic_strides */
658 -1, /* minimum_stride */
659 -1 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune tsv110_prefetch_tune =
663 {
664 0, /* num_slots */
665 64, /* l1_cache_size */
666 64, /* l1_cache_line_size */
667 512, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 -1 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune xgene1_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 64, /* l1_cache_line_size */
678 256, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const struct tune_params generic_tunings =
685 {
686 &cortexa57_extra_costs,
687 &generic_addrcost_table,
688 &generic_regmove_cost,
689 &generic_vector_cost,
690 &generic_branch_cost,
691 &generic_approx_modes,
692 SVE_NOT_IMPLEMENTED, /* sve_width */
693 4, /* memmov_cost */
694 2, /* issue_rate */
695 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
696 "8", /* function_align. */
697 "4", /* jump_align. */
698 "8", /* loop_align. */
699 2, /* int_reassoc_width. */
700 4, /* fp_reassoc_width. */
701 1, /* vec_reassoc_width. */
702 2, /* min_div_recip_mul_sf. */
703 2, /* min_div_recip_mul_df. */
704 0, /* max_case_values. */
705 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
706 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
707 &generic_prefetch_tune
708 };
709
710 static const struct tune_params cortexa35_tunings =
711 {
712 &cortexa53_extra_costs,
713 &generic_addrcost_table,
714 &cortexa53_regmove_cost,
715 &generic_vector_cost,
716 &generic_branch_cost,
717 &generic_approx_modes,
718 SVE_NOT_IMPLEMENTED, /* sve_width */
719 4, /* memmov_cost */
720 1, /* issue_rate */
721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
723 "16", /* function_align. */
724 "4", /* jump_align. */
725 "8", /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
734 &generic_prefetch_tune
735 };
736
737 static const struct tune_params cortexa53_tunings =
738 {
739 &cortexa53_extra_costs,
740 &generic_addrcost_table,
741 &cortexa53_regmove_cost,
742 &generic_vector_cost,
743 &generic_branch_cost,
744 &generic_approx_modes,
745 SVE_NOT_IMPLEMENTED, /* sve_width */
746 4, /* memmov_cost */
747 2, /* issue_rate */
748 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
749 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
750 "16", /* function_align. */
751 "4", /* jump_align. */
752 "8", /* loop_align. */
753 2, /* int_reassoc_width. */
754 4, /* fp_reassoc_width. */
755 1, /* vec_reassoc_width. */
756 2, /* min_div_recip_mul_sf. */
757 2, /* min_div_recip_mul_df. */
758 0, /* max_case_values. */
759 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
760 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
761 &generic_prefetch_tune
762 };
763
764 static const struct tune_params cortexa57_tunings =
765 {
766 &cortexa57_extra_costs,
767 &generic_addrcost_table,
768 &cortexa57_regmove_cost,
769 &cortexa57_vector_cost,
770 &generic_branch_cost,
771 &generic_approx_modes,
772 SVE_NOT_IMPLEMENTED, /* sve_width */
773 4, /* memmov_cost */
774 3, /* issue_rate */
775 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
776 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
777 "16", /* function_align. */
778 "4", /* jump_align. */
779 "8", /* loop_align. */
780 2, /* int_reassoc_width. */
781 4, /* fp_reassoc_width. */
782 1, /* vec_reassoc_width. */
783 2, /* min_div_recip_mul_sf. */
784 2, /* min_div_recip_mul_df. */
785 0, /* max_case_values. */
786 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
787 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
788 &generic_prefetch_tune
789 };
790
791 static const struct tune_params cortexa72_tunings =
792 {
793 &cortexa57_extra_costs,
794 &generic_addrcost_table,
795 &cortexa57_regmove_cost,
796 &cortexa57_vector_cost,
797 &generic_branch_cost,
798 &generic_approx_modes,
799 SVE_NOT_IMPLEMENTED, /* sve_width */
800 4, /* memmov_cost */
801 3, /* issue_rate */
802 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
803 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
804 "16", /* function_align. */
805 "4", /* jump_align. */
806 "8", /* loop_align. */
807 2, /* int_reassoc_width. */
808 4, /* fp_reassoc_width. */
809 1, /* vec_reassoc_width. */
810 2, /* min_div_recip_mul_sf. */
811 2, /* min_div_recip_mul_df. */
812 0, /* max_case_values. */
813 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
814 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
815 &generic_prefetch_tune
816 };
817
818 static const struct tune_params cortexa73_tunings =
819 {
820 &cortexa57_extra_costs,
821 &generic_addrcost_table,
822 &cortexa57_regmove_cost,
823 &cortexa57_vector_cost,
824 &generic_branch_cost,
825 &generic_approx_modes,
826 SVE_NOT_IMPLEMENTED, /* sve_width */
827 4, /* memmov_cost. */
828 2, /* issue_rate. */
829 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
830 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
831 "16", /* function_align. */
832 "4", /* jump_align. */
833 "8", /* loop_align. */
834 2, /* int_reassoc_width. */
835 4, /* fp_reassoc_width. */
836 1, /* vec_reassoc_width. */
837 2, /* min_div_recip_mul_sf. */
838 2, /* min_div_recip_mul_df. */
839 0, /* max_case_values. */
840 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
841 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
842 &generic_prefetch_tune
843 };
844
845
846
847 static const struct tune_params exynosm1_tunings =
848 {
849 &exynosm1_extra_costs,
850 &exynosm1_addrcost_table,
851 &exynosm1_regmove_cost,
852 &exynosm1_vector_cost,
853 &generic_branch_cost,
854 &exynosm1_approx_modes,
855 SVE_NOT_IMPLEMENTED, /* sve_width */
856 4, /* memmov_cost */
857 3, /* issue_rate */
858 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
859 "4", /* function_align. */
860 "4", /* jump_align. */
861 "4", /* loop_align. */
862 2, /* int_reassoc_width. */
863 4, /* fp_reassoc_width. */
864 1, /* vec_reassoc_width. */
865 2, /* min_div_recip_mul_sf. */
866 2, /* min_div_recip_mul_df. */
867 48, /* max_case_values. */
868 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
869 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
870 &exynosm1_prefetch_tune
871 };
872
873 static const struct tune_params thunderxt88_tunings =
874 {
875 &thunderx_extra_costs,
876 &generic_addrcost_table,
877 &thunderx_regmove_cost,
878 &thunderx_vector_cost,
879 &generic_branch_cost,
880 &generic_approx_modes,
881 SVE_NOT_IMPLEMENTED, /* sve_width */
882 6, /* memmov_cost */
883 2, /* issue_rate */
884 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
885 "8", /* function_align. */
886 "8", /* jump_align. */
887 "8", /* loop_align. */
888 2, /* int_reassoc_width. */
889 4, /* fp_reassoc_width. */
890 1, /* vec_reassoc_width. */
891 2, /* min_div_recip_mul_sf. */
892 2, /* min_div_recip_mul_df. */
893 0, /* max_case_values. */
894 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
895 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
896 &thunderxt88_prefetch_tune
897 };
898
899 static const struct tune_params thunderx_tunings =
900 {
901 &thunderx_extra_costs,
902 &generic_addrcost_table,
903 &thunderx_regmove_cost,
904 &thunderx_vector_cost,
905 &generic_branch_cost,
906 &generic_approx_modes,
907 SVE_NOT_IMPLEMENTED, /* sve_width */
908 6, /* memmov_cost */
909 2, /* issue_rate */
910 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
911 "8", /* function_align. */
912 "8", /* jump_align. */
913 "8", /* loop_align. */
914 2, /* int_reassoc_width. */
915 4, /* fp_reassoc_width. */
916 1, /* vec_reassoc_width. */
917 2, /* min_div_recip_mul_sf. */
918 2, /* min_div_recip_mul_df. */
919 0, /* max_case_values. */
920 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
921 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
922 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
923 &thunderx_prefetch_tune
924 };
925
926 static const struct tune_params tsv110_tunings =
927 {
928 &tsv110_extra_costs,
929 &tsv110_addrcost_table,
930 &tsv110_regmove_cost,
931 &tsv110_vector_cost,
932 &generic_branch_cost,
933 &generic_approx_modes,
934 SVE_NOT_IMPLEMENTED, /* sve_width */
935 4, /* memmov_cost */
936 4, /* issue_rate */
937 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
938 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
939 "16", /* function_align. */
940 "4", /* jump_align. */
941 "8", /* loop_align. */
942 2, /* int_reassoc_width. */
943 4, /* fp_reassoc_width. */
944 1, /* vec_reassoc_width. */
945 2, /* min_div_recip_mul_sf. */
946 2, /* min_div_recip_mul_df. */
947 0, /* max_case_values. */
948 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
949 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
950 &tsv110_prefetch_tune
951 };
952
953 static const struct tune_params xgene1_tunings =
954 {
955 &xgene1_extra_costs,
956 &xgene1_addrcost_table,
957 &xgene1_regmove_cost,
958 &xgene1_vector_cost,
959 &generic_branch_cost,
960 &xgene1_approx_modes,
961 SVE_NOT_IMPLEMENTED, /* sve_width */
962 6, /* memmov_cost */
963 4, /* issue_rate */
964 AARCH64_FUSE_NOTHING, /* fusible_ops */
965 "16", /* function_align. */
966 "16", /* jump_align. */
967 "16", /* loop_align. */
968 2, /* int_reassoc_width. */
969 4, /* fp_reassoc_width. */
970 1, /* vec_reassoc_width. */
971 2, /* min_div_recip_mul_sf. */
972 2, /* min_div_recip_mul_df. */
973 17, /* max_case_values. */
974 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
975 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
976 &xgene1_prefetch_tune
977 };
978
979 static const struct tune_params emag_tunings =
980 {
981 &xgene1_extra_costs,
982 &xgene1_addrcost_table,
983 &xgene1_regmove_cost,
984 &xgene1_vector_cost,
985 &generic_branch_cost,
986 &xgene1_approx_modes,
987 SVE_NOT_IMPLEMENTED,
988 6, /* memmov_cost */
989 4, /* issue_rate */
990 AARCH64_FUSE_NOTHING, /* fusible_ops */
991 "16", /* function_align. */
992 "16", /* jump_align. */
993 "16", /* loop_align. */
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 17, /* max_case_values. */
1000 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1001 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1002 &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007 &qdf24xx_extra_costs,
1008 &qdf24xx_addrcost_table,
1009 &qdf24xx_regmove_cost,
1010 &qdf24xx_vector_cost,
1011 &generic_branch_cost,
1012 &generic_approx_modes,
1013 SVE_NOT_IMPLEMENTED, /* sve_width */
1014 4, /* memmov_cost */
1015 4, /* issue_rate */
1016 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1018 "16", /* function_align. */
1019 "8", /* jump_align. */
1020 "16", /* loop_align. */
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1028 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1029 &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1033 for now. */
1034 static const struct tune_params saphira_tunings =
1035 {
1036 &generic_extra_costs,
1037 &generic_addrcost_table,
1038 &generic_regmove_cost,
1039 &generic_vector_cost,
1040 &generic_branch_cost,
1041 &generic_approx_modes,
1042 SVE_NOT_IMPLEMENTED, /* sve_width */
1043 4, /* memmov_cost */
1044 4, /* issue_rate */
1045 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1047 "16", /* function_align. */
1048 "8", /* jump_align. */
1049 "16", /* loop_align. */
1050 2, /* int_reassoc_width. */
1051 4, /* fp_reassoc_width. */
1052 1, /* vec_reassoc_width. */
1053 2, /* min_div_recip_mul_sf. */
1054 2, /* min_div_recip_mul_df. */
1055 0, /* max_case_values. */
1056 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1057 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1058 &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063 &thunderx2t99_extra_costs,
1064 &thunderx2t99_addrcost_table,
1065 &thunderx2t99_regmove_cost,
1066 &thunderx2t99_vector_cost,
1067 &generic_branch_cost,
1068 &generic_approx_modes,
1069 SVE_NOT_IMPLEMENTED, /* sve_width */
1070 4, /* memmov_cost. */
1071 4, /* issue_rate. */
1072 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1074 "16", /* function_align. */
1075 "8", /* jump_align. */
1076 "16", /* loop_align. */
1077 3, /* int_reassoc_width. */
1078 2, /* fp_reassoc_width. */
1079 2, /* vec_reassoc_width. */
1080 2, /* min_div_recip_mul_sf. */
1081 2, /* min_div_recip_mul_df. */
1082 0, /* max_case_values. */
1083 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1084 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1085 &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params neoversen1_tunings =
1089 {
1090 &cortexa57_extra_costs,
1091 &generic_addrcost_table,
1092 &generic_regmove_cost,
1093 &cortexa57_vector_cost,
1094 &generic_branch_cost,
1095 &generic_approx_modes,
1096 SVE_NOT_IMPLEMENTED, /* sve_width */
1097 4, /* memmov_cost */
1098 3, /* issue_rate */
1099 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1100 "32:16", /* function_align. */
1101 "32:16", /* jump_align. */
1102 "32:16", /* loop_align. */
1103 2, /* int_reassoc_width. */
1104 4, /* fp_reassoc_width. */
1105 2, /* vec_reassoc_width. */
1106 2, /* min_div_recip_mul_sf. */
1107 2, /* min_div_recip_mul_df. */
1108 0, /* max_case_values. */
1109 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1110 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1111 &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures. */
1115 struct aarch64_tuning_override_function
1116 {
1117 const char* name;
1118 void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128 { "fuse", aarch64_parse_fuse_string },
1129 { "tune", aarch64_parse_tune_string },
1130 { "sve_width", aarch64_parse_sve_width_string },
1131 { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64. */
1135 struct processor
1136 {
1137 const char *const name;
1138 enum aarch64_processor ident;
1139 enum aarch64_processor sched_core;
1140 enum aarch64_arch arch;
1141 unsigned architecture_version;
1142 const unsigned long flags;
1143 const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64. */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64. */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1160 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1161 FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification. These are populated by the -march, -mtune, -mcpu
1170 handling code or by target attributes. */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 /* The current tuning set. */
1176 struct tune_params aarch64_tune_params = generic_tunings;
1177
1178 /* Table of machine attributes. */
1179 static const struct attribute_spec aarch64_attribute_table[] =
1180 {
1181 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1182 affects_type_identity, handler, exclude } */
1183 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1184 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1185 };
1186
1187 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1188
1189 /* An ISA extension in the co-processor and main instruction set space. */
1190 struct aarch64_option_extension
1191 {
1192 const char *const name;
1193 const unsigned long flags_on;
1194 const unsigned long flags_off;
1195 };
1196
1197 typedef enum aarch64_cond_code
1198 {
1199 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1200 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1201 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1202 }
1203 aarch64_cc;
1204
1205 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1206
1207 struct aarch64_branch_protect_type
1208 {
1209 /* The type's name that the user passes to the branch-protection option
1210 string. */
1211 const char* name;
1212 /* Function to handle the protection type and set global variables.
1213 First argument is the string token corresponding with this type and the
1214 second argument is the next token in the option string.
1215 Return values:
1216 * AARCH64_PARSE_OK: Handling was sucessful.
1217 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1218 should print an error.
1219 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1220 own error. */
1221 enum aarch64_parse_opt_result (*handler)(char*, char*);
1222 /* A list of types that can follow this type in the option string. */
1223 const aarch64_branch_protect_type* subtypes;
1224 unsigned int num_subtypes;
1225 };
1226
1227 static enum aarch64_parse_opt_result
1228 aarch64_handle_no_branch_protection (char* str, char* rest)
1229 {
1230 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1231 aarch64_enable_bti = 0;
1232 if (rest)
1233 {
1234 error ("unexpected %<%s%> after %<%s%>", rest, str);
1235 return AARCH64_PARSE_INVALID_FEATURE;
1236 }
1237 return AARCH64_PARSE_OK;
1238 }
1239
1240 static enum aarch64_parse_opt_result
1241 aarch64_handle_standard_branch_protection (char* str, char* rest)
1242 {
1243 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1244 aarch64_enable_bti = 1;
1245 if (rest)
1246 {
1247 error ("unexpected %<%s%> after %<%s%>", rest, str);
1248 return AARCH64_PARSE_INVALID_FEATURE;
1249 }
1250 return AARCH64_PARSE_OK;
1251 }
1252
1253 static enum aarch64_parse_opt_result
1254 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1255 char* rest ATTRIBUTE_UNUSED)
1256 {
1257 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1258 return AARCH64_PARSE_OK;
1259 }
1260
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1263 char* rest ATTRIBUTE_UNUSED)
1264 {
1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1266 return AARCH64_PARSE_OK;
1267 }
1268
1269 static enum aarch64_parse_opt_result
1270 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1271 char* rest ATTRIBUTE_UNUSED)
1272 {
1273 aarch64_enable_bti = 1;
1274 return AARCH64_PARSE_OK;
1275 }
1276
1277 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1278 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1279 { NULL, NULL, NULL, 0 }
1280 };
1281
1282 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1283 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1284 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1285 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1286 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1287 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1288 { NULL, NULL, NULL, 0 }
1289 };
1290
1291 /* The condition codes of the processor, and the inverse function. */
1292 static const char * const aarch64_condition_codes[] =
1293 {
1294 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1295 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1296 };
1297
1298 /* Generate code to enable conditional branches in functions over 1 MiB. */
1299 const char *
1300 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1301 const char * branch_format)
1302 {
1303 rtx_code_label * tmp_label = gen_label_rtx ();
1304 char label_buf[256];
1305 char buffer[128];
1306 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1307 CODE_LABEL_NUMBER (tmp_label));
1308 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1309 rtx dest_label = operands[pos_label];
1310 operands[pos_label] = tmp_label;
1311
1312 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1313 output_asm_insn (buffer, operands);
1314
1315 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1316 operands[pos_label] = dest_label;
1317 output_asm_insn (buffer, operands);
1318 return "";
1319 }
1320
1321 void
1322 aarch64_err_no_fpadvsimd (machine_mode mode)
1323 {
1324 if (TARGET_GENERAL_REGS_ONLY)
1325 if (FLOAT_MODE_P (mode))
1326 error ("%qs is incompatible with the use of floating-point types",
1327 "-mgeneral-regs-only");
1328 else
1329 error ("%qs is incompatible with the use of vector types",
1330 "-mgeneral-regs-only");
1331 else
1332 if (FLOAT_MODE_P (mode))
1333 error ("%qs feature modifier is incompatible with the use of"
1334 " floating-point types", "+nofp");
1335 else
1336 error ("%qs feature modifier is incompatible with the use of"
1337 " vector types", "+nofp");
1338 }
1339
1340 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1341 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1342 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1343 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1344 and GENERAL_REGS is lower than the memory cost (in this case the best class
1345 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1346 cost results in bad allocations with many redundant int<->FP moves which
1347 are expensive on various cores.
1348 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1349 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1350 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1351 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1352 The result of this is that it is no longer inefficient to have a higher
1353 memory move cost than the register move cost.
1354 */
1355
1356 static reg_class_t
1357 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1358 reg_class_t best_class)
1359 {
1360 machine_mode mode;
1361
1362 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1363 || !reg_class_subset_p (FP_REGS, allocno_class))
1364 return allocno_class;
1365
1366 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1367 || !reg_class_subset_p (FP_REGS, best_class))
1368 return best_class;
1369
1370 mode = PSEUDO_REGNO_MODE (regno);
1371 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1372 }
1373
1374 static unsigned int
1375 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1376 {
1377 if (GET_MODE_UNIT_SIZE (mode) == 4)
1378 return aarch64_tune_params.min_div_recip_mul_sf;
1379 return aarch64_tune_params.min_div_recip_mul_df;
1380 }
1381
1382 /* Return the reassociation width of treeop OPC with mode MODE. */
1383 static int
1384 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1385 {
1386 if (VECTOR_MODE_P (mode))
1387 return aarch64_tune_params.vec_reassoc_width;
1388 if (INTEGRAL_MODE_P (mode))
1389 return aarch64_tune_params.int_reassoc_width;
1390 /* Avoid reassociating floating point addition so we emit more FMAs. */
1391 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1392 return aarch64_tune_params.fp_reassoc_width;
1393 return 1;
1394 }
1395
1396 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1397 unsigned
1398 aarch64_dbx_register_number (unsigned regno)
1399 {
1400 if (GP_REGNUM_P (regno))
1401 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1402 else if (regno == SP_REGNUM)
1403 return AARCH64_DWARF_SP;
1404 else if (FP_REGNUM_P (regno))
1405 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1406 else if (PR_REGNUM_P (regno))
1407 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1408 else if (regno == VG_REGNUM)
1409 return AARCH64_DWARF_VG;
1410
1411 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1412 equivalent DWARF register. */
1413 return DWARF_FRAME_REGISTERS;
1414 }
1415
1416 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1417 static bool
1418 aarch64_advsimd_struct_mode_p (machine_mode mode)
1419 {
1420 return (TARGET_SIMD
1421 && (mode == OImode || mode == CImode || mode == XImode));
1422 }
1423
1424 /* Return true if MODE is an SVE predicate mode. */
1425 static bool
1426 aarch64_sve_pred_mode_p (machine_mode mode)
1427 {
1428 return (TARGET_SVE
1429 && (mode == VNx16BImode
1430 || mode == VNx8BImode
1431 || mode == VNx4BImode
1432 || mode == VNx2BImode));
1433 }
1434
1435 /* Three mutually-exclusive flags describing a vector or predicate type. */
1436 const unsigned int VEC_ADVSIMD = 1;
1437 const unsigned int VEC_SVE_DATA = 2;
1438 const unsigned int VEC_SVE_PRED = 4;
1439 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1440 a structure of 2, 3 or 4 vectors. */
1441 const unsigned int VEC_STRUCT = 8;
1442 /* Useful combinations of the above. */
1443 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1444 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1445
1446 /* Return a set of flags describing the vector properties of mode MODE.
1447 Ignore modes that are not supported by the current target. */
1448 static unsigned int
1449 aarch64_classify_vector_mode (machine_mode mode)
1450 {
1451 if (aarch64_advsimd_struct_mode_p (mode))
1452 return VEC_ADVSIMD | VEC_STRUCT;
1453
1454 if (aarch64_sve_pred_mode_p (mode))
1455 return VEC_SVE_PRED;
1456
1457 scalar_mode inner = GET_MODE_INNER (mode);
1458 if (VECTOR_MODE_P (mode)
1459 && (inner == QImode
1460 || inner == HImode
1461 || inner == HFmode
1462 || inner == SImode
1463 || inner == SFmode
1464 || inner == DImode
1465 || inner == DFmode))
1466 {
1467 if (TARGET_SVE)
1468 {
1469 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1470 return VEC_SVE_DATA;
1471 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1472 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1473 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1474 return VEC_SVE_DATA | VEC_STRUCT;
1475 }
1476
1477 /* This includes V1DF but not V1DI (which doesn't exist). */
1478 if (TARGET_SIMD
1479 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1480 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1481 return VEC_ADVSIMD;
1482 }
1483
1484 return 0;
1485 }
1486
1487 /* Return true if MODE is any of the data vector modes, including
1488 structure modes. */
1489 static bool
1490 aarch64_vector_data_mode_p (machine_mode mode)
1491 {
1492 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1493 }
1494
1495 /* Return true if MODE is an SVE data vector mode; either a single vector
1496 or a structure of vectors. */
1497 static bool
1498 aarch64_sve_data_mode_p (machine_mode mode)
1499 {
1500 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1501 }
1502
1503 /* Implement target hook TARGET_ARRAY_MODE. */
1504 static opt_machine_mode
1505 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1506 {
1507 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1508 && IN_RANGE (nelems, 2, 4))
1509 return mode_for_vector (GET_MODE_INNER (mode),
1510 GET_MODE_NUNITS (mode) * nelems);
1511
1512 return opt_machine_mode ();
1513 }
1514
1515 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1516 static bool
1517 aarch64_array_mode_supported_p (machine_mode mode,
1518 unsigned HOST_WIDE_INT nelems)
1519 {
1520 if (TARGET_SIMD
1521 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1522 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1523 && (nelems >= 2 && nelems <= 4))
1524 return true;
1525
1526 return false;
1527 }
1528
1529 /* Return the SVE predicate mode to use for elements that have
1530 ELEM_NBYTES bytes, if such a mode exists. */
1531
1532 opt_machine_mode
1533 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1534 {
1535 if (TARGET_SVE)
1536 {
1537 if (elem_nbytes == 1)
1538 return VNx16BImode;
1539 if (elem_nbytes == 2)
1540 return VNx8BImode;
1541 if (elem_nbytes == 4)
1542 return VNx4BImode;
1543 if (elem_nbytes == 8)
1544 return VNx2BImode;
1545 }
1546 return opt_machine_mode ();
1547 }
1548
1549 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1550
1551 static opt_machine_mode
1552 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1553 {
1554 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1555 {
1556 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1557 machine_mode pred_mode;
1558 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1559 return pred_mode;
1560 }
1561
1562 return default_get_mask_mode (nunits, nbytes);
1563 }
1564
1565 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1566 prefer to use the first arithmetic operand as the else value if
1567 the else value doesn't matter, since that exactly matches the SVE
1568 destructive merging form. For ternary operations we could either
1569 pick the first operand and use FMAD-like instructions or the last
1570 operand and use FMLA-like instructions; the latter seems more
1571 natural. */
1572
1573 static tree
1574 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1575 {
1576 return nops == 3 ? ops[2] : ops[0];
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_NREGS. */
1580
1581 static unsigned int
1582 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1583 {
1584 /* ??? Logically we should only need to provide a value when
1585 HARD_REGNO_MODE_OK says that the combination is valid,
1586 but at the moment we need to handle all modes. Just ignore
1587 any runtime parts for registers that can't store them. */
1588 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1589 switch (aarch64_regno_regclass (regno))
1590 {
1591 case FP_REGS:
1592 case FP_LO_REGS:
1593 if (aarch64_sve_data_mode_p (mode))
1594 return exact_div (GET_MODE_SIZE (mode),
1595 BYTES_PER_SVE_VECTOR).to_constant ();
1596 return CEIL (lowest_size, UNITS_PER_VREG);
1597 case PR_REGS:
1598 case PR_LO_REGS:
1599 case PR_HI_REGS:
1600 return 1;
1601 default:
1602 return CEIL (lowest_size, UNITS_PER_WORD);
1603 }
1604 gcc_unreachable ();
1605 }
1606
1607 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1608
1609 static bool
1610 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1611 {
1612 if (GET_MODE_CLASS (mode) == MODE_CC)
1613 return regno == CC_REGNUM;
1614
1615 if (regno == VG_REGNUM)
1616 /* This must have the same size as _Unwind_Word. */
1617 return mode == DImode;
1618
1619 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1620 if (vec_flags & VEC_SVE_PRED)
1621 return PR_REGNUM_P (regno);
1622
1623 if (PR_REGNUM_P (regno))
1624 return 0;
1625
1626 if (regno == SP_REGNUM)
1627 /* The purpose of comparing with ptr_mode is to support the
1628 global register variable associated with the stack pointer
1629 register via the syntax of asm ("wsp") in ILP32. */
1630 return mode == Pmode || mode == ptr_mode;
1631
1632 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1633 return mode == Pmode;
1634
1635 if (GP_REGNUM_P (regno))
1636 {
1637 if (known_le (GET_MODE_SIZE (mode), 8))
1638 return true;
1639 else if (known_le (GET_MODE_SIZE (mode), 16))
1640 return (regno & 1) == 0;
1641 }
1642 else if (FP_REGNUM_P (regno))
1643 {
1644 if (vec_flags & VEC_STRUCT)
1645 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1646 else
1647 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1648 }
1649
1650 return false;
1651 }
1652
1653 /* Return true if this is a definition of a vectorized simd function. */
1654
1655 static bool
1656 aarch64_simd_decl_p (tree fndecl)
1657 {
1658 tree fntype;
1659
1660 if (fndecl == NULL)
1661 return false;
1662 fntype = TREE_TYPE (fndecl);
1663 if (fntype == NULL)
1664 return false;
1665
1666 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1667 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1668 return true;
1669
1670 return false;
1671 }
1672
1673 /* Return the mode a register save/restore should use. DImode for integer
1674 registers, DFmode for FP registers in non-SIMD functions (they only save
1675 the bottom half of a 128 bit register), or TFmode for FP registers in
1676 SIMD functions. */
1677
1678 static machine_mode
1679 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1680 {
1681 return GP_REGNUM_P (regno)
1682 ? E_DImode
1683 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1684 }
1685
1686 /* Return true if the instruction is a call to a SIMD function, false
1687 if it is not a SIMD function or if we do not know anything about
1688 the function. */
1689
1690 static bool
1691 aarch64_simd_call_p (rtx_insn *insn)
1692 {
1693 rtx symbol;
1694 rtx call;
1695 tree fndecl;
1696
1697 gcc_assert (CALL_P (insn));
1698 call = get_call_rtx_from (insn);
1699 symbol = XEXP (XEXP (call, 0), 0);
1700 if (GET_CODE (symbol) != SYMBOL_REF)
1701 return false;
1702 fndecl = SYMBOL_REF_DECL (symbol);
1703 if (!fndecl)
1704 return false;
1705
1706 return aarch64_simd_decl_p (fndecl);
1707 }
1708
1709 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1710 a function that uses the SIMD ABI, take advantage of the extra
1711 call-preserved registers that the ABI provides. */
1712
1713 void
1714 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1715 HARD_REG_SET *return_set)
1716 {
1717 if (aarch64_simd_call_p (insn))
1718 {
1719 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1720 if (FP_SIMD_SAVED_REGNUM_P (regno))
1721 CLEAR_HARD_REG_BIT (*return_set, regno);
1722 }
1723 }
1724
1725 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1726 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1727 clobbers the top 64 bits when restoring the bottom 64 bits. */
1728
1729 static bool
1730 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1731 machine_mode mode)
1732 {
1733 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1734 return FP_REGNUM_P (regno)
1735 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1736 }
1737
1738 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1739
1740 rtx_insn *
1741 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1742 {
1743 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1744
1745 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1746 return call_1;
1747 else
1748 return call_2;
1749 }
1750
1751 /* Implement REGMODE_NATURAL_SIZE. */
1752 poly_uint64
1753 aarch64_regmode_natural_size (machine_mode mode)
1754 {
1755 /* The natural size for SVE data modes is one SVE data vector,
1756 and similarly for predicates. We can't independently modify
1757 anything smaller than that. */
1758 /* ??? For now, only do this for variable-width SVE registers.
1759 Doing it for constant-sized registers breaks lower-subreg.c. */
1760 /* ??? And once that's fixed, we should probably have similar
1761 code for Advanced SIMD. */
1762 if (!aarch64_sve_vg.is_constant ())
1763 {
1764 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1765 if (vec_flags & VEC_SVE_PRED)
1766 return BYTES_PER_SVE_PRED;
1767 if (vec_flags & VEC_SVE_DATA)
1768 return BYTES_PER_SVE_VECTOR;
1769 }
1770 return UNITS_PER_WORD;
1771 }
1772
1773 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1774 machine_mode
1775 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1776 machine_mode mode)
1777 {
1778 /* The predicate mode determines which bits are significant and
1779 which are "don't care". Decreasing the number of lanes would
1780 lose data while increasing the number of lanes would make bits
1781 unnecessarily significant. */
1782 if (PR_REGNUM_P (regno))
1783 return mode;
1784 if (known_ge (GET_MODE_SIZE (mode), 4))
1785 return mode;
1786 else
1787 return SImode;
1788 }
1789
1790 /* Return true if I's bits are consecutive ones from the MSB. */
1791 bool
1792 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1793 {
1794 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1795 }
1796
1797 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1798 that strcpy from constants will be faster. */
1799
1800 static HOST_WIDE_INT
1801 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1802 {
1803 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1804 return MAX (align, BITS_PER_WORD);
1805 return align;
1806 }
1807
1808 /* Return true if calls to DECL should be treated as
1809 long-calls (ie called via a register). */
1810 static bool
1811 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1812 {
1813 return false;
1814 }
1815
1816 /* Return true if calls to symbol-ref SYM should be treated as
1817 long-calls (ie called via a register). */
1818 bool
1819 aarch64_is_long_call_p (rtx sym)
1820 {
1821 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1822 }
1823
1824 /* Return true if calls to symbol-ref SYM should not go through
1825 plt stubs. */
1826
1827 bool
1828 aarch64_is_noplt_call_p (rtx sym)
1829 {
1830 const_tree decl = SYMBOL_REF_DECL (sym);
1831
1832 if (flag_pic
1833 && decl
1834 && (!flag_plt
1835 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1836 && !targetm.binds_local_p (decl))
1837 return true;
1838
1839 return false;
1840 }
1841
1842 /* Return true if the offsets to a zero/sign-extract operation
1843 represent an expression that matches an extend operation. The
1844 operands represent the paramters from
1845
1846 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1847 bool
1848 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1849 rtx extract_imm)
1850 {
1851 HOST_WIDE_INT mult_val, extract_val;
1852
1853 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1854 return false;
1855
1856 mult_val = INTVAL (mult_imm);
1857 extract_val = INTVAL (extract_imm);
1858
1859 if (extract_val > 8
1860 && extract_val < GET_MODE_BITSIZE (mode)
1861 && exact_log2 (extract_val & ~7) > 0
1862 && (extract_val & 7) <= 4
1863 && mult_val == (1 << (extract_val & 7)))
1864 return true;
1865
1866 return false;
1867 }
1868
1869 /* Emit an insn that's a simple single-set. Both the operands must be
1870 known to be valid. */
1871 inline static rtx_insn *
1872 emit_set_insn (rtx x, rtx y)
1873 {
1874 return emit_insn (gen_rtx_SET (x, y));
1875 }
1876
1877 /* X and Y are two things to compare using CODE. Emit the compare insn and
1878 return the rtx for register 0 in the proper mode. */
1879 rtx
1880 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1881 {
1882 machine_mode mode = SELECT_CC_MODE (code, x, y);
1883 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1884
1885 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1886 return cc_reg;
1887 }
1888
1889 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1890
1891 static rtx
1892 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1893 machine_mode y_mode)
1894 {
1895 if (y_mode == E_QImode || y_mode == E_HImode)
1896 {
1897 if (CONST_INT_P (y))
1898 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1899 else
1900 {
1901 rtx t, cc_reg;
1902 machine_mode cc_mode;
1903
1904 t = gen_rtx_ZERO_EXTEND (SImode, y);
1905 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1906 cc_mode = CC_SWPmode;
1907 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1908 emit_set_insn (cc_reg, t);
1909 return cc_reg;
1910 }
1911 }
1912
1913 return aarch64_gen_compare_reg (code, x, y);
1914 }
1915
1916 /* Build the SYMBOL_REF for __tls_get_addr. */
1917
1918 static GTY(()) rtx tls_get_addr_libfunc;
1919
1920 rtx
1921 aarch64_tls_get_addr (void)
1922 {
1923 if (!tls_get_addr_libfunc)
1924 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1925 return tls_get_addr_libfunc;
1926 }
1927
1928 /* Return the TLS model to use for ADDR. */
1929
1930 static enum tls_model
1931 tls_symbolic_operand_type (rtx addr)
1932 {
1933 enum tls_model tls_kind = TLS_MODEL_NONE;
1934 if (GET_CODE (addr) == CONST)
1935 {
1936 poly_int64 addend;
1937 rtx sym = strip_offset (addr, &addend);
1938 if (GET_CODE (sym) == SYMBOL_REF)
1939 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1940 }
1941 else if (GET_CODE (addr) == SYMBOL_REF)
1942 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1943
1944 return tls_kind;
1945 }
1946
1947 /* We'll allow lo_sum's in addresses in our legitimate addresses
1948 so that combine would take care of combining addresses where
1949 necessary, but for generation purposes, we'll generate the address
1950 as :
1951 RTL Absolute
1952 tmp = hi (symbol_ref); adrp x1, foo
1953 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1954 nop
1955
1956 PIC TLS
1957 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1958 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1959 bl __tls_get_addr
1960 nop
1961
1962 Load TLS symbol, depending on TLS mechanism and TLS access model.
1963
1964 Global Dynamic - Traditional TLS:
1965 adrp tmp, :tlsgd:imm
1966 add dest, tmp, #:tlsgd_lo12:imm
1967 bl __tls_get_addr
1968
1969 Global Dynamic - TLS Descriptors:
1970 adrp dest, :tlsdesc:imm
1971 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1972 add dest, dest, #:tlsdesc_lo12:imm
1973 blr tmp
1974 mrs tp, tpidr_el0
1975 add dest, dest, tp
1976
1977 Initial Exec:
1978 mrs tp, tpidr_el0
1979 adrp tmp, :gottprel:imm
1980 ldr dest, [tmp, #:gottprel_lo12:imm]
1981 add dest, dest, tp
1982
1983 Local Exec:
1984 mrs tp, tpidr_el0
1985 add t0, tp, #:tprel_hi12:imm, lsl #12
1986 add t0, t0, #:tprel_lo12_nc:imm
1987 */
1988
1989 static void
1990 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1991 enum aarch64_symbol_type type)
1992 {
1993 switch (type)
1994 {
1995 case SYMBOL_SMALL_ABSOLUTE:
1996 {
1997 /* In ILP32, the mode of dest can be either SImode or DImode. */
1998 rtx tmp_reg = dest;
1999 machine_mode mode = GET_MODE (dest);
2000
2001 gcc_assert (mode == Pmode || mode == ptr_mode);
2002
2003 if (can_create_pseudo_p ())
2004 tmp_reg = gen_reg_rtx (mode);
2005
2006 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2007 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2008 return;
2009 }
2010
2011 case SYMBOL_TINY_ABSOLUTE:
2012 emit_insn (gen_rtx_SET (dest, imm));
2013 return;
2014
2015 case SYMBOL_SMALL_GOT_28K:
2016 {
2017 machine_mode mode = GET_MODE (dest);
2018 rtx gp_rtx = pic_offset_table_rtx;
2019 rtx insn;
2020 rtx mem;
2021
2022 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2023 here before rtl expand. Tree IVOPT will generate rtl pattern to
2024 decide rtx costs, in which case pic_offset_table_rtx is not
2025 initialized. For that case no need to generate the first adrp
2026 instruction as the final cost for global variable access is
2027 one instruction. */
2028 if (gp_rtx != NULL)
2029 {
2030 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2031 using the page base as GOT base, the first page may be wasted,
2032 in the worst scenario, there is only 28K space for GOT).
2033
2034 The generate instruction sequence for accessing global variable
2035 is:
2036
2037 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2038
2039 Only one instruction needed. But we must initialize
2040 pic_offset_table_rtx properly. We generate initialize insn for
2041 every global access, and allow CSE to remove all redundant.
2042
2043 The final instruction sequences will look like the following
2044 for multiply global variables access.
2045
2046 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2047
2048 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2049 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2050 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2051 ... */
2052
2053 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2054 crtl->uses_pic_offset_table = 1;
2055 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2056
2057 if (mode != GET_MODE (gp_rtx))
2058 gp_rtx = gen_lowpart (mode, gp_rtx);
2059
2060 }
2061
2062 if (mode == ptr_mode)
2063 {
2064 if (mode == DImode)
2065 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2066 else
2067 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2068
2069 mem = XVECEXP (SET_SRC (insn), 0, 0);
2070 }
2071 else
2072 {
2073 gcc_assert (mode == Pmode);
2074
2075 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2076 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2077 }
2078
2079 /* The operand is expected to be MEM. Whenever the related insn
2080 pattern changed, above code which calculate mem should be
2081 updated. */
2082 gcc_assert (GET_CODE (mem) == MEM);
2083 MEM_READONLY_P (mem) = 1;
2084 MEM_NOTRAP_P (mem) = 1;
2085 emit_insn (insn);
2086 return;
2087 }
2088
2089 case SYMBOL_SMALL_GOT_4G:
2090 {
2091 /* In ILP32, the mode of dest can be either SImode or DImode,
2092 while the got entry is always of SImode size. The mode of
2093 dest depends on how dest is used: if dest is assigned to a
2094 pointer (e.g. in the memory), it has SImode; it may have
2095 DImode if dest is dereferenced to access the memeory.
2096 This is why we have to handle three different ldr_got_small
2097 patterns here (two patterns for ILP32). */
2098
2099 rtx insn;
2100 rtx mem;
2101 rtx tmp_reg = dest;
2102 machine_mode mode = GET_MODE (dest);
2103
2104 if (can_create_pseudo_p ())
2105 tmp_reg = gen_reg_rtx (mode);
2106
2107 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2108 if (mode == ptr_mode)
2109 {
2110 if (mode == DImode)
2111 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2112 else
2113 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2114
2115 mem = XVECEXP (SET_SRC (insn), 0, 0);
2116 }
2117 else
2118 {
2119 gcc_assert (mode == Pmode);
2120
2121 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2122 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2123 }
2124
2125 gcc_assert (GET_CODE (mem) == MEM);
2126 MEM_READONLY_P (mem) = 1;
2127 MEM_NOTRAP_P (mem) = 1;
2128 emit_insn (insn);
2129 return;
2130 }
2131
2132 case SYMBOL_SMALL_TLSGD:
2133 {
2134 rtx_insn *insns;
2135 machine_mode mode = GET_MODE (dest);
2136 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2137
2138 start_sequence ();
2139 if (TARGET_ILP32)
2140 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2141 else
2142 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2143 insns = get_insns ();
2144 end_sequence ();
2145
2146 RTL_CONST_CALL_P (insns) = 1;
2147 emit_libcall_block (insns, dest, result, imm);
2148 return;
2149 }
2150
2151 case SYMBOL_SMALL_TLSDESC:
2152 {
2153 machine_mode mode = GET_MODE (dest);
2154 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2155 rtx tp;
2156
2157 gcc_assert (mode == Pmode || mode == ptr_mode);
2158
2159 /* In ILP32, the got entry is always of SImode size. Unlike
2160 small GOT, the dest is fixed at reg 0. */
2161 if (TARGET_ILP32)
2162 emit_insn (gen_tlsdesc_small_si (imm));
2163 else
2164 emit_insn (gen_tlsdesc_small_di (imm));
2165 tp = aarch64_load_tp (NULL);
2166
2167 if (mode != Pmode)
2168 tp = gen_lowpart (mode, tp);
2169
2170 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2171 if (REG_P (dest))
2172 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2173 return;
2174 }
2175
2176 case SYMBOL_SMALL_TLSIE:
2177 {
2178 /* In ILP32, the mode of dest can be either SImode or DImode,
2179 while the got entry is always of SImode size. The mode of
2180 dest depends on how dest is used: if dest is assigned to a
2181 pointer (e.g. in the memory), it has SImode; it may have
2182 DImode if dest is dereferenced to access the memeory.
2183 This is why we have to handle three different tlsie_small
2184 patterns here (two patterns for ILP32). */
2185 machine_mode mode = GET_MODE (dest);
2186 rtx tmp_reg = gen_reg_rtx (mode);
2187 rtx tp = aarch64_load_tp (NULL);
2188
2189 if (mode == ptr_mode)
2190 {
2191 if (mode == DImode)
2192 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2193 else
2194 {
2195 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2196 tp = gen_lowpart (mode, tp);
2197 }
2198 }
2199 else
2200 {
2201 gcc_assert (mode == Pmode);
2202 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2203 }
2204
2205 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2206 if (REG_P (dest))
2207 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2208 return;
2209 }
2210
2211 case SYMBOL_TLSLE12:
2212 case SYMBOL_TLSLE24:
2213 case SYMBOL_TLSLE32:
2214 case SYMBOL_TLSLE48:
2215 {
2216 machine_mode mode = GET_MODE (dest);
2217 rtx tp = aarch64_load_tp (NULL);
2218
2219 if (mode != Pmode)
2220 tp = gen_lowpart (mode, tp);
2221
2222 switch (type)
2223 {
2224 case SYMBOL_TLSLE12:
2225 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2226 (dest, tp, imm));
2227 break;
2228 case SYMBOL_TLSLE24:
2229 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2230 (dest, tp, imm));
2231 break;
2232 case SYMBOL_TLSLE32:
2233 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2234 (dest, imm));
2235 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2236 (dest, dest, tp));
2237 break;
2238 case SYMBOL_TLSLE48:
2239 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2240 (dest, imm));
2241 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2242 (dest, dest, tp));
2243 break;
2244 default:
2245 gcc_unreachable ();
2246 }
2247
2248 if (REG_P (dest))
2249 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2250 return;
2251 }
2252
2253 case SYMBOL_TINY_GOT:
2254 emit_insn (gen_ldr_got_tiny (dest, imm));
2255 return;
2256
2257 case SYMBOL_TINY_TLSIE:
2258 {
2259 machine_mode mode = GET_MODE (dest);
2260 rtx tp = aarch64_load_tp (NULL);
2261
2262 if (mode == ptr_mode)
2263 {
2264 if (mode == DImode)
2265 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2266 else
2267 {
2268 tp = gen_lowpart (mode, tp);
2269 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2270 }
2271 }
2272 else
2273 {
2274 gcc_assert (mode == Pmode);
2275 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2276 }
2277
2278 if (REG_P (dest))
2279 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2280 return;
2281 }
2282
2283 default:
2284 gcc_unreachable ();
2285 }
2286 }
2287
2288 /* Emit a move from SRC to DEST. Assume that the move expanders can
2289 handle all moves if !can_create_pseudo_p (). The distinction is
2290 important because, unlike emit_move_insn, the move expanders know
2291 how to force Pmode objects into the constant pool even when the
2292 constant pool address is not itself legitimate. */
2293 static rtx
2294 aarch64_emit_move (rtx dest, rtx src)
2295 {
2296 return (can_create_pseudo_p ()
2297 ? emit_move_insn (dest, src)
2298 : emit_move_insn_1 (dest, src));
2299 }
2300
2301 /* Apply UNOPTAB to OP and store the result in DEST. */
2302
2303 static void
2304 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2305 {
2306 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2307 if (dest != tmp)
2308 emit_move_insn (dest, tmp);
2309 }
2310
2311 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2312
2313 static void
2314 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2315 {
2316 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2317 OPTAB_DIRECT);
2318 if (dest != tmp)
2319 emit_move_insn (dest, tmp);
2320 }
2321
2322 /* Split a 128-bit move operation into two 64-bit move operations,
2323 taking care to handle partial overlap of register to register
2324 copies. Special cases are needed when moving between GP regs and
2325 FP regs. SRC can be a register, constant or memory; DST a register
2326 or memory. If either operand is memory it must not have any side
2327 effects. */
2328 void
2329 aarch64_split_128bit_move (rtx dst, rtx src)
2330 {
2331 rtx dst_lo, dst_hi;
2332 rtx src_lo, src_hi;
2333
2334 machine_mode mode = GET_MODE (dst);
2335
2336 gcc_assert (mode == TImode || mode == TFmode);
2337 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2338 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2339
2340 if (REG_P (dst) && REG_P (src))
2341 {
2342 int src_regno = REGNO (src);
2343 int dst_regno = REGNO (dst);
2344
2345 /* Handle FP <-> GP regs. */
2346 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2347 {
2348 src_lo = gen_lowpart (word_mode, src);
2349 src_hi = gen_highpart (word_mode, src);
2350
2351 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2352 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2353 return;
2354 }
2355 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2356 {
2357 dst_lo = gen_lowpart (word_mode, dst);
2358 dst_hi = gen_highpart (word_mode, dst);
2359
2360 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2361 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2362 return;
2363 }
2364 }
2365
2366 dst_lo = gen_lowpart (word_mode, dst);
2367 dst_hi = gen_highpart (word_mode, dst);
2368 src_lo = gen_lowpart (word_mode, src);
2369 src_hi = gen_highpart_mode (word_mode, mode, src);
2370
2371 /* At most one pairing may overlap. */
2372 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2373 {
2374 aarch64_emit_move (dst_hi, src_hi);
2375 aarch64_emit_move (dst_lo, src_lo);
2376 }
2377 else
2378 {
2379 aarch64_emit_move (dst_lo, src_lo);
2380 aarch64_emit_move (dst_hi, src_hi);
2381 }
2382 }
2383
2384 bool
2385 aarch64_split_128bit_move_p (rtx dst, rtx src)
2386 {
2387 return (! REG_P (src)
2388 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2389 }
2390
2391 /* Split a complex SIMD combine. */
2392
2393 void
2394 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2395 {
2396 machine_mode src_mode = GET_MODE (src1);
2397 machine_mode dst_mode = GET_MODE (dst);
2398
2399 gcc_assert (VECTOR_MODE_P (dst_mode));
2400 gcc_assert (register_operand (dst, dst_mode)
2401 && register_operand (src1, src_mode)
2402 && register_operand (src2, src_mode));
2403
2404 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2405 return;
2406 }
2407
2408 /* Split a complex SIMD move. */
2409
2410 void
2411 aarch64_split_simd_move (rtx dst, rtx src)
2412 {
2413 machine_mode src_mode = GET_MODE (src);
2414 machine_mode dst_mode = GET_MODE (dst);
2415
2416 gcc_assert (VECTOR_MODE_P (dst_mode));
2417
2418 if (REG_P (dst) && REG_P (src))
2419 {
2420 gcc_assert (VECTOR_MODE_P (src_mode));
2421 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2422 }
2423 }
2424
2425 bool
2426 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2427 machine_mode ymode, rtx y)
2428 {
2429 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2430 gcc_assert (r != NULL);
2431 return rtx_equal_p (x, r);
2432 }
2433
2434
2435 static rtx
2436 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2437 {
2438 if (can_create_pseudo_p ())
2439 return force_reg (mode, value);
2440 else
2441 {
2442 gcc_assert (x);
2443 aarch64_emit_move (x, value);
2444 return x;
2445 }
2446 }
2447
2448 /* Return true if we can move VALUE into a register using a single
2449 CNT[BHWD] instruction. */
2450
2451 static bool
2452 aarch64_sve_cnt_immediate_p (poly_int64 value)
2453 {
2454 HOST_WIDE_INT factor = value.coeffs[0];
2455 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2456 return (value.coeffs[1] == factor
2457 && IN_RANGE (factor, 2, 16 * 16)
2458 && (factor & 1) == 0
2459 && factor <= 16 * (factor & -factor));
2460 }
2461
2462 /* Likewise for rtx X. */
2463
2464 bool
2465 aarch64_sve_cnt_immediate_p (rtx x)
2466 {
2467 poly_int64 value;
2468 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2469 }
2470
2471 /* Return the asm string for an instruction with a CNT-like vector size
2472 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2473 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2474 first part of the operands template (the part that comes before the
2475 vector size itself). FACTOR is the number of quadwords.
2476 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2477 If it is zero, we can use any element size. */
2478
2479 static char *
2480 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2481 unsigned int factor,
2482 unsigned int nelts_per_vq)
2483 {
2484 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2485
2486 if (nelts_per_vq == 0)
2487 /* There is some overlap in the ranges of the four CNT instructions.
2488 Here we always use the smallest possible element size, so that the
2489 multiplier is 1 whereever possible. */
2490 nelts_per_vq = factor & -factor;
2491 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2492 gcc_assert (IN_RANGE (shift, 1, 4));
2493 char suffix = "dwhb"[shift - 1];
2494
2495 factor >>= shift;
2496 unsigned int written;
2497 if (factor == 1)
2498 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2499 prefix, suffix, operands);
2500 else
2501 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2502 prefix, suffix, operands, factor);
2503 gcc_assert (written < sizeof (buffer));
2504 return buffer;
2505 }
2506
2507 /* Return the asm string for an instruction with a CNT-like vector size
2508 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2509 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2510 first part of the operands template (the part that comes before the
2511 vector size itself). X is the value of the vector size operand,
2512 as a polynomial integer rtx. */
2513
2514 char *
2515 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2516 rtx x)
2517 {
2518 poly_int64 value = rtx_to_poly_int64 (x);
2519 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2520 return aarch64_output_sve_cnt_immediate (prefix, operands,
2521 value.coeffs[1], 0);
2522 }
2523
2524 /* Return true if we can add VALUE to a register using a single ADDVL
2525 or ADDPL instruction. */
2526
2527 static bool
2528 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2529 {
2530 HOST_WIDE_INT factor = value.coeffs[0];
2531 if (factor == 0 || value.coeffs[1] != factor)
2532 return false;
2533 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2534 and a value of 16 is one vector width. */
2535 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2536 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2537 }
2538
2539 /* Likewise for rtx X. */
2540
2541 bool
2542 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2543 {
2544 poly_int64 value;
2545 return (poly_int_rtx_p (x, &value)
2546 && aarch64_sve_addvl_addpl_immediate_p (value));
2547 }
2548
2549 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2550 and storing the result in operand 0. */
2551
2552 char *
2553 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2554 {
2555 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2556 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2557 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2558
2559 /* Use INC or DEC if possible. */
2560 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2561 {
2562 if (aarch64_sve_cnt_immediate_p (offset_value))
2563 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2564 offset_value.coeffs[1], 0);
2565 if (aarch64_sve_cnt_immediate_p (-offset_value))
2566 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2567 -offset_value.coeffs[1], 0);
2568 }
2569
2570 int factor = offset_value.coeffs[1];
2571 if ((factor & 15) == 0)
2572 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2573 else
2574 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2575 return buffer;
2576 }
2577
2578 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2579 instruction. If it is, store the number of elements in each vector
2580 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2581 factor in *FACTOR_OUT (if nonnull). */
2582
2583 bool
2584 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2585 unsigned int *nelts_per_vq_out)
2586 {
2587 rtx elt;
2588 poly_int64 value;
2589
2590 if (!const_vec_duplicate_p (x, &elt)
2591 || !poly_int_rtx_p (elt, &value))
2592 return false;
2593
2594 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2595 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2596 /* There's no vector INCB. */
2597 return false;
2598
2599 HOST_WIDE_INT factor = value.coeffs[0];
2600 if (value.coeffs[1] != factor)
2601 return false;
2602
2603 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2604 if ((factor % nelts_per_vq) != 0
2605 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2606 return false;
2607
2608 if (factor_out)
2609 *factor_out = factor;
2610 if (nelts_per_vq_out)
2611 *nelts_per_vq_out = nelts_per_vq;
2612 return true;
2613 }
2614
2615 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2616 instruction. */
2617
2618 bool
2619 aarch64_sve_inc_dec_immediate_p (rtx x)
2620 {
2621 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2622 }
2623
2624 /* Return the asm template for an SVE vector INC or DEC instruction.
2625 OPERANDS gives the operands before the vector count and X is the
2626 value of the vector count operand itself. */
2627
2628 char *
2629 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2630 {
2631 int factor;
2632 unsigned int nelts_per_vq;
2633 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2634 gcc_unreachable ();
2635 if (factor < 0)
2636 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2637 nelts_per_vq);
2638 else
2639 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2640 nelts_per_vq);
2641 }
2642
2643 static int
2644 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2645 scalar_int_mode mode)
2646 {
2647 int i;
2648 unsigned HOST_WIDE_INT val, val2, mask;
2649 int one_match, zero_match;
2650 int num_insns;
2651
2652 val = INTVAL (imm);
2653
2654 if (aarch64_move_imm (val, mode))
2655 {
2656 if (generate)
2657 emit_insn (gen_rtx_SET (dest, imm));
2658 return 1;
2659 }
2660
2661 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2662 (with XXXX non-zero). In that case check to see if the move can be done in
2663 a smaller mode. */
2664 val2 = val & 0xffffffff;
2665 if (mode == DImode
2666 && aarch64_move_imm (val2, SImode)
2667 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2668 {
2669 if (generate)
2670 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2671
2672 /* Check if we have to emit a second instruction by checking to see
2673 if any of the upper 32 bits of the original DI mode value is set. */
2674 if (val == val2)
2675 return 1;
2676
2677 i = (val >> 48) ? 48 : 32;
2678
2679 if (generate)
2680 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2681 GEN_INT ((val >> i) & 0xffff)));
2682
2683 return 2;
2684 }
2685
2686 if ((val >> 32) == 0 || mode == SImode)
2687 {
2688 if (generate)
2689 {
2690 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2691 if (mode == SImode)
2692 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2693 GEN_INT ((val >> 16) & 0xffff)));
2694 else
2695 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2696 GEN_INT ((val >> 16) & 0xffff)));
2697 }
2698 return 2;
2699 }
2700
2701 /* Remaining cases are all for DImode. */
2702
2703 mask = 0xffff;
2704 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2705 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2706 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2707 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2708
2709 if (zero_match != 2 && one_match != 2)
2710 {
2711 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2712 For a 64-bit bitmask try whether changing 16 bits to all ones or
2713 zeroes creates a valid bitmask. To check any repeated bitmask,
2714 try using 16 bits from the other 32-bit half of val. */
2715
2716 for (i = 0; i < 64; i += 16, mask <<= 16)
2717 {
2718 val2 = val & ~mask;
2719 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2720 break;
2721 val2 = val | mask;
2722 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2723 break;
2724 val2 = val2 & ~mask;
2725 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2726 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2727 break;
2728 }
2729 if (i != 64)
2730 {
2731 if (generate)
2732 {
2733 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2734 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2735 GEN_INT ((val >> i) & 0xffff)));
2736 }
2737 return 2;
2738 }
2739 }
2740
2741 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2742 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2743 otherwise skip zero bits. */
2744
2745 num_insns = 1;
2746 mask = 0xffff;
2747 val2 = one_match > zero_match ? ~val : val;
2748 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2749
2750 if (generate)
2751 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2752 ? (val | ~(mask << i))
2753 : (val & (mask << i)))));
2754 for (i += 16; i < 64; i += 16)
2755 {
2756 if ((val2 & (mask << i)) == 0)
2757 continue;
2758 if (generate)
2759 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2760 GEN_INT ((val >> i) & 0xffff)));
2761 num_insns ++;
2762 }
2763
2764 return num_insns;
2765 }
2766
2767 /* Return whether imm is a 128-bit immediate which is simple enough to
2768 expand inline. */
2769 bool
2770 aarch64_mov128_immediate (rtx imm)
2771 {
2772 if (GET_CODE (imm) == CONST_INT)
2773 return true;
2774
2775 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2776
2777 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2778 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2779
2780 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2781 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2782 }
2783
2784
2785 /* Return the number of temporary registers that aarch64_add_offset_1
2786 would need to add OFFSET to a register. */
2787
2788 static unsigned int
2789 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2790 {
2791 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2792 }
2793
2794 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2795 a non-polynomial OFFSET. MODE is the mode of the addition.
2796 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2797 be set and CFA adjustments added to the generated instructions.
2798
2799 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2800 temporary if register allocation is already complete. This temporary
2801 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2802 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2803 the immediate again.
2804
2805 Since this function may be used to adjust the stack pointer, we must
2806 ensure that it cannot cause transient stack deallocation (for example
2807 by first incrementing SP and then decrementing when adjusting by a
2808 large immediate). */
2809
2810 static void
2811 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2812 rtx src, HOST_WIDE_INT offset, rtx temp1,
2813 bool frame_related_p, bool emit_move_imm)
2814 {
2815 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2816 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2817
2818 HOST_WIDE_INT moffset = abs_hwi (offset);
2819 rtx_insn *insn;
2820
2821 if (!moffset)
2822 {
2823 if (!rtx_equal_p (dest, src))
2824 {
2825 insn = emit_insn (gen_rtx_SET (dest, src));
2826 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2827 }
2828 return;
2829 }
2830
2831 /* Single instruction adjustment. */
2832 if (aarch64_uimm12_shift (moffset))
2833 {
2834 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2835 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2836 return;
2837 }
2838
2839 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2840 and either:
2841
2842 a) the offset cannot be loaded by a 16-bit move or
2843 b) there is no spare register into which we can move it. */
2844 if (moffset < 0x1000000
2845 && ((!temp1 && !can_create_pseudo_p ())
2846 || !aarch64_move_imm (moffset, mode)))
2847 {
2848 HOST_WIDE_INT low_off = moffset & 0xfff;
2849
2850 low_off = offset < 0 ? -low_off : low_off;
2851 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2852 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2853 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2854 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2855 return;
2856 }
2857
2858 /* Emit a move immediate if required and an addition/subtraction. */
2859 if (emit_move_imm)
2860 {
2861 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2862 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2863 }
2864 insn = emit_insn (offset < 0
2865 ? gen_sub3_insn (dest, src, temp1)
2866 : gen_add3_insn (dest, src, temp1));
2867 if (frame_related_p)
2868 {
2869 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2870 rtx adj = plus_constant (mode, src, offset);
2871 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2872 }
2873 }
2874
2875 /* Return the number of temporary registers that aarch64_add_offset
2876 would need to move OFFSET into a register or add OFFSET to a register;
2877 ADD_P is true if we want the latter rather than the former. */
2878
2879 static unsigned int
2880 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2881 {
2882 /* This follows the same structure as aarch64_add_offset. */
2883 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2884 return 0;
2885
2886 unsigned int count = 0;
2887 HOST_WIDE_INT factor = offset.coeffs[1];
2888 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2889 poly_int64 poly_offset (factor, factor);
2890 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2891 /* Need one register for the ADDVL/ADDPL result. */
2892 count += 1;
2893 else if (factor != 0)
2894 {
2895 factor = abs (factor);
2896 if (factor > 16 * (factor & -factor))
2897 /* Need one register for the CNT result and one for the multiplication
2898 factor. If necessary, the second temporary can be reused for the
2899 constant part of the offset. */
2900 return 2;
2901 /* Need one register for the CNT result (which might then
2902 be shifted). */
2903 count += 1;
2904 }
2905 return count + aarch64_add_offset_1_temporaries (constant);
2906 }
2907
2908 /* If X can be represented as a poly_int64, return the number
2909 of temporaries that are required to add it to a register.
2910 Return -1 otherwise. */
2911
2912 int
2913 aarch64_add_offset_temporaries (rtx x)
2914 {
2915 poly_int64 offset;
2916 if (!poly_int_rtx_p (x, &offset))
2917 return -1;
2918 return aarch64_offset_temporaries (true, offset);
2919 }
2920
2921 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2922 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2923 be set and CFA adjustments added to the generated instructions.
2924
2925 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2926 temporary if register allocation is already complete. This temporary
2927 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2928 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2929 false to avoid emitting the immediate again.
2930
2931 TEMP2, if nonnull, is a second temporary register that doesn't
2932 overlap either DEST or REG.
2933
2934 Since this function may be used to adjust the stack pointer, we must
2935 ensure that it cannot cause transient stack deallocation (for example
2936 by first incrementing SP and then decrementing when adjusting by a
2937 large immediate). */
2938
2939 static void
2940 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2941 poly_int64 offset, rtx temp1, rtx temp2,
2942 bool frame_related_p, bool emit_move_imm = true)
2943 {
2944 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2945 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2946 gcc_assert (temp1 == NULL_RTX
2947 || !frame_related_p
2948 || !reg_overlap_mentioned_p (temp1, dest));
2949 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2950
2951 /* Try using ADDVL or ADDPL to add the whole value. */
2952 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2953 {
2954 rtx offset_rtx = gen_int_mode (offset, mode);
2955 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2956 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2957 return;
2958 }
2959
2960 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2961 SVE vector register, over and above the minimum size of 128 bits.
2962 This is equivalent to half the value returned by CNTD with a
2963 vector shape of ALL. */
2964 HOST_WIDE_INT factor = offset.coeffs[1];
2965 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2966
2967 /* Try using ADDVL or ADDPL to add the VG-based part. */
2968 poly_int64 poly_offset (factor, factor);
2969 if (src != const0_rtx
2970 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2971 {
2972 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2973 if (frame_related_p)
2974 {
2975 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2976 RTX_FRAME_RELATED_P (insn) = true;
2977 src = dest;
2978 }
2979 else
2980 {
2981 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2982 src = aarch64_force_temporary (mode, temp1, addr);
2983 temp1 = temp2;
2984 temp2 = NULL_RTX;
2985 }
2986 }
2987 /* Otherwise use a CNT-based sequence. */
2988 else if (factor != 0)
2989 {
2990 /* Use a subtraction if we have a negative factor. */
2991 rtx_code code = PLUS;
2992 if (factor < 0)
2993 {
2994 factor = -factor;
2995 code = MINUS;
2996 }
2997
2998 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2999 into the multiplication. */
3000 rtx val;
3001 int shift = 0;
3002 if (factor & 1)
3003 /* Use a right shift by 1. */
3004 shift = -1;
3005 else
3006 factor /= 2;
3007 HOST_WIDE_INT low_bit = factor & -factor;
3008 if (factor <= 16 * low_bit)
3009 {
3010 if (factor > 16 * 8)
3011 {
3012 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3013 the value with the minimum multiplier and shift it into
3014 position. */
3015 int extra_shift = exact_log2 (low_bit);
3016 shift += extra_shift;
3017 factor >>= extra_shift;
3018 }
3019 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3020 }
3021 else
3022 {
3023 /* Use CNTD, then multiply it by FACTOR. */
3024 val = gen_int_mode (poly_int64 (2, 2), mode);
3025 val = aarch64_force_temporary (mode, temp1, val);
3026
3027 /* Go back to using a negative multiplication factor if we have
3028 no register from which to subtract. */
3029 if (code == MINUS && src == const0_rtx)
3030 {
3031 factor = -factor;
3032 code = PLUS;
3033 }
3034 rtx coeff1 = gen_int_mode (factor, mode);
3035 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3036 val = gen_rtx_MULT (mode, val, coeff1);
3037 }
3038
3039 if (shift > 0)
3040 {
3041 /* Multiply by 1 << SHIFT. */
3042 val = aarch64_force_temporary (mode, temp1, val);
3043 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3044 }
3045 else if (shift == -1)
3046 {
3047 /* Divide by 2. */
3048 val = aarch64_force_temporary (mode, temp1, val);
3049 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3050 }
3051
3052 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3053 if (src != const0_rtx)
3054 {
3055 val = aarch64_force_temporary (mode, temp1, val);
3056 val = gen_rtx_fmt_ee (code, mode, src, val);
3057 }
3058 else if (code == MINUS)
3059 {
3060 val = aarch64_force_temporary (mode, temp1, val);
3061 val = gen_rtx_NEG (mode, val);
3062 }
3063
3064 if (constant == 0 || frame_related_p)
3065 {
3066 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3067 if (frame_related_p)
3068 {
3069 RTX_FRAME_RELATED_P (insn) = true;
3070 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3071 gen_rtx_SET (dest, plus_constant (Pmode, src,
3072 poly_offset)));
3073 }
3074 src = dest;
3075 if (constant == 0)
3076 return;
3077 }
3078 else
3079 {
3080 src = aarch64_force_temporary (mode, temp1, val);
3081 temp1 = temp2;
3082 temp2 = NULL_RTX;
3083 }
3084
3085 emit_move_imm = true;
3086 }
3087
3088 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3089 frame_related_p, emit_move_imm);
3090 }
3091
3092 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3093 than a poly_int64. */
3094
3095 void
3096 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3097 rtx offset_rtx, rtx temp1, rtx temp2)
3098 {
3099 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3100 temp1, temp2, false);
3101 }
3102
3103 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3104 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3105 if TEMP1 already contains abs (DELTA). */
3106
3107 static inline void
3108 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3109 {
3110 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3111 temp1, temp2, true, emit_move_imm);
3112 }
3113
3114 /* Subtract DELTA from the stack pointer, marking the instructions
3115 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3116 if nonnull. */
3117
3118 static inline void
3119 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3120 bool emit_move_imm = true)
3121 {
3122 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3123 temp1, temp2, frame_related_p, emit_move_imm);
3124 }
3125
3126 /* Set DEST to (vec_series BASE STEP). */
3127
3128 static void
3129 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3130 {
3131 machine_mode mode = GET_MODE (dest);
3132 scalar_mode inner = GET_MODE_INNER (mode);
3133
3134 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3135 if (!aarch64_sve_index_immediate_p (base))
3136 base = force_reg (inner, base);
3137 if (!aarch64_sve_index_immediate_p (step))
3138 step = force_reg (inner, step);
3139
3140 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3141 }
3142
3143 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3144 integer of mode INT_MODE. Return true on success. */
3145
3146 static bool
3147 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3148 rtx src)
3149 {
3150 /* If the constant is smaller than 128 bits, we can do the move
3151 using a vector of SRC_MODEs. */
3152 if (src_mode != TImode)
3153 {
3154 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3155 GET_MODE_SIZE (src_mode));
3156 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3157 emit_move_insn (gen_lowpart (dup_mode, dest),
3158 gen_const_vec_duplicate (dup_mode, src));
3159 return true;
3160 }
3161
3162 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3163 src = force_const_mem (src_mode, src);
3164 if (!src)
3165 return false;
3166
3167 /* Make sure that the address is legitimate. */
3168 if (!aarch64_sve_ld1r_operand_p (src))
3169 {
3170 rtx addr = force_reg (Pmode, XEXP (src, 0));
3171 src = replace_equiv_address (src, addr);
3172 }
3173
3174 machine_mode mode = GET_MODE (dest);
3175 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3176 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3177 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3178 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3179 emit_insn (gen_rtx_SET (dest, src));
3180 return true;
3181 }
3182
3183 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3184 isn't a simple duplicate or series. */
3185
3186 static void
3187 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3188 {
3189 machine_mode mode = GET_MODE (src);
3190 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3191 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3192 gcc_assert (npatterns > 1);
3193
3194 if (nelts_per_pattern == 1)
3195 {
3196 /* The constant is a repeating seqeuence of at least two elements,
3197 where the repeating elements occupy no more than 128 bits.
3198 Get an integer representation of the replicated value. */
3199 scalar_int_mode int_mode;
3200 if (BYTES_BIG_ENDIAN)
3201 /* For now, always use LD1RQ to load the value on big-endian
3202 targets, since the handling of smaller integers includes a
3203 subreg that is semantically an element reverse. */
3204 int_mode = TImode;
3205 else
3206 {
3207 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3208 gcc_assert (int_bits <= 128);
3209 int_mode = int_mode_for_size (int_bits, 0).require ();
3210 }
3211 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3212 if (int_value
3213 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3214 return;
3215 }
3216
3217 /* Expand each pattern individually. */
3218 rtx_vector_builder builder;
3219 auto_vec<rtx, 16> vectors (npatterns);
3220 for (unsigned int i = 0; i < npatterns; ++i)
3221 {
3222 builder.new_vector (mode, 1, nelts_per_pattern);
3223 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3224 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3225 vectors.quick_push (force_reg (mode, builder.build ()));
3226 }
3227
3228 /* Use permutes to interleave the separate vectors. */
3229 while (npatterns > 1)
3230 {
3231 npatterns /= 2;
3232 for (unsigned int i = 0; i < npatterns; ++i)
3233 {
3234 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3235 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3236 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3237 vectors[i] = tmp;
3238 }
3239 }
3240 gcc_assert (vectors[0] == dest);
3241 }
3242
3243 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3244 is a pattern that can be used to set DEST to a replicated scalar
3245 element. */
3246
3247 void
3248 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3249 rtx (*gen_vec_duplicate) (rtx, rtx))
3250 {
3251 machine_mode mode = GET_MODE (dest);
3252
3253 /* Check on what type of symbol it is. */
3254 scalar_int_mode int_mode;
3255 if ((GET_CODE (imm) == SYMBOL_REF
3256 || GET_CODE (imm) == LABEL_REF
3257 || GET_CODE (imm) == CONST
3258 || GET_CODE (imm) == CONST_POLY_INT)
3259 && is_a <scalar_int_mode> (mode, &int_mode))
3260 {
3261 rtx mem;
3262 poly_int64 offset;
3263 HOST_WIDE_INT const_offset;
3264 enum aarch64_symbol_type sty;
3265
3266 /* If we have (const (plus symbol offset)), separate out the offset
3267 before we start classifying the symbol. */
3268 rtx base = strip_offset (imm, &offset);
3269
3270 /* We must always add an offset involving VL separately, rather than
3271 folding it into the relocation. */
3272 if (!offset.is_constant (&const_offset))
3273 {
3274 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3275 emit_insn (gen_rtx_SET (dest, imm));
3276 else
3277 {
3278 /* Do arithmetic on 32-bit values if the result is smaller
3279 than that. */
3280 if (partial_subreg_p (int_mode, SImode))
3281 {
3282 /* It is invalid to do symbol calculations in modes
3283 narrower than SImode. */
3284 gcc_assert (base == const0_rtx);
3285 dest = gen_lowpart (SImode, dest);
3286 int_mode = SImode;
3287 }
3288 if (base != const0_rtx)
3289 {
3290 base = aarch64_force_temporary (int_mode, dest, base);
3291 aarch64_add_offset (int_mode, dest, base, offset,
3292 NULL_RTX, NULL_RTX, false);
3293 }
3294 else
3295 aarch64_add_offset (int_mode, dest, base, offset,
3296 dest, NULL_RTX, false);
3297 }
3298 return;
3299 }
3300
3301 sty = aarch64_classify_symbol (base, const_offset);
3302 switch (sty)
3303 {
3304 case SYMBOL_FORCE_TO_MEM:
3305 if (const_offset != 0
3306 && targetm.cannot_force_const_mem (int_mode, imm))
3307 {
3308 gcc_assert (can_create_pseudo_p ());
3309 base = aarch64_force_temporary (int_mode, dest, base);
3310 aarch64_add_offset (int_mode, dest, base, const_offset,
3311 NULL_RTX, NULL_RTX, false);
3312 return;
3313 }
3314
3315 mem = force_const_mem (ptr_mode, imm);
3316 gcc_assert (mem);
3317
3318 /* If we aren't generating PC relative literals, then
3319 we need to expand the literal pool access carefully.
3320 This is something that needs to be done in a number
3321 of places, so could well live as a separate function. */
3322 if (!aarch64_pcrelative_literal_loads)
3323 {
3324 gcc_assert (can_create_pseudo_p ());
3325 base = gen_reg_rtx (ptr_mode);
3326 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3327 if (ptr_mode != Pmode)
3328 base = convert_memory_address (Pmode, base);
3329 mem = gen_rtx_MEM (ptr_mode, base);
3330 }
3331
3332 if (int_mode != ptr_mode)
3333 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3334
3335 emit_insn (gen_rtx_SET (dest, mem));
3336
3337 return;
3338
3339 case SYMBOL_SMALL_TLSGD:
3340 case SYMBOL_SMALL_TLSDESC:
3341 case SYMBOL_SMALL_TLSIE:
3342 case SYMBOL_SMALL_GOT_28K:
3343 case SYMBOL_SMALL_GOT_4G:
3344 case SYMBOL_TINY_GOT:
3345 case SYMBOL_TINY_TLSIE:
3346 if (const_offset != 0)
3347 {
3348 gcc_assert(can_create_pseudo_p ());
3349 base = aarch64_force_temporary (int_mode, dest, base);
3350 aarch64_add_offset (int_mode, dest, base, const_offset,
3351 NULL_RTX, NULL_RTX, false);
3352 return;
3353 }
3354 /* FALLTHRU */
3355
3356 case SYMBOL_SMALL_ABSOLUTE:
3357 case SYMBOL_TINY_ABSOLUTE:
3358 case SYMBOL_TLSLE12:
3359 case SYMBOL_TLSLE24:
3360 case SYMBOL_TLSLE32:
3361 case SYMBOL_TLSLE48:
3362 aarch64_load_symref_appropriately (dest, imm, sty);
3363 return;
3364
3365 default:
3366 gcc_unreachable ();
3367 }
3368 }
3369
3370 if (!CONST_INT_P (imm))
3371 {
3372 rtx base, step, value;
3373 if (GET_CODE (imm) == HIGH
3374 || aarch64_simd_valid_immediate (imm, NULL))
3375 emit_insn (gen_rtx_SET (dest, imm));
3376 else if (const_vec_series_p (imm, &base, &step))
3377 aarch64_expand_vec_series (dest, base, step);
3378 else if (const_vec_duplicate_p (imm, &value))
3379 {
3380 /* If the constant is out of range of an SVE vector move,
3381 load it from memory if we can, otherwise move it into
3382 a register and use a DUP. */
3383 scalar_mode inner_mode = GET_MODE_INNER (mode);
3384 rtx op = force_const_mem (inner_mode, value);
3385 if (!op)
3386 op = force_reg (inner_mode, value);
3387 else if (!aarch64_sve_ld1r_operand_p (op))
3388 {
3389 rtx addr = force_reg (Pmode, XEXP (op, 0));
3390 op = replace_equiv_address (op, addr);
3391 }
3392 emit_insn (gen_vec_duplicate (dest, op));
3393 }
3394 else if (GET_CODE (imm) == CONST_VECTOR
3395 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3396 aarch64_expand_sve_const_vector (dest, imm);
3397 else
3398 {
3399 rtx mem = force_const_mem (mode, imm);
3400 gcc_assert (mem);
3401 emit_move_insn (dest, mem);
3402 }
3403
3404 return;
3405 }
3406
3407 aarch64_internal_mov_immediate (dest, imm, true,
3408 as_a <scalar_int_mode> (mode));
3409 }
3410
3411 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3412 that is known to contain PTRUE. */
3413
3414 void
3415 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3416 {
3417 expand_operand ops[3];
3418 machine_mode mode = GET_MODE (dest);
3419 create_output_operand (&ops[0], dest, mode);
3420 create_input_operand (&ops[1], pred, GET_MODE(pred));
3421 create_input_operand (&ops[2], src, mode);
3422 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3423 }
3424
3425 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3426 operand is in memory. In this case we need to use the predicated LD1
3427 and ST1 instead of LDR and STR, both for correctness on big-endian
3428 targets and because LD1 and ST1 support a wider range of addressing modes.
3429 PRED_MODE is the mode of the predicate.
3430
3431 See the comment at the head of aarch64-sve.md for details about the
3432 big-endian handling. */
3433
3434 void
3435 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3436 {
3437 machine_mode mode = GET_MODE (dest);
3438 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3439 if (!register_operand (src, mode)
3440 && !register_operand (dest, mode))
3441 {
3442 rtx tmp = gen_reg_rtx (mode);
3443 if (MEM_P (src))
3444 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3445 else
3446 emit_move_insn (tmp, src);
3447 src = tmp;
3448 }
3449 aarch64_emit_sve_pred_move (dest, ptrue, src);
3450 }
3451
3452 /* Called only on big-endian targets. See whether an SVE vector move
3453 from SRC to DEST is effectively a REV[BHW] instruction, because at
3454 least one operand is a subreg of an SVE vector that has wider or
3455 narrower elements. Return true and emit the instruction if so.
3456
3457 For example:
3458
3459 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3460
3461 represents a VIEW_CONVERT between the following vectors, viewed
3462 in memory order:
3463
3464 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3465 R1: { [0], [1], [2], [3], ... }
3466
3467 The high part of lane X in R2 should therefore correspond to lane X*2
3468 of R1, but the register representations are:
3469
3470 msb lsb
3471 R2: ...... [1].high [1].low [0].high [0].low
3472 R1: ...... [3] [2] [1] [0]
3473
3474 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3475 We therefore need a reverse operation to swap the high and low values
3476 around.
3477
3478 This is purely an optimization. Without it we would spill the
3479 subreg operand to the stack in one mode and reload it in the
3480 other mode, which has the same effect as the REV. */
3481
3482 bool
3483 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3484 {
3485 gcc_assert (BYTES_BIG_ENDIAN);
3486 if (GET_CODE (dest) == SUBREG)
3487 dest = SUBREG_REG (dest);
3488 if (GET_CODE (src) == SUBREG)
3489 src = SUBREG_REG (src);
3490
3491 /* The optimization handles two single SVE REGs with different element
3492 sizes. */
3493 if (!REG_P (dest)
3494 || !REG_P (src)
3495 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3496 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3497 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3498 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3499 return false;
3500
3501 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3502 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3503 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3504 UNSPEC_REV_SUBREG);
3505 emit_insn (gen_rtx_SET (dest, unspec));
3506 return true;
3507 }
3508
3509 /* Return a copy of X with mode MODE, without changing its other
3510 attributes. Unlike gen_lowpart, this doesn't care whether the
3511 mode change is valid. */
3512
3513 static rtx
3514 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3515 {
3516 if (GET_MODE (x) == mode)
3517 return x;
3518
3519 x = shallow_copy_rtx (x);
3520 set_mode_and_regno (x, mode, REGNO (x));
3521 return x;
3522 }
3523
3524 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3525 operands. */
3526
3527 void
3528 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3529 {
3530 /* Decide which REV operation we need. The mode with narrower elements
3531 determines the mode of the operands and the mode with the wider
3532 elements determines the reverse width. */
3533 machine_mode mode_with_wider_elts = GET_MODE (dest);
3534 machine_mode mode_with_narrower_elts = GET_MODE (src);
3535 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3536 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3537 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3538
3539 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3540 unsigned int unspec;
3541 if (wider_bytes == 8)
3542 unspec = UNSPEC_REV64;
3543 else if (wider_bytes == 4)
3544 unspec = UNSPEC_REV32;
3545 else if (wider_bytes == 2)
3546 unspec = UNSPEC_REV16;
3547 else
3548 gcc_unreachable ();
3549 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3550
3551 /* Emit:
3552
3553 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3554 UNSPEC_MERGE_PTRUE))
3555
3556 with the appropriate modes. */
3557 ptrue = gen_lowpart (pred_mode, ptrue);
3558 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3559 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3560 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3561 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3562 UNSPEC_MERGE_PTRUE);
3563 emit_insn (gen_rtx_SET (dest, src));
3564 }
3565
3566 static bool
3567 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3568 tree exp ATTRIBUTE_UNUSED)
3569 {
3570 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3571 return false;
3572
3573 return true;
3574 }
3575
3576 /* Implement TARGET_PASS_BY_REFERENCE. */
3577
3578 static bool
3579 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3580 machine_mode mode,
3581 const_tree type,
3582 bool named ATTRIBUTE_UNUSED)
3583 {
3584 HOST_WIDE_INT size;
3585 machine_mode dummymode;
3586 int nregs;
3587
3588 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3589 if (mode == BLKmode && type)
3590 size = int_size_in_bytes (type);
3591 else
3592 /* No frontends can create types with variable-sized modes, so we
3593 shouldn't be asked to pass or return them. */
3594 size = GET_MODE_SIZE (mode).to_constant ();
3595
3596 /* Aggregates are passed by reference based on their size. */
3597 if (type && AGGREGATE_TYPE_P (type))
3598 {
3599 size = int_size_in_bytes (type);
3600 }
3601
3602 /* Variable sized arguments are always returned by reference. */
3603 if (size < 0)
3604 return true;
3605
3606 /* Can this be a candidate to be passed in fp/simd register(s)? */
3607 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3608 &dummymode, &nregs,
3609 NULL))
3610 return false;
3611
3612 /* Arguments which are variable sized or larger than 2 registers are
3613 passed by reference unless they are a homogenous floating point
3614 aggregate. */
3615 return size > 2 * UNITS_PER_WORD;
3616 }
3617
3618 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3619 static bool
3620 aarch64_return_in_msb (const_tree valtype)
3621 {
3622 machine_mode dummy_mode;
3623 int dummy_int;
3624
3625 /* Never happens in little-endian mode. */
3626 if (!BYTES_BIG_ENDIAN)
3627 return false;
3628
3629 /* Only composite types smaller than or equal to 16 bytes can
3630 be potentially returned in registers. */
3631 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3632 || int_size_in_bytes (valtype) <= 0
3633 || int_size_in_bytes (valtype) > 16)
3634 return false;
3635
3636 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3637 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3638 is always passed/returned in the least significant bits of fp/simd
3639 register(s). */
3640 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3641 &dummy_mode, &dummy_int, NULL))
3642 return false;
3643
3644 return true;
3645 }
3646
3647 /* Implement TARGET_FUNCTION_VALUE.
3648 Define how to find the value returned by a function. */
3649
3650 static rtx
3651 aarch64_function_value (const_tree type, const_tree func,
3652 bool outgoing ATTRIBUTE_UNUSED)
3653 {
3654 machine_mode mode;
3655 int unsignedp;
3656 int count;
3657 machine_mode ag_mode;
3658
3659 mode = TYPE_MODE (type);
3660 if (INTEGRAL_TYPE_P (type))
3661 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3662
3663 if (aarch64_return_in_msb (type))
3664 {
3665 HOST_WIDE_INT size = int_size_in_bytes (type);
3666
3667 if (size % UNITS_PER_WORD != 0)
3668 {
3669 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3670 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3671 }
3672 }
3673
3674 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3675 &ag_mode, &count, NULL))
3676 {
3677 if (!aarch64_composite_type_p (type, mode))
3678 {
3679 gcc_assert (count == 1 && mode == ag_mode);
3680 return gen_rtx_REG (mode, V0_REGNUM);
3681 }
3682 else
3683 {
3684 int i;
3685 rtx par;
3686
3687 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3688 for (i = 0; i < count; i++)
3689 {
3690 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3691 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3692 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3693 XVECEXP (par, 0, i) = tmp;
3694 }
3695 return par;
3696 }
3697 }
3698 else
3699 return gen_rtx_REG (mode, R0_REGNUM);
3700 }
3701
3702 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3703 Return true if REGNO is the number of a hard register in which the values
3704 of called function may come back. */
3705
3706 static bool
3707 aarch64_function_value_regno_p (const unsigned int regno)
3708 {
3709 /* Maximum of 16 bytes can be returned in the general registers. Examples
3710 of 16-byte return values are: 128-bit integers and 16-byte small
3711 structures (excluding homogeneous floating-point aggregates). */
3712 if (regno == R0_REGNUM || regno == R1_REGNUM)
3713 return true;
3714
3715 /* Up to four fp/simd registers can return a function value, e.g. a
3716 homogeneous floating-point aggregate having four members. */
3717 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3718 return TARGET_FLOAT;
3719
3720 return false;
3721 }
3722
3723 /* Implement TARGET_RETURN_IN_MEMORY.
3724
3725 If the type T of the result of a function is such that
3726 void func (T arg)
3727 would require that arg be passed as a value in a register (or set of
3728 registers) according to the parameter passing rules, then the result
3729 is returned in the same registers as would be used for such an
3730 argument. */
3731
3732 static bool
3733 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3734 {
3735 HOST_WIDE_INT size;
3736 machine_mode ag_mode;
3737 int count;
3738
3739 if (!AGGREGATE_TYPE_P (type)
3740 && TREE_CODE (type) != COMPLEX_TYPE
3741 && TREE_CODE (type) != VECTOR_TYPE)
3742 /* Simple scalar types always returned in registers. */
3743 return false;
3744
3745 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3746 type,
3747 &ag_mode,
3748 &count,
3749 NULL))
3750 return false;
3751
3752 /* Types larger than 2 registers returned in memory. */
3753 size = int_size_in_bytes (type);
3754 return (size < 0 || size > 2 * UNITS_PER_WORD);
3755 }
3756
3757 static bool
3758 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3759 const_tree type, int *nregs)
3760 {
3761 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3762 return aarch64_vfp_is_call_or_return_candidate (mode,
3763 type,
3764 &pcum->aapcs_vfp_rmode,
3765 nregs,
3766 NULL);
3767 }
3768
3769 /* Given MODE and TYPE of a function argument, return the alignment in
3770 bits. The idea is to suppress any stronger alignment requested by
3771 the user and opt for the natural alignment (specified in AAPCS64 \S
3772 4.1). ABI_BREAK is set to true if the alignment was incorrectly
3773 calculated in versions of GCC prior to GCC-9. This is a helper
3774 function for local use only. */
3775
3776 static unsigned int
3777 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3778 bool *abi_break)
3779 {
3780 *abi_break = false;
3781 if (!type)
3782 return GET_MODE_ALIGNMENT (mode);
3783
3784 if (integer_zerop (TYPE_SIZE (type)))
3785 return 0;
3786
3787 gcc_assert (TYPE_MODE (type) == mode);
3788
3789 if (!AGGREGATE_TYPE_P (type))
3790 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3791
3792 if (TREE_CODE (type) == ARRAY_TYPE)
3793 return TYPE_ALIGN (TREE_TYPE (type));
3794
3795 unsigned int alignment = 0;
3796 unsigned int bitfield_alignment = 0;
3797 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3798 if (TREE_CODE (field) == FIELD_DECL)
3799 {
3800 alignment = std::max (alignment, DECL_ALIGN (field));
3801 if (DECL_BIT_FIELD_TYPE (field))
3802 bitfield_alignment
3803 = std::max (bitfield_alignment,
3804 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3805 }
3806
3807 if (bitfield_alignment > alignment)
3808 {
3809 *abi_break = true;
3810 return bitfield_alignment;
3811 }
3812
3813 return alignment;
3814 }
3815
3816 /* Layout a function argument according to the AAPCS64 rules. The rule
3817 numbers refer to the rule numbers in the AAPCS64. */
3818
3819 static void
3820 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3821 const_tree type,
3822 bool named ATTRIBUTE_UNUSED)
3823 {
3824 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3825 int ncrn, nvrn, nregs;
3826 bool allocate_ncrn, allocate_nvrn;
3827 HOST_WIDE_INT size;
3828 bool abi_break;
3829
3830 /* We need to do this once per argument. */
3831 if (pcum->aapcs_arg_processed)
3832 return;
3833
3834 pcum->aapcs_arg_processed = true;
3835
3836 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3837 if (type)
3838 size = int_size_in_bytes (type);
3839 else
3840 /* No frontends can create types with variable-sized modes, so we
3841 shouldn't be asked to pass or return them. */
3842 size = GET_MODE_SIZE (mode).to_constant ();
3843 size = ROUND_UP (size, UNITS_PER_WORD);
3844
3845 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3846 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3847 mode,
3848 type,
3849 &nregs);
3850
3851 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3852 The following code thus handles passing by SIMD/FP registers first. */
3853
3854 nvrn = pcum->aapcs_nvrn;
3855
3856 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3857 and homogenous short-vector aggregates (HVA). */
3858 if (allocate_nvrn)
3859 {
3860 if (!TARGET_FLOAT)
3861 aarch64_err_no_fpadvsimd (mode);
3862
3863 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3864 {
3865 pcum->aapcs_nextnvrn = nvrn + nregs;
3866 if (!aarch64_composite_type_p (type, mode))
3867 {
3868 gcc_assert (nregs == 1);
3869 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3870 }
3871 else
3872 {
3873 rtx par;
3874 int i;
3875 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3876 for (i = 0; i < nregs; i++)
3877 {
3878 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3879 V0_REGNUM + nvrn + i);
3880 rtx offset = gen_int_mode
3881 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3882 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3883 XVECEXP (par, 0, i) = tmp;
3884 }
3885 pcum->aapcs_reg = par;
3886 }
3887 return;
3888 }
3889 else
3890 {
3891 /* C.3 NSRN is set to 8. */
3892 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3893 goto on_stack;
3894 }
3895 }
3896
3897 ncrn = pcum->aapcs_ncrn;
3898 nregs = size / UNITS_PER_WORD;
3899
3900 /* C6 - C9. though the sign and zero extension semantics are
3901 handled elsewhere. This is the case where the argument fits
3902 entirely general registers. */
3903 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3904 {
3905 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3906
3907 /* C.8 if the argument has an alignment of 16 then the NGRN is
3908 rounded up to the next even number. */
3909 if (nregs == 2
3910 && ncrn % 2
3911 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3912 comparison is there because for > 16 * BITS_PER_UNIT
3913 alignment nregs should be > 2 and therefore it should be
3914 passed by reference rather than value. */
3915 && (aarch64_function_arg_alignment (mode, type, &abi_break)
3916 == 16 * BITS_PER_UNIT))
3917 {
3918 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3919 inform (input_location, "parameter passing for argument of type "
3920 "%qT changed in GCC 9.1", type);
3921 ++ncrn;
3922 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3923 }
3924
3925 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3926 A reg is still generated for it, but the caller should be smart
3927 enough not to use it. */
3928 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3929 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3930 else
3931 {
3932 rtx par;
3933 int i;
3934
3935 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3936 for (i = 0; i < nregs; i++)
3937 {
3938 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3939 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3940 GEN_INT (i * UNITS_PER_WORD));
3941 XVECEXP (par, 0, i) = tmp;
3942 }
3943 pcum->aapcs_reg = par;
3944 }
3945
3946 pcum->aapcs_nextncrn = ncrn + nregs;
3947 return;
3948 }
3949
3950 /* C.11 */
3951 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3952
3953 /* The argument is passed on stack; record the needed number of words for
3954 this argument and align the total size if necessary. */
3955 on_stack:
3956 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3957
3958 if (aarch64_function_arg_alignment (mode, type, &abi_break)
3959 == 16 * BITS_PER_UNIT)
3960 {
3961 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
3962 if (pcum->aapcs_stack_size != new_size)
3963 {
3964 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3965 inform (input_location, "parameter passing for argument of type "
3966 "%qT changed in GCC 9.1", type);
3967 pcum->aapcs_stack_size = new_size;
3968 }
3969 }
3970 return;
3971 }
3972
3973 /* Implement TARGET_FUNCTION_ARG. */
3974
3975 static rtx
3976 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3977 const_tree type, bool named)
3978 {
3979 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3980 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3981
3982 if (mode == VOIDmode)
3983 return NULL_RTX;
3984
3985 aarch64_layout_arg (pcum_v, mode, type, named);
3986 return pcum->aapcs_reg;
3987 }
3988
3989 void
3990 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3991 const_tree fntype ATTRIBUTE_UNUSED,
3992 rtx libname ATTRIBUTE_UNUSED,
3993 const_tree fndecl ATTRIBUTE_UNUSED,
3994 unsigned n_named ATTRIBUTE_UNUSED)
3995 {
3996 pcum->aapcs_ncrn = 0;
3997 pcum->aapcs_nvrn = 0;
3998 pcum->aapcs_nextncrn = 0;
3999 pcum->aapcs_nextnvrn = 0;
4000 pcum->pcs_variant = ARM_PCS_AAPCS64;
4001 pcum->aapcs_reg = NULL_RTX;
4002 pcum->aapcs_arg_processed = false;
4003 pcum->aapcs_stack_words = 0;
4004 pcum->aapcs_stack_size = 0;
4005
4006 if (!TARGET_FLOAT
4007 && fndecl && TREE_PUBLIC (fndecl)
4008 && fntype && fntype != error_mark_node)
4009 {
4010 const_tree type = TREE_TYPE (fntype);
4011 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4012 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4013 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4014 &mode, &nregs, NULL))
4015 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4016 }
4017 return;
4018 }
4019
4020 static void
4021 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4022 machine_mode mode,
4023 const_tree type,
4024 bool named)
4025 {
4026 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4027 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4028 {
4029 aarch64_layout_arg (pcum_v, mode, type, named);
4030 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4031 != (pcum->aapcs_stack_words != 0));
4032 pcum->aapcs_arg_processed = false;
4033 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4034 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4035 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4036 pcum->aapcs_stack_words = 0;
4037 pcum->aapcs_reg = NULL_RTX;
4038 }
4039 }
4040
4041 bool
4042 aarch64_function_arg_regno_p (unsigned regno)
4043 {
4044 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4045 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4046 }
4047
4048 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4049 PARM_BOUNDARY bits of alignment, but will be given anything up
4050 to STACK_BOUNDARY bits if the type requires it. This makes sure
4051 that both before and after the layout of each argument, the Next
4052 Stacked Argument Address (NSAA) will have a minimum alignment of
4053 8 bytes. */
4054
4055 static unsigned int
4056 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4057 {
4058 bool abi_break;
4059 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4060 &abi_break);
4061 if (abi_break & warn_psabi)
4062 inform (input_location, "parameter passing for argument of type "
4063 "%qT changed in GCC 9.1", type);
4064
4065 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4066 }
4067
4068 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4069
4070 static fixed_size_mode
4071 aarch64_get_reg_raw_mode (int regno)
4072 {
4073 if (TARGET_SVE && FP_REGNUM_P (regno))
4074 /* Don't use the SVE part of the register for __builtin_apply and
4075 __builtin_return. The SVE registers aren't used by the normal PCS,
4076 so using them there would be a waste of time. The PCS extensions
4077 for SVE types are fundamentally incompatible with the
4078 __builtin_return/__builtin_apply interface. */
4079 return as_a <fixed_size_mode> (V16QImode);
4080 return default_get_reg_raw_mode (regno);
4081 }
4082
4083 /* Implement TARGET_FUNCTION_ARG_PADDING.
4084
4085 Small aggregate types are placed in the lowest memory address.
4086
4087 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4088
4089 static pad_direction
4090 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4091 {
4092 /* On little-endian targets, the least significant byte of every stack
4093 argument is passed at the lowest byte address of the stack slot. */
4094 if (!BYTES_BIG_ENDIAN)
4095 return PAD_UPWARD;
4096
4097 /* Otherwise, integral, floating-point and pointer types are padded downward:
4098 the least significant byte of a stack argument is passed at the highest
4099 byte address of the stack slot. */
4100 if (type
4101 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4102 || POINTER_TYPE_P (type))
4103 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4104 return PAD_DOWNWARD;
4105
4106 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4107 return PAD_UPWARD;
4108 }
4109
4110 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4111
4112 It specifies padding for the last (may also be the only)
4113 element of a block move between registers and memory. If
4114 assuming the block is in the memory, padding upward means that
4115 the last element is padded after its highest significant byte,
4116 while in downward padding, the last element is padded at the
4117 its least significant byte side.
4118
4119 Small aggregates and small complex types are always padded
4120 upwards.
4121
4122 We don't need to worry about homogeneous floating-point or
4123 short-vector aggregates; their move is not affected by the
4124 padding direction determined here. Regardless of endianness,
4125 each element of such an aggregate is put in the least
4126 significant bits of a fp/simd register.
4127
4128 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4129 register has useful data, and return the opposite if the most
4130 significant byte does. */
4131
4132 bool
4133 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4134 bool first ATTRIBUTE_UNUSED)
4135 {
4136
4137 /* Small composite types are always padded upward. */
4138 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4139 {
4140 HOST_WIDE_INT size;
4141 if (type)
4142 size = int_size_in_bytes (type);
4143 else
4144 /* No frontends can create types with variable-sized modes, so we
4145 shouldn't be asked to pass or return them. */
4146 size = GET_MODE_SIZE (mode).to_constant ();
4147 if (size < 2 * UNITS_PER_WORD)
4148 return true;
4149 }
4150
4151 /* Otherwise, use the default padding. */
4152 return !BYTES_BIG_ENDIAN;
4153 }
4154
4155 static scalar_int_mode
4156 aarch64_libgcc_cmp_return_mode (void)
4157 {
4158 return SImode;
4159 }
4160
4161 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4162
4163 /* We use the 12-bit shifted immediate arithmetic instructions so values
4164 must be multiple of (1 << 12), i.e. 4096. */
4165 #define ARITH_FACTOR 4096
4166
4167 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4168 #error Cannot use simple address calculation for stack probing
4169 #endif
4170
4171 /* The pair of scratch registers used for stack probing. */
4172 #define PROBE_STACK_FIRST_REG R9_REGNUM
4173 #define PROBE_STACK_SECOND_REG R10_REGNUM
4174
4175 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4176 inclusive. These are offsets from the current stack pointer. */
4177
4178 static void
4179 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4180 {
4181 HOST_WIDE_INT size;
4182 if (!poly_size.is_constant (&size))
4183 {
4184 sorry ("stack probes for SVE frames");
4185 return;
4186 }
4187
4188 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4189
4190 /* See the same assertion on PROBE_INTERVAL above. */
4191 gcc_assert ((first % ARITH_FACTOR) == 0);
4192
4193 /* See if we have a constant small number of probes to generate. If so,
4194 that's the easy case. */
4195 if (size <= PROBE_INTERVAL)
4196 {
4197 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4198
4199 emit_set_insn (reg1,
4200 plus_constant (Pmode,
4201 stack_pointer_rtx, -(first + base)));
4202 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4203 }
4204
4205 /* The run-time loop is made up of 8 insns in the generic case while the
4206 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4207 else if (size <= 4 * PROBE_INTERVAL)
4208 {
4209 HOST_WIDE_INT i, rem;
4210
4211 emit_set_insn (reg1,
4212 plus_constant (Pmode,
4213 stack_pointer_rtx,
4214 -(first + PROBE_INTERVAL)));
4215 emit_stack_probe (reg1);
4216
4217 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4218 it exceeds SIZE. If only two probes are needed, this will not
4219 generate any code. Then probe at FIRST + SIZE. */
4220 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4221 {
4222 emit_set_insn (reg1,
4223 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4224 emit_stack_probe (reg1);
4225 }
4226
4227 rem = size - (i - PROBE_INTERVAL);
4228 if (rem > 256)
4229 {
4230 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4231
4232 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4233 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4234 }
4235 else
4236 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4237 }
4238
4239 /* Otherwise, do the same as above, but in a loop. Note that we must be
4240 extra careful with variables wrapping around because we might be at
4241 the very top (or the very bottom) of the address space and we have
4242 to be able to handle this case properly; in particular, we use an
4243 equality test for the loop condition. */
4244 else
4245 {
4246 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4247
4248 /* Step 1: round SIZE to the previous multiple of the interval. */
4249
4250 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4251
4252
4253 /* Step 2: compute initial and final value of the loop counter. */
4254
4255 /* TEST_ADDR = SP + FIRST. */
4256 emit_set_insn (reg1,
4257 plus_constant (Pmode, stack_pointer_rtx, -first));
4258
4259 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4260 HOST_WIDE_INT adjustment = - (first + rounded_size);
4261 if (! aarch64_uimm12_shift (adjustment))
4262 {
4263 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4264 true, Pmode);
4265 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4266 }
4267 else
4268 emit_set_insn (reg2,
4269 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4270
4271 /* Step 3: the loop
4272
4273 do
4274 {
4275 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4276 probe at TEST_ADDR
4277 }
4278 while (TEST_ADDR != LAST_ADDR)
4279
4280 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4281 until it is equal to ROUNDED_SIZE. */
4282
4283 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4284
4285
4286 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4287 that SIZE is equal to ROUNDED_SIZE. */
4288
4289 if (size != rounded_size)
4290 {
4291 HOST_WIDE_INT rem = size - rounded_size;
4292
4293 if (rem > 256)
4294 {
4295 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4296
4297 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4298 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4299 }
4300 else
4301 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4302 }
4303 }
4304
4305 /* Make sure nothing is scheduled before we are done. */
4306 emit_insn (gen_blockage ());
4307 }
4308
4309 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4310 absolute addresses. */
4311
4312 const char *
4313 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4314 {
4315 static int labelno = 0;
4316 char loop_lab[32];
4317 rtx xops[2];
4318
4319 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4320
4321 /* Loop. */
4322 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4323
4324 HOST_WIDE_INT stack_clash_probe_interval
4325 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4326
4327 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4328 xops[0] = reg1;
4329 HOST_WIDE_INT interval;
4330 if (flag_stack_clash_protection)
4331 interval = stack_clash_probe_interval;
4332 else
4333 interval = PROBE_INTERVAL;
4334
4335 gcc_assert (aarch64_uimm12_shift (interval));
4336 xops[1] = GEN_INT (interval);
4337
4338 output_asm_insn ("sub\t%0, %0, %1", xops);
4339
4340 /* If doing stack clash protection then we probe up by the ABI specified
4341 amount. We do this because we're dropping full pages at a time in the
4342 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4343 if (flag_stack_clash_protection)
4344 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4345 else
4346 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4347
4348 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4349 by this amount for each iteration. */
4350 output_asm_insn ("str\txzr, [%0, %1]", xops);
4351
4352 /* Test if TEST_ADDR == LAST_ADDR. */
4353 xops[1] = reg2;
4354 output_asm_insn ("cmp\t%0, %1", xops);
4355
4356 /* Branch. */
4357 fputs ("\tb.ne\t", asm_out_file);
4358 assemble_name_raw (asm_out_file, loop_lab);
4359 fputc ('\n', asm_out_file);
4360
4361 return "";
4362 }
4363
4364 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4365 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4366 of GUARD_SIZE. When a probe is emitted it is done at most
4367 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4368 at most MIN_PROBE_THRESHOLD. By the end of this function
4369 BASE = BASE - ADJUSTMENT. */
4370
4371 const char *
4372 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4373 rtx min_probe_threshold, rtx guard_size)
4374 {
4375 /* This function is not allowed to use any instruction generation function
4376 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4377 so instead emit the code you want using output_asm_insn. */
4378 gcc_assert (flag_stack_clash_protection);
4379 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4380 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4381
4382 /* The minimum required allocation before the residual requires probing. */
4383 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4384
4385 /* Clamp the value down to the nearest value that can be used with a cmp. */
4386 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4387 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4388
4389 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4390 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4391
4392 static int labelno = 0;
4393 char loop_start_lab[32];
4394 char loop_end_lab[32];
4395 rtx xops[2];
4396
4397 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4398 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4399
4400 /* Emit loop start label. */
4401 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4402
4403 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4404 xops[0] = adjustment;
4405 xops[1] = probe_offset_value_rtx;
4406 output_asm_insn ("cmp\t%0, %1", xops);
4407
4408 /* Branch to end if not enough adjustment to probe. */
4409 fputs ("\tb.lt\t", asm_out_file);
4410 assemble_name_raw (asm_out_file, loop_end_lab);
4411 fputc ('\n', asm_out_file);
4412
4413 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4414 xops[0] = base;
4415 xops[1] = probe_offset_value_rtx;
4416 output_asm_insn ("sub\t%0, %0, %1", xops);
4417
4418 /* Probe at BASE. */
4419 xops[1] = const0_rtx;
4420 output_asm_insn ("str\txzr, [%0, %1]", xops);
4421
4422 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4423 xops[0] = adjustment;
4424 xops[1] = probe_offset_value_rtx;
4425 output_asm_insn ("sub\t%0, %0, %1", xops);
4426
4427 /* Branch to start if still more bytes to allocate. */
4428 fputs ("\tb\t", asm_out_file);
4429 assemble_name_raw (asm_out_file, loop_start_lab);
4430 fputc ('\n', asm_out_file);
4431
4432 /* No probe leave. */
4433 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4434
4435 /* BASE = BASE - ADJUSTMENT. */
4436 xops[0] = base;
4437 xops[1] = adjustment;
4438 output_asm_insn ("sub\t%0, %0, %1", xops);
4439 return "";
4440 }
4441
4442 /* Determine whether a frame chain needs to be generated. */
4443 static bool
4444 aarch64_needs_frame_chain (void)
4445 {
4446 /* Force a frame chain for EH returns so the return address is at FP+8. */
4447 if (frame_pointer_needed || crtl->calls_eh_return)
4448 return true;
4449
4450 /* A leaf function cannot have calls or write LR. */
4451 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4452
4453 /* Don't use a frame chain in leaf functions if leaf frame pointers
4454 are disabled. */
4455 if (flag_omit_leaf_frame_pointer && is_leaf)
4456 return false;
4457
4458 return aarch64_use_frame_pointer;
4459 }
4460
4461 /* Mark the registers that need to be saved by the callee and calculate
4462 the size of the callee-saved registers area and frame record (both FP
4463 and LR may be omitted). */
4464 static void
4465 aarch64_layout_frame (void)
4466 {
4467 HOST_WIDE_INT offset = 0;
4468 int regno, last_fp_reg = INVALID_REGNUM;
4469 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4470
4471 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4472
4473 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4474 the mid-end is doing. */
4475 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4476
4477 #define SLOT_NOT_REQUIRED (-2)
4478 #define SLOT_REQUIRED (-1)
4479
4480 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4481 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4482
4483 /* If this is a non-leaf simd function with calls we assume that
4484 at least one of those calls is to a non-simd function and thus
4485 we must save V8 to V23 in the prologue. */
4486
4487 if (simd_function && !crtl->is_leaf)
4488 {
4489 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4490 if (FP_SIMD_SAVED_REGNUM_P (regno))
4491 df_set_regs_ever_live (regno, true);
4492 }
4493
4494 /* First mark all the registers that really need to be saved... */
4495 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4496 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4497
4498 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4499 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4500
4501 /* ... that includes the eh data registers (if needed)... */
4502 if (crtl->calls_eh_return)
4503 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4504 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4505 = SLOT_REQUIRED;
4506
4507 /* ... and any callee saved register that dataflow says is live. */
4508 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4509 if (df_regs_ever_live_p (regno)
4510 && (regno == R30_REGNUM
4511 || !call_used_regs[regno]))
4512 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4513
4514 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4515 if (df_regs_ever_live_p (regno)
4516 && (!call_used_regs[regno]
4517 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4518 {
4519 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4520 last_fp_reg = regno;
4521 }
4522
4523 if (cfun->machine->frame.emit_frame_chain)
4524 {
4525 /* FP and LR are placed in the linkage record. */
4526 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4527 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4528 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4529 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4530 offset = 2 * UNITS_PER_WORD;
4531 }
4532
4533 /* With stack-clash, LR must be saved in non-leaf functions. */
4534 gcc_assert (crtl->is_leaf
4535 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4536 != SLOT_NOT_REQUIRED));
4537
4538 /* Now assign stack slots for them. */
4539 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4540 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4541 {
4542 cfun->machine->frame.reg_offset[regno] = offset;
4543 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4544 cfun->machine->frame.wb_candidate1 = regno;
4545 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4546 cfun->machine->frame.wb_candidate2 = regno;
4547 offset += UNITS_PER_WORD;
4548 }
4549
4550 HOST_WIDE_INT max_int_offset = offset;
4551 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4552 bool has_align_gap = offset != max_int_offset;
4553
4554 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4555 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4556 {
4557 /* If there is an alignment gap between integer and fp callee-saves,
4558 allocate the last fp register to it if possible. */
4559 if (regno == last_fp_reg
4560 && has_align_gap
4561 && !simd_function
4562 && (offset & 8) == 0)
4563 {
4564 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4565 break;
4566 }
4567
4568 cfun->machine->frame.reg_offset[regno] = offset;
4569 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4570 cfun->machine->frame.wb_candidate1 = regno;
4571 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4572 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4573 cfun->machine->frame.wb_candidate2 = regno;
4574 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4575 }
4576
4577 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4578
4579 cfun->machine->frame.saved_regs_size = offset;
4580
4581 HOST_WIDE_INT varargs_and_saved_regs_size
4582 = offset + cfun->machine->frame.saved_varargs_size;
4583
4584 cfun->machine->frame.hard_fp_offset
4585 = aligned_upper_bound (varargs_and_saved_regs_size
4586 + get_frame_size (),
4587 STACK_BOUNDARY / BITS_PER_UNIT);
4588
4589 /* Both these values are already aligned. */
4590 gcc_assert (multiple_p (crtl->outgoing_args_size,
4591 STACK_BOUNDARY / BITS_PER_UNIT));
4592 cfun->machine->frame.frame_size
4593 = (cfun->machine->frame.hard_fp_offset
4594 + crtl->outgoing_args_size);
4595
4596 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4597
4598 cfun->machine->frame.initial_adjust = 0;
4599 cfun->machine->frame.final_adjust = 0;
4600 cfun->machine->frame.callee_adjust = 0;
4601 cfun->machine->frame.callee_offset = 0;
4602
4603 HOST_WIDE_INT max_push_offset = 0;
4604 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4605 max_push_offset = 512;
4606 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4607 max_push_offset = 256;
4608
4609 HOST_WIDE_INT const_size, const_fp_offset;
4610 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4611 && const_size < max_push_offset
4612 && known_eq (crtl->outgoing_args_size, 0))
4613 {
4614 /* Simple, small frame with no outgoing arguments:
4615 stp reg1, reg2, [sp, -frame_size]!
4616 stp reg3, reg4, [sp, 16] */
4617 cfun->machine->frame.callee_adjust = const_size;
4618 }
4619 else if (known_lt (crtl->outgoing_args_size
4620 + cfun->machine->frame.saved_regs_size, 512)
4621 && !(cfun->calls_alloca
4622 && known_lt (cfun->machine->frame.hard_fp_offset,
4623 max_push_offset)))
4624 {
4625 /* Frame with small outgoing arguments:
4626 sub sp, sp, frame_size
4627 stp reg1, reg2, [sp, outgoing_args_size]
4628 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4629 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4630 cfun->machine->frame.callee_offset
4631 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4632 }
4633 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4634 && const_fp_offset < max_push_offset)
4635 {
4636 /* Frame with large outgoing arguments but a small local area:
4637 stp reg1, reg2, [sp, -hard_fp_offset]!
4638 stp reg3, reg4, [sp, 16]
4639 sub sp, sp, outgoing_args_size */
4640 cfun->machine->frame.callee_adjust = const_fp_offset;
4641 cfun->machine->frame.final_adjust
4642 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4643 }
4644 else
4645 {
4646 /* Frame with large local area and outgoing arguments using frame pointer:
4647 sub sp, sp, hard_fp_offset
4648 stp x29, x30, [sp, 0]
4649 add x29, sp, 0
4650 stp reg3, reg4, [sp, 16]
4651 sub sp, sp, outgoing_args_size */
4652 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4653 cfun->machine->frame.final_adjust
4654 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4655 }
4656
4657 cfun->machine->frame.laid_out = true;
4658 }
4659
4660 /* Return true if the register REGNO is saved on entry to
4661 the current function. */
4662
4663 static bool
4664 aarch64_register_saved_on_entry (int regno)
4665 {
4666 return cfun->machine->frame.reg_offset[regno] >= 0;
4667 }
4668
4669 /* Return the next register up from REGNO up to LIMIT for the callee
4670 to save. */
4671
4672 static unsigned
4673 aarch64_next_callee_save (unsigned regno, unsigned limit)
4674 {
4675 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4676 regno ++;
4677 return regno;
4678 }
4679
4680 /* Push the register number REGNO of mode MODE to the stack with write-back
4681 adjusting the stack by ADJUSTMENT. */
4682
4683 static void
4684 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4685 HOST_WIDE_INT adjustment)
4686 {
4687 rtx base_rtx = stack_pointer_rtx;
4688 rtx insn, reg, mem;
4689
4690 reg = gen_rtx_REG (mode, regno);
4691 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4692 plus_constant (Pmode, base_rtx, -adjustment));
4693 mem = gen_frame_mem (mode, mem);
4694
4695 insn = emit_move_insn (mem, reg);
4696 RTX_FRAME_RELATED_P (insn) = 1;
4697 }
4698
4699 /* Generate and return an instruction to store the pair of registers
4700 REG and REG2 of mode MODE to location BASE with write-back adjusting
4701 the stack location BASE by ADJUSTMENT. */
4702
4703 static rtx
4704 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4705 HOST_WIDE_INT adjustment)
4706 {
4707 switch (mode)
4708 {
4709 case E_DImode:
4710 return gen_storewb_pairdi_di (base, base, reg, reg2,
4711 GEN_INT (-adjustment),
4712 GEN_INT (UNITS_PER_WORD - adjustment));
4713 case E_DFmode:
4714 return gen_storewb_pairdf_di (base, base, reg, reg2,
4715 GEN_INT (-adjustment),
4716 GEN_INT (UNITS_PER_WORD - adjustment));
4717 case E_TFmode:
4718 return gen_storewb_pairtf_di (base, base, reg, reg2,
4719 GEN_INT (-adjustment),
4720 GEN_INT (UNITS_PER_VREG - adjustment));
4721 default:
4722 gcc_unreachable ();
4723 }
4724 }
4725
4726 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4727 stack pointer by ADJUSTMENT. */
4728
4729 static void
4730 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4731 {
4732 rtx_insn *insn;
4733 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4734
4735 if (regno2 == INVALID_REGNUM)
4736 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4737
4738 rtx reg1 = gen_rtx_REG (mode, regno1);
4739 rtx reg2 = gen_rtx_REG (mode, regno2);
4740
4741 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4742 reg2, adjustment));
4743 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4744 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4745 RTX_FRAME_RELATED_P (insn) = 1;
4746 }
4747
4748 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4749 adjusting it by ADJUSTMENT afterwards. */
4750
4751 static rtx
4752 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4753 HOST_WIDE_INT adjustment)
4754 {
4755 switch (mode)
4756 {
4757 case E_DImode:
4758 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4759 GEN_INT (UNITS_PER_WORD));
4760 case E_DFmode:
4761 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4762 GEN_INT (UNITS_PER_WORD));
4763 case E_TFmode:
4764 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4765 GEN_INT (UNITS_PER_VREG));
4766 default:
4767 gcc_unreachable ();
4768 }
4769 }
4770
4771 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4772 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4773 into CFI_OPS. */
4774
4775 static void
4776 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4777 rtx *cfi_ops)
4778 {
4779 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4780 rtx reg1 = gen_rtx_REG (mode, regno1);
4781
4782 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4783
4784 if (regno2 == INVALID_REGNUM)
4785 {
4786 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4787 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4788 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4789 }
4790 else
4791 {
4792 rtx reg2 = gen_rtx_REG (mode, regno2);
4793 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4794 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4795 reg2, adjustment));
4796 }
4797 }
4798
4799 /* Generate and return a store pair instruction of mode MODE to store
4800 register REG1 to MEM1 and register REG2 to MEM2. */
4801
4802 static rtx
4803 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4804 rtx reg2)
4805 {
4806 switch (mode)
4807 {
4808 case E_DImode:
4809 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4810
4811 case E_DFmode:
4812 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4813
4814 case E_TFmode:
4815 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4816
4817 default:
4818 gcc_unreachable ();
4819 }
4820 }
4821
4822 /* Generate and regurn a load pair isntruction of mode MODE to load register
4823 REG1 from MEM1 and register REG2 from MEM2. */
4824
4825 static rtx
4826 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4827 rtx mem2)
4828 {
4829 switch (mode)
4830 {
4831 case E_DImode:
4832 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4833
4834 case E_DFmode:
4835 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4836
4837 case E_TFmode:
4838 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4839
4840 default:
4841 gcc_unreachable ();
4842 }
4843 }
4844
4845 /* Return TRUE if return address signing should be enabled for the current
4846 function, otherwise return FALSE. */
4847
4848 bool
4849 aarch64_return_address_signing_enabled (void)
4850 {
4851 /* This function should only be called after frame laid out. */
4852 gcc_assert (cfun->machine->frame.laid_out);
4853
4854 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4855 if it's LR is pushed onto stack. */
4856 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4857 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4858 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4859 }
4860
4861 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4862 bool
4863 aarch64_bti_enabled (void)
4864 {
4865 return (aarch64_enable_bti == 1);
4866 }
4867
4868 /* Emit code to save the callee-saved registers from register number START
4869 to LIMIT to the stack at the location starting at offset START_OFFSET,
4870 skipping any write-back candidates if SKIP_WB is true. */
4871
4872 static void
4873 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4874 unsigned start, unsigned limit, bool skip_wb)
4875 {
4876 rtx_insn *insn;
4877 unsigned regno;
4878 unsigned regno2;
4879
4880 for (regno = aarch64_next_callee_save (start, limit);
4881 regno <= limit;
4882 regno = aarch64_next_callee_save (regno + 1, limit))
4883 {
4884 rtx reg, mem;
4885 poly_int64 offset;
4886 int offset_diff;
4887
4888 if (skip_wb
4889 && (regno == cfun->machine->frame.wb_candidate1
4890 || regno == cfun->machine->frame.wb_candidate2))
4891 continue;
4892
4893 if (cfun->machine->reg_is_wrapped_separately[regno])
4894 continue;
4895
4896 reg = gen_rtx_REG (mode, regno);
4897 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4898 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4899 offset));
4900
4901 regno2 = aarch64_next_callee_save (regno + 1, limit);
4902 offset_diff = cfun->machine->frame.reg_offset[regno2]
4903 - cfun->machine->frame.reg_offset[regno];
4904
4905 if (regno2 <= limit
4906 && !cfun->machine->reg_is_wrapped_separately[regno2]
4907 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4908 {
4909 rtx reg2 = gen_rtx_REG (mode, regno2);
4910 rtx mem2;
4911
4912 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4913 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4914 offset));
4915 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4916 reg2));
4917
4918 /* The first part of a frame-related parallel insn is
4919 always assumed to be relevant to the frame
4920 calculations; subsequent parts, are only
4921 frame-related if explicitly marked. */
4922 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4923 regno = regno2;
4924 }
4925 else
4926 insn = emit_move_insn (mem, reg);
4927
4928 RTX_FRAME_RELATED_P (insn) = 1;
4929 }
4930 }
4931
4932 /* Emit code to restore the callee registers of mode MODE from register
4933 number START up to and including LIMIT. Restore from the stack offset
4934 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4935 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4936
4937 static void
4938 aarch64_restore_callee_saves (machine_mode mode,
4939 poly_int64 start_offset, unsigned start,
4940 unsigned limit, bool skip_wb, rtx *cfi_ops)
4941 {
4942 rtx base_rtx = stack_pointer_rtx;
4943 unsigned regno;
4944 unsigned regno2;
4945 poly_int64 offset;
4946
4947 for (regno = aarch64_next_callee_save (start, limit);
4948 regno <= limit;
4949 regno = aarch64_next_callee_save (regno + 1, limit))
4950 {
4951 if (cfun->machine->reg_is_wrapped_separately[regno])
4952 continue;
4953
4954 rtx reg, mem;
4955 int offset_diff;
4956
4957 if (skip_wb
4958 && (regno == cfun->machine->frame.wb_candidate1
4959 || regno == cfun->machine->frame.wb_candidate2))
4960 continue;
4961
4962 reg = gen_rtx_REG (mode, regno);
4963 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4964 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4965
4966 regno2 = aarch64_next_callee_save (regno + 1, limit);
4967 offset_diff = cfun->machine->frame.reg_offset[regno2]
4968 - cfun->machine->frame.reg_offset[regno];
4969
4970 if (regno2 <= limit
4971 && !cfun->machine->reg_is_wrapped_separately[regno2]
4972 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4973 {
4974 rtx reg2 = gen_rtx_REG (mode, regno2);
4975 rtx mem2;
4976
4977 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4978 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4979 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4980
4981 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4982 regno = regno2;
4983 }
4984 else
4985 emit_move_insn (reg, mem);
4986 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4987 }
4988 }
4989
4990 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4991 of MODE. */
4992
4993 static inline bool
4994 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4995 {
4996 HOST_WIDE_INT multiple;
4997 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4998 && IN_RANGE (multiple, -8, 7));
4999 }
5000
5001 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5002 of MODE. */
5003
5004 static inline bool
5005 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5006 {
5007 HOST_WIDE_INT multiple;
5008 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5009 && IN_RANGE (multiple, 0, 63));
5010 }
5011
5012 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5013 of MODE. */
5014
5015 bool
5016 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5017 {
5018 HOST_WIDE_INT multiple;
5019 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5020 && IN_RANGE (multiple, -64, 63));
5021 }
5022
5023 /* Return true if OFFSET is a signed 9-bit value. */
5024
5025 bool
5026 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5027 poly_int64 offset)
5028 {
5029 HOST_WIDE_INT const_offset;
5030 return (offset.is_constant (&const_offset)
5031 && IN_RANGE (const_offset, -256, 255));
5032 }
5033
5034 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5035 of MODE. */
5036
5037 static inline bool
5038 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5039 {
5040 HOST_WIDE_INT multiple;
5041 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5042 && IN_RANGE (multiple, -256, 255));
5043 }
5044
5045 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5046 of MODE. */
5047
5048 static inline bool
5049 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5050 {
5051 HOST_WIDE_INT multiple;
5052 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5053 && IN_RANGE (multiple, 0, 4095));
5054 }
5055
5056 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5057
5058 static sbitmap
5059 aarch64_get_separate_components (void)
5060 {
5061 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5062 bitmap_clear (components);
5063
5064 /* The registers we need saved to the frame. */
5065 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5066 if (aarch64_register_saved_on_entry (regno))
5067 {
5068 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5069 if (!frame_pointer_needed)
5070 offset += cfun->machine->frame.frame_size
5071 - cfun->machine->frame.hard_fp_offset;
5072 /* Check that we can access the stack slot of the register with one
5073 direct load with no adjustments needed. */
5074 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5075 bitmap_set_bit (components, regno);
5076 }
5077
5078 /* Don't mess with the hard frame pointer. */
5079 if (frame_pointer_needed)
5080 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5081
5082 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5083 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5084 /* If registers have been chosen to be stored/restored with
5085 writeback don't interfere with them to avoid having to output explicit
5086 stack adjustment instructions. */
5087 if (reg2 != INVALID_REGNUM)
5088 bitmap_clear_bit (components, reg2);
5089 if (reg1 != INVALID_REGNUM)
5090 bitmap_clear_bit (components, reg1);
5091
5092 bitmap_clear_bit (components, LR_REGNUM);
5093 bitmap_clear_bit (components, SP_REGNUM);
5094
5095 return components;
5096 }
5097
5098 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5099
5100 static sbitmap
5101 aarch64_components_for_bb (basic_block bb)
5102 {
5103 bitmap in = DF_LIVE_IN (bb);
5104 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5105 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5106 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5107
5108 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5109 bitmap_clear (components);
5110
5111 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5112 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5113 if ((!call_used_regs[regno]
5114 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5115 && (bitmap_bit_p (in, regno)
5116 || bitmap_bit_p (gen, regno)
5117 || bitmap_bit_p (kill, regno)))
5118 {
5119 unsigned regno2, offset, offset2;
5120 bitmap_set_bit (components, regno);
5121
5122 /* If there is a callee-save at an adjacent offset, add it too
5123 to increase the use of LDP/STP. */
5124 offset = cfun->machine->frame.reg_offset[regno];
5125 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5126
5127 if (regno2 <= LAST_SAVED_REGNUM)
5128 {
5129 offset2 = cfun->machine->frame.reg_offset[regno2];
5130 if ((offset & ~8) == (offset2 & ~8))
5131 bitmap_set_bit (components, regno2);
5132 }
5133 }
5134
5135 return components;
5136 }
5137
5138 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5139 Nothing to do for aarch64. */
5140
5141 static void
5142 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5143 {
5144 }
5145
5146 /* Return the next set bit in BMP from START onwards. Return the total number
5147 of bits in BMP if no set bit is found at or after START. */
5148
5149 static unsigned int
5150 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5151 {
5152 unsigned int nbits = SBITMAP_SIZE (bmp);
5153 if (start == nbits)
5154 return start;
5155
5156 gcc_assert (start < nbits);
5157 for (unsigned int i = start; i < nbits; i++)
5158 if (bitmap_bit_p (bmp, i))
5159 return i;
5160
5161 return nbits;
5162 }
5163
5164 /* Do the work for aarch64_emit_prologue_components and
5165 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5166 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5167 for these components or the epilogue sequence. That is, it determines
5168 whether we should emit stores or loads and what kind of CFA notes to attach
5169 to the insns. Otherwise the logic for the two sequences is very
5170 similar. */
5171
5172 static void
5173 aarch64_process_components (sbitmap components, bool prologue_p)
5174 {
5175 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5176 ? HARD_FRAME_POINTER_REGNUM
5177 : STACK_POINTER_REGNUM);
5178
5179 unsigned last_regno = SBITMAP_SIZE (components);
5180 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5181 rtx_insn *insn = NULL;
5182
5183 while (regno != last_regno)
5184 {
5185 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5186 so DFmode for the vector registers is enough. For simd functions
5187 we want to save the low 128 bits. */
5188 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5189
5190 rtx reg = gen_rtx_REG (mode, regno);
5191 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5192 if (!frame_pointer_needed)
5193 offset += cfun->machine->frame.frame_size
5194 - cfun->machine->frame.hard_fp_offset;
5195 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5196 rtx mem = gen_frame_mem (mode, addr);
5197
5198 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5199 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5200 /* No more registers to handle after REGNO.
5201 Emit a single save/restore and exit. */
5202 if (regno2 == last_regno)
5203 {
5204 insn = emit_insn (set);
5205 RTX_FRAME_RELATED_P (insn) = 1;
5206 if (prologue_p)
5207 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5208 else
5209 add_reg_note (insn, REG_CFA_RESTORE, reg);
5210 break;
5211 }
5212
5213 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5214 /* The next register is not of the same class or its offset is not
5215 mergeable with the current one into a pair. */
5216 if (!satisfies_constraint_Ump (mem)
5217 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5218 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5219 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5220 GET_MODE_SIZE (mode)))
5221 {
5222 insn = emit_insn (set);
5223 RTX_FRAME_RELATED_P (insn) = 1;
5224 if (prologue_p)
5225 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5226 else
5227 add_reg_note (insn, REG_CFA_RESTORE, reg);
5228
5229 regno = regno2;
5230 continue;
5231 }
5232
5233 /* REGNO2 can be saved/restored in a pair with REGNO. */
5234 rtx reg2 = gen_rtx_REG (mode, regno2);
5235 if (!frame_pointer_needed)
5236 offset2 += cfun->machine->frame.frame_size
5237 - cfun->machine->frame.hard_fp_offset;
5238 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5239 rtx mem2 = gen_frame_mem (mode, addr2);
5240 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5241 : gen_rtx_SET (reg2, mem2);
5242
5243 if (prologue_p)
5244 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5245 else
5246 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5247
5248 RTX_FRAME_RELATED_P (insn) = 1;
5249 if (prologue_p)
5250 {
5251 add_reg_note (insn, REG_CFA_OFFSET, set);
5252 add_reg_note (insn, REG_CFA_OFFSET, set2);
5253 }
5254 else
5255 {
5256 add_reg_note (insn, REG_CFA_RESTORE, reg);
5257 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5258 }
5259
5260 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5261 }
5262 }
5263
5264 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5265
5266 static void
5267 aarch64_emit_prologue_components (sbitmap components)
5268 {
5269 aarch64_process_components (components, true);
5270 }
5271
5272 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5273
5274 static void
5275 aarch64_emit_epilogue_components (sbitmap components)
5276 {
5277 aarch64_process_components (components, false);
5278 }
5279
5280 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5281
5282 static void
5283 aarch64_set_handled_components (sbitmap components)
5284 {
5285 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5286 if (bitmap_bit_p (components, regno))
5287 cfun->machine->reg_is_wrapped_separately[regno] = true;
5288 }
5289
5290 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5291 determining the probe offset for alloca. */
5292
5293 static HOST_WIDE_INT
5294 aarch64_stack_clash_protection_alloca_probe_range (void)
5295 {
5296 return STACK_CLASH_CALLER_GUARD;
5297 }
5298
5299
5300 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5301 registers. If POLY_SIZE is not large enough to require a probe this function
5302 will only adjust the stack. When allocating the stack space
5303 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5304 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5305 arguments. If we are then we ensure that any allocation larger than the ABI
5306 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5307 maintained.
5308
5309 We emit barriers after each stack adjustment to prevent optimizations from
5310 breaking the invariant that we never drop the stack more than a page. This
5311 invariant is needed to make it easier to correctly handle asynchronous
5312 events, e.g. if we were to allow the stack to be dropped by more than a page
5313 and then have multiple probes up and we take a signal somewhere in between
5314 then the signal handler doesn't know the state of the stack and can make no
5315 assumptions about which pages have been probed. */
5316
5317 static void
5318 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5319 poly_int64 poly_size,
5320 bool frame_related_p,
5321 bool final_adjustment_p)
5322 {
5323 HOST_WIDE_INT guard_size
5324 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5325 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5326 /* When doing the final adjustment for the outgoing argument size we can't
5327 assume that LR was saved at position 0. So subtract it's offset from the
5328 ABI safe buffer so that we don't accidentally allow an adjustment that
5329 would result in an allocation larger than the ABI buffer without
5330 probing. */
5331 HOST_WIDE_INT min_probe_threshold
5332 = final_adjustment_p
5333 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5334 : guard_size - guard_used_by_caller;
5335
5336 poly_int64 frame_size = cfun->machine->frame.frame_size;
5337
5338 /* We should always have a positive probe threshold. */
5339 gcc_assert (min_probe_threshold > 0);
5340
5341 if (flag_stack_clash_protection && !final_adjustment_p)
5342 {
5343 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5344 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5345
5346 if (known_eq (frame_size, 0))
5347 {
5348 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5349 }
5350 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5351 && known_lt (final_adjust, guard_used_by_caller))
5352 {
5353 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5354 }
5355 }
5356
5357 /* If SIZE is not large enough to require probing, just adjust the stack and
5358 exit. */
5359 if (known_lt (poly_size, min_probe_threshold)
5360 || !flag_stack_clash_protection)
5361 {
5362 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5363 return;
5364 }
5365
5366 HOST_WIDE_INT size;
5367 /* Handle the SVE non-constant case first. */
5368 if (!poly_size.is_constant (&size))
5369 {
5370 if (dump_file)
5371 {
5372 fprintf (dump_file, "Stack clash SVE prologue: ");
5373 print_dec (poly_size, dump_file);
5374 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5375 }
5376
5377 /* First calculate the amount of bytes we're actually spilling. */
5378 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5379 poly_size, temp1, temp2, false, true);
5380
5381 rtx_insn *insn = get_last_insn ();
5382
5383 if (frame_related_p)
5384 {
5385 /* This is done to provide unwinding information for the stack
5386 adjustments we're about to do, however to prevent the optimizers
5387 from removing the R11 move and leaving the CFA note (which would be
5388 very wrong) we tie the old and new stack pointer together.
5389 The tie will expand to nothing but the optimizers will not touch
5390 the instruction. */
5391 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5392 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5393 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5394
5395 /* We want the CFA independent of the stack pointer for the
5396 duration of the loop. */
5397 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5398 RTX_FRAME_RELATED_P (insn) = 1;
5399 }
5400
5401 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5402 rtx guard_const = gen_int_mode (guard_size, Pmode);
5403
5404 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5405 stack_pointer_rtx, temp1,
5406 probe_const, guard_const));
5407
5408 /* Now reset the CFA register if needed. */
5409 if (frame_related_p)
5410 {
5411 add_reg_note (insn, REG_CFA_DEF_CFA,
5412 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5413 gen_int_mode (poly_size, Pmode)));
5414 RTX_FRAME_RELATED_P (insn) = 1;
5415 }
5416
5417 return;
5418 }
5419
5420 if (dump_file)
5421 fprintf (dump_file,
5422 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5423 " bytes, probing will be required.\n", size);
5424
5425 /* Round size to the nearest multiple of guard_size, and calculate the
5426 residual as the difference between the original size and the rounded
5427 size. */
5428 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5429 HOST_WIDE_INT residual = size - rounded_size;
5430
5431 /* We can handle a small number of allocations/probes inline. Otherwise
5432 punt to a loop. */
5433 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5434 {
5435 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5436 {
5437 aarch64_sub_sp (NULL, temp2, guard_size, true);
5438 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5439 guard_used_by_caller));
5440 emit_insn (gen_blockage ());
5441 }
5442 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5443 }
5444 else
5445 {
5446 /* Compute the ending address. */
5447 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5448 temp1, NULL, false, true);
5449 rtx_insn *insn = get_last_insn ();
5450
5451 /* For the initial allocation, we don't have a frame pointer
5452 set up, so we always need CFI notes. If we're doing the
5453 final allocation, then we may have a frame pointer, in which
5454 case it is the CFA, otherwise we need CFI notes.
5455
5456 We can determine which allocation we are doing by looking at
5457 the value of FRAME_RELATED_P since the final allocations are not
5458 frame related. */
5459 if (frame_related_p)
5460 {
5461 /* We want the CFA independent of the stack pointer for the
5462 duration of the loop. */
5463 add_reg_note (insn, REG_CFA_DEF_CFA,
5464 plus_constant (Pmode, temp1, rounded_size));
5465 RTX_FRAME_RELATED_P (insn) = 1;
5466 }
5467
5468 /* This allocates and probes the stack. Note that this re-uses some of
5469 the existing Ada stack protection code. However we are guaranteed not
5470 to enter the non loop or residual branches of that code.
5471
5472 The non-loop part won't be entered because if our allocation amount
5473 doesn't require a loop, the case above would handle it.
5474
5475 The residual amount won't be entered because TEMP1 is a mutliple of
5476 the allocation size. The residual will always be 0. As such, the only
5477 part we are actually using from that code is the loop setup. The
5478 actual probing is done in aarch64_output_probe_stack_range. */
5479 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5480 stack_pointer_rtx, temp1));
5481
5482 /* Now reset the CFA register if needed. */
5483 if (frame_related_p)
5484 {
5485 add_reg_note (insn, REG_CFA_DEF_CFA,
5486 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5487 RTX_FRAME_RELATED_P (insn) = 1;
5488 }
5489
5490 emit_insn (gen_blockage ());
5491 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5492 }
5493
5494 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5495 be probed. This maintains the requirement that each page is probed at
5496 least once. For initial probing we probe only if the allocation is
5497 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5498 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5499 GUARD_SIZE. This works that for any allocation that is large enough to
5500 trigger a probe here, we'll have at least one, and if they're not large
5501 enough for this code to emit anything for them, The page would have been
5502 probed by the saving of FP/LR either by this function or any callees. If
5503 we don't have any callees then we won't have more stack adjustments and so
5504 are still safe. */
5505 if (residual)
5506 {
5507 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5508 /* If we're doing final adjustments, and we've done any full page
5509 allocations then any residual needs to be probed. */
5510 if (final_adjustment_p && rounded_size != 0)
5511 min_probe_threshold = 0;
5512 /* If doing a small final adjustment, we always probe at offset 0.
5513 This is done to avoid issues when LR is not at position 0 or when
5514 the final adjustment is smaller than the probing offset. */
5515 else if (final_adjustment_p && rounded_size == 0)
5516 residual_probe_offset = 0;
5517
5518 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5519 if (residual >= min_probe_threshold)
5520 {
5521 if (dump_file)
5522 fprintf (dump_file,
5523 "Stack clash AArch64 prologue residuals: "
5524 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5525 "\n", residual);
5526
5527 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5528 residual_probe_offset));
5529 emit_insn (gen_blockage ());
5530 }
5531 }
5532 }
5533
5534 /* Return 1 if the register is used by the epilogue. We need to say the
5535 return register is used, but only after epilogue generation is complete.
5536 Note that in the case of sibcalls, the values "used by the epilogue" are
5537 considered live at the start of the called function.
5538
5539 For SIMD functions we need to return 1 for FP registers that are saved and
5540 restored by a function but are not zero in call_used_regs. If we do not do
5541 this optimizations may remove the restore of the register. */
5542
5543 int
5544 aarch64_epilogue_uses (int regno)
5545 {
5546 if (epilogue_completed)
5547 {
5548 if (regno == LR_REGNUM)
5549 return 1;
5550 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5551 return 1;
5552 }
5553 return 0;
5554 }
5555
5556 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5557 is saved at BASE + OFFSET. */
5558
5559 static void
5560 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5561 rtx base, poly_int64 offset)
5562 {
5563 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5564 add_reg_note (insn, REG_CFA_EXPRESSION,
5565 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5566 }
5567
5568 /* AArch64 stack frames generated by this compiler look like:
5569
5570 +-------------------------------+
5571 | |
5572 | incoming stack arguments |
5573 | |
5574 +-------------------------------+
5575 | | <-- incoming stack pointer (aligned)
5576 | callee-allocated save area |
5577 | for register varargs |
5578 | |
5579 +-------------------------------+
5580 | local variables | <-- frame_pointer_rtx
5581 | |
5582 +-------------------------------+
5583 | padding | \
5584 +-------------------------------+ |
5585 | callee-saved registers | | frame.saved_regs_size
5586 +-------------------------------+ |
5587 | LR' | |
5588 +-------------------------------+ |
5589 | FP' | / <- hard_frame_pointer_rtx (aligned)
5590 +-------------------------------+
5591 | dynamic allocation |
5592 +-------------------------------+
5593 | padding |
5594 +-------------------------------+
5595 | outgoing stack arguments | <-- arg_pointer
5596 | |
5597 +-------------------------------+
5598 | | <-- stack_pointer_rtx (aligned)
5599
5600 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5601 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5602 unchanged.
5603
5604 By default for stack-clash we assume the guard is at least 64KB, but this
5605 value is configurable to either 4KB or 64KB. We also force the guard size to
5606 be the same as the probing interval and both values are kept in sync.
5607
5608 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5609 on the guard size) of stack space without probing.
5610
5611 When probing is needed, we emit a probe at the start of the prologue
5612 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5613
5614 We have to track how much space has been allocated and the only stores
5615 to the stack we track as implicit probes are the FP/LR stores.
5616
5617 For outgoing arguments we probe if the size is larger than 1KB, such that
5618 the ABI specified buffer is maintained for the next callee.
5619
5620 The following registers are reserved during frame layout and should not be
5621 used for any other purpose:
5622
5623 - r11: Used by stack clash protection when SVE is enabled.
5624 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5625 - r14 and r15: Used for speculation tracking.
5626 - r16(IP0), r17(IP1): Used by indirect tailcalls.
5627 - r30(LR), r29(FP): Used by standard frame layout.
5628
5629 These registers must be avoided in frame layout related code unless the
5630 explicit intention is to interact with one of the features listed above. */
5631
5632 /* Generate the prologue instructions for entry into a function.
5633 Establish the stack frame by decreasing the stack pointer with a
5634 properly calculated size and, if necessary, create a frame record
5635 filled with the values of LR and previous frame pointer. The
5636 current FP is also set up if it is in use. */
5637
5638 void
5639 aarch64_expand_prologue (void)
5640 {
5641 poly_int64 frame_size = cfun->machine->frame.frame_size;
5642 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5643 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5644 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5645 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5646 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5647 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5648 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5649 rtx_insn *insn;
5650
5651 /* Sign return address for functions. */
5652 if (aarch64_return_address_signing_enabled ())
5653 {
5654 insn = emit_insn (gen_pacisp ());
5655 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5656 RTX_FRAME_RELATED_P (insn) = 1;
5657 }
5658
5659 if (flag_stack_usage_info)
5660 current_function_static_stack_size = constant_lower_bound (frame_size);
5661
5662 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5663 {
5664 if (crtl->is_leaf && !cfun->calls_alloca)
5665 {
5666 if (maybe_gt (frame_size, PROBE_INTERVAL)
5667 && maybe_gt (frame_size, get_stack_check_protect ()))
5668 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5669 (frame_size
5670 - get_stack_check_protect ()));
5671 }
5672 else if (maybe_gt (frame_size, 0))
5673 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5674 }
5675
5676 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5677 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5678
5679 /* In theory we should never have both an initial adjustment
5680 and a callee save adjustment. Verify that is the case since the
5681 code below does not handle it for -fstack-clash-protection. */
5682 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5683
5684 /* Will only probe if the initial adjustment is larger than the guard
5685 less the amount of the guard reserved for use by the caller's
5686 outgoing args. */
5687 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5688 true, false);
5689
5690 if (callee_adjust != 0)
5691 aarch64_push_regs (reg1, reg2, callee_adjust);
5692
5693 if (emit_frame_chain)
5694 {
5695 poly_int64 reg_offset = callee_adjust;
5696 if (callee_adjust == 0)
5697 {
5698 reg1 = R29_REGNUM;
5699 reg2 = R30_REGNUM;
5700 reg_offset = callee_offset;
5701 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5702 }
5703 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5704 stack_pointer_rtx, callee_offset,
5705 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5706 if (frame_pointer_needed && !frame_size.is_constant ())
5707 {
5708 /* Variable-sized frames need to describe the save slot
5709 address using DW_CFA_expression rather than DW_CFA_offset.
5710 This means that, without taking further action, the
5711 locations of the registers that we've already saved would
5712 remain based on the stack pointer even after we redefine
5713 the CFA based on the frame pointer. We therefore need new
5714 DW_CFA_expressions to re-express the save slots with addresses
5715 based on the frame pointer. */
5716 rtx_insn *insn = get_last_insn ();
5717 gcc_assert (RTX_FRAME_RELATED_P (insn));
5718
5719 /* Add an explicit CFA definition if this was previously
5720 implicit. */
5721 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5722 {
5723 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5724 callee_offset);
5725 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5726 gen_rtx_SET (hard_frame_pointer_rtx, src));
5727 }
5728
5729 /* Change the save slot expressions for the registers that
5730 we've already saved. */
5731 reg_offset -= callee_offset;
5732 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5733 reg_offset + UNITS_PER_WORD);
5734 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5735 reg_offset);
5736 }
5737 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5738 }
5739
5740 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5741 callee_adjust != 0 || emit_frame_chain);
5742 if (aarch64_simd_decl_p (cfun->decl))
5743 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5744 callee_adjust != 0 || emit_frame_chain);
5745 else
5746 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5747 callee_adjust != 0 || emit_frame_chain);
5748
5749 /* We may need to probe the final adjustment if it is larger than the guard
5750 that is assumed by the called. */
5751 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5752 !frame_pointer_needed, true);
5753 }
5754
5755 /* Return TRUE if we can use a simple_return insn.
5756
5757 This function checks whether the callee saved stack is empty, which
5758 means no restore actions are need. The pro_and_epilogue will use
5759 this to check whether shrink-wrapping opt is feasible. */
5760
5761 bool
5762 aarch64_use_return_insn_p (void)
5763 {
5764 if (!reload_completed)
5765 return false;
5766
5767 if (crtl->profile)
5768 return false;
5769
5770 return known_eq (cfun->machine->frame.frame_size, 0);
5771 }
5772
5773 /* Return false for non-leaf SIMD functions in order to avoid
5774 shrink-wrapping them. Doing this will lose the necessary
5775 save/restore of FP registers. */
5776
5777 bool
5778 aarch64_use_simple_return_insn_p (void)
5779 {
5780 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5781 return false;
5782
5783 return true;
5784 }
5785
5786 /* Generate the epilogue instructions for returning from a function.
5787 This is almost exactly the reverse of the prolog sequence, except
5788 that we need to insert barriers to avoid scheduling loads that read
5789 from a deallocated stack, and we optimize the unwind records by
5790 emitting them all together if possible. */
5791 void
5792 aarch64_expand_epilogue (bool for_sibcall)
5793 {
5794 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5795 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5796 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5797 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5798 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5799 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5800 rtx cfi_ops = NULL;
5801 rtx_insn *insn;
5802 /* A stack clash protection prologue may not have left EP0_REGNUM or
5803 EP1_REGNUM in a usable state. The same is true for allocations
5804 with an SVE component, since we then need both temporary registers
5805 for each allocation. For stack clash we are in a usable state if
5806 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5807 HOST_WIDE_INT guard_size
5808 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5809 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5810
5811 /* We can re-use the registers when the allocation amount is smaller than
5812 guard_size - guard_used_by_caller because we won't be doing any probes
5813 then. In such situations the register should remain live with the correct
5814 value. */
5815 bool can_inherit_p = (initial_adjust.is_constant ()
5816 && final_adjust.is_constant ())
5817 && (!flag_stack_clash_protection
5818 || known_lt (initial_adjust,
5819 guard_size - guard_used_by_caller));
5820
5821 /* We need to add memory barrier to prevent read from deallocated stack. */
5822 bool need_barrier_p
5823 = maybe_ne (get_frame_size ()
5824 + cfun->machine->frame.saved_varargs_size, 0);
5825
5826 /* Emit a barrier to prevent loads from a deallocated stack. */
5827 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5828 || cfun->calls_alloca
5829 || crtl->calls_eh_return)
5830 {
5831 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5832 need_barrier_p = false;
5833 }
5834
5835 /* Restore the stack pointer from the frame pointer if it may not
5836 be the same as the stack pointer. */
5837 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5838 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5839 if (frame_pointer_needed
5840 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5841 /* If writeback is used when restoring callee-saves, the CFA
5842 is restored on the instruction doing the writeback. */
5843 aarch64_add_offset (Pmode, stack_pointer_rtx,
5844 hard_frame_pointer_rtx, -callee_offset,
5845 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5846 else
5847 /* The case where we need to re-use the register here is very rare, so
5848 avoid the complicated condition and just always emit a move if the
5849 immediate doesn't fit. */
5850 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5851
5852 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5853 callee_adjust != 0, &cfi_ops);
5854 if (aarch64_simd_decl_p (cfun->decl))
5855 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5856 callee_adjust != 0, &cfi_ops);
5857 else
5858 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5859 callee_adjust != 0, &cfi_ops);
5860
5861 if (need_barrier_p)
5862 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5863
5864 if (callee_adjust != 0)
5865 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5866
5867 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5868 {
5869 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5870 insn = get_last_insn ();
5871 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5872 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5873 RTX_FRAME_RELATED_P (insn) = 1;
5874 cfi_ops = NULL;
5875 }
5876
5877 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5878 add restriction on emit_move optimization to leaf functions. */
5879 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5880 (!can_inherit_p || !crtl->is_leaf
5881 || df_regs_ever_live_p (EP0_REGNUM)));
5882
5883 if (cfi_ops)
5884 {
5885 /* Emit delayed restores and reset the CFA to be SP. */
5886 insn = get_last_insn ();
5887 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5888 REG_NOTES (insn) = cfi_ops;
5889 RTX_FRAME_RELATED_P (insn) = 1;
5890 }
5891
5892 /* We prefer to emit the combined return/authenticate instruction RETAA,
5893 however there are three cases in which we must instead emit an explicit
5894 authentication instruction.
5895
5896 1) Sibcalls don't return in a normal way, so if we're about to call one
5897 we must authenticate.
5898
5899 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5900 generating code for !TARGET_ARMV8_3 we can't use it and must
5901 explicitly authenticate.
5902
5903 3) On an eh_return path we make extra stack adjustments to update the
5904 canonical frame address to be the exception handler's CFA. We want
5905 to authenticate using the CFA of the function which calls eh_return.
5906 */
5907 if (aarch64_return_address_signing_enabled ()
5908 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5909 {
5910 insn = emit_insn (gen_autisp ());
5911 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5912 RTX_FRAME_RELATED_P (insn) = 1;
5913 }
5914
5915 /* Stack adjustment for exception handler. */
5916 if (crtl->calls_eh_return)
5917 {
5918 /* We need to unwind the stack by the offset computed by
5919 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5920 to be SP; letting the CFA move during this adjustment
5921 is just as correct as retaining the CFA from the body
5922 of the function. Therefore, do nothing special. */
5923 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5924 }
5925
5926 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5927 if (!for_sibcall)
5928 emit_jump_insn (ret_rtx);
5929 }
5930
5931 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5932 normally or return to a previous frame after unwinding.
5933
5934 An EH return uses a single shared return sequence. The epilogue is
5935 exactly like a normal epilogue except that it has an extra input
5936 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5937 that must be applied after the frame has been destroyed. An extra label
5938 is inserted before the epilogue which initializes this register to zero,
5939 and this is the entry point for a normal return.
5940
5941 An actual EH return updates the return address, initializes the stack
5942 adjustment and jumps directly into the epilogue (bypassing the zeroing
5943 of the adjustment). Since the return address is typically saved on the
5944 stack when a function makes a call, the saved LR must be updated outside
5945 the epilogue.
5946
5947 This poses problems as the store is generated well before the epilogue,
5948 so the offset of LR is not known yet. Also optimizations will remove the
5949 store as it appears dead, even after the epilogue is generated (as the
5950 base or offset for loading LR is different in many cases).
5951
5952 To avoid these problems this implementation forces the frame pointer
5953 in eh_return functions so that the location of LR is fixed and known early.
5954 It also marks the store volatile, so no optimization is permitted to
5955 remove the store. */
5956 rtx
5957 aarch64_eh_return_handler_rtx (void)
5958 {
5959 rtx tmp = gen_frame_mem (Pmode,
5960 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5961
5962 /* Mark the store volatile, so no optimization is permitted to remove it. */
5963 MEM_VOLATILE_P (tmp) = true;
5964 return tmp;
5965 }
5966
5967 /* Output code to add DELTA to the first argument, and then jump
5968 to FUNCTION. Used for C++ multiple inheritance. */
5969 static void
5970 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5971 HOST_WIDE_INT delta,
5972 HOST_WIDE_INT vcall_offset,
5973 tree function)
5974 {
5975 /* The this pointer is always in x0. Note that this differs from
5976 Arm where the this pointer maybe bumped to r1 if r0 is required
5977 to return a pointer to an aggregate. On AArch64 a result value
5978 pointer will be in x8. */
5979 int this_regno = R0_REGNUM;
5980 rtx this_rtx, temp0, temp1, addr, funexp;
5981 rtx_insn *insn;
5982
5983 if (aarch64_bti_enabled ())
5984 emit_insn (gen_bti_c());
5985
5986 reload_completed = 1;
5987 emit_note (NOTE_INSN_PROLOGUE_END);
5988
5989 this_rtx = gen_rtx_REG (Pmode, this_regno);
5990 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5991 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5992
5993 if (vcall_offset == 0)
5994 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5995 else
5996 {
5997 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5998
5999 addr = this_rtx;
6000 if (delta != 0)
6001 {
6002 if (delta >= -256 && delta < 256)
6003 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6004 plus_constant (Pmode, this_rtx, delta));
6005 else
6006 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6007 temp1, temp0, false);
6008 }
6009
6010 if (Pmode == ptr_mode)
6011 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6012 else
6013 aarch64_emit_move (temp0,
6014 gen_rtx_ZERO_EXTEND (Pmode,
6015 gen_rtx_MEM (ptr_mode, addr)));
6016
6017 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6018 addr = plus_constant (Pmode, temp0, vcall_offset);
6019 else
6020 {
6021 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6022 Pmode);
6023 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6024 }
6025
6026 if (Pmode == ptr_mode)
6027 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6028 else
6029 aarch64_emit_move (temp1,
6030 gen_rtx_SIGN_EXTEND (Pmode,
6031 gen_rtx_MEM (ptr_mode, addr)));
6032
6033 emit_insn (gen_add2_insn (this_rtx, temp1));
6034 }
6035
6036 /* Generate a tail call to the target function. */
6037 if (!TREE_USED (function))
6038 {
6039 assemble_external (function);
6040 TREE_USED (function) = 1;
6041 }
6042 funexp = XEXP (DECL_RTL (function), 0);
6043 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6044 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6045 SIBLING_CALL_P (insn) = 1;
6046
6047 insn = get_insns ();
6048 shorten_branches (insn);
6049 final_start_function (insn, file, 1);
6050 final (insn, file, 1);
6051 final_end_function ();
6052
6053 /* Stop pretending to be a post-reload pass. */
6054 reload_completed = 0;
6055 }
6056
6057 static bool
6058 aarch64_tls_referenced_p (rtx x)
6059 {
6060 if (!TARGET_HAVE_TLS)
6061 return false;
6062 subrtx_iterator::array_type array;
6063 FOR_EACH_SUBRTX (iter, array, x, ALL)
6064 {
6065 const_rtx x = *iter;
6066 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6067 return true;
6068 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6069 TLS offsets, not real symbol references. */
6070 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6071 iter.skip_subrtxes ();
6072 }
6073 return false;
6074 }
6075
6076
6077 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6078 a left shift of 0 or 12 bits. */
6079 bool
6080 aarch64_uimm12_shift (HOST_WIDE_INT val)
6081 {
6082 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6083 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6084 );
6085 }
6086
6087 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6088 that can be created with a left shift of 0 or 12. */
6089 static HOST_WIDE_INT
6090 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6091 {
6092 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6093 handle correctly. */
6094 gcc_assert ((val & 0xffffff) == val);
6095
6096 if (((val & 0xfff) << 0) == val)
6097 return val;
6098
6099 return val & (0xfff << 12);
6100 }
6101
6102 /* Return true if val is an immediate that can be loaded into a
6103 register by a MOVZ instruction. */
6104 static bool
6105 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6106 {
6107 if (GET_MODE_SIZE (mode) > 4)
6108 {
6109 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6110 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6111 return 1;
6112 }
6113 else
6114 {
6115 /* Ignore sign extension. */
6116 val &= (HOST_WIDE_INT) 0xffffffff;
6117 }
6118 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6119 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6120 }
6121
6122 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6123 64-bit (DImode) integer. */
6124
6125 static unsigned HOST_WIDE_INT
6126 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6127 {
6128 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6129 while (size < 64)
6130 {
6131 val &= (HOST_WIDE_INT_1U << size) - 1;
6132 val |= val << size;
6133 size *= 2;
6134 }
6135 return val;
6136 }
6137
6138 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6139
6140 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6141 {
6142 0x0000000100000001ull,
6143 0x0001000100010001ull,
6144 0x0101010101010101ull,
6145 0x1111111111111111ull,
6146 0x5555555555555555ull,
6147 };
6148
6149
6150 /* Return true if val is a valid bitmask immediate. */
6151
6152 bool
6153 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6154 {
6155 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6156 int bits;
6157
6158 /* Check for a single sequence of one bits and return quickly if so.
6159 The special cases of all ones and all zeroes returns false. */
6160 val = aarch64_replicate_bitmask_imm (val_in, mode);
6161 tmp = val + (val & -val);
6162
6163 if (tmp == (tmp & -tmp))
6164 return (val + 1) > 1;
6165
6166 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6167 if (mode == SImode)
6168 val = (val << 32) | (val & 0xffffffff);
6169
6170 /* Invert if the immediate doesn't start with a zero bit - this means we
6171 only need to search for sequences of one bits. */
6172 if (val & 1)
6173 val = ~val;
6174
6175 /* Find the first set bit and set tmp to val with the first sequence of one
6176 bits removed. Return success if there is a single sequence of ones. */
6177 first_one = val & -val;
6178 tmp = val & (val + first_one);
6179
6180 if (tmp == 0)
6181 return true;
6182
6183 /* Find the next set bit and compute the difference in bit position. */
6184 next_one = tmp & -tmp;
6185 bits = clz_hwi (first_one) - clz_hwi (next_one);
6186 mask = val ^ tmp;
6187
6188 /* Check the bit position difference is a power of 2, and that the first
6189 sequence of one bits fits within 'bits' bits. */
6190 if ((mask >> bits) != 0 || bits != (bits & -bits))
6191 return false;
6192
6193 /* Check the sequence of one bits is repeated 64/bits times. */
6194 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6195 }
6196
6197 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6198 Assumed precondition: VAL_IN Is not zero. */
6199
6200 unsigned HOST_WIDE_INT
6201 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6202 {
6203 int lowest_bit_set = ctz_hwi (val_in);
6204 int highest_bit_set = floor_log2 (val_in);
6205 gcc_assert (val_in != 0);
6206
6207 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6208 (HOST_WIDE_INT_1U << lowest_bit_set));
6209 }
6210
6211 /* Create constant where bits outside of lowest bit set to highest bit set
6212 are set to 1. */
6213
6214 unsigned HOST_WIDE_INT
6215 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6216 {
6217 return val_in | ~aarch64_and_split_imm1 (val_in);
6218 }
6219
6220 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6221
6222 bool
6223 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6224 {
6225 scalar_int_mode int_mode;
6226 if (!is_a <scalar_int_mode> (mode, &int_mode))
6227 return false;
6228
6229 if (aarch64_bitmask_imm (val_in, int_mode))
6230 return false;
6231
6232 if (aarch64_move_imm (val_in, int_mode))
6233 return false;
6234
6235 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6236
6237 return aarch64_bitmask_imm (imm2, int_mode);
6238 }
6239
6240 /* Return true if val is an immediate that can be loaded into a
6241 register in a single instruction. */
6242 bool
6243 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6244 {
6245 scalar_int_mode int_mode;
6246 if (!is_a <scalar_int_mode> (mode, &int_mode))
6247 return false;
6248
6249 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6250 return 1;
6251 return aarch64_bitmask_imm (val, int_mode);
6252 }
6253
6254 static bool
6255 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6256 {
6257 rtx base, offset;
6258
6259 if (GET_CODE (x) == HIGH)
6260 return true;
6261
6262 /* There's no way to calculate VL-based values using relocations. */
6263 subrtx_iterator::array_type array;
6264 FOR_EACH_SUBRTX (iter, array, x, ALL)
6265 if (GET_CODE (*iter) == CONST_POLY_INT)
6266 return true;
6267
6268 split_const (x, &base, &offset);
6269 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6270 {
6271 if (aarch64_classify_symbol (base, INTVAL (offset))
6272 != SYMBOL_FORCE_TO_MEM)
6273 return true;
6274 else
6275 /* Avoid generating a 64-bit relocation in ILP32; leave
6276 to aarch64_expand_mov_immediate to handle it properly. */
6277 return mode != ptr_mode;
6278 }
6279
6280 return aarch64_tls_referenced_p (x);
6281 }
6282
6283 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6284 The expansion for a table switch is quite expensive due to the number
6285 of instructions, the table lookup and hard to predict indirect jump.
6286 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6287 set, otherwise use tables for > 16 cases as a tradeoff between size and
6288 performance. When optimizing for size, use the default setting. */
6289
6290 static unsigned int
6291 aarch64_case_values_threshold (void)
6292 {
6293 /* Use the specified limit for the number of cases before using jump
6294 tables at higher optimization levels. */
6295 if (optimize > 2
6296 && selected_cpu->tune->max_case_values != 0)
6297 return selected_cpu->tune->max_case_values;
6298 else
6299 return optimize_size ? default_case_values_threshold () : 17;
6300 }
6301
6302 /* Return true if register REGNO is a valid index register.
6303 STRICT_P is true if REG_OK_STRICT is in effect. */
6304
6305 bool
6306 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6307 {
6308 if (!HARD_REGISTER_NUM_P (regno))
6309 {
6310 if (!strict_p)
6311 return true;
6312
6313 if (!reg_renumber)
6314 return false;
6315
6316 regno = reg_renumber[regno];
6317 }
6318 return GP_REGNUM_P (regno);
6319 }
6320
6321 /* Return true if register REGNO is a valid base register for mode MODE.
6322 STRICT_P is true if REG_OK_STRICT is in effect. */
6323
6324 bool
6325 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6326 {
6327 if (!HARD_REGISTER_NUM_P (regno))
6328 {
6329 if (!strict_p)
6330 return true;
6331
6332 if (!reg_renumber)
6333 return false;
6334
6335 regno = reg_renumber[regno];
6336 }
6337
6338 /* The fake registers will be eliminated to either the stack or
6339 hard frame pointer, both of which are usually valid base registers.
6340 Reload deals with the cases where the eliminated form isn't valid. */
6341 return (GP_REGNUM_P (regno)
6342 || regno == SP_REGNUM
6343 || regno == FRAME_POINTER_REGNUM
6344 || regno == ARG_POINTER_REGNUM);
6345 }
6346
6347 /* Return true if X is a valid base register for mode MODE.
6348 STRICT_P is true if REG_OK_STRICT is in effect. */
6349
6350 static bool
6351 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6352 {
6353 if (!strict_p
6354 && GET_CODE (x) == SUBREG
6355 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6356 x = SUBREG_REG (x);
6357
6358 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6359 }
6360
6361 /* Return true if address offset is a valid index. If it is, fill in INFO
6362 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6363
6364 static bool
6365 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6366 machine_mode mode, bool strict_p)
6367 {
6368 enum aarch64_address_type type;
6369 rtx index;
6370 int shift;
6371
6372 /* (reg:P) */
6373 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6374 && GET_MODE (x) == Pmode)
6375 {
6376 type = ADDRESS_REG_REG;
6377 index = x;
6378 shift = 0;
6379 }
6380 /* (sign_extend:DI (reg:SI)) */
6381 else if ((GET_CODE (x) == SIGN_EXTEND
6382 || GET_CODE (x) == ZERO_EXTEND)
6383 && GET_MODE (x) == DImode
6384 && GET_MODE (XEXP (x, 0)) == SImode)
6385 {
6386 type = (GET_CODE (x) == SIGN_EXTEND)
6387 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6388 index = XEXP (x, 0);
6389 shift = 0;
6390 }
6391 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6392 else if (GET_CODE (x) == MULT
6393 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6394 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6395 && GET_MODE (XEXP (x, 0)) == DImode
6396 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6397 && CONST_INT_P (XEXP (x, 1)))
6398 {
6399 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6400 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6401 index = XEXP (XEXP (x, 0), 0);
6402 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6403 }
6404 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6405 else if (GET_CODE (x) == ASHIFT
6406 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6407 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6408 && GET_MODE (XEXP (x, 0)) == DImode
6409 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6410 && CONST_INT_P (XEXP (x, 1)))
6411 {
6412 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6413 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6414 index = XEXP (XEXP (x, 0), 0);
6415 shift = INTVAL (XEXP (x, 1));
6416 }
6417 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6418 else if ((GET_CODE (x) == SIGN_EXTRACT
6419 || GET_CODE (x) == ZERO_EXTRACT)
6420 && GET_MODE (x) == DImode
6421 && GET_CODE (XEXP (x, 0)) == MULT
6422 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6423 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6424 {
6425 type = (GET_CODE (x) == SIGN_EXTRACT)
6426 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6427 index = XEXP (XEXP (x, 0), 0);
6428 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6429 if (INTVAL (XEXP (x, 1)) != 32 + shift
6430 || INTVAL (XEXP (x, 2)) != 0)
6431 shift = -1;
6432 }
6433 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6434 (const_int 0xffffffff<<shift)) */
6435 else if (GET_CODE (x) == AND
6436 && GET_MODE (x) == DImode
6437 && GET_CODE (XEXP (x, 0)) == MULT
6438 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6439 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6440 && CONST_INT_P (XEXP (x, 1)))
6441 {
6442 type = ADDRESS_REG_UXTW;
6443 index = XEXP (XEXP (x, 0), 0);
6444 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6445 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6446 shift = -1;
6447 }
6448 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6449 else if ((GET_CODE (x) == SIGN_EXTRACT
6450 || GET_CODE (x) == ZERO_EXTRACT)
6451 && GET_MODE (x) == DImode
6452 && GET_CODE (XEXP (x, 0)) == ASHIFT
6453 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6454 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6455 {
6456 type = (GET_CODE (x) == SIGN_EXTRACT)
6457 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6458 index = XEXP (XEXP (x, 0), 0);
6459 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6460 if (INTVAL (XEXP (x, 1)) != 32 + shift
6461 || INTVAL (XEXP (x, 2)) != 0)
6462 shift = -1;
6463 }
6464 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6465 (const_int 0xffffffff<<shift)) */
6466 else if (GET_CODE (x) == AND
6467 && GET_MODE (x) == DImode
6468 && GET_CODE (XEXP (x, 0)) == ASHIFT
6469 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6470 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6471 && CONST_INT_P (XEXP (x, 1)))
6472 {
6473 type = ADDRESS_REG_UXTW;
6474 index = XEXP (XEXP (x, 0), 0);
6475 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6476 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6477 shift = -1;
6478 }
6479 /* (mult:P (reg:P) (const_int scale)) */
6480 else if (GET_CODE (x) == MULT
6481 && GET_MODE (x) == Pmode
6482 && GET_MODE (XEXP (x, 0)) == Pmode
6483 && CONST_INT_P (XEXP (x, 1)))
6484 {
6485 type = ADDRESS_REG_REG;
6486 index = XEXP (x, 0);
6487 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6488 }
6489 /* (ashift:P (reg:P) (const_int shift)) */
6490 else if (GET_CODE (x) == ASHIFT
6491 && GET_MODE (x) == Pmode
6492 && GET_MODE (XEXP (x, 0)) == Pmode
6493 && CONST_INT_P (XEXP (x, 1)))
6494 {
6495 type = ADDRESS_REG_REG;
6496 index = XEXP (x, 0);
6497 shift = INTVAL (XEXP (x, 1));
6498 }
6499 else
6500 return false;
6501
6502 if (!strict_p
6503 && GET_CODE (index) == SUBREG
6504 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6505 index = SUBREG_REG (index);
6506
6507 if (aarch64_sve_data_mode_p (mode))
6508 {
6509 if (type != ADDRESS_REG_REG
6510 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6511 return false;
6512 }
6513 else
6514 {
6515 if (shift != 0
6516 && !(IN_RANGE (shift, 1, 3)
6517 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6518 return false;
6519 }
6520
6521 if (REG_P (index)
6522 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6523 {
6524 info->type = type;
6525 info->offset = index;
6526 info->shift = shift;
6527 return true;
6528 }
6529
6530 return false;
6531 }
6532
6533 /* Return true if MODE is one of the modes for which we
6534 support LDP/STP operations. */
6535
6536 static bool
6537 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6538 {
6539 return mode == SImode || mode == DImode
6540 || mode == SFmode || mode == DFmode
6541 || (aarch64_vector_mode_supported_p (mode)
6542 && (known_eq (GET_MODE_SIZE (mode), 8)
6543 || (known_eq (GET_MODE_SIZE (mode), 16)
6544 && (aarch64_tune_params.extra_tuning_flags
6545 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6546 }
6547
6548 /* Return true if REGNO is a virtual pointer register, or an eliminable
6549 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6550 include stack_pointer or hard_frame_pointer. */
6551 static bool
6552 virt_or_elim_regno_p (unsigned regno)
6553 {
6554 return ((regno >= FIRST_VIRTUAL_REGISTER
6555 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6556 || regno == FRAME_POINTER_REGNUM
6557 || regno == ARG_POINTER_REGNUM);
6558 }
6559
6560 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6561 If it is, fill in INFO appropriately. STRICT_P is true if
6562 REG_OK_STRICT is in effect. */
6563
6564 bool
6565 aarch64_classify_address (struct aarch64_address_info *info,
6566 rtx x, machine_mode mode, bool strict_p,
6567 aarch64_addr_query_type type)
6568 {
6569 enum rtx_code code = GET_CODE (x);
6570 rtx op0, op1;
6571 poly_int64 offset;
6572
6573 HOST_WIDE_INT const_size;
6574
6575 /* On BE, we use load/store pair for all large int mode load/stores.
6576 TI/TFmode may also use a load/store pair. */
6577 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6578 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6579 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6580 || type == ADDR_QUERY_LDP_STP_N
6581 || mode == TImode
6582 || mode == TFmode
6583 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6584
6585 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6586 corresponds to the actual size of the memory being loaded/stored and the
6587 mode of the corresponding addressing mode is half of that. */
6588 if (type == ADDR_QUERY_LDP_STP_N
6589 && known_eq (GET_MODE_SIZE (mode), 16))
6590 mode = DFmode;
6591
6592 bool allow_reg_index_p = (!load_store_pair_p
6593 && (known_lt (GET_MODE_SIZE (mode), 16)
6594 || vec_flags == VEC_ADVSIMD
6595 || vec_flags == VEC_SVE_DATA));
6596
6597 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6598 [Rn, #offset, MUL VL]. */
6599 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6600 && (code != REG && code != PLUS))
6601 return false;
6602
6603 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6604 REG addressing. */
6605 if (advsimd_struct_p
6606 && !BYTES_BIG_ENDIAN
6607 && (code != POST_INC && code != REG))
6608 return false;
6609
6610 gcc_checking_assert (GET_MODE (x) == VOIDmode
6611 || SCALAR_INT_MODE_P (GET_MODE (x)));
6612
6613 switch (code)
6614 {
6615 case REG:
6616 case SUBREG:
6617 info->type = ADDRESS_REG_IMM;
6618 info->base = x;
6619 info->offset = const0_rtx;
6620 info->const_offset = 0;
6621 return aarch64_base_register_rtx_p (x, strict_p);
6622
6623 case PLUS:
6624 op0 = XEXP (x, 0);
6625 op1 = XEXP (x, 1);
6626
6627 if (! strict_p
6628 && REG_P (op0)
6629 && virt_or_elim_regno_p (REGNO (op0))
6630 && poly_int_rtx_p (op1, &offset))
6631 {
6632 info->type = ADDRESS_REG_IMM;
6633 info->base = op0;
6634 info->offset = op1;
6635 info->const_offset = offset;
6636
6637 return true;
6638 }
6639
6640 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6641 && aarch64_base_register_rtx_p (op0, strict_p)
6642 && poly_int_rtx_p (op1, &offset))
6643 {
6644 info->type = ADDRESS_REG_IMM;
6645 info->base = op0;
6646 info->offset = op1;
6647 info->const_offset = offset;
6648
6649 /* TImode and TFmode values are allowed in both pairs of X
6650 registers and individual Q registers. The available
6651 address modes are:
6652 X,X: 7-bit signed scaled offset
6653 Q: 9-bit signed offset
6654 We conservatively require an offset representable in either mode.
6655 When performing the check for pairs of X registers i.e. LDP/STP
6656 pass down DImode since that is the natural size of the LDP/STP
6657 instruction memory accesses. */
6658 if (mode == TImode || mode == TFmode)
6659 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6660 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6661 || offset_12bit_unsigned_scaled_p (mode, offset)));
6662
6663 /* A 7bit offset check because OImode will emit a ldp/stp
6664 instruction (only big endian will get here).
6665 For ldp/stp instructions, the offset is scaled for the size of a
6666 single element of the pair. */
6667 if (mode == OImode)
6668 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6669
6670 /* Three 9/12 bit offsets checks because CImode will emit three
6671 ldr/str instructions (only big endian will get here). */
6672 if (mode == CImode)
6673 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6674 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6675 offset + 32)
6676 || offset_12bit_unsigned_scaled_p (V16QImode,
6677 offset + 32)));
6678
6679 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6680 instructions (only big endian will get here). */
6681 if (mode == XImode)
6682 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6683 && aarch64_offset_7bit_signed_scaled_p (TImode,
6684 offset + 32));
6685
6686 /* Make "m" use the LD1 offset range for SVE data modes, so
6687 that pre-RTL optimizers like ivopts will work to that
6688 instead of the wider LDR/STR range. */
6689 if (vec_flags == VEC_SVE_DATA)
6690 return (type == ADDR_QUERY_M
6691 ? offset_4bit_signed_scaled_p (mode, offset)
6692 : offset_9bit_signed_scaled_p (mode, offset));
6693
6694 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6695 {
6696 poly_int64 end_offset = (offset
6697 + GET_MODE_SIZE (mode)
6698 - BYTES_PER_SVE_VECTOR);
6699 return (type == ADDR_QUERY_M
6700 ? offset_4bit_signed_scaled_p (mode, offset)
6701 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6702 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6703 end_offset)));
6704 }
6705
6706 if (vec_flags == VEC_SVE_PRED)
6707 return offset_9bit_signed_scaled_p (mode, offset);
6708
6709 if (load_store_pair_p)
6710 return ((known_eq (GET_MODE_SIZE (mode), 4)
6711 || known_eq (GET_MODE_SIZE (mode), 8)
6712 || known_eq (GET_MODE_SIZE (mode), 16))
6713 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6714 else
6715 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6716 || offset_12bit_unsigned_scaled_p (mode, offset));
6717 }
6718
6719 if (allow_reg_index_p)
6720 {
6721 /* Look for base + (scaled/extended) index register. */
6722 if (aarch64_base_register_rtx_p (op0, strict_p)
6723 && aarch64_classify_index (info, op1, mode, strict_p))
6724 {
6725 info->base = op0;
6726 return true;
6727 }
6728 if (aarch64_base_register_rtx_p (op1, strict_p)
6729 && aarch64_classify_index (info, op0, mode, strict_p))
6730 {
6731 info->base = op1;
6732 return true;
6733 }
6734 }
6735
6736 return false;
6737
6738 case POST_INC:
6739 case POST_DEC:
6740 case PRE_INC:
6741 case PRE_DEC:
6742 info->type = ADDRESS_REG_WB;
6743 info->base = XEXP (x, 0);
6744 info->offset = NULL_RTX;
6745 return aarch64_base_register_rtx_p (info->base, strict_p);
6746
6747 case POST_MODIFY:
6748 case PRE_MODIFY:
6749 info->type = ADDRESS_REG_WB;
6750 info->base = XEXP (x, 0);
6751 if (GET_CODE (XEXP (x, 1)) == PLUS
6752 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6753 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6754 && aarch64_base_register_rtx_p (info->base, strict_p))
6755 {
6756 info->offset = XEXP (XEXP (x, 1), 1);
6757 info->const_offset = offset;
6758
6759 /* TImode and TFmode values are allowed in both pairs of X
6760 registers and individual Q registers. The available
6761 address modes are:
6762 X,X: 7-bit signed scaled offset
6763 Q: 9-bit signed offset
6764 We conservatively require an offset representable in either mode.
6765 */
6766 if (mode == TImode || mode == TFmode)
6767 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6768 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6769
6770 if (load_store_pair_p)
6771 return ((known_eq (GET_MODE_SIZE (mode), 4)
6772 || known_eq (GET_MODE_SIZE (mode), 8)
6773 || known_eq (GET_MODE_SIZE (mode), 16))
6774 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6775 else
6776 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6777 }
6778 return false;
6779
6780 case CONST:
6781 case SYMBOL_REF:
6782 case LABEL_REF:
6783 /* load literal: pc-relative constant pool entry. Only supported
6784 for SI mode or larger. */
6785 info->type = ADDRESS_SYMBOLIC;
6786
6787 if (!load_store_pair_p
6788 && GET_MODE_SIZE (mode).is_constant (&const_size)
6789 && const_size >= 4)
6790 {
6791 rtx sym, addend;
6792
6793 split_const (x, &sym, &addend);
6794 return ((GET_CODE (sym) == LABEL_REF
6795 || (GET_CODE (sym) == SYMBOL_REF
6796 && CONSTANT_POOL_ADDRESS_P (sym)
6797 && aarch64_pcrelative_literal_loads)));
6798 }
6799 return false;
6800
6801 case LO_SUM:
6802 info->type = ADDRESS_LO_SUM;
6803 info->base = XEXP (x, 0);
6804 info->offset = XEXP (x, 1);
6805 if (allow_reg_index_p
6806 && aarch64_base_register_rtx_p (info->base, strict_p))
6807 {
6808 rtx sym, offs;
6809 split_const (info->offset, &sym, &offs);
6810 if (GET_CODE (sym) == SYMBOL_REF
6811 && (aarch64_classify_symbol (sym, INTVAL (offs))
6812 == SYMBOL_SMALL_ABSOLUTE))
6813 {
6814 /* The symbol and offset must be aligned to the access size. */
6815 unsigned int align;
6816
6817 if (CONSTANT_POOL_ADDRESS_P (sym))
6818 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6819 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6820 {
6821 tree exp = SYMBOL_REF_DECL (sym);
6822 align = TYPE_ALIGN (TREE_TYPE (exp));
6823 align = aarch64_constant_alignment (exp, align);
6824 }
6825 else if (SYMBOL_REF_DECL (sym))
6826 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6827 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6828 && SYMBOL_REF_BLOCK (sym) != NULL)
6829 align = SYMBOL_REF_BLOCK (sym)->alignment;
6830 else
6831 align = BITS_PER_UNIT;
6832
6833 poly_int64 ref_size = GET_MODE_SIZE (mode);
6834 if (known_eq (ref_size, 0))
6835 ref_size = GET_MODE_SIZE (DImode);
6836
6837 return (multiple_p (INTVAL (offs), ref_size)
6838 && multiple_p (align / BITS_PER_UNIT, ref_size));
6839 }
6840 }
6841 return false;
6842
6843 default:
6844 return false;
6845 }
6846 }
6847
6848 /* Return true if the address X is valid for a PRFM instruction.
6849 STRICT_P is true if we should do strict checking with
6850 aarch64_classify_address. */
6851
6852 bool
6853 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6854 {
6855 struct aarch64_address_info addr;
6856
6857 /* PRFM accepts the same addresses as DImode... */
6858 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6859 if (!res)
6860 return false;
6861
6862 /* ... except writeback forms. */
6863 return addr.type != ADDRESS_REG_WB;
6864 }
6865
6866 bool
6867 aarch64_symbolic_address_p (rtx x)
6868 {
6869 rtx offset;
6870
6871 split_const (x, &x, &offset);
6872 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6873 }
6874
6875 /* Classify the base of symbolic expression X. */
6876
6877 enum aarch64_symbol_type
6878 aarch64_classify_symbolic_expression (rtx x)
6879 {
6880 rtx offset;
6881
6882 split_const (x, &x, &offset);
6883 return aarch64_classify_symbol (x, INTVAL (offset));
6884 }
6885
6886
6887 /* Return TRUE if X is a legitimate address for accessing memory in
6888 mode MODE. */
6889 static bool
6890 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6891 {
6892 struct aarch64_address_info addr;
6893
6894 return aarch64_classify_address (&addr, x, mode, strict_p);
6895 }
6896
6897 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6898 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6899 bool
6900 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6901 aarch64_addr_query_type type)
6902 {
6903 struct aarch64_address_info addr;
6904
6905 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6906 }
6907
6908 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6909
6910 static bool
6911 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6912 poly_int64 orig_offset,
6913 machine_mode mode)
6914 {
6915 HOST_WIDE_INT size;
6916 if (GET_MODE_SIZE (mode).is_constant (&size))
6917 {
6918 HOST_WIDE_INT const_offset, second_offset;
6919
6920 /* A general SVE offset is A * VQ + B. Remove the A component from
6921 coefficient 0 in order to get the constant B. */
6922 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6923
6924 /* Split an out-of-range address displacement into a base and
6925 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6926 range otherwise to increase opportunities for sharing the base
6927 address of different sizes. Unaligned accesses use the signed
6928 9-bit range, TImode/TFmode use the intersection of signed
6929 scaled 7-bit and signed 9-bit offset. */
6930 if (mode == TImode || mode == TFmode)
6931 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6932 else if ((const_offset & (size - 1)) != 0)
6933 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6934 else
6935 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6936
6937 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6938 return false;
6939
6940 /* Split the offset into second_offset and the rest. */
6941 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6942 *offset2 = gen_int_mode (second_offset, Pmode);
6943 return true;
6944 }
6945 else
6946 {
6947 /* Get the mode we should use as the basis of the range. For structure
6948 modes this is the mode of one vector. */
6949 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6950 machine_mode step_mode
6951 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6952
6953 /* Get the "mul vl" multiplier we'd like to use. */
6954 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6955 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6956 if (vec_flags & VEC_SVE_DATA)
6957 /* LDR supports a 9-bit range, but the move patterns for
6958 structure modes require all vectors to be in range of the
6959 same base. The simplest way of accomodating that while still
6960 promoting reuse of anchor points between different modes is
6961 to use an 8-bit range unconditionally. */
6962 vnum = ((vnum + 128) & 255) - 128;
6963 else
6964 /* Predicates are only handled singly, so we might as well use
6965 the full range. */
6966 vnum = ((vnum + 256) & 511) - 256;
6967 if (vnum == 0)
6968 return false;
6969
6970 /* Convert the "mul vl" multiplier into a byte offset. */
6971 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6972 if (known_eq (second_offset, orig_offset))
6973 return false;
6974
6975 /* Split the offset into second_offset and the rest. */
6976 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6977 *offset2 = gen_int_mode (second_offset, Pmode);
6978 return true;
6979 }
6980 }
6981
6982 /* Return the binary representation of floating point constant VALUE in INTVAL.
6983 If the value cannot be converted, return false without setting INTVAL.
6984 The conversion is done in the given MODE. */
6985 bool
6986 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6987 {
6988
6989 /* We make a general exception for 0. */
6990 if (aarch64_float_const_zero_rtx_p (value))
6991 {
6992 *intval = 0;
6993 return true;
6994 }
6995
6996 scalar_float_mode mode;
6997 if (GET_CODE (value) != CONST_DOUBLE
6998 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6999 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7000 /* Only support up to DF mode. */
7001 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7002 return false;
7003
7004 unsigned HOST_WIDE_INT ival = 0;
7005
7006 long res[2];
7007 real_to_target (res,
7008 CONST_DOUBLE_REAL_VALUE (value),
7009 REAL_MODE_FORMAT (mode));
7010
7011 if (mode == DFmode)
7012 {
7013 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7014 ival = zext_hwi (res[order], 32);
7015 ival |= (zext_hwi (res[1 - order], 32) << 32);
7016 }
7017 else
7018 ival = zext_hwi (res[0], 32);
7019
7020 *intval = ival;
7021 return true;
7022 }
7023
7024 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7025 single MOV(+MOVK) followed by an FMOV. */
7026 bool
7027 aarch64_float_const_rtx_p (rtx x)
7028 {
7029 machine_mode mode = GET_MODE (x);
7030 if (mode == VOIDmode)
7031 return false;
7032
7033 /* Determine whether it's cheaper to write float constants as
7034 mov/movk pairs over ldr/adrp pairs. */
7035 unsigned HOST_WIDE_INT ival;
7036
7037 if (GET_CODE (x) == CONST_DOUBLE
7038 && SCALAR_FLOAT_MODE_P (mode)
7039 && aarch64_reinterpret_float_as_int (x, &ival))
7040 {
7041 scalar_int_mode imode = (mode == HFmode
7042 ? SImode
7043 : int_mode_for_mode (mode).require ());
7044 int num_instr = aarch64_internal_mov_immediate
7045 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7046 return num_instr < 3;
7047 }
7048
7049 return false;
7050 }
7051
7052 /* Return TRUE if rtx X is immediate constant 0.0 */
7053 bool
7054 aarch64_float_const_zero_rtx_p (rtx x)
7055 {
7056 if (GET_MODE (x) == VOIDmode)
7057 return false;
7058
7059 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7060 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7061 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7062 }
7063
7064 /* Return TRUE if rtx X is immediate constant that fits in a single
7065 MOVI immediate operation. */
7066 bool
7067 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7068 {
7069 if (!TARGET_SIMD)
7070 return false;
7071
7072 machine_mode vmode;
7073 scalar_int_mode imode;
7074 unsigned HOST_WIDE_INT ival;
7075
7076 if (GET_CODE (x) == CONST_DOUBLE
7077 && SCALAR_FLOAT_MODE_P (mode))
7078 {
7079 if (!aarch64_reinterpret_float_as_int (x, &ival))
7080 return false;
7081
7082 /* We make a general exception for 0. */
7083 if (aarch64_float_const_zero_rtx_p (x))
7084 return true;
7085
7086 imode = int_mode_for_mode (mode).require ();
7087 }
7088 else if (GET_CODE (x) == CONST_INT
7089 && is_a <scalar_int_mode> (mode, &imode))
7090 ival = INTVAL (x);
7091 else
7092 return false;
7093
7094 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7095 a 128 bit vector mode. */
7096 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7097
7098 vmode = aarch64_simd_container_mode (imode, width);
7099 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7100
7101 return aarch64_simd_valid_immediate (v_op, NULL);
7102 }
7103
7104
7105 /* Return the fixed registers used for condition codes. */
7106
7107 static bool
7108 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7109 {
7110 *p1 = CC_REGNUM;
7111 *p2 = INVALID_REGNUM;
7112 return true;
7113 }
7114
7115 /* This function is used by the call expanders of the machine description.
7116 RESULT is the register in which the result is returned. It's NULL for
7117 "call" and "sibcall".
7118 MEM is the location of the function call.
7119 SIBCALL indicates whether this function call is normal call or sibling call.
7120 It will generate different pattern accordingly. */
7121
7122 void
7123 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7124 {
7125 rtx call, callee, tmp;
7126 rtvec vec;
7127 machine_mode mode;
7128
7129 gcc_assert (MEM_P (mem));
7130 callee = XEXP (mem, 0);
7131 mode = GET_MODE (callee);
7132 gcc_assert (mode == Pmode);
7133
7134 /* Decide if we should generate indirect calls by loading the
7135 address of the callee into a register before performing
7136 the branch-and-link. */
7137 if (SYMBOL_REF_P (callee)
7138 ? (aarch64_is_long_call_p (callee)
7139 || aarch64_is_noplt_call_p (callee))
7140 : !REG_P (callee))
7141 XEXP (mem, 0) = force_reg (mode, callee);
7142
7143 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7144
7145 if (result != NULL_RTX)
7146 call = gen_rtx_SET (result, call);
7147
7148 if (sibcall)
7149 tmp = ret_rtx;
7150 else
7151 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7152
7153 vec = gen_rtvec (2, call, tmp);
7154 call = gen_rtx_PARALLEL (VOIDmode, vec);
7155
7156 aarch64_emit_call_insn (call);
7157 }
7158
7159 /* Emit call insn with PAT and do aarch64-specific handling. */
7160
7161 void
7162 aarch64_emit_call_insn (rtx pat)
7163 {
7164 rtx insn = emit_call_insn (pat);
7165
7166 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7167 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7168 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7169 }
7170
7171 machine_mode
7172 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7173 {
7174 machine_mode mode_x = GET_MODE (x);
7175 rtx_code code_x = GET_CODE (x);
7176
7177 /* All floating point compares return CCFP if it is an equality
7178 comparison, and CCFPE otherwise. */
7179 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7180 {
7181 switch (code)
7182 {
7183 case EQ:
7184 case NE:
7185 case UNORDERED:
7186 case ORDERED:
7187 case UNLT:
7188 case UNLE:
7189 case UNGT:
7190 case UNGE:
7191 case UNEQ:
7192 return CCFPmode;
7193
7194 case LT:
7195 case LE:
7196 case GT:
7197 case GE:
7198 case LTGT:
7199 return CCFPEmode;
7200
7201 default:
7202 gcc_unreachable ();
7203 }
7204 }
7205
7206 /* Equality comparisons of short modes against zero can be performed
7207 using the TST instruction with the appropriate bitmask. */
7208 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7209 && (code == EQ || code == NE)
7210 && (mode_x == HImode || mode_x == QImode))
7211 return CC_NZmode;
7212
7213 /* Similarly, comparisons of zero_extends from shorter modes can
7214 be performed using an ANDS with an immediate mask. */
7215 if (y == const0_rtx && code_x == ZERO_EXTEND
7216 && (mode_x == SImode || mode_x == DImode)
7217 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7218 && (code == EQ || code == NE))
7219 return CC_NZmode;
7220
7221 if ((mode_x == SImode || mode_x == DImode)
7222 && y == const0_rtx
7223 && (code == EQ || code == NE || code == LT || code == GE)
7224 && (code_x == PLUS || code_x == MINUS || code_x == AND
7225 || code_x == NEG
7226 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7227 && CONST_INT_P (XEXP (x, 2)))))
7228 return CC_NZmode;
7229
7230 /* A compare with a shifted operand. Because of canonicalization,
7231 the comparison will have to be swapped when we emit the assembly
7232 code. */
7233 if ((mode_x == SImode || mode_x == DImode)
7234 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7235 && (code_x == ASHIFT || code_x == ASHIFTRT
7236 || code_x == LSHIFTRT
7237 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7238 return CC_SWPmode;
7239
7240 /* Similarly for a negated operand, but we can only do this for
7241 equalities. */
7242 if ((mode_x == SImode || mode_x == DImode)
7243 && (REG_P (y) || GET_CODE (y) == SUBREG)
7244 && (code == EQ || code == NE)
7245 && code_x == NEG)
7246 return CC_Zmode;
7247
7248 /* A test for unsigned overflow from an addition. */
7249 if ((mode_x == DImode || mode_x == TImode)
7250 && (code == LTU || code == GEU)
7251 && code_x == PLUS
7252 && rtx_equal_p (XEXP (x, 0), y))
7253 return CC_Cmode;
7254
7255 /* A test for unsigned overflow from an add with carry. */
7256 if ((mode_x == DImode || mode_x == TImode)
7257 && (code == LTU || code == GEU)
7258 && code_x == PLUS
7259 && CONST_SCALAR_INT_P (y)
7260 && (rtx_mode_t (y, mode_x)
7261 == (wi::shwi (1, mode_x)
7262 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7263 return CC_ADCmode;
7264
7265 /* A test for signed overflow. */
7266 if ((mode_x == DImode || mode_x == TImode)
7267 && code == NE
7268 && code_x == PLUS
7269 && GET_CODE (y) == SIGN_EXTEND)
7270 return CC_Vmode;
7271
7272 /* For everything else, return CCmode. */
7273 return CCmode;
7274 }
7275
7276 static int
7277 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7278
7279 int
7280 aarch64_get_condition_code (rtx x)
7281 {
7282 machine_mode mode = GET_MODE (XEXP (x, 0));
7283 enum rtx_code comp_code = GET_CODE (x);
7284
7285 if (GET_MODE_CLASS (mode) != MODE_CC)
7286 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7287 return aarch64_get_condition_code_1 (mode, comp_code);
7288 }
7289
7290 static int
7291 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7292 {
7293 switch (mode)
7294 {
7295 case E_CCFPmode:
7296 case E_CCFPEmode:
7297 switch (comp_code)
7298 {
7299 case GE: return AARCH64_GE;
7300 case GT: return AARCH64_GT;
7301 case LE: return AARCH64_LS;
7302 case LT: return AARCH64_MI;
7303 case NE: return AARCH64_NE;
7304 case EQ: return AARCH64_EQ;
7305 case ORDERED: return AARCH64_VC;
7306 case UNORDERED: return AARCH64_VS;
7307 case UNLT: return AARCH64_LT;
7308 case UNLE: return AARCH64_LE;
7309 case UNGT: return AARCH64_HI;
7310 case UNGE: return AARCH64_PL;
7311 default: return -1;
7312 }
7313 break;
7314
7315 case E_CCmode:
7316 switch (comp_code)
7317 {
7318 case NE: return AARCH64_NE;
7319 case EQ: return AARCH64_EQ;
7320 case GE: return AARCH64_GE;
7321 case GT: return AARCH64_GT;
7322 case LE: return AARCH64_LE;
7323 case LT: return AARCH64_LT;
7324 case GEU: return AARCH64_CS;
7325 case GTU: return AARCH64_HI;
7326 case LEU: return AARCH64_LS;
7327 case LTU: return AARCH64_CC;
7328 default: return -1;
7329 }
7330 break;
7331
7332 case E_CC_SWPmode:
7333 switch (comp_code)
7334 {
7335 case NE: return AARCH64_NE;
7336 case EQ: return AARCH64_EQ;
7337 case GE: return AARCH64_LE;
7338 case GT: return AARCH64_LT;
7339 case LE: return AARCH64_GE;
7340 case LT: return AARCH64_GT;
7341 case GEU: return AARCH64_LS;
7342 case GTU: return AARCH64_CC;
7343 case LEU: return AARCH64_CS;
7344 case LTU: return AARCH64_HI;
7345 default: return -1;
7346 }
7347 break;
7348
7349 case E_CC_NZmode:
7350 switch (comp_code)
7351 {
7352 case NE: return AARCH64_NE;
7353 case EQ: return AARCH64_EQ;
7354 case GE: return AARCH64_PL;
7355 case LT: return AARCH64_MI;
7356 default: return -1;
7357 }
7358 break;
7359
7360 case E_CC_Zmode:
7361 switch (comp_code)
7362 {
7363 case NE: return AARCH64_NE;
7364 case EQ: return AARCH64_EQ;
7365 default: return -1;
7366 }
7367 break;
7368
7369 case E_CC_Cmode:
7370 switch (comp_code)
7371 {
7372 case LTU: return AARCH64_CS;
7373 case GEU: return AARCH64_CC;
7374 default: return -1;
7375 }
7376 break;
7377
7378 case E_CC_ADCmode:
7379 switch (comp_code)
7380 {
7381 case GEU: return AARCH64_CS;
7382 case LTU: return AARCH64_CC;
7383 default: return -1;
7384 }
7385 break;
7386
7387 case E_CC_Vmode:
7388 switch (comp_code)
7389 {
7390 case NE: return AARCH64_VS;
7391 case EQ: return AARCH64_VC;
7392 default: return -1;
7393 }
7394 break;
7395
7396 default:
7397 return -1;
7398 }
7399
7400 return -1;
7401 }
7402
7403 bool
7404 aarch64_const_vec_all_same_in_range_p (rtx x,
7405 HOST_WIDE_INT minval,
7406 HOST_WIDE_INT maxval)
7407 {
7408 rtx elt;
7409 return (const_vec_duplicate_p (x, &elt)
7410 && CONST_INT_P (elt)
7411 && IN_RANGE (INTVAL (elt), minval, maxval));
7412 }
7413
7414 bool
7415 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7416 {
7417 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7418 }
7419
7420 /* Return true if VEC is a constant in which every element is in the range
7421 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7422
7423 static bool
7424 aarch64_const_vec_all_in_range_p (rtx vec,
7425 HOST_WIDE_INT minval,
7426 HOST_WIDE_INT maxval)
7427 {
7428 if (GET_CODE (vec) != CONST_VECTOR
7429 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7430 return false;
7431
7432 int nunits;
7433 if (!CONST_VECTOR_STEPPED_P (vec))
7434 nunits = const_vector_encoded_nelts (vec);
7435 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7436 return false;
7437
7438 for (int i = 0; i < nunits; i++)
7439 {
7440 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7441 if (!CONST_INT_P (vec_elem)
7442 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7443 return false;
7444 }
7445 return true;
7446 }
7447
7448 /* N Z C V. */
7449 #define AARCH64_CC_V 1
7450 #define AARCH64_CC_C (1 << 1)
7451 #define AARCH64_CC_Z (1 << 2)
7452 #define AARCH64_CC_N (1 << 3)
7453
7454 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7455 static const int aarch64_nzcv_codes[] =
7456 {
7457 0, /* EQ, Z == 1. */
7458 AARCH64_CC_Z, /* NE, Z == 0. */
7459 0, /* CS, C == 1. */
7460 AARCH64_CC_C, /* CC, C == 0. */
7461 0, /* MI, N == 1. */
7462 AARCH64_CC_N, /* PL, N == 0. */
7463 0, /* VS, V == 1. */
7464 AARCH64_CC_V, /* VC, V == 0. */
7465 0, /* HI, C ==1 && Z == 0. */
7466 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7467 AARCH64_CC_V, /* GE, N == V. */
7468 0, /* LT, N != V. */
7469 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7470 0, /* LE, !(Z == 0 && N == V). */
7471 0, /* AL, Any. */
7472 0 /* NV, Any. */
7473 };
7474
7475 /* Print floating-point vector immediate operand X to F, negating it
7476 first if NEGATE is true. Return true on success, false if it isn't
7477 a constant we can handle. */
7478
7479 static bool
7480 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7481 {
7482 rtx elt;
7483
7484 if (!const_vec_duplicate_p (x, &elt))
7485 return false;
7486
7487 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7488 if (negate)
7489 r = real_value_negate (&r);
7490
7491 /* We only handle the SVE single-bit immediates here. */
7492 if (real_equal (&r, &dconst0))
7493 asm_fprintf (f, "0.0");
7494 else if (real_equal (&r, &dconst1))
7495 asm_fprintf (f, "1.0");
7496 else if (real_equal (&r, &dconsthalf))
7497 asm_fprintf (f, "0.5");
7498 else
7499 return false;
7500
7501 return true;
7502 }
7503
7504 /* Return the equivalent letter for size. */
7505 static char
7506 sizetochar (int size)
7507 {
7508 switch (size)
7509 {
7510 case 64: return 'd';
7511 case 32: return 's';
7512 case 16: return 'h';
7513 case 8 : return 'b';
7514 default: gcc_unreachable ();
7515 }
7516 }
7517
7518 /* Print operand X to file F in a target specific manner according to CODE.
7519 The acceptable formatting commands given by CODE are:
7520 'c': An integer or symbol address without a preceding #
7521 sign.
7522 'C': Take the duplicated element in a vector constant
7523 and print it in hex.
7524 'D': Take the duplicated element in a vector constant
7525 and print it as an unsigned integer, in decimal.
7526 'e': Print the sign/zero-extend size as a character 8->b,
7527 16->h, 32->w.
7528 'p': Prints N such that 2^N == X (X must be power of 2 and
7529 const int).
7530 'P': Print the number of non-zero bits in X (a const_int).
7531 'H': Print the higher numbered register of a pair (TImode)
7532 of regs.
7533 'm': Print a condition (eq, ne, etc).
7534 'M': Same as 'm', but invert condition.
7535 'N': Take the duplicated element in a vector constant
7536 and print the negative of it in decimal.
7537 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7538 'S/T/U/V': Print a FP/SIMD register name for a register list.
7539 The register printed is the FP/SIMD register name
7540 of X + 0/1/2/3 for S/T/U/V.
7541 'R': Print a scalar FP/SIMD register name + 1.
7542 'X': Print bottom 16 bits of integer constant in hex.
7543 'w/x': Print a general register name or the zero register
7544 (32-bit or 64-bit).
7545 '0': Print a normal operand, if it's a general register,
7546 then we assume DImode.
7547 'k': Print NZCV for conditional compare instructions.
7548 'A': Output address constant representing the first
7549 argument of X, specifying a relocation offset
7550 if appropriate.
7551 'L': Output constant address specified by X
7552 with a relocation offset if appropriate.
7553 'G': Prints address of X, specifying a PC relative
7554 relocation mode if appropriate.
7555 'y': Output address of LDP or STP - this is used for
7556 some LDP/STPs which don't use a PARALLEL in their
7557 pattern (so the mode needs to be adjusted).
7558 'z': Output address of a typical LDP or STP. */
7559
7560 static void
7561 aarch64_print_operand (FILE *f, rtx x, int code)
7562 {
7563 rtx elt;
7564 switch (code)
7565 {
7566 case 'c':
7567 switch (GET_CODE (x))
7568 {
7569 case CONST_INT:
7570 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7571 break;
7572
7573 case SYMBOL_REF:
7574 output_addr_const (f, x);
7575 break;
7576
7577 case CONST:
7578 if (GET_CODE (XEXP (x, 0)) == PLUS
7579 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7580 {
7581 output_addr_const (f, x);
7582 break;
7583 }
7584 /* Fall through. */
7585
7586 default:
7587 output_operand_lossage ("unsupported operand for code '%c'", code);
7588 }
7589 break;
7590
7591 case 'e':
7592 {
7593 int n;
7594
7595 if (!CONST_INT_P (x)
7596 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7597 {
7598 output_operand_lossage ("invalid operand for '%%%c'", code);
7599 return;
7600 }
7601
7602 switch (n)
7603 {
7604 case 3:
7605 fputc ('b', f);
7606 break;
7607 case 4:
7608 fputc ('h', f);
7609 break;
7610 case 5:
7611 fputc ('w', f);
7612 break;
7613 default:
7614 output_operand_lossage ("invalid operand for '%%%c'", code);
7615 return;
7616 }
7617 }
7618 break;
7619
7620 case 'p':
7621 {
7622 int n;
7623
7624 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7625 {
7626 output_operand_lossage ("invalid operand for '%%%c'", code);
7627 return;
7628 }
7629
7630 asm_fprintf (f, "%d", n);
7631 }
7632 break;
7633
7634 case 'P':
7635 if (!CONST_INT_P (x))
7636 {
7637 output_operand_lossage ("invalid operand for '%%%c'", code);
7638 return;
7639 }
7640
7641 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7642 break;
7643
7644 case 'H':
7645 if (x == const0_rtx)
7646 {
7647 asm_fprintf (f, "xzr");
7648 break;
7649 }
7650
7651 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7652 {
7653 output_operand_lossage ("invalid operand for '%%%c'", code);
7654 return;
7655 }
7656
7657 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7658 break;
7659
7660 case 'M':
7661 case 'm':
7662 {
7663 int cond_code;
7664 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7665 if (x == const_true_rtx)
7666 {
7667 if (code == 'M')
7668 fputs ("nv", f);
7669 return;
7670 }
7671
7672 if (!COMPARISON_P (x))
7673 {
7674 output_operand_lossage ("invalid operand for '%%%c'", code);
7675 return;
7676 }
7677
7678 cond_code = aarch64_get_condition_code (x);
7679 gcc_assert (cond_code >= 0);
7680 if (code == 'M')
7681 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7682 fputs (aarch64_condition_codes[cond_code], f);
7683 }
7684 break;
7685
7686 case 'N':
7687 if (!const_vec_duplicate_p (x, &elt))
7688 {
7689 output_operand_lossage ("invalid vector constant");
7690 return;
7691 }
7692
7693 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7694 asm_fprintf (f, "%wd", -INTVAL (elt));
7695 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7696 && aarch64_print_vector_float_operand (f, x, true))
7697 ;
7698 else
7699 {
7700 output_operand_lossage ("invalid vector constant");
7701 return;
7702 }
7703 break;
7704
7705 case 'b':
7706 case 'h':
7707 case 's':
7708 case 'd':
7709 case 'q':
7710 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7711 {
7712 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7713 return;
7714 }
7715 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7716 break;
7717
7718 case 'S':
7719 case 'T':
7720 case 'U':
7721 case 'V':
7722 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7723 {
7724 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7725 return;
7726 }
7727 asm_fprintf (f, "%c%d",
7728 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7729 REGNO (x) - V0_REGNUM + (code - 'S'));
7730 break;
7731
7732 case 'R':
7733 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7734 {
7735 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7736 return;
7737 }
7738 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7739 break;
7740
7741 case 'X':
7742 if (!CONST_INT_P (x))
7743 {
7744 output_operand_lossage ("invalid operand for '%%%c'", code);
7745 return;
7746 }
7747 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7748 break;
7749
7750 case 'C':
7751 {
7752 /* Print a replicated constant in hex. */
7753 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7754 {
7755 output_operand_lossage ("invalid operand for '%%%c'", code);
7756 return;
7757 }
7758 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7759 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7760 }
7761 break;
7762
7763 case 'D':
7764 {
7765 /* Print a replicated constant in decimal, treating it as
7766 unsigned. */
7767 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7768 {
7769 output_operand_lossage ("invalid operand for '%%%c'", code);
7770 return;
7771 }
7772 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7773 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7774 }
7775 break;
7776
7777 case 'w':
7778 case 'x':
7779 if (x == const0_rtx
7780 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7781 {
7782 asm_fprintf (f, "%czr", code);
7783 break;
7784 }
7785
7786 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7787 {
7788 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7789 break;
7790 }
7791
7792 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7793 {
7794 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7795 break;
7796 }
7797
7798 /* Fall through */
7799
7800 case 0:
7801 if (x == NULL)
7802 {
7803 output_operand_lossage ("missing operand");
7804 return;
7805 }
7806
7807 switch (GET_CODE (x))
7808 {
7809 case REG:
7810 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7811 {
7812 if (REG_NREGS (x) == 1)
7813 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7814 else
7815 {
7816 char suffix
7817 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7818 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7819 REGNO (x) - V0_REGNUM, suffix,
7820 END_REGNO (x) - V0_REGNUM - 1, suffix);
7821 }
7822 }
7823 else
7824 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7825 break;
7826
7827 case MEM:
7828 output_address (GET_MODE (x), XEXP (x, 0));
7829 break;
7830
7831 case LABEL_REF:
7832 case SYMBOL_REF:
7833 output_addr_const (asm_out_file, x);
7834 break;
7835
7836 case CONST_INT:
7837 asm_fprintf (f, "%wd", INTVAL (x));
7838 break;
7839
7840 case CONST:
7841 if (!VECTOR_MODE_P (GET_MODE (x)))
7842 {
7843 output_addr_const (asm_out_file, x);
7844 break;
7845 }
7846 /* fall through */
7847
7848 case CONST_VECTOR:
7849 if (!const_vec_duplicate_p (x, &elt))
7850 {
7851 output_operand_lossage ("invalid vector constant");
7852 return;
7853 }
7854
7855 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7856 asm_fprintf (f, "%wd", INTVAL (elt));
7857 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7858 && aarch64_print_vector_float_operand (f, x, false))
7859 ;
7860 else
7861 {
7862 output_operand_lossage ("invalid vector constant");
7863 return;
7864 }
7865 break;
7866
7867 case CONST_DOUBLE:
7868 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7869 be getting CONST_DOUBLEs holding integers. */
7870 gcc_assert (GET_MODE (x) != VOIDmode);
7871 if (aarch64_float_const_zero_rtx_p (x))
7872 {
7873 fputc ('0', f);
7874 break;
7875 }
7876 else if (aarch64_float_const_representable_p (x))
7877 {
7878 #define buf_size 20
7879 char float_buf[buf_size] = {'\0'};
7880 real_to_decimal_for_mode (float_buf,
7881 CONST_DOUBLE_REAL_VALUE (x),
7882 buf_size, buf_size,
7883 1, GET_MODE (x));
7884 asm_fprintf (asm_out_file, "%s", float_buf);
7885 break;
7886 #undef buf_size
7887 }
7888 output_operand_lossage ("invalid constant");
7889 return;
7890 default:
7891 output_operand_lossage ("invalid operand");
7892 return;
7893 }
7894 break;
7895
7896 case 'A':
7897 if (GET_CODE (x) == HIGH)
7898 x = XEXP (x, 0);
7899
7900 switch (aarch64_classify_symbolic_expression (x))
7901 {
7902 case SYMBOL_SMALL_GOT_4G:
7903 asm_fprintf (asm_out_file, ":got:");
7904 break;
7905
7906 case SYMBOL_SMALL_TLSGD:
7907 asm_fprintf (asm_out_file, ":tlsgd:");
7908 break;
7909
7910 case SYMBOL_SMALL_TLSDESC:
7911 asm_fprintf (asm_out_file, ":tlsdesc:");
7912 break;
7913
7914 case SYMBOL_SMALL_TLSIE:
7915 asm_fprintf (asm_out_file, ":gottprel:");
7916 break;
7917
7918 case SYMBOL_TLSLE24:
7919 asm_fprintf (asm_out_file, ":tprel:");
7920 break;
7921
7922 case SYMBOL_TINY_GOT:
7923 gcc_unreachable ();
7924 break;
7925
7926 default:
7927 break;
7928 }
7929 output_addr_const (asm_out_file, x);
7930 break;
7931
7932 case 'L':
7933 switch (aarch64_classify_symbolic_expression (x))
7934 {
7935 case SYMBOL_SMALL_GOT_4G:
7936 asm_fprintf (asm_out_file, ":lo12:");
7937 break;
7938
7939 case SYMBOL_SMALL_TLSGD:
7940 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7941 break;
7942
7943 case SYMBOL_SMALL_TLSDESC:
7944 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7945 break;
7946
7947 case SYMBOL_SMALL_TLSIE:
7948 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7949 break;
7950
7951 case SYMBOL_TLSLE12:
7952 asm_fprintf (asm_out_file, ":tprel_lo12:");
7953 break;
7954
7955 case SYMBOL_TLSLE24:
7956 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7957 break;
7958
7959 case SYMBOL_TINY_GOT:
7960 asm_fprintf (asm_out_file, ":got:");
7961 break;
7962
7963 case SYMBOL_TINY_TLSIE:
7964 asm_fprintf (asm_out_file, ":gottprel:");
7965 break;
7966
7967 default:
7968 break;
7969 }
7970 output_addr_const (asm_out_file, x);
7971 break;
7972
7973 case 'G':
7974 switch (aarch64_classify_symbolic_expression (x))
7975 {
7976 case SYMBOL_TLSLE24:
7977 asm_fprintf (asm_out_file, ":tprel_hi12:");
7978 break;
7979 default:
7980 break;
7981 }
7982 output_addr_const (asm_out_file, x);
7983 break;
7984
7985 case 'k':
7986 {
7987 HOST_WIDE_INT cond_code;
7988
7989 if (!CONST_INT_P (x))
7990 {
7991 output_operand_lossage ("invalid operand for '%%%c'", code);
7992 return;
7993 }
7994
7995 cond_code = INTVAL (x);
7996 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7997 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7998 }
7999 break;
8000
8001 case 'y':
8002 case 'z':
8003 {
8004 machine_mode mode = GET_MODE (x);
8005
8006 if (GET_CODE (x) != MEM
8007 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8008 {
8009 output_operand_lossage ("invalid operand for '%%%c'", code);
8010 return;
8011 }
8012
8013 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8014 code == 'y'
8015 ? ADDR_QUERY_LDP_STP_N
8016 : ADDR_QUERY_LDP_STP))
8017 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8018 }
8019 break;
8020
8021 default:
8022 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8023 return;
8024 }
8025 }
8026
8027 /* Print address 'x' of a memory access with mode 'mode'.
8028 'op' is the context required by aarch64_classify_address. It can either be
8029 MEM for a normal memory access or PARALLEL for LDP/STP. */
8030 static bool
8031 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8032 aarch64_addr_query_type type)
8033 {
8034 struct aarch64_address_info addr;
8035 unsigned int size;
8036
8037 /* Check all addresses are Pmode - including ILP32. */
8038 if (GET_MODE (x) != Pmode
8039 && (!CONST_INT_P (x)
8040 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8041 {
8042 output_operand_lossage ("invalid address mode");
8043 return false;
8044 }
8045
8046 if (aarch64_classify_address (&addr, x, mode, true, type))
8047 switch (addr.type)
8048 {
8049 case ADDRESS_REG_IMM:
8050 if (known_eq (addr.const_offset, 0))
8051 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8052 else if (aarch64_sve_data_mode_p (mode))
8053 {
8054 HOST_WIDE_INT vnum
8055 = exact_div (addr.const_offset,
8056 BYTES_PER_SVE_VECTOR).to_constant ();
8057 asm_fprintf (f, "[%s, #%wd, mul vl]",
8058 reg_names[REGNO (addr.base)], vnum);
8059 }
8060 else if (aarch64_sve_pred_mode_p (mode))
8061 {
8062 HOST_WIDE_INT vnum
8063 = exact_div (addr.const_offset,
8064 BYTES_PER_SVE_PRED).to_constant ();
8065 asm_fprintf (f, "[%s, #%wd, mul vl]",
8066 reg_names[REGNO (addr.base)], vnum);
8067 }
8068 else
8069 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8070 INTVAL (addr.offset));
8071 return true;
8072
8073 case ADDRESS_REG_REG:
8074 if (addr.shift == 0)
8075 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8076 reg_names [REGNO (addr.offset)]);
8077 else
8078 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8079 reg_names [REGNO (addr.offset)], addr.shift);
8080 return true;
8081
8082 case ADDRESS_REG_UXTW:
8083 if (addr.shift == 0)
8084 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8085 REGNO (addr.offset) - R0_REGNUM);
8086 else
8087 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8088 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8089 return true;
8090
8091 case ADDRESS_REG_SXTW:
8092 if (addr.shift == 0)
8093 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8094 REGNO (addr.offset) - R0_REGNUM);
8095 else
8096 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8097 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8098 return true;
8099
8100 case ADDRESS_REG_WB:
8101 /* Writeback is only supported for fixed-width modes. */
8102 size = GET_MODE_SIZE (mode).to_constant ();
8103 switch (GET_CODE (x))
8104 {
8105 case PRE_INC:
8106 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8107 return true;
8108 case POST_INC:
8109 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8110 return true;
8111 case PRE_DEC:
8112 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8113 return true;
8114 case POST_DEC:
8115 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8116 return true;
8117 case PRE_MODIFY:
8118 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8119 INTVAL (addr.offset));
8120 return true;
8121 case POST_MODIFY:
8122 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8123 INTVAL (addr.offset));
8124 return true;
8125 default:
8126 break;
8127 }
8128 break;
8129
8130 case ADDRESS_LO_SUM:
8131 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8132 output_addr_const (f, addr.offset);
8133 asm_fprintf (f, "]");
8134 return true;
8135
8136 case ADDRESS_SYMBOLIC:
8137 output_addr_const (f, x);
8138 return true;
8139 }
8140
8141 return false;
8142 }
8143
8144 /* Print address 'x' of a memory access with mode 'mode'. */
8145 static void
8146 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8147 {
8148 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8149 output_addr_const (f, x);
8150 }
8151
8152 bool
8153 aarch64_label_mentioned_p (rtx x)
8154 {
8155 const char *fmt;
8156 int i;
8157
8158 if (GET_CODE (x) == LABEL_REF)
8159 return true;
8160
8161 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8162 referencing instruction, but they are constant offsets, not
8163 symbols. */
8164 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8165 return false;
8166
8167 fmt = GET_RTX_FORMAT (GET_CODE (x));
8168 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8169 {
8170 if (fmt[i] == 'E')
8171 {
8172 int j;
8173
8174 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8175 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8176 return 1;
8177 }
8178 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8179 return 1;
8180 }
8181
8182 return 0;
8183 }
8184
8185 /* Implement REGNO_REG_CLASS. */
8186
8187 enum reg_class
8188 aarch64_regno_regclass (unsigned regno)
8189 {
8190 if (GP_REGNUM_P (regno))
8191 return GENERAL_REGS;
8192
8193 if (regno == SP_REGNUM)
8194 return STACK_REG;
8195
8196 if (regno == FRAME_POINTER_REGNUM
8197 || regno == ARG_POINTER_REGNUM)
8198 return POINTER_REGS;
8199
8200 if (FP_REGNUM_P (regno))
8201 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
8202
8203 if (PR_REGNUM_P (regno))
8204 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8205
8206 return NO_REGS;
8207 }
8208
8209 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8210 If OFFSET is out of range, return an offset of an anchor point
8211 that is in range. Return 0 otherwise. */
8212
8213 static HOST_WIDE_INT
8214 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8215 machine_mode mode)
8216 {
8217 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8218 if (size > 16)
8219 return (offset + 0x400) & ~0x7f0;
8220
8221 /* For offsets that aren't a multiple of the access size, the limit is
8222 -256...255. */
8223 if (offset & (size - 1))
8224 {
8225 /* BLKmode typically uses LDP of X-registers. */
8226 if (mode == BLKmode)
8227 return (offset + 512) & ~0x3ff;
8228 return (offset + 0x100) & ~0x1ff;
8229 }
8230
8231 /* Small negative offsets are supported. */
8232 if (IN_RANGE (offset, -256, 0))
8233 return 0;
8234
8235 if (mode == TImode || mode == TFmode)
8236 return (offset + 0x100) & ~0x1ff;
8237
8238 /* Use 12-bit offset by access size. */
8239 return offset & (~0xfff * size);
8240 }
8241
8242 static rtx
8243 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8244 {
8245 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8246 where mask is selected by alignment and size of the offset.
8247 We try to pick as large a range for the offset as possible to
8248 maximize the chance of a CSE. However, for aligned addresses
8249 we limit the range to 4k so that structures with different sized
8250 elements are likely to use the same base. We need to be careful
8251 not to split a CONST for some forms of address expression, otherwise
8252 it will generate sub-optimal code. */
8253
8254 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8255 {
8256 rtx base = XEXP (x, 0);
8257 rtx offset_rtx = XEXP (x, 1);
8258 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8259
8260 if (GET_CODE (base) == PLUS)
8261 {
8262 rtx op0 = XEXP (base, 0);
8263 rtx op1 = XEXP (base, 1);
8264
8265 /* Force any scaling into a temp for CSE. */
8266 op0 = force_reg (Pmode, op0);
8267 op1 = force_reg (Pmode, op1);
8268
8269 /* Let the pointer register be in op0. */
8270 if (REG_POINTER (op1))
8271 std::swap (op0, op1);
8272
8273 /* If the pointer is virtual or frame related, then we know that
8274 virtual register instantiation or register elimination is going
8275 to apply a second constant. We want the two constants folded
8276 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8277 if (virt_or_elim_regno_p (REGNO (op0)))
8278 {
8279 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8280 NULL_RTX, true, OPTAB_DIRECT);
8281 return gen_rtx_PLUS (Pmode, base, op1);
8282 }
8283
8284 /* Otherwise, in order to encourage CSE (and thence loop strength
8285 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8286 base = expand_binop (Pmode, add_optab, op0, op1,
8287 NULL_RTX, true, OPTAB_DIRECT);
8288 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8289 }
8290
8291 HOST_WIDE_INT size;
8292 if (GET_MODE_SIZE (mode).is_constant (&size))
8293 {
8294 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8295 mode);
8296 if (base_offset != 0)
8297 {
8298 base = plus_constant (Pmode, base, base_offset);
8299 base = force_operand (base, NULL_RTX);
8300 return plus_constant (Pmode, base, offset - base_offset);
8301 }
8302 }
8303 }
8304
8305 return x;
8306 }
8307
8308 static reg_class_t
8309 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8310 reg_class_t rclass,
8311 machine_mode mode,
8312 secondary_reload_info *sri)
8313 {
8314 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8315 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8316 comment at the head of aarch64-sve.md for more details about the
8317 big-endian handling. */
8318 if (BYTES_BIG_ENDIAN
8319 && reg_class_subset_p (rclass, FP_REGS)
8320 && !((REG_P (x) && HARD_REGISTER_P (x))
8321 || aarch64_simd_valid_immediate (x, NULL))
8322 && aarch64_sve_data_mode_p (mode))
8323 {
8324 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8325 return NO_REGS;
8326 }
8327
8328 /* If we have to disable direct literal pool loads and stores because the
8329 function is too big, then we need a scratch register. */
8330 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8331 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8332 || targetm.vector_mode_supported_p (GET_MODE (x)))
8333 && !aarch64_pcrelative_literal_loads)
8334 {
8335 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8336 return NO_REGS;
8337 }
8338
8339 /* Without the TARGET_SIMD instructions we cannot move a Q register
8340 to a Q register directly. We need a scratch. */
8341 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8342 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8343 && reg_class_subset_p (rclass, FP_REGS))
8344 {
8345 sri->icode = code_for_aarch64_reload_mov (mode);
8346 return NO_REGS;
8347 }
8348
8349 /* A TFmode or TImode memory access should be handled via an FP_REGS
8350 because AArch64 has richer addressing modes for LDR/STR instructions
8351 than LDP/STP instructions. */
8352 if (TARGET_FLOAT && rclass == GENERAL_REGS
8353 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8354 return FP_REGS;
8355
8356 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8357 return GENERAL_REGS;
8358
8359 return NO_REGS;
8360 }
8361
8362 static bool
8363 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8364 {
8365 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8366
8367 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8368 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8369 if (frame_pointer_needed)
8370 return to == HARD_FRAME_POINTER_REGNUM;
8371 return true;
8372 }
8373
8374 poly_int64
8375 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8376 {
8377 if (to == HARD_FRAME_POINTER_REGNUM)
8378 {
8379 if (from == ARG_POINTER_REGNUM)
8380 return cfun->machine->frame.hard_fp_offset;
8381
8382 if (from == FRAME_POINTER_REGNUM)
8383 return cfun->machine->frame.hard_fp_offset
8384 - cfun->machine->frame.locals_offset;
8385 }
8386
8387 if (to == STACK_POINTER_REGNUM)
8388 {
8389 if (from == FRAME_POINTER_REGNUM)
8390 return cfun->machine->frame.frame_size
8391 - cfun->machine->frame.locals_offset;
8392 }
8393
8394 return cfun->machine->frame.frame_size;
8395 }
8396
8397 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8398 previous frame. */
8399
8400 rtx
8401 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8402 {
8403 if (count != 0)
8404 return const0_rtx;
8405 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8406 }
8407
8408
8409 static void
8410 aarch64_asm_trampoline_template (FILE *f)
8411 {
8412 int offset1 = 16;
8413 int offset2 = 20;
8414
8415 if (aarch64_bti_enabled ())
8416 {
8417 asm_fprintf (f, "\thint\t34 // bti c\n");
8418 offset1 -= 4;
8419 offset2 -= 4;
8420 }
8421
8422 if (TARGET_ILP32)
8423 {
8424 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8425 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8426 offset1);
8427 }
8428 else
8429 {
8430 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8431 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8432 offset2);
8433 }
8434 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8435
8436 /* The trampoline needs an extra padding instruction. In case if BTI is
8437 enabled the padding instruction is replaced by the BTI instruction at
8438 the beginning. */
8439 if (!aarch64_bti_enabled ())
8440 assemble_aligned_integer (4, const0_rtx);
8441
8442 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8443 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8444 }
8445
8446 static void
8447 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8448 {
8449 rtx fnaddr, mem, a_tramp;
8450 const int tramp_code_sz = 16;
8451
8452 /* Don't need to copy the trailing D-words, we fill those in below. */
8453 emit_block_move (m_tramp, assemble_trampoline_template (),
8454 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8455 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8456 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8457 if (GET_MODE (fnaddr) != ptr_mode)
8458 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8459 emit_move_insn (mem, fnaddr);
8460
8461 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8462 emit_move_insn (mem, chain_value);
8463
8464 /* XXX We should really define a "clear_cache" pattern and use
8465 gen_clear_cache(). */
8466 a_tramp = XEXP (m_tramp, 0);
8467 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8468 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8469 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8470 ptr_mode);
8471 }
8472
8473 static unsigned char
8474 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8475 {
8476 /* ??? Logically we should only need to provide a value when
8477 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8478 can hold MODE, but at the moment we need to handle all modes.
8479 Just ignore any runtime parts for registers that can't store them. */
8480 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8481 unsigned int nregs;
8482 switch (regclass)
8483 {
8484 case TAILCALL_ADDR_REGS:
8485 case POINTER_REGS:
8486 case GENERAL_REGS:
8487 case ALL_REGS:
8488 case POINTER_AND_FP_REGS:
8489 case FP_REGS:
8490 case FP_LO_REGS:
8491 if (aarch64_sve_data_mode_p (mode)
8492 && constant_multiple_p (GET_MODE_SIZE (mode),
8493 BYTES_PER_SVE_VECTOR, &nregs))
8494 return nregs;
8495 return (aarch64_vector_data_mode_p (mode)
8496 ? CEIL (lowest_size, UNITS_PER_VREG)
8497 : CEIL (lowest_size, UNITS_PER_WORD));
8498 case STACK_REG:
8499 case PR_REGS:
8500 case PR_LO_REGS:
8501 case PR_HI_REGS:
8502 return 1;
8503
8504 case NO_REGS:
8505 return 0;
8506
8507 default:
8508 break;
8509 }
8510 gcc_unreachable ();
8511 }
8512
8513 static reg_class_t
8514 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8515 {
8516 if (regclass == POINTER_REGS)
8517 return GENERAL_REGS;
8518
8519 if (regclass == STACK_REG)
8520 {
8521 if (REG_P(x)
8522 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8523 return regclass;
8524
8525 return NO_REGS;
8526 }
8527
8528 /* Register eliminiation can result in a request for
8529 SP+constant->FP_REGS. We cannot support such operations which
8530 use SP as source and an FP_REG as destination, so reject out
8531 right now. */
8532 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8533 {
8534 rtx lhs = XEXP (x, 0);
8535
8536 /* Look through a possible SUBREG introduced by ILP32. */
8537 if (GET_CODE (lhs) == SUBREG)
8538 lhs = SUBREG_REG (lhs);
8539
8540 gcc_assert (REG_P (lhs));
8541 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8542 POINTER_REGS));
8543 return NO_REGS;
8544 }
8545
8546 return regclass;
8547 }
8548
8549 void
8550 aarch64_asm_output_labelref (FILE* f, const char *name)
8551 {
8552 asm_fprintf (f, "%U%s", name);
8553 }
8554
8555 static void
8556 aarch64_elf_asm_constructor (rtx symbol, int priority)
8557 {
8558 if (priority == DEFAULT_INIT_PRIORITY)
8559 default_ctor_section_asm_out_constructor (symbol, priority);
8560 else
8561 {
8562 section *s;
8563 /* While priority is known to be in range [0, 65535], so 18 bytes
8564 would be enough, the compiler might not know that. To avoid
8565 -Wformat-truncation false positive, use a larger size. */
8566 char buf[23];
8567 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8568 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8569 switch_to_section (s);
8570 assemble_align (POINTER_SIZE);
8571 assemble_aligned_integer (POINTER_BYTES, symbol);
8572 }
8573 }
8574
8575 static void
8576 aarch64_elf_asm_destructor (rtx symbol, int priority)
8577 {
8578 if (priority == DEFAULT_INIT_PRIORITY)
8579 default_dtor_section_asm_out_destructor (symbol, priority);
8580 else
8581 {
8582 section *s;
8583 /* While priority is known to be in range [0, 65535], so 18 bytes
8584 would be enough, the compiler might not know that. To avoid
8585 -Wformat-truncation false positive, use a larger size. */
8586 char buf[23];
8587 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8588 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8589 switch_to_section (s);
8590 assemble_align (POINTER_SIZE);
8591 assemble_aligned_integer (POINTER_BYTES, symbol);
8592 }
8593 }
8594
8595 const char*
8596 aarch64_output_casesi (rtx *operands)
8597 {
8598 char buf[100];
8599 char label[100];
8600 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8601 int index;
8602 static const char *const patterns[4][2] =
8603 {
8604 {
8605 "ldrb\t%w3, [%0,%w1,uxtw]",
8606 "add\t%3, %4, %w3, sxtb #2"
8607 },
8608 {
8609 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8610 "add\t%3, %4, %w3, sxth #2"
8611 },
8612 {
8613 "ldr\t%w3, [%0,%w1,uxtw #2]",
8614 "add\t%3, %4, %w3, sxtw #2"
8615 },
8616 /* We assume that DImode is only generated when not optimizing and
8617 that we don't really need 64-bit address offsets. That would
8618 imply an object file with 8GB of code in a single function! */
8619 {
8620 "ldr\t%w3, [%0,%w1,uxtw #2]",
8621 "add\t%3, %4, %w3, sxtw #2"
8622 }
8623 };
8624
8625 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8626
8627 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8628 index = exact_log2 (GET_MODE_SIZE (mode));
8629
8630 gcc_assert (index >= 0 && index <= 3);
8631
8632 /* Need to implement table size reduction, by chaning the code below. */
8633 output_asm_insn (patterns[index][0], operands);
8634 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8635 snprintf (buf, sizeof (buf),
8636 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8637 output_asm_insn (buf, operands);
8638 output_asm_insn (patterns[index][1], operands);
8639 output_asm_insn ("br\t%3", operands);
8640 assemble_label (asm_out_file, label);
8641 return "";
8642 }
8643
8644
8645 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8646 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8647 operator. */
8648
8649 int
8650 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8651 {
8652 if (shift >= 0 && shift <= 3)
8653 {
8654 int size;
8655 for (size = 8; size <= 32; size *= 2)
8656 {
8657 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8658 if (mask == bits << shift)
8659 return size;
8660 }
8661 }
8662 return 0;
8663 }
8664
8665 /* Constant pools are per function only when PC relative
8666 literal loads are true or we are in the large memory
8667 model. */
8668
8669 static inline bool
8670 aarch64_can_use_per_function_literal_pools_p (void)
8671 {
8672 return (aarch64_pcrelative_literal_loads
8673 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8674 }
8675
8676 static bool
8677 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8678 {
8679 /* We can't use blocks for constants when we're using a per-function
8680 constant pool. */
8681 return !aarch64_can_use_per_function_literal_pools_p ();
8682 }
8683
8684 /* Select appropriate section for constants depending
8685 on where we place literal pools. */
8686
8687 static section *
8688 aarch64_select_rtx_section (machine_mode mode,
8689 rtx x,
8690 unsigned HOST_WIDE_INT align)
8691 {
8692 if (aarch64_can_use_per_function_literal_pools_p ())
8693 return function_section (current_function_decl);
8694
8695 return default_elf_select_rtx_section (mode, x, align);
8696 }
8697
8698 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8699 void
8700 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8701 HOST_WIDE_INT offset)
8702 {
8703 /* When using per-function literal pools, we must ensure that any code
8704 section is aligned to the minimal instruction length, lest we get
8705 errors from the assembler re "unaligned instructions". */
8706 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8707 ASM_OUTPUT_ALIGN (f, 2);
8708 }
8709
8710 /* Costs. */
8711
8712 /* Helper function for rtx cost calculation. Strip a shift expression
8713 from X. Returns the inner operand if successful, or the original
8714 expression on failure. */
8715 static rtx
8716 aarch64_strip_shift (rtx x)
8717 {
8718 rtx op = x;
8719
8720 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8721 we can convert both to ROR during final output. */
8722 if ((GET_CODE (op) == ASHIFT
8723 || GET_CODE (op) == ASHIFTRT
8724 || GET_CODE (op) == LSHIFTRT
8725 || GET_CODE (op) == ROTATERT
8726 || GET_CODE (op) == ROTATE)
8727 && CONST_INT_P (XEXP (op, 1)))
8728 return XEXP (op, 0);
8729
8730 if (GET_CODE (op) == MULT
8731 && CONST_INT_P (XEXP (op, 1))
8732 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8733 return XEXP (op, 0);
8734
8735 return x;
8736 }
8737
8738 /* Helper function for rtx cost calculation. Strip an extend
8739 expression from X. Returns the inner operand if successful, or the
8740 original expression on failure. We deal with a number of possible
8741 canonicalization variations here. If STRIP_SHIFT is true, then
8742 we can strip off a shift also. */
8743 static rtx
8744 aarch64_strip_extend (rtx x, bool strip_shift)
8745 {
8746 scalar_int_mode mode;
8747 rtx op = x;
8748
8749 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8750 return op;
8751
8752 /* Zero and sign extraction of a widened value. */
8753 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8754 && XEXP (op, 2) == const0_rtx
8755 && GET_CODE (XEXP (op, 0)) == MULT
8756 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8757 XEXP (op, 1)))
8758 return XEXP (XEXP (op, 0), 0);
8759
8760 /* It can also be represented (for zero-extend) as an AND with an
8761 immediate. */
8762 if (GET_CODE (op) == AND
8763 && GET_CODE (XEXP (op, 0)) == MULT
8764 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8765 && CONST_INT_P (XEXP (op, 1))
8766 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8767 INTVAL (XEXP (op, 1))) != 0)
8768 return XEXP (XEXP (op, 0), 0);
8769
8770 /* Now handle extended register, as this may also have an optional
8771 left shift by 1..4. */
8772 if (strip_shift
8773 && GET_CODE (op) == ASHIFT
8774 && CONST_INT_P (XEXP (op, 1))
8775 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8776 op = XEXP (op, 0);
8777
8778 if (GET_CODE (op) == ZERO_EXTEND
8779 || GET_CODE (op) == SIGN_EXTEND)
8780 op = XEXP (op, 0);
8781
8782 if (op != x)
8783 return op;
8784
8785 return x;
8786 }
8787
8788 /* Return true iff CODE is a shift supported in combination
8789 with arithmetic instructions. */
8790
8791 static bool
8792 aarch64_shift_p (enum rtx_code code)
8793 {
8794 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8795 }
8796
8797
8798 /* Return true iff X is a cheap shift without a sign extend. */
8799
8800 static bool
8801 aarch64_cheap_mult_shift_p (rtx x)
8802 {
8803 rtx op0, op1;
8804
8805 op0 = XEXP (x, 0);
8806 op1 = XEXP (x, 1);
8807
8808 if (!(aarch64_tune_params.extra_tuning_flags
8809 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8810 return false;
8811
8812 if (GET_CODE (op0) == SIGN_EXTEND)
8813 return false;
8814
8815 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8816 && UINTVAL (op1) <= 4)
8817 return true;
8818
8819 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8820 return false;
8821
8822 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8823
8824 if (l2 > 0 && l2 <= 4)
8825 return true;
8826
8827 return false;
8828 }
8829
8830 /* Helper function for rtx cost calculation. Calculate the cost of
8831 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8832 Return the calculated cost of the expression, recursing manually in to
8833 operands where needed. */
8834
8835 static int
8836 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8837 {
8838 rtx op0, op1;
8839 const struct cpu_cost_table *extra_cost
8840 = aarch64_tune_params.insn_extra_cost;
8841 int cost = 0;
8842 bool compound_p = (outer == PLUS || outer == MINUS);
8843 machine_mode mode = GET_MODE (x);
8844
8845 gcc_checking_assert (code == MULT);
8846
8847 op0 = XEXP (x, 0);
8848 op1 = XEXP (x, 1);
8849
8850 if (VECTOR_MODE_P (mode))
8851 mode = GET_MODE_INNER (mode);
8852
8853 /* Integer multiply/fma. */
8854 if (GET_MODE_CLASS (mode) == MODE_INT)
8855 {
8856 /* The multiply will be canonicalized as a shift, cost it as such. */
8857 if (aarch64_shift_p (GET_CODE (x))
8858 || (CONST_INT_P (op1)
8859 && exact_log2 (INTVAL (op1)) > 0))
8860 {
8861 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8862 || GET_CODE (op0) == SIGN_EXTEND;
8863 if (speed)
8864 {
8865 if (compound_p)
8866 {
8867 /* If the shift is considered cheap,
8868 then don't add any cost. */
8869 if (aarch64_cheap_mult_shift_p (x))
8870 ;
8871 else if (REG_P (op1))
8872 /* ARITH + shift-by-register. */
8873 cost += extra_cost->alu.arith_shift_reg;
8874 else if (is_extend)
8875 /* ARITH + extended register. We don't have a cost field
8876 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8877 cost += extra_cost->alu.extend_arith;
8878 else
8879 /* ARITH + shift-by-immediate. */
8880 cost += extra_cost->alu.arith_shift;
8881 }
8882 else
8883 /* LSL (immediate). */
8884 cost += extra_cost->alu.shift;
8885
8886 }
8887 /* Strip extends as we will have costed them in the case above. */
8888 if (is_extend)
8889 op0 = aarch64_strip_extend (op0, true);
8890
8891 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8892
8893 return cost;
8894 }
8895
8896 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8897 compound and let the below cases handle it. After all, MNEG is a
8898 special-case alias of MSUB. */
8899 if (GET_CODE (op0) == NEG)
8900 {
8901 op0 = XEXP (op0, 0);
8902 compound_p = true;
8903 }
8904
8905 /* Integer multiplies or FMAs have zero/sign extending variants. */
8906 if ((GET_CODE (op0) == ZERO_EXTEND
8907 && GET_CODE (op1) == ZERO_EXTEND)
8908 || (GET_CODE (op0) == SIGN_EXTEND
8909 && GET_CODE (op1) == SIGN_EXTEND))
8910 {
8911 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8912 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8913
8914 if (speed)
8915 {
8916 if (compound_p)
8917 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8918 cost += extra_cost->mult[0].extend_add;
8919 else
8920 /* MUL/SMULL/UMULL. */
8921 cost += extra_cost->mult[0].extend;
8922 }
8923
8924 return cost;
8925 }
8926
8927 /* This is either an integer multiply or a MADD. In both cases
8928 we want to recurse and cost the operands. */
8929 cost += rtx_cost (op0, mode, MULT, 0, speed);
8930 cost += rtx_cost (op1, mode, MULT, 1, speed);
8931
8932 if (speed)
8933 {
8934 if (compound_p)
8935 /* MADD/MSUB. */
8936 cost += extra_cost->mult[mode == DImode].add;
8937 else
8938 /* MUL. */
8939 cost += extra_cost->mult[mode == DImode].simple;
8940 }
8941
8942 return cost;
8943 }
8944 else
8945 {
8946 if (speed)
8947 {
8948 /* Floating-point FMA/FMUL can also support negations of the
8949 operands, unless the rounding mode is upward or downward in
8950 which case FNMUL is different than FMUL with operand negation. */
8951 bool neg0 = GET_CODE (op0) == NEG;
8952 bool neg1 = GET_CODE (op1) == NEG;
8953 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8954 {
8955 if (neg0)
8956 op0 = XEXP (op0, 0);
8957 if (neg1)
8958 op1 = XEXP (op1, 0);
8959 }
8960
8961 if (compound_p)
8962 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8963 cost += extra_cost->fp[mode == DFmode].fma;
8964 else
8965 /* FMUL/FNMUL. */
8966 cost += extra_cost->fp[mode == DFmode].mult;
8967 }
8968
8969 cost += rtx_cost (op0, mode, MULT, 0, speed);
8970 cost += rtx_cost (op1, mode, MULT, 1, speed);
8971 return cost;
8972 }
8973 }
8974
8975 static int
8976 aarch64_address_cost (rtx x,
8977 machine_mode mode,
8978 addr_space_t as ATTRIBUTE_UNUSED,
8979 bool speed)
8980 {
8981 enum rtx_code c = GET_CODE (x);
8982 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8983 struct aarch64_address_info info;
8984 int cost = 0;
8985 info.shift = 0;
8986
8987 if (!aarch64_classify_address (&info, x, mode, false))
8988 {
8989 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8990 {
8991 /* This is a CONST or SYMBOL ref which will be split
8992 in a different way depending on the code model in use.
8993 Cost it through the generic infrastructure. */
8994 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8995 /* Divide through by the cost of one instruction to
8996 bring it to the same units as the address costs. */
8997 cost_symbol_ref /= COSTS_N_INSNS (1);
8998 /* The cost is then the cost of preparing the address,
8999 followed by an immediate (possibly 0) offset. */
9000 return cost_symbol_ref + addr_cost->imm_offset;
9001 }
9002 else
9003 {
9004 /* This is most likely a jump table from a case
9005 statement. */
9006 return addr_cost->register_offset;
9007 }
9008 }
9009
9010 switch (info.type)
9011 {
9012 case ADDRESS_LO_SUM:
9013 case ADDRESS_SYMBOLIC:
9014 case ADDRESS_REG_IMM:
9015 cost += addr_cost->imm_offset;
9016 break;
9017
9018 case ADDRESS_REG_WB:
9019 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9020 cost += addr_cost->pre_modify;
9021 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9022 cost += addr_cost->post_modify;
9023 else
9024 gcc_unreachable ();
9025
9026 break;
9027
9028 case ADDRESS_REG_REG:
9029 cost += addr_cost->register_offset;
9030 break;
9031
9032 case ADDRESS_REG_SXTW:
9033 cost += addr_cost->register_sextend;
9034 break;
9035
9036 case ADDRESS_REG_UXTW:
9037 cost += addr_cost->register_zextend;
9038 break;
9039
9040 default:
9041 gcc_unreachable ();
9042 }
9043
9044
9045 if (info.shift > 0)
9046 {
9047 /* For the sake of calculating the cost of the shifted register
9048 component, we can treat same sized modes in the same way. */
9049 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9050 cost += addr_cost->addr_scale_costs.hi;
9051 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9052 cost += addr_cost->addr_scale_costs.si;
9053 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9054 cost += addr_cost->addr_scale_costs.di;
9055 else
9056 /* We can't tell, or this is a 128-bit vector. */
9057 cost += addr_cost->addr_scale_costs.ti;
9058 }
9059
9060 return cost;
9061 }
9062
9063 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9064 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9065 to be taken. */
9066
9067 int
9068 aarch64_branch_cost (bool speed_p, bool predictable_p)
9069 {
9070 /* When optimizing for speed, use the cost of unpredictable branches. */
9071 const struct cpu_branch_cost *branch_costs =
9072 aarch64_tune_params.branch_costs;
9073
9074 if (!speed_p || predictable_p)
9075 return branch_costs->predictable;
9076 else
9077 return branch_costs->unpredictable;
9078 }
9079
9080 /* Return true if the RTX X in mode MODE is a zero or sign extract
9081 usable in an ADD or SUB (extended register) instruction. */
9082 static bool
9083 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9084 {
9085 /* Catch add with a sign extract.
9086 This is add_<optab><mode>_multp2. */
9087 if (GET_CODE (x) == SIGN_EXTRACT
9088 || GET_CODE (x) == ZERO_EXTRACT)
9089 {
9090 rtx op0 = XEXP (x, 0);
9091 rtx op1 = XEXP (x, 1);
9092 rtx op2 = XEXP (x, 2);
9093
9094 if (GET_CODE (op0) == MULT
9095 && CONST_INT_P (op1)
9096 && op2 == const0_rtx
9097 && CONST_INT_P (XEXP (op0, 1))
9098 && aarch64_is_extend_from_extract (mode,
9099 XEXP (op0, 1),
9100 op1))
9101 {
9102 return true;
9103 }
9104 }
9105 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9106 No shift. */
9107 else if (GET_CODE (x) == SIGN_EXTEND
9108 || GET_CODE (x) == ZERO_EXTEND)
9109 return REG_P (XEXP (x, 0));
9110
9111 return false;
9112 }
9113
9114 static bool
9115 aarch64_frint_unspec_p (unsigned int u)
9116 {
9117 switch (u)
9118 {
9119 case UNSPEC_FRINTZ:
9120 case UNSPEC_FRINTP:
9121 case UNSPEC_FRINTM:
9122 case UNSPEC_FRINTA:
9123 case UNSPEC_FRINTN:
9124 case UNSPEC_FRINTX:
9125 case UNSPEC_FRINTI:
9126 return true;
9127
9128 default:
9129 return false;
9130 }
9131 }
9132
9133 /* Return true iff X is an rtx that will match an extr instruction
9134 i.e. as described in the *extr<mode>5_insn family of patterns.
9135 OP0 and OP1 will be set to the operands of the shifts involved
9136 on success and will be NULL_RTX otherwise. */
9137
9138 static bool
9139 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9140 {
9141 rtx op0, op1;
9142 scalar_int_mode mode;
9143 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9144 return false;
9145
9146 *res_op0 = NULL_RTX;
9147 *res_op1 = NULL_RTX;
9148
9149 if (GET_CODE (x) != IOR)
9150 return false;
9151
9152 op0 = XEXP (x, 0);
9153 op1 = XEXP (x, 1);
9154
9155 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9156 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9157 {
9158 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9159 if (GET_CODE (op1) == ASHIFT)
9160 std::swap (op0, op1);
9161
9162 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9163 return false;
9164
9165 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9166 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9167
9168 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9169 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9170 {
9171 *res_op0 = XEXP (op0, 0);
9172 *res_op1 = XEXP (op1, 0);
9173 return true;
9174 }
9175 }
9176
9177 return false;
9178 }
9179
9180 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9181 storing it in *COST. Result is true if the total cost of the operation
9182 has now been calculated. */
9183 static bool
9184 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9185 {
9186 rtx inner;
9187 rtx comparator;
9188 enum rtx_code cmpcode;
9189
9190 if (COMPARISON_P (op0))
9191 {
9192 inner = XEXP (op0, 0);
9193 comparator = XEXP (op0, 1);
9194 cmpcode = GET_CODE (op0);
9195 }
9196 else
9197 {
9198 inner = op0;
9199 comparator = const0_rtx;
9200 cmpcode = NE;
9201 }
9202
9203 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9204 {
9205 /* Conditional branch. */
9206 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9207 return true;
9208 else
9209 {
9210 if (cmpcode == NE || cmpcode == EQ)
9211 {
9212 if (comparator == const0_rtx)
9213 {
9214 /* TBZ/TBNZ/CBZ/CBNZ. */
9215 if (GET_CODE (inner) == ZERO_EXTRACT)
9216 /* TBZ/TBNZ. */
9217 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9218 ZERO_EXTRACT, 0, speed);
9219 else
9220 /* CBZ/CBNZ. */
9221 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9222
9223 return true;
9224 }
9225 }
9226 else if (cmpcode == LT || cmpcode == GE)
9227 {
9228 /* TBZ/TBNZ. */
9229 if (comparator == const0_rtx)
9230 return true;
9231 }
9232 }
9233 }
9234 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9235 {
9236 /* CCMP. */
9237 if (GET_CODE (op1) == COMPARE)
9238 {
9239 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9240 if (XEXP (op1, 1) == const0_rtx)
9241 *cost += 1;
9242 if (speed)
9243 {
9244 machine_mode mode = GET_MODE (XEXP (op1, 0));
9245 const struct cpu_cost_table *extra_cost
9246 = aarch64_tune_params.insn_extra_cost;
9247
9248 if (GET_MODE_CLASS (mode) == MODE_INT)
9249 *cost += extra_cost->alu.arith;
9250 else
9251 *cost += extra_cost->fp[mode == DFmode].compare;
9252 }
9253 return true;
9254 }
9255
9256 /* It's a conditional operation based on the status flags,
9257 so it must be some flavor of CSEL. */
9258
9259 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9260 if (GET_CODE (op1) == NEG
9261 || GET_CODE (op1) == NOT
9262 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9263 op1 = XEXP (op1, 0);
9264 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9265 {
9266 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9267 op1 = XEXP (op1, 0);
9268 op2 = XEXP (op2, 0);
9269 }
9270
9271 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9272 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9273 return true;
9274 }
9275
9276 /* We don't know what this is, cost all operands. */
9277 return false;
9278 }
9279
9280 /* Check whether X is a bitfield operation of the form shift + extend that
9281 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9282 operand to which the bitfield operation is applied. Otherwise return
9283 NULL_RTX. */
9284
9285 static rtx
9286 aarch64_extend_bitfield_pattern_p (rtx x)
9287 {
9288 rtx_code outer_code = GET_CODE (x);
9289 machine_mode outer_mode = GET_MODE (x);
9290
9291 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9292 && outer_mode != SImode && outer_mode != DImode)
9293 return NULL_RTX;
9294
9295 rtx inner = XEXP (x, 0);
9296 rtx_code inner_code = GET_CODE (inner);
9297 machine_mode inner_mode = GET_MODE (inner);
9298 rtx op = NULL_RTX;
9299
9300 switch (inner_code)
9301 {
9302 case ASHIFT:
9303 if (CONST_INT_P (XEXP (inner, 1))
9304 && (inner_mode == QImode || inner_mode == HImode))
9305 op = XEXP (inner, 0);
9306 break;
9307 case LSHIFTRT:
9308 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9309 && (inner_mode == QImode || inner_mode == HImode))
9310 op = XEXP (inner, 0);
9311 break;
9312 case ASHIFTRT:
9313 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9314 && (inner_mode == QImode || inner_mode == HImode))
9315 op = XEXP (inner, 0);
9316 break;
9317 default:
9318 break;
9319 }
9320
9321 return op;
9322 }
9323
9324 /* Return true if the mask and a shift amount from an RTX of the form
9325 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9326 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9327
9328 bool
9329 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9330 rtx shft_amnt)
9331 {
9332 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9333 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9334 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9335 && (INTVAL (mask)
9336 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9337 }
9338
9339 /* Calculate the cost of calculating X, storing it in *COST. Result
9340 is true if the total cost of the operation has now been calculated. */
9341 static bool
9342 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9343 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9344 {
9345 rtx op0, op1, op2;
9346 const struct cpu_cost_table *extra_cost
9347 = aarch64_tune_params.insn_extra_cost;
9348 int code = GET_CODE (x);
9349 scalar_int_mode int_mode;
9350
9351 /* By default, assume that everything has equivalent cost to the
9352 cheapest instruction. Any additional costs are applied as a delta
9353 above this default. */
9354 *cost = COSTS_N_INSNS (1);
9355
9356 switch (code)
9357 {
9358 case SET:
9359 /* The cost depends entirely on the operands to SET. */
9360 *cost = 0;
9361 op0 = SET_DEST (x);
9362 op1 = SET_SRC (x);
9363
9364 switch (GET_CODE (op0))
9365 {
9366 case MEM:
9367 if (speed)
9368 {
9369 rtx address = XEXP (op0, 0);
9370 if (VECTOR_MODE_P (mode))
9371 *cost += extra_cost->ldst.storev;
9372 else if (GET_MODE_CLASS (mode) == MODE_INT)
9373 *cost += extra_cost->ldst.store;
9374 else if (mode == SFmode)
9375 *cost += extra_cost->ldst.storef;
9376 else if (mode == DFmode)
9377 *cost += extra_cost->ldst.stored;
9378
9379 *cost +=
9380 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9381 0, speed));
9382 }
9383
9384 *cost += rtx_cost (op1, mode, SET, 1, speed);
9385 return true;
9386
9387 case SUBREG:
9388 if (! REG_P (SUBREG_REG (op0)))
9389 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9390
9391 /* Fall through. */
9392 case REG:
9393 /* The cost is one per vector-register copied. */
9394 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9395 {
9396 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9397 *cost = COSTS_N_INSNS (nregs);
9398 }
9399 /* const0_rtx is in general free, but we will use an
9400 instruction to set a register to 0. */
9401 else if (REG_P (op1) || op1 == const0_rtx)
9402 {
9403 /* The cost is 1 per register copied. */
9404 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9405 *cost = COSTS_N_INSNS (nregs);
9406 }
9407 else
9408 /* Cost is just the cost of the RHS of the set. */
9409 *cost += rtx_cost (op1, mode, SET, 1, speed);
9410 return true;
9411
9412 case ZERO_EXTRACT:
9413 case SIGN_EXTRACT:
9414 /* Bit-field insertion. Strip any redundant widening of
9415 the RHS to meet the width of the target. */
9416 if (GET_CODE (op1) == SUBREG)
9417 op1 = SUBREG_REG (op1);
9418 if ((GET_CODE (op1) == ZERO_EXTEND
9419 || GET_CODE (op1) == SIGN_EXTEND)
9420 && CONST_INT_P (XEXP (op0, 1))
9421 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9422 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9423 op1 = XEXP (op1, 0);
9424
9425 if (CONST_INT_P (op1))
9426 {
9427 /* MOV immediate is assumed to always be cheap. */
9428 *cost = COSTS_N_INSNS (1);
9429 }
9430 else
9431 {
9432 /* BFM. */
9433 if (speed)
9434 *cost += extra_cost->alu.bfi;
9435 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9436 }
9437
9438 return true;
9439
9440 default:
9441 /* We can't make sense of this, assume default cost. */
9442 *cost = COSTS_N_INSNS (1);
9443 return false;
9444 }
9445 return false;
9446
9447 case CONST_INT:
9448 /* If an instruction can incorporate a constant within the
9449 instruction, the instruction's expression avoids calling
9450 rtx_cost() on the constant. If rtx_cost() is called on a
9451 constant, then it is usually because the constant must be
9452 moved into a register by one or more instructions.
9453
9454 The exception is constant 0, which can be expressed
9455 as XZR/WZR and is therefore free. The exception to this is
9456 if we have (set (reg) (const0_rtx)) in which case we must cost
9457 the move. However, we can catch that when we cost the SET, so
9458 we don't need to consider that here. */
9459 if (x == const0_rtx)
9460 *cost = 0;
9461 else
9462 {
9463 /* To an approximation, building any other constant is
9464 proportionally expensive to the number of instructions
9465 required to build that constant. This is true whether we
9466 are compiling for SPEED or otherwise. */
9467 if (!is_a <scalar_int_mode> (mode, &int_mode))
9468 int_mode = word_mode;
9469 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9470 (NULL_RTX, x, false, int_mode));
9471 }
9472 return true;
9473
9474 case CONST_DOUBLE:
9475
9476 /* First determine number of instructions to do the move
9477 as an integer constant. */
9478 if (!aarch64_float_const_representable_p (x)
9479 && !aarch64_can_const_movi_rtx_p (x, mode)
9480 && aarch64_float_const_rtx_p (x))
9481 {
9482 unsigned HOST_WIDE_INT ival;
9483 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9484 gcc_assert (succeed);
9485
9486 scalar_int_mode imode = (mode == HFmode
9487 ? SImode
9488 : int_mode_for_mode (mode).require ());
9489 int ncost = aarch64_internal_mov_immediate
9490 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9491 *cost += COSTS_N_INSNS (ncost);
9492 return true;
9493 }
9494
9495 if (speed)
9496 {
9497 /* mov[df,sf]_aarch64. */
9498 if (aarch64_float_const_representable_p (x))
9499 /* FMOV (scalar immediate). */
9500 *cost += extra_cost->fp[mode == DFmode].fpconst;
9501 else if (!aarch64_float_const_zero_rtx_p (x))
9502 {
9503 /* This will be a load from memory. */
9504 if (mode == DFmode)
9505 *cost += extra_cost->ldst.loadd;
9506 else
9507 *cost += extra_cost->ldst.loadf;
9508 }
9509 else
9510 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9511 or MOV v0.s[0], wzr - neither of which are modeled by the
9512 cost tables. Just use the default cost. */
9513 {
9514 }
9515 }
9516
9517 return true;
9518
9519 case MEM:
9520 if (speed)
9521 {
9522 /* For loads we want the base cost of a load, plus an
9523 approximation for the additional cost of the addressing
9524 mode. */
9525 rtx address = XEXP (x, 0);
9526 if (VECTOR_MODE_P (mode))
9527 *cost += extra_cost->ldst.loadv;
9528 else if (GET_MODE_CLASS (mode) == MODE_INT)
9529 *cost += extra_cost->ldst.load;
9530 else if (mode == SFmode)
9531 *cost += extra_cost->ldst.loadf;
9532 else if (mode == DFmode)
9533 *cost += extra_cost->ldst.loadd;
9534
9535 *cost +=
9536 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9537 0, speed));
9538 }
9539
9540 return true;
9541
9542 case NEG:
9543 op0 = XEXP (x, 0);
9544
9545 if (VECTOR_MODE_P (mode))
9546 {
9547 if (speed)
9548 {
9549 /* FNEG. */
9550 *cost += extra_cost->vect.alu;
9551 }
9552 return false;
9553 }
9554
9555 if (GET_MODE_CLASS (mode) == MODE_INT)
9556 {
9557 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9558 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9559 {
9560 /* CSETM. */
9561 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9562 return true;
9563 }
9564
9565 /* Cost this as SUB wzr, X. */
9566 op0 = CONST0_RTX (mode);
9567 op1 = XEXP (x, 0);
9568 goto cost_minus;
9569 }
9570
9571 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9572 {
9573 /* Support (neg(fma...)) as a single instruction only if
9574 sign of zeros is unimportant. This matches the decision
9575 making in aarch64.md. */
9576 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9577 {
9578 /* FNMADD. */
9579 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9580 return true;
9581 }
9582 if (GET_CODE (op0) == MULT)
9583 {
9584 /* FNMUL. */
9585 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9586 return true;
9587 }
9588 if (speed)
9589 /* FNEG. */
9590 *cost += extra_cost->fp[mode == DFmode].neg;
9591 return false;
9592 }
9593
9594 return false;
9595
9596 case CLRSB:
9597 case CLZ:
9598 if (speed)
9599 {
9600 if (VECTOR_MODE_P (mode))
9601 *cost += extra_cost->vect.alu;
9602 else
9603 *cost += extra_cost->alu.clz;
9604 }
9605
9606 return false;
9607
9608 case COMPARE:
9609 op0 = XEXP (x, 0);
9610 op1 = XEXP (x, 1);
9611
9612 if (op1 == const0_rtx
9613 && GET_CODE (op0) == AND)
9614 {
9615 x = op0;
9616 mode = GET_MODE (op0);
9617 goto cost_logic;
9618 }
9619
9620 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9621 {
9622 /* TODO: A write to the CC flags possibly costs extra, this
9623 needs encoding in the cost tables. */
9624
9625 mode = GET_MODE (op0);
9626 /* ANDS. */
9627 if (GET_CODE (op0) == AND)
9628 {
9629 x = op0;
9630 goto cost_logic;
9631 }
9632
9633 if (GET_CODE (op0) == PLUS)
9634 {
9635 /* ADDS (and CMN alias). */
9636 x = op0;
9637 goto cost_plus;
9638 }
9639
9640 if (GET_CODE (op0) == MINUS)
9641 {
9642 /* SUBS. */
9643 x = op0;
9644 goto cost_minus;
9645 }
9646
9647 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9648 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9649 && CONST_INT_P (XEXP (op0, 2)))
9650 {
9651 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9652 Handle it here directly rather than going to cost_logic
9653 since we know the immediate generated for the TST is valid
9654 so we can avoid creating an intermediate rtx for it only
9655 for costing purposes. */
9656 if (speed)
9657 *cost += extra_cost->alu.logical;
9658
9659 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9660 ZERO_EXTRACT, 0, speed);
9661 return true;
9662 }
9663
9664 if (GET_CODE (op1) == NEG)
9665 {
9666 /* CMN. */
9667 if (speed)
9668 *cost += extra_cost->alu.arith;
9669
9670 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9671 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9672 return true;
9673 }
9674
9675 /* CMP.
9676
9677 Compare can freely swap the order of operands, and
9678 canonicalization puts the more complex operation first.
9679 But the integer MINUS logic expects the shift/extend
9680 operation in op1. */
9681 if (! (REG_P (op0)
9682 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9683 {
9684 op0 = XEXP (x, 1);
9685 op1 = XEXP (x, 0);
9686 }
9687 goto cost_minus;
9688 }
9689
9690 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9691 {
9692 /* FCMP. */
9693 if (speed)
9694 *cost += extra_cost->fp[mode == DFmode].compare;
9695
9696 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9697 {
9698 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9699 /* FCMP supports constant 0.0 for no extra cost. */
9700 return true;
9701 }
9702 return false;
9703 }
9704
9705 if (VECTOR_MODE_P (mode))
9706 {
9707 /* Vector compare. */
9708 if (speed)
9709 *cost += extra_cost->vect.alu;
9710
9711 if (aarch64_float_const_zero_rtx_p (op1))
9712 {
9713 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9714 cost. */
9715 return true;
9716 }
9717 return false;
9718 }
9719 return false;
9720
9721 case MINUS:
9722 {
9723 op0 = XEXP (x, 0);
9724 op1 = XEXP (x, 1);
9725
9726 cost_minus:
9727 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9728
9729 /* Detect valid immediates. */
9730 if ((GET_MODE_CLASS (mode) == MODE_INT
9731 || (GET_MODE_CLASS (mode) == MODE_CC
9732 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9733 && CONST_INT_P (op1)
9734 && aarch64_uimm12_shift (INTVAL (op1)))
9735 {
9736 if (speed)
9737 /* SUB(S) (immediate). */
9738 *cost += extra_cost->alu.arith;
9739 return true;
9740 }
9741
9742 /* Look for SUB (extended register). */
9743 if (is_a <scalar_int_mode> (mode, &int_mode)
9744 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9745 {
9746 if (speed)
9747 *cost += extra_cost->alu.extend_arith;
9748
9749 op1 = aarch64_strip_extend (op1, true);
9750 *cost += rtx_cost (op1, VOIDmode,
9751 (enum rtx_code) GET_CODE (op1), 0, speed);
9752 return true;
9753 }
9754
9755 rtx new_op1 = aarch64_strip_extend (op1, false);
9756
9757 /* Cost this as an FMA-alike operation. */
9758 if ((GET_CODE (new_op1) == MULT
9759 || aarch64_shift_p (GET_CODE (new_op1)))
9760 && code != COMPARE)
9761 {
9762 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9763 (enum rtx_code) code,
9764 speed);
9765 return true;
9766 }
9767
9768 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9769
9770 if (speed)
9771 {
9772 if (VECTOR_MODE_P (mode))
9773 {
9774 /* Vector SUB. */
9775 *cost += extra_cost->vect.alu;
9776 }
9777 else if (GET_MODE_CLASS (mode) == MODE_INT)
9778 {
9779 /* SUB(S). */
9780 *cost += extra_cost->alu.arith;
9781 }
9782 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9783 {
9784 /* FSUB. */
9785 *cost += extra_cost->fp[mode == DFmode].addsub;
9786 }
9787 }
9788 return true;
9789 }
9790
9791 case PLUS:
9792 {
9793 rtx new_op0;
9794
9795 op0 = XEXP (x, 0);
9796 op1 = XEXP (x, 1);
9797
9798 cost_plus:
9799 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9800 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9801 {
9802 /* CSINC. */
9803 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9804 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9805 return true;
9806 }
9807
9808 if (GET_MODE_CLASS (mode) == MODE_INT
9809 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9810 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9811 {
9812 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9813
9814 if (speed)
9815 /* ADD (immediate). */
9816 *cost += extra_cost->alu.arith;
9817 return true;
9818 }
9819
9820 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9821
9822 /* Look for ADD (extended register). */
9823 if (is_a <scalar_int_mode> (mode, &int_mode)
9824 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9825 {
9826 if (speed)
9827 *cost += extra_cost->alu.extend_arith;
9828
9829 op0 = aarch64_strip_extend (op0, true);
9830 *cost += rtx_cost (op0, VOIDmode,
9831 (enum rtx_code) GET_CODE (op0), 0, speed);
9832 return true;
9833 }
9834
9835 /* Strip any extend, leave shifts behind as we will
9836 cost them through mult_cost. */
9837 new_op0 = aarch64_strip_extend (op0, false);
9838
9839 if (GET_CODE (new_op0) == MULT
9840 || aarch64_shift_p (GET_CODE (new_op0)))
9841 {
9842 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9843 speed);
9844 return true;
9845 }
9846
9847 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9848
9849 if (speed)
9850 {
9851 if (VECTOR_MODE_P (mode))
9852 {
9853 /* Vector ADD. */
9854 *cost += extra_cost->vect.alu;
9855 }
9856 else if (GET_MODE_CLASS (mode) == MODE_INT)
9857 {
9858 /* ADD. */
9859 *cost += extra_cost->alu.arith;
9860 }
9861 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9862 {
9863 /* FADD. */
9864 *cost += extra_cost->fp[mode == DFmode].addsub;
9865 }
9866 }
9867 return true;
9868 }
9869
9870 case BSWAP:
9871 *cost = COSTS_N_INSNS (1);
9872
9873 if (speed)
9874 {
9875 if (VECTOR_MODE_P (mode))
9876 *cost += extra_cost->vect.alu;
9877 else
9878 *cost += extra_cost->alu.rev;
9879 }
9880 return false;
9881
9882 case IOR:
9883 if (aarch_rev16_p (x))
9884 {
9885 *cost = COSTS_N_INSNS (1);
9886
9887 if (speed)
9888 {
9889 if (VECTOR_MODE_P (mode))
9890 *cost += extra_cost->vect.alu;
9891 else
9892 *cost += extra_cost->alu.rev;
9893 }
9894 return true;
9895 }
9896
9897 if (aarch64_extr_rtx_p (x, &op0, &op1))
9898 {
9899 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9900 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9901 if (speed)
9902 *cost += extra_cost->alu.shift;
9903
9904 return true;
9905 }
9906 /* Fall through. */
9907 case XOR:
9908 case AND:
9909 cost_logic:
9910 op0 = XEXP (x, 0);
9911 op1 = XEXP (x, 1);
9912
9913 if (VECTOR_MODE_P (mode))
9914 {
9915 if (speed)
9916 *cost += extra_cost->vect.alu;
9917 return true;
9918 }
9919
9920 if (code == AND
9921 && GET_CODE (op0) == MULT
9922 && CONST_INT_P (XEXP (op0, 1))
9923 && CONST_INT_P (op1)
9924 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9925 INTVAL (op1)) != 0)
9926 {
9927 /* This is a UBFM/SBFM. */
9928 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9929 if (speed)
9930 *cost += extra_cost->alu.bfx;
9931 return true;
9932 }
9933
9934 if (is_int_mode (mode, &int_mode))
9935 {
9936 if (CONST_INT_P (op1))
9937 {
9938 /* We have a mask + shift version of a UBFIZ
9939 i.e. the *andim_ashift<mode>_bfiz pattern. */
9940 if (GET_CODE (op0) == ASHIFT
9941 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9942 XEXP (op0, 1)))
9943 {
9944 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9945 (enum rtx_code) code, 0, speed);
9946 if (speed)
9947 *cost += extra_cost->alu.bfx;
9948
9949 return true;
9950 }
9951 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9952 {
9953 /* We possibly get the immediate for free, this is not
9954 modelled. */
9955 *cost += rtx_cost (op0, int_mode,
9956 (enum rtx_code) code, 0, speed);
9957 if (speed)
9958 *cost += extra_cost->alu.logical;
9959
9960 return true;
9961 }
9962 }
9963 else
9964 {
9965 rtx new_op0 = op0;
9966
9967 /* Handle ORN, EON, or BIC. */
9968 if (GET_CODE (op0) == NOT)
9969 op0 = XEXP (op0, 0);
9970
9971 new_op0 = aarch64_strip_shift (op0);
9972
9973 /* If we had a shift on op0 then this is a logical-shift-
9974 by-register/immediate operation. Otherwise, this is just
9975 a logical operation. */
9976 if (speed)
9977 {
9978 if (new_op0 != op0)
9979 {
9980 /* Shift by immediate. */
9981 if (CONST_INT_P (XEXP (op0, 1)))
9982 *cost += extra_cost->alu.log_shift;
9983 else
9984 *cost += extra_cost->alu.log_shift_reg;
9985 }
9986 else
9987 *cost += extra_cost->alu.logical;
9988 }
9989
9990 /* In both cases we want to cost both operands. */
9991 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9992 0, speed);
9993 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9994 1, speed);
9995
9996 return true;
9997 }
9998 }
9999 return false;
10000
10001 case NOT:
10002 x = XEXP (x, 0);
10003 op0 = aarch64_strip_shift (x);
10004
10005 if (VECTOR_MODE_P (mode))
10006 {
10007 /* Vector NOT. */
10008 *cost += extra_cost->vect.alu;
10009 return false;
10010 }
10011
10012 /* MVN-shifted-reg. */
10013 if (op0 != x)
10014 {
10015 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10016
10017 if (speed)
10018 *cost += extra_cost->alu.log_shift;
10019
10020 return true;
10021 }
10022 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10023 Handle the second form here taking care that 'a' in the above can
10024 be a shift. */
10025 else if (GET_CODE (op0) == XOR)
10026 {
10027 rtx newop0 = XEXP (op0, 0);
10028 rtx newop1 = XEXP (op0, 1);
10029 rtx op0_stripped = aarch64_strip_shift (newop0);
10030
10031 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10032 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10033
10034 if (speed)
10035 {
10036 if (op0_stripped != newop0)
10037 *cost += extra_cost->alu.log_shift;
10038 else
10039 *cost += extra_cost->alu.logical;
10040 }
10041
10042 return true;
10043 }
10044 /* MVN. */
10045 if (speed)
10046 *cost += extra_cost->alu.logical;
10047
10048 return false;
10049
10050 case ZERO_EXTEND:
10051
10052 op0 = XEXP (x, 0);
10053 /* If a value is written in SI mode, then zero extended to DI
10054 mode, the operation will in general be free as a write to
10055 a 'w' register implicitly zeroes the upper bits of an 'x'
10056 register. However, if this is
10057
10058 (set (reg) (zero_extend (reg)))
10059
10060 we must cost the explicit register move. */
10061 if (mode == DImode
10062 && GET_MODE (op0) == SImode
10063 && outer == SET)
10064 {
10065 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10066
10067 /* If OP_COST is non-zero, then the cost of the zero extend
10068 is effectively the cost of the inner operation. Otherwise
10069 we have a MOV instruction and we take the cost from the MOV
10070 itself. This is true independently of whether we are
10071 optimizing for space or time. */
10072 if (op_cost)
10073 *cost = op_cost;
10074
10075 return true;
10076 }
10077 else if (MEM_P (op0))
10078 {
10079 /* All loads can zero extend to any size for free. */
10080 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10081 return true;
10082 }
10083
10084 op0 = aarch64_extend_bitfield_pattern_p (x);
10085 if (op0)
10086 {
10087 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10088 if (speed)
10089 *cost += extra_cost->alu.bfx;
10090 return true;
10091 }
10092
10093 if (speed)
10094 {
10095 if (VECTOR_MODE_P (mode))
10096 {
10097 /* UMOV. */
10098 *cost += extra_cost->vect.alu;
10099 }
10100 else
10101 {
10102 /* We generate an AND instead of UXTB/UXTH. */
10103 *cost += extra_cost->alu.logical;
10104 }
10105 }
10106 return false;
10107
10108 case SIGN_EXTEND:
10109 if (MEM_P (XEXP (x, 0)))
10110 {
10111 /* LDRSH. */
10112 if (speed)
10113 {
10114 rtx address = XEXP (XEXP (x, 0), 0);
10115 *cost += extra_cost->ldst.load_sign_extend;
10116
10117 *cost +=
10118 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10119 0, speed));
10120 }
10121 return true;
10122 }
10123
10124 op0 = aarch64_extend_bitfield_pattern_p (x);
10125 if (op0)
10126 {
10127 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10128 if (speed)
10129 *cost += extra_cost->alu.bfx;
10130 return true;
10131 }
10132
10133 if (speed)
10134 {
10135 if (VECTOR_MODE_P (mode))
10136 *cost += extra_cost->vect.alu;
10137 else
10138 *cost += extra_cost->alu.extend;
10139 }
10140 return false;
10141
10142 case ASHIFT:
10143 op0 = XEXP (x, 0);
10144 op1 = XEXP (x, 1);
10145
10146 if (CONST_INT_P (op1))
10147 {
10148 if (speed)
10149 {
10150 if (VECTOR_MODE_P (mode))
10151 {
10152 /* Vector shift (immediate). */
10153 *cost += extra_cost->vect.alu;
10154 }
10155 else
10156 {
10157 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10158 aliases. */
10159 *cost += extra_cost->alu.shift;
10160 }
10161 }
10162
10163 /* We can incorporate zero/sign extend for free. */
10164 if (GET_CODE (op0) == ZERO_EXTEND
10165 || GET_CODE (op0) == SIGN_EXTEND)
10166 op0 = XEXP (op0, 0);
10167
10168 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10169 return true;
10170 }
10171 else
10172 {
10173 if (VECTOR_MODE_P (mode))
10174 {
10175 if (speed)
10176 /* Vector shift (register). */
10177 *cost += extra_cost->vect.alu;
10178 }
10179 else
10180 {
10181 if (speed)
10182 /* LSLV. */
10183 *cost += extra_cost->alu.shift_reg;
10184
10185 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10186 && CONST_INT_P (XEXP (op1, 1))
10187 && known_eq (INTVAL (XEXP (op1, 1)),
10188 GET_MODE_BITSIZE (mode) - 1))
10189 {
10190 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10191 /* We already demanded XEXP (op1, 0) to be REG_P, so
10192 don't recurse into it. */
10193 return true;
10194 }
10195 }
10196 return false; /* All arguments need to be in registers. */
10197 }
10198
10199 case ROTATE:
10200 case ROTATERT:
10201 case LSHIFTRT:
10202 case ASHIFTRT:
10203 op0 = XEXP (x, 0);
10204 op1 = XEXP (x, 1);
10205
10206 if (CONST_INT_P (op1))
10207 {
10208 /* ASR (immediate) and friends. */
10209 if (speed)
10210 {
10211 if (VECTOR_MODE_P (mode))
10212 *cost += extra_cost->vect.alu;
10213 else
10214 *cost += extra_cost->alu.shift;
10215 }
10216
10217 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10218 return true;
10219 }
10220 else
10221 {
10222 if (VECTOR_MODE_P (mode))
10223 {
10224 if (speed)
10225 /* Vector shift (register). */
10226 *cost += extra_cost->vect.alu;
10227 }
10228 else
10229 {
10230 if (speed)
10231 /* ASR (register) and friends. */
10232 *cost += extra_cost->alu.shift_reg;
10233
10234 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10235 && CONST_INT_P (XEXP (op1, 1))
10236 && known_eq (INTVAL (XEXP (op1, 1)),
10237 GET_MODE_BITSIZE (mode) - 1))
10238 {
10239 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10240 /* We already demanded XEXP (op1, 0) to be REG_P, so
10241 don't recurse into it. */
10242 return true;
10243 }
10244 }
10245 return false; /* All arguments need to be in registers. */
10246 }
10247
10248 case SYMBOL_REF:
10249
10250 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10251 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10252 {
10253 /* LDR. */
10254 if (speed)
10255 *cost += extra_cost->ldst.load;
10256 }
10257 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10258 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10259 {
10260 /* ADRP, followed by ADD. */
10261 *cost += COSTS_N_INSNS (1);
10262 if (speed)
10263 *cost += 2 * extra_cost->alu.arith;
10264 }
10265 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10266 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10267 {
10268 /* ADR. */
10269 if (speed)
10270 *cost += extra_cost->alu.arith;
10271 }
10272
10273 if (flag_pic)
10274 {
10275 /* One extra load instruction, after accessing the GOT. */
10276 *cost += COSTS_N_INSNS (1);
10277 if (speed)
10278 *cost += extra_cost->ldst.load;
10279 }
10280 return true;
10281
10282 case HIGH:
10283 case LO_SUM:
10284 /* ADRP/ADD (immediate). */
10285 if (speed)
10286 *cost += extra_cost->alu.arith;
10287 return true;
10288
10289 case ZERO_EXTRACT:
10290 case SIGN_EXTRACT:
10291 /* UBFX/SBFX. */
10292 if (speed)
10293 {
10294 if (VECTOR_MODE_P (mode))
10295 *cost += extra_cost->vect.alu;
10296 else
10297 *cost += extra_cost->alu.bfx;
10298 }
10299
10300 /* We can trust that the immediates used will be correct (there
10301 are no by-register forms), so we need only cost op0. */
10302 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10303 return true;
10304
10305 case MULT:
10306 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10307 /* aarch64_rtx_mult_cost always handles recursion to its
10308 operands. */
10309 return true;
10310
10311 case MOD:
10312 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10313 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10314 an unconditional negate. This case should only ever be reached through
10315 the set_smod_pow2_cheap check in expmed.c. */
10316 if (CONST_INT_P (XEXP (x, 1))
10317 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10318 && (mode == SImode || mode == DImode))
10319 {
10320 /* We expand to 4 instructions. Reset the baseline. */
10321 *cost = COSTS_N_INSNS (4);
10322
10323 if (speed)
10324 *cost += 2 * extra_cost->alu.logical
10325 + 2 * extra_cost->alu.arith;
10326
10327 return true;
10328 }
10329
10330 /* Fall-through. */
10331 case UMOD:
10332 if (speed)
10333 {
10334 /* Slighly prefer UMOD over SMOD. */
10335 if (VECTOR_MODE_P (mode))
10336 *cost += extra_cost->vect.alu;
10337 else if (GET_MODE_CLASS (mode) == MODE_INT)
10338 *cost += (extra_cost->mult[mode == DImode].add
10339 + extra_cost->mult[mode == DImode].idiv
10340 + (code == MOD ? 1 : 0));
10341 }
10342 return false; /* All arguments need to be in registers. */
10343
10344 case DIV:
10345 case UDIV:
10346 case SQRT:
10347 if (speed)
10348 {
10349 if (VECTOR_MODE_P (mode))
10350 *cost += extra_cost->vect.alu;
10351 else if (GET_MODE_CLASS (mode) == MODE_INT)
10352 /* There is no integer SQRT, so only DIV and UDIV can get
10353 here. */
10354 *cost += (extra_cost->mult[mode == DImode].idiv
10355 /* Slighly prefer UDIV over SDIV. */
10356 + (code == DIV ? 1 : 0));
10357 else
10358 *cost += extra_cost->fp[mode == DFmode].div;
10359 }
10360 return false; /* All arguments need to be in registers. */
10361
10362 case IF_THEN_ELSE:
10363 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10364 XEXP (x, 2), cost, speed);
10365
10366 case EQ:
10367 case NE:
10368 case GT:
10369 case GTU:
10370 case LT:
10371 case LTU:
10372 case GE:
10373 case GEU:
10374 case LE:
10375 case LEU:
10376
10377 return false; /* All arguments must be in registers. */
10378
10379 case FMA:
10380 op0 = XEXP (x, 0);
10381 op1 = XEXP (x, 1);
10382 op2 = XEXP (x, 2);
10383
10384 if (speed)
10385 {
10386 if (VECTOR_MODE_P (mode))
10387 *cost += extra_cost->vect.alu;
10388 else
10389 *cost += extra_cost->fp[mode == DFmode].fma;
10390 }
10391
10392 /* FMSUB, FNMADD, and FNMSUB are free. */
10393 if (GET_CODE (op0) == NEG)
10394 op0 = XEXP (op0, 0);
10395
10396 if (GET_CODE (op2) == NEG)
10397 op2 = XEXP (op2, 0);
10398
10399 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10400 and the by-element operand as operand 0. */
10401 if (GET_CODE (op1) == NEG)
10402 op1 = XEXP (op1, 0);
10403
10404 /* Catch vector-by-element operations. The by-element operand can
10405 either be (vec_duplicate (vec_select (x))) or just
10406 (vec_select (x)), depending on whether we are multiplying by
10407 a vector or a scalar.
10408
10409 Canonicalization is not very good in these cases, FMA4 will put the
10410 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10411 if (GET_CODE (op0) == VEC_DUPLICATE)
10412 op0 = XEXP (op0, 0);
10413 else if (GET_CODE (op1) == VEC_DUPLICATE)
10414 op1 = XEXP (op1, 0);
10415
10416 if (GET_CODE (op0) == VEC_SELECT)
10417 op0 = XEXP (op0, 0);
10418 else if (GET_CODE (op1) == VEC_SELECT)
10419 op1 = XEXP (op1, 0);
10420
10421 /* If the remaining parameters are not registers,
10422 get the cost to put them into registers. */
10423 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10424 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10425 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10426 return true;
10427
10428 case FLOAT:
10429 case UNSIGNED_FLOAT:
10430 if (speed)
10431 *cost += extra_cost->fp[mode == DFmode].fromint;
10432 return false;
10433
10434 case FLOAT_EXTEND:
10435 if (speed)
10436 {
10437 if (VECTOR_MODE_P (mode))
10438 {
10439 /*Vector truncate. */
10440 *cost += extra_cost->vect.alu;
10441 }
10442 else
10443 *cost += extra_cost->fp[mode == DFmode].widen;
10444 }
10445 return false;
10446
10447 case FLOAT_TRUNCATE:
10448 if (speed)
10449 {
10450 if (VECTOR_MODE_P (mode))
10451 {
10452 /*Vector conversion. */
10453 *cost += extra_cost->vect.alu;
10454 }
10455 else
10456 *cost += extra_cost->fp[mode == DFmode].narrow;
10457 }
10458 return false;
10459
10460 case FIX:
10461 case UNSIGNED_FIX:
10462 x = XEXP (x, 0);
10463 /* Strip the rounding part. They will all be implemented
10464 by the fcvt* family of instructions anyway. */
10465 if (GET_CODE (x) == UNSPEC)
10466 {
10467 unsigned int uns_code = XINT (x, 1);
10468
10469 if (uns_code == UNSPEC_FRINTA
10470 || uns_code == UNSPEC_FRINTM
10471 || uns_code == UNSPEC_FRINTN
10472 || uns_code == UNSPEC_FRINTP
10473 || uns_code == UNSPEC_FRINTZ)
10474 x = XVECEXP (x, 0, 0);
10475 }
10476
10477 if (speed)
10478 {
10479 if (VECTOR_MODE_P (mode))
10480 *cost += extra_cost->vect.alu;
10481 else
10482 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10483 }
10484
10485 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10486 fixed-point fcvt. */
10487 if (GET_CODE (x) == MULT
10488 && ((VECTOR_MODE_P (mode)
10489 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10490 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10491 {
10492 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10493 0, speed);
10494 return true;
10495 }
10496
10497 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10498 return true;
10499
10500 case ABS:
10501 if (VECTOR_MODE_P (mode))
10502 {
10503 /* ABS (vector). */
10504 if (speed)
10505 *cost += extra_cost->vect.alu;
10506 }
10507 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10508 {
10509 op0 = XEXP (x, 0);
10510
10511 /* FABD, which is analogous to FADD. */
10512 if (GET_CODE (op0) == MINUS)
10513 {
10514 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10515 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10516 if (speed)
10517 *cost += extra_cost->fp[mode == DFmode].addsub;
10518
10519 return true;
10520 }
10521 /* Simple FABS is analogous to FNEG. */
10522 if (speed)
10523 *cost += extra_cost->fp[mode == DFmode].neg;
10524 }
10525 else
10526 {
10527 /* Integer ABS will either be split to
10528 two arithmetic instructions, or will be an ABS
10529 (scalar), which we don't model. */
10530 *cost = COSTS_N_INSNS (2);
10531 if (speed)
10532 *cost += 2 * extra_cost->alu.arith;
10533 }
10534 return false;
10535
10536 case SMAX:
10537 case SMIN:
10538 if (speed)
10539 {
10540 if (VECTOR_MODE_P (mode))
10541 *cost += extra_cost->vect.alu;
10542 else
10543 {
10544 /* FMAXNM/FMINNM/FMAX/FMIN.
10545 TODO: This may not be accurate for all implementations, but
10546 we do not model this in the cost tables. */
10547 *cost += extra_cost->fp[mode == DFmode].addsub;
10548 }
10549 }
10550 return false;
10551
10552 case UNSPEC:
10553 /* The floating point round to integer frint* instructions. */
10554 if (aarch64_frint_unspec_p (XINT (x, 1)))
10555 {
10556 if (speed)
10557 *cost += extra_cost->fp[mode == DFmode].roundint;
10558
10559 return false;
10560 }
10561
10562 if (XINT (x, 1) == UNSPEC_RBIT)
10563 {
10564 if (speed)
10565 *cost += extra_cost->alu.rev;
10566
10567 return false;
10568 }
10569 break;
10570
10571 case TRUNCATE:
10572
10573 /* Decompose <su>muldi3_highpart. */
10574 if (/* (truncate:DI */
10575 mode == DImode
10576 /* (lshiftrt:TI */
10577 && GET_MODE (XEXP (x, 0)) == TImode
10578 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10579 /* (mult:TI */
10580 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10581 /* (ANY_EXTEND:TI (reg:DI))
10582 (ANY_EXTEND:TI (reg:DI))) */
10583 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10584 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10585 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10586 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10587 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10588 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10589 /* (const_int 64) */
10590 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10591 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10592 {
10593 /* UMULH/SMULH. */
10594 if (speed)
10595 *cost += extra_cost->mult[mode == DImode].extend;
10596 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10597 mode, MULT, 0, speed);
10598 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10599 mode, MULT, 1, speed);
10600 return true;
10601 }
10602
10603 /* Fall through. */
10604 default:
10605 break;
10606 }
10607
10608 if (dump_file
10609 && flag_aarch64_verbose_cost)
10610 fprintf (dump_file,
10611 "\nFailed to cost RTX. Assuming default cost.\n");
10612
10613 return true;
10614 }
10615
10616 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10617 calculated for X. This cost is stored in *COST. Returns true
10618 if the total cost of X was calculated. */
10619 static bool
10620 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10621 int param, int *cost, bool speed)
10622 {
10623 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10624
10625 if (dump_file
10626 && flag_aarch64_verbose_cost)
10627 {
10628 print_rtl_single (dump_file, x);
10629 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10630 speed ? "Hot" : "Cold",
10631 *cost, result ? "final" : "partial");
10632 }
10633
10634 return result;
10635 }
10636
10637 static int
10638 aarch64_register_move_cost (machine_mode mode,
10639 reg_class_t from_i, reg_class_t to_i)
10640 {
10641 enum reg_class from = (enum reg_class) from_i;
10642 enum reg_class to = (enum reg_class) to_i;
10643 const struct cpu_regmove_cost *regmove_cost
10644 = aarch64_tune_params.regmove_cost;
10645
10646 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10647 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10648 to = GENERAL_REGS;
10649
10650 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10651 from = GENERAL_REGS;
10652
10653 /* Moving between GPR and stack cost is the same as GP2GP. */
10654 if ((from == GENERAL_REGS && to == STACK_REG)
10655 || (to == GENERAL_REGS && from == STACK_REG))
10656 return regmove_cost->GP2GP;
10657
10658 /* To/From the stack register, we move via the gprs. */
10659 if (to == STACK_REG || from == STACK_REG)
10660 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10661 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10662
10663 if (known_eq (GET_MODE_SIZE (mode), 16))
10664 {
10665 /* 128-bit operations on general registers require 2 instructions. */
10666 if (from == GENERAL_REGS && to == GENERAL_REGS)
10667 return regmove_cost->GP2GP * 2;
10668 else if (from == GENERAL_REGS)
10669 return regmove_cost->GP2FP * 2;
10670 else if (to == GENERAL_REGS)
10671 return regmove_cost->FP2GP * 2;
10672
10673 /* When AdvSIMD instructions are disabled it is not possible to move
10674 a 128-bit value directly between Q registers. This is handled in
10675 secondary reload. A general register is used as a scratch to move
10676 the upper DI value and the lower DI value is moved directly,
10677 hence the cost is the sum of three moves. */
10678 if (! TARGET_SIMD)
10679 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10680
10681 return regmove_cost->FP2FP;
10682 }
10683
10684 if (from == GENERAL_REGS && to == GENERAL_REGS)
10685 return regmove_cost->GP2GP;
10686 else if (from == GENERAL_REGS)
10687 return regmove_cost->GP2FP;
10688 else if (to == GENERAL_REGS)
10689 return regmove_cost->FP2GP;
10690
10691 return regmove_cost->FP2FP;
10692 }
10693
10694 static int
10695 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10696 reg_class_t rclass ATTRIBUTE_UNUSED,
10697 bool in ATTRIBUTE_UNUSED)
10698 {
10699 return aarch64_tune_params.memmov_cost;
10700 }
10701
10702 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10703 to optimize 1.0/sqrt. */
10704
10705 static bool
10706 use_rsqrt_p (machine_mode mode)
10707 {
10708 return (!flag_trapping_math
10709 && flag_unsafe_math_optimizations
10710 && ((aarch64_tune_params.approx_modes->recip_sqrt
10711 & AARCH64_APPROX_MODE (mode))
10712 || flag_mrecip_low_precision_sqrt));
10713 }
10714
10715 /* Function to decide when to use the approximate reciprocal square root
10716 builtin. */
10717
10718 static tree
10719 aarch64_builtin_reciprocal (tree fndecl)
10720 {
10721 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10722
10723 if (!use_rsqrt_p (mode))
10724 return NULL_TREE;
10725 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10726 }
10727
10728 /* Emit instruction sequence to compute either the approximate square root
10729 or its approximate reciprocal, depending on the flag RECP, and return
10730 whether the sequence was emitted or not. */
10731
10732 bool
10733 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10734 {
10735 machine_mode mode = GET_MODE (dst);
10736
10737 if (GET_MODE_INNER (mode) == HFmode)
10738 {
10739 gcc_assert (!recp);
10740 return false;
10741 }
10742
10743 if (!recp)
10744 {
10745 if (!(flag_mlow_precision_sqrt
10746 || (aarch64_tune_params.approx_modes->sqrt
10747 & AARCH64_APPROX_MODE (mode))))
10748 return false;
10749
10750 if (flag_finite_math_only
10751 || flag_trapping_math
10752 || !flag_unsafe_math_optimizations
10753 || optimize_function_for_size_p (cfun))
10754 return false;
10755 }
10756 else
10757 /* Caller assumes we cannot fail. */
10758 gcc_assert (use_rsqrt_p (mode));
10759
10760 machine_mode mmsk = mode_for_int_vector (mode).require ();
10761 rtx xmsk = gen_reg_rtx (mmsk);
10762 if (!recp)
10763 /* When calculating the approximate square root, compare the
10764 argument with 0.0 and create a mask. */
10765 emit_insn (gen_rtx_SET (xmsk,
10766 gen_rtx_NEG (mmsk,
10767 gen_rtx_EQ (mmsk, src,
10768 CONST0_RTX (mode)))));
10769
10770 /* Estimate the approximate reciprocal square root. */
10771 rtx xdst = gen_reg_rtx (mode);
10772 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10773
10774 /* Iterate over the series twice for SF and thrice for DF. */
10775 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10776
10777 /* Optionally iterate over the series once less for faster performance
10778 while sacrificing the accuracy. */
10779 if ((recp && flag_mrecip_low_precision_sqrt)
10780 || (!recp && flag_mlow_precision_sqrt))
10781 iterations--;
10782
10783 /* Iterate over the series to calculate the approximate reciprocal square
10784 root. */
10785 rtx x1 = gen_reg_rtx (mode);
10786 while (iterations--)
10787 {
10788 rtx x2 = gen_reg_rtx (mode);
10789 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10790
10791 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10792
10793 if (iterations > 0)
10794 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10795 }
10796
10797 if (!recp)
10798 {
10799 /* Qualify the approximate reciprocal square root when the argument is
10800 0.0 by squashing the intermediary result to 0.0. */
10801 rtx xtmp = gen_reg_rtx (mmsk);
10802 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10803 gen_rtx_SUBREG (mmsk, xdst, 0)));
10804 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10805
10806 /* Calculate the approximate square root. */
10807 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10808 }
10809
10810 /* Finalize the approximation. */
10811 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10812
10813 return true;
10814 }
10815
10816 /* Emit the instruction sequence to compute the approximation for the division
10817 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10818
10819 bool
10820 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10821 {
10822 machine_mode mode = GET_MODE (quo);
10823
10824 if (GET_MODE_INNER (mode) == HFmode)
10825 return false;
10826
10827 bool use_approx_division_p = (flag_mlow_precision_div
10828 || (aarch64_tune_params.approx_modes->division
10829 & AARCH64_APPROX_MODE (mode)));
10830
10831 if (!flag_finite_math_only
10832 || flag_trapping_math
10833 || !flag_unsafe_math_optimizations
10834 || optimize_function_for_size_p (cfun)
10835 || !use_approx_division_p)
10836 return false;
10837
10838 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10839 return false;
10840
10841 /* Estimate the approximate reciprocal. */
10842 rtx xrcp = gen_reg_rtx (mode);
10843 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10844
10845 /* Iterate over the series twice for SF and thrice for DF. */
10846 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10847
10848 /* Optionally iterate over the series once less for faster performance,
10849 while sacrificing the accuracy. */
10850 if (flag_mlow_precision_div)
10851 iterations--;
10852
10853 /* Iterate over the series to calculate the approximate reciprocal. */
10854 rtx xtmp = gen_reg_rtx (mode);
10855 while (iterations--)
10856 {
10857 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10858
10859 if (iterations > 0)
10860 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10861 }
10862
10863 if (num != CONST1_RTX (mode))
10864 {
10865 /* As the approximate reciprocal of DEN is already calculated, only
10866 calculate the approximate division when NUM is not 1.0. */
10867 rtx xnum = force_reg (mode, num);
10868 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10869 }
10870
10871 /* Finalize the approximation. */
10872 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10873 return true;
10874 }
10875
10876 /* Return the number of instructions that can be issued per cycle. */
10877 static int
10878 aarch64_sched_issue_rate (void)
10879 {
10880 return aarch64_tune_params.issue_rate;
10881 }
10882
10883 static int
10884 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10885 {
10886 int issue_rate = aarch64_sched_issue_rate ();
10887
10888 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10889 }
10890
10891
10892 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10893 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10894 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10895
10896 static int
10897 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10898 int ready_index)
10899 {
10900 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10901 }
10902
10903
10904 /* Vectorizer cost model target hooks. */
10905
10906 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10907 static int
10908 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10909 tree vectype,
10910 int misalign ATTRIBUTE_UNUSED)
10911 {
10912 unsigned elements;
10913 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10914 bool fp = false;
10915
10916 if (vectype != NULL)
10917 fp = FLOAT_TYPE_P (vectype);
10918
10919 switch (type_of_cost)
10920 {
10921 case scalar_stmt:
10922 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10923
10924 case scalar_load:
10925 return costs->scalar_load_cost;
10926
10927 case scalar_store:
10928 return costs->scalar_store_cost;
10929
10930 case vector_stmt:
10931 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10932
10933 case vector_load:
10934 return costs->vec_align_load_cost;
10935
10936 case vector_store:
10937 return costs->vec_store_cost;
10938
10939 case vec_to_scalar:
10940 return costs->vec_to_scalar_cost;
10941
10942 case scalar_to_vec:
10943 return costs->scalar_to_vec_cost;
10944
10945 case unaligned_load:
10946 case vector_gather_load:
10947 return costs->vec_unalign_load_cost;
10948
10949 case unaligned_store:
10950 case vector_scatter_store:
10951 return costs->vec_unalign_store_cost;
10952
10953 case cond_branch_taken:
10954 return costs->cond_taken_branch_cost;
10955
10956 case cond_branch_not_taken:
10957 return costs->cond_not_taken_branch_cost;
10958
10959 case vec_perm:
10960 return costs->vec_permute_cost;
10961
10962 case vec_promote_demote:
10963 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10964
10965 case vec_construct:
10966 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10967 return elements / 2 + 1;
10968
10969 default:
10970 gcc_unreachable ();
10971 }
10972 }
10973
10974 /* Implement targetm.vectorize.add_stmt_cost. */
10975 static unsigned
10976 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10977 struct _stmt_vec_info *stmt_info, int misalign,
10978 enum vect_cost_model_location where)
10979 {
10980 unsigned *cost = (unsigned *) data;
10981 unsigned retval = 0;
10982
10983 if (flag_vect_cost_model)
10984 {
10985 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10986 int stmt_cost =
10987 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10988
10989 /* Statements in an inner loop relative to the loop being
10990 vectorized are weighted more heavily. The value here is
10991 arbitrary and could potentially be improved with analysis. */
10992 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10993 count *= 50; /* FIXME */
10994
10995 retval = (unsigned) (count * stmt_cost);
10996 cost[where] += retval;
10997 }
10998
10999 return retval;
11000 }
11001
11002 static void initialize_aarch64_code_model (struct gcc_options *);
11003
11004 /* Parse the TO_PARSE string and put the architecture struct that it
11005 selects into RES and the architectural features into ISA_FLAGS.
11006 Return an aarch64_parse_opt_result describing the parse result.
11007 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11008 When the TO_PARSE string contains an invalid extension,
11009 a copy of the string is created and stored to INVALID_EXTENSION. */
11010
11011 static enum aarch64_parse_opt_result
11012 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11013 unsigned long *isa_flags, std::string *invalid_extension)
11014 {
11015 const char *ext;
11016 const struct processor *arch;
11017 size_t len;
11018
11019 ext = strchr (to_parse, '+');
11020
11021 if (ext != NULL)
11022 len = ext - to_parse;
11023 else
11024 len = strlen (to_parse);
11025
11026 if (len == 0)
11027 return AARCH64_PARSE_MISSING_ARG;
11028
11029
11030 /* Loop through the list of supported ARCHes to find a match. */
11031 for (arch = all_architectures; arch->name != NULL; arch++)
11032 {
11033 if (strlen (arch->name) == len
11034 && strncmp (arch->name, to_parse, len) == 0)
11035 {
11036 unsigned long isa_temp = arch->flags;
11037
11038 if (ext != NULL)
11039 {
11040 /* TO_PARSE string contains at least one extension. */
11041 enum aarch64_parse_opt_result ext_res
11042 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11043
11044 if (ext_res != AARCH64_PARSE_OK)
11045 return ext_res;
11046 }
11047 /* Extension parsing was successful. Confirm the result
11048 arch and ISA flags. */
11049 *res = arch;
11050 *isa_flags = isa_temp;
11051 return AARCH64_PARSE_OK;
11052 }
11053 }
11054
11055 /* ARCH name not found in list. */
11056 return AARCH64_PARSE_INVALID_ARG;
11057 }
11058
11059 /* Parse the TO_PARSE string and put the result tuning in RES and the
11060 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11061 describing the parse result. If there is an error parsing, RES and
11062 ISA_FLAGS are left unchanged.
11063 When the TO_PARSE string contains an invalid extension,
11064 a copy of the string is created and stored to INVALID_EXTENSION. */
11065
11066 static enum aarch64_parse_opt_result
11067 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11068 unsigned long *isa_flags, std::string *invalid_extension)
11069 {
11070 const char *ext;
11071 const struct processor *cpu;
11072 size_t len;
11073
11074 ext = strchr (to_parse, '+');
11075
11076 if (ext != NULL)
11077 len = ext - to_parse;
11078 else
11079 len = strlen (to_parse);
11080
11081 if (len == 0)
11082 return AARCH64_PARSE_MISSING_ARG;
11083
11084
11085 /* Loop through the list of supported CPUs to find a match. */
11086 for (cpu = all_cores; cpu->name != NULL; cpu++)
11087 {
11088 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11089 {
11090 unsigned long isa_temp = cpu->flags;
11091
11092
11093 if (ext != NULL)
11094 {
11095 /* TO_PARSE string contains at least one extension. */
11096 enum aarch64_parse_opt_result ext_res
11097 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11098
11099 if (ext_res != AARCH64_PARSE_OK)
11100 return ext_res;
11101 }
11102 /* Extension parsing was successfull. Confirm the result
11103 cpu and ISA flags. */
11104 *res = cpu;
11105 *isa_flags = isa_temp;
11106 return AARCH64_PARSE_OK;
11107 }
11108 }
11109
11110 /* CPU name not found in list. */
11111 return AARCH64_PARSE_INVALID_ARG;
11112 }
11113
11114 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11115 Return an aarch64_parse_opt_result describing the parse result.
11116 If the parsing fails the RES does not change. */
11117
11118 static enum aarch64_parse_opt_result
11119 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11120 {
11121 const struct processor *cpu;
11122
11123 /* Loop through the list of supported CPUs to find a match. */
11124 for (cpu = all_cores; cpu->name != NULL; cpu++)
11125 {
11126 if (strcmp (cpu->name, to_parse) == 0)
11127 {
11128 *res = cpu;
11129 return AARCH64_PARSE_OK;
11130 }
11131 }
11132
11133 /* CPU name not found in list. */
11134 return AARCH64_PARSE_INVALID_ARG;
11135 }
11136
11137 /* Parse TOKEN, which has length LENGTH to see if it is an option
11138 described in FLAG. If it is, return the index bit for that fusion type.
11139 If not, error (printing OPTION_NAME) and return zero. */
11140
11141 static unsigned int
11142 aarch64_parse_one_option_token (const char *token,
11143 size_t length,
11144 const struct aarch64_flag_desc *flag,
11145 const char *option_name)
11146 {
11147 for (; flag->name != NULL; flag++)
11148 {
11149 if (length == strlen (flag->name)
11150 && !strncmp (flag->name, token, length))
11151 return flag->flag;
11152 }
11153
11154 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11155 return 0;
11156 }
11157
11158 /* Parse OPTION which is a comma-separated list of flags to enable.
11159 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11160 default state we inherit from the CPU tuning structures. OPTION_NAME
11161 gives the top-level option we are parsing in the -moverride string,
11162 for use in error messages. */
11163
11164 static unsigned int
11165 aarch64_parse_boolean_options (const char *option,
11166 const struct aarch64_flag_desc *flags,
11167 unsigned int initial_state,
11168 const char *option_name)
11169 {
11170 const char separator = '.';
11171 const char* specs = option;
11172 const char* ntoken = option;
11173 unsigned int found_flags = initial_state;
11174
11175 while ((ntoken = strchr (specs, separator)))
11176 {
11177 size_t token_length = ntoken - specs;
11178 unsigned token_ops = aarch64_parse_one_option_token (specs,
11179 token_length,
11180 flags,
11181 option_name);
11182 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11183 in the token stream, reset the supported operations. So:
11184
11185 adrp+add.cmp+branch.none.adrp+add
11186
11187 would have the result of turning on only adrp+add fusion. */
11188 if (!token_ops)
11189 found_flags = 0;
11190
11191 found_flags |= token_ops;
11192 specs = ++ntoken;
11193 }
11194
11195 /* We ended with a comma, print something. */
11196 if (!(*specs))
11197 {
11198 error ("%s string ill-formed\n", option_name);
11199 return 0;
11200 }
11201
11202 /* We still have one more token to parse. */
11203 size_t token_length = strlen (specs);
11204 unsigned token_ops = aarch64_parse_one_option_token (specs,
11205 token_length,
11206 flags,
11207 option_name);
11208 if (!token_ops)
11209 found_flags = 0;
11210
11211 found_flags |= token_ops;
11212 return found_flags;
11213 }
11214
11215 /* Support for overriding instruction fusion. */
11216
11217 static void
11218 aarch64_parse_fuse_string (const char *fuse_string,
11219 struct tune_params *tune)
11220 {
11221 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11222 aarch64_fusible_pairs,
11223 tune->fusible_ops,
11224 "fuse=");
11225 }
11226
11227 /* Support for overriding other tuning flags. */
11228
11229 static void
11230 aarch64_parse_tune_string (const char *tune_string,
11231 struct tune_params *tune)
11232 {
11233 tune->extra_tuning_flags
11234 = aarch64_parse_boolean_options (tune_string,
11235 aarch64_tuning_flags,
11236 tune->extra_tuning_flags,
11237 "tune=");
11238 }
11239
11240 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11241 Accept the valid SVE vector widths allowed by
11242 aarch64_sve_vector_bits_enum and use it to override sve_width
11243 in TUNE. */
11244
11245 static void
11246 aarch64_parse_sve_width_string (const char *tune_string,
11247 struct tune_params *tune)
11248 {
11249 int width = -1;
11250
11251 int n = sscanf (tune_string, "%d", &width);
11252 if (n == EOF)
11253 {
11254 error ("invalid format for sve_width");
11255 return;
11256 }
11257 switch (width)
11258 {
11259 case SVE_128:
11260 case SVE_256:
11261 case SVE_512:
11262 case SVE_1024:
11263 case SVE_2048:
11264 break;
11265 default:
11266 error ("invalid sve_width value: %d", width);
11267 }
11268 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11269 }
11270
11271 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11272 we understand. If it is, extract the option string and handoff to
11273 the appropriate function. */
11274
11275 void
11276 aarch64_parse_one_override_token (const char* token,
11277 size_t length,
11278 struct tune_params *tune)
11279 {
11280 const struct aarch64_tuning_override_function *fn
11281 = aarch64_tuning_override_functions;
11282
11283 const char *option_part = strchr (token, '=');
11284 if (!option_part)
11285 {
11286 error ("tuning string missing in option (%s)", token);
11287 return;
11288 }
11289
11290 /* Get the length of the option name. */
11291 length = option_part - token;
11292 /* Skip the '=' to get to the option string. */
11293 option_part++;
11294
11295 for (; fn->name != NULL; fn++)
11296 {
11297 if (!strncmp (fn->name, token, length))
11298 {
11299 fn->parse_override (option_part, tune);
11300 return;
11301 }
11302 }
11303
11304 error ("unknown tuning option (%s)",token);
11305 return;
11306 }
11307
11308 /* A checking mechanism for the implementation of the tls size. */
11309
11310 static void
11311 initialize_aarch64_tls_size (struct gcc_options *opts)
11312 {
11313 if (aarch64_tls_size == 0)
11314 aarch64_tls_size = 24;
11315
11316 switch (opts->x_aarch64_cmodel_var)
11317 {
11318 case AARCH64_CMODEL_TINY:
11319 /* Both the default and maximum TLS size allowed under tiny is 1M which
11320 needs two instructions to address, so we clamp the size to 24. */
11321 if (aarch64_tls_size > 24)
11322 aarch64_tls_size = 24;
11323 break;
11324 case AARCH64_CMODEL_SMALL:
11325 /* The maximum TLS size allowed under small is 4G. */
11326 if (aarch64_tls_size > 32)
11327 aarch64_tls_size = 32;
11328 break;
11329 case AARCH64_CMODEL_LARGE:
11330 /* The maximum TLS size allowed under large is 16E.
11331 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11332 if (aarch64_tls_size > 48)
11333 aarch64_tls_size = 48;
11334 break;
11335 default:
11336 gcc_unreachable ();
11337 }
11338
11339 return;
11340 }
11341
11342 /* Parse STRING looking for options in the format:
11343 string :: option:string
11344 option :: name=substring
11345 name :: {a-z}
11346 substring :: defined by option. */
11347
11348 static void
11349 aarch64_parse_override_string (const char* input_string,
11350 struct tune_params* tune)
11351 {
11352 const char separator = ':';
11353 size_t string_length = strlen (input_string) + 1;
11354 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11355 char *string = string_root;
11356 strncpy (string, input_string, string_length);
11357 string[string_length - 1] = '\0';
11358
11359 char* ntoken = string;
11360
11361 while ((ntoken = strchr (string, separator)))
11362 {
11363 size_t token_length = ntoken - string;
11364 /* Make this substring look like a string. */
11365 *ntoken = '\0';
11366 aarch64_parse_one_override_token (string, token_length, tune);
11367 string = ++ntoken;
11368 }
11369
11370 /* One last option to parse. */
11371 aarch64_parse_one_override_token (string, strlen (string), tune);
11372 free (string_root);
11373 }
11374
11375
11376 static void
11377 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11378 {
11379 if (accepted_branch_protection_string)
11380 {
11381 opts->x_aarch64_branch_protection_string
11382 = xstrdup (accepted_branch_protection_string);
11383 }
11384
11385 /* PR 70044: We have to be careful about being called multiple times for the
11386 same function. This means all changes should be repeatable. */
11387
11388 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11389 Disable the frame pointer flag so the mid-end will not use a frame
11390 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11391 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11392 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11393 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11394 if (opts->x_flag_omit_frame_pointer == 0)
11395 opts->x_flag_omit_frame_pointer = 2;
11396
11397 /* If not optimizing for size, set the default
11398 alignment to what the target wants. */
11399 if (!opts->x_optimize_size)
11400 {
11401 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11402 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11403 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11404 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11405 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11406 opts->x_str_align_functions = aarch64_tune_params.function_align;
11407 }
11408
11409 /* We default to no pc-relative literal loads. */
11410
11411 aarch64_pcrelative_literal_loads = false;
11412
11413 /* If -mpc-relative-literal-loads is set on the command line, this
11414 implies that the user asked for PC relative literal loads. */
11415 if (opts->x_pcrelative_literal_loads == 1)
11416 aarch64_pcrelative_literal_loads = true;
11417
11418 /* In the tiny memory model it makes no sense to disallow PC relative
11419 literal pool loads. */
11420 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11421 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11422 aarch64_pcrelative_literal_loads = true;
11423
11424 /* When enabling the lower precision Newton series for the square root, also
11425 enable it for the reciprocal square root, since the latter is an
11426 intermediary step for the former. */
11427 if (flag_mlow_precision_sqrt)
11428 flag_mrecip_low_precision_sqrt = true;
11429 }
11430
11431 /* 'Unpack' up the internal tuning structs and update the options
11432 in OPTS. The caller must have set up selected_tune and selected_arch
11433 as all the other target-specific codegen decisions are
11434 derived from them. */
11435
11436 void
11437 aarch64_override_options_internal (struct gcc_options *opts)
11438 {
11439 aarch64_tune_flags = selected_tune->flags;
11440 aarch64_tune = selected_tune->sched_core;
11441 /* Make a copy of the tuning parameters attached to the core, which
11442 we may later overwrite. */
11443 aarch64_tune_params = *(selected_tune->tune);
11444 aarch64_architecture_version = selected_arch->architecture_version;
11445
11446 if (opts->x_aarch64_override_tune_string)
11447 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11448 &aarch64_tune_params);
11449
11450 /* This target defaults to strict volatile bitfields. */
11451 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11452 opts->x_flag_strict_volatile_bitfields = 1;
11453
11454 if (aarch64_stack_protector_guard == SSP_GLOBAL
11455 && opts->x_aarch64_stack_protector_guard_offset_str)
11456 {
11457 error ("incompatible options %<-mstack-protector-guard=global%> and"
11458 "%<-mstack-protector-guard-offset=%qs%>",
11459 aarch64_stack_protector_guard_offset_str);
11460 }
11461
11462 if (aarch64_stack_protector_guard == SSP_SYSREG
11463 && !(opts->x_aarch64_stack_protector_guard_offset_str
11464 && opts->x_aarch64_stack_protector_guard_reg_str))
11465 {
11466 error ("both %<-mstack-protector-guard-offset%> and "
11467 "%<-mstack-protector-guard-reg%> must be used "
11468 "with %<-mstack-protector-guard=sysreg%>");
11469 }
11470
11471 if (opts->x_aarch64_stack_protector_guard_reg_str)
11472 {
11473 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11474 error ("specify a system register with a small string length.");
11475 }
11476
11477 if (opts->x_aarch64_stack_protector_guard_offset_str)
11478 {
11479 char *end;
11480 const char *str = aarch64_stack_protector_guard_offset_str;
11481 errno = 0;
11482 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11483 if (!*str || *end || errno)
11484 error ("%qs is not a valid offset in %qs", str,
11485 "%<-mstack-protector-guard-offset=%>");
11486 aarch64_stack_protector_guard_offset = offs;
11487 }
11488
11489 initialize_aarch64_code_model (opts);
11490 initialize_aarch64_tls_size (opts);
11491
11492 int queue_depth = 0;
11493 switch (aarch64_tune_params.autoprefetcher_model)
11494 {
11495 case tune_params::AUTOPREFETCHER_OFF:
11496 queue_depth = -1;
11497 break;
11498 case tune_params::AUTOPREFETCHER_WEAK:
11499 queue_depth = 0;
11500 break;
11501 case tune_params::AUTOPREFETCHER_STRONG:
11502 queue_depth = max_insn_queue_index + 1;
11503 break;
11504 default:
11505 gcc_unreachable ();
11506 }
11507
11508 /* We don't mind passing in global_options_set here as we don't use
11509 the *options_set structs anyway. */
11510 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11511 queue_depth,
11512 opts->x_param_values,
11513 global_options_set.x_param_values);
11514
11515 /* Set up parameters to be used in prefetching algorithm. Do not
11516 override the defaults unless we are tuning for a core we have
11517 researched values for. */
11518 if (aarch64_tune_params.prefetch->num_slots > 0)
11519 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11520 aarch64_tune_params.prefetch->num_slots,
11521 opts->x_param_values,
11522 global_options_set.x_param_values);
11523 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11524 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11525 aarch64_tune_params.prefetch->l1_cache_size,
11526 opts->x_param_values,
11527 global_options_set.x_param_values);
11528 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11529 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11530 aarch64_tune_params.prefetch->l1_cache_line_size,
11531 opts->x_param_values,
11532 global_options_set.x_param_values);
11533 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11534 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11535 aarch64_tune_params.prefetch->l2_cache_size,
11536 opts->x_param_values,
11537 global_options_set.x_param_values);
11538 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11539 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11540 0,
11541 opts->x_param_values,
11542 global_options_set.x_param_values);
11543 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11544 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11545 aarch64_tune_params.prefetch->minimum_stride,
11546 opts->x_param_values,
11547 global_options_set.x_param_values);
11548
11549 /* Use the alternative scheduling-pressure algorithm by default. */
11550 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11551 opts->x_param_values,
11552 global_options_set.x_param_values);
11553
11554 /* If the user hasn't changed it via configure then set the default to 64 KB
11555 for the backend. */
11556 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11557 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11558 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11559 opts->x_param_values,
11560 global_options_set.x_param_values);
11561
11562 /* Validate the guard size. */
11563 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11564
11565 /* Enforce that interval is the same size as size so the mid-end does the
11566 right thing. */
11567 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11568 guard_size,
11569 opts->x_param_values,
11570 global_options_set.x_param_values);
11571
11572 /* The maybe_set calls won't update the value if the user has explicitly set
11573 one. Which means we need to validate that probing interval and guard size
11574 are equal. */
11575 int probe_interval
11576 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11577 if (guard_size != probe_interval)
11578 error ("stack clash guard size '%d' must be equal to probing interval "
11579 "'%d'", guard_size, probe_interval);
11580
11581 /* Enable sw prefetching at specified optimization level for
11582 CPUS that have prefetch. Lower optimization level threshold by 1
11583 when profiling is enabled. */
11584 if (opts->x_flag_prefetch_loop_arrays < 0
11585 && !opts->x_optimize_size
11586 && aarch64_tune_params.prefetch->default_opt_level >= 0
11587 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11588 opts->x_flag_prefetch_loop_arrays = 1;
11589
11590 if (opts->x_aarch64_arch_string == NULL)
11591 opts->x_aarch64_arch_string = selected_arch->name;
11592 if (opts->x_aarch64_cpu_string == NULL)
11593 opts->x_aarch64_cpu_string = selected_cpu->name;
11594 if (opts->x_aarch64_tune_string == NULL)
11595 opts->x_aarch64_tune_string = selected_tune->name;
11596
11597 aarch64_override_options_after_change_1 (opts);
11598 }
11599
11600 /* Print a hint with a suggestion for a core or architecture name that
11601 most closely resembles what the user passed in STR. ARCH is true if
11602 the user is asking for an architecture name. ARCH is false if the user
11603 is asking for a core name. */
11604
11605 static void
11606 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11607 {
11608 auto_vec<const char *> candidates;
11609 const struct processor *entry = arch ? all_architectures : all_cores;
11610 for (; entry->name != NULL; entry++)
11611 candidates.safe_push (entry->name);
11612
11613 #ifdef HAVE_LOCAL_CPU_DETECT
11614 /* Add also "native" as possible value. */
11615 if (arch)
11616 candidates.safe_push ("native");
11617 #endif
11618
11619 char *s;
11620 const char *hint = candidates_list_and_hint (str, s, candidates);
11621 if (hint)
11622 inform (input_location, "valid arguments are: %s;"
11623 " did you mean %qs?", s, hint);
11624 else
11625 inform (input_location, "valid arguments are: %s", s);
11626
11627 XDELETEVEC (s);
11628 }
11629
11630 /* Print a hint with a suggestion for a core name that most closely resembles
11631 what the user passed in STR. */
11632
11633 inline static void
11634 aarch64_print_hint_for_core (const char *str)
11635 {
11636 aarch64_print_hint_for_core_or_arch (str, false);
11637 }
11638
11639 /* Print a hint with a suggestion for an architecture name that most closely
11640 resembles what the user passed in STR. */
11641
11642 inline static void
11643 aarch64_print_hint_for_arch (const char *str)
11644 {
11645 aarch64_print_hint_for_core_or_arch (str, true);
11646 }
11647
11648
11649 /* Print a hint with a suggestion for an extension name
11650 that most closely resembles what the user passed in STR. */
11651
11652 void
11653 aarch64_print_hint_for_extensions (const std::string &str)
11654 {
11655 auto_vec<const char *> candidates;
11656 aarch64_get_all_extension_candidates (&candidates);
11657 char *s;
11658 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11659 if (hint)
11660 inform (input_location, "valid arguments are: %s;"
11661 " did you mean %qs?", s, hint);
11662 else
11663 inform (input_location, "valid arguments are: %s;", s);
11664
11665 XDELETEVEC (s);
11666 }
11667
11668 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11669 specified in STR and throw errors if appropriate. Put the results if
11670 they are valid in RES and ISA_FLAGS. Return whether the option is
11671 valid. */
11672
11673 static bool
11674 aarch64_validate_mcpu (const char *str, const struct processor **res,
11675 unsigned long *isa_flags)
11676 {
11677 std::string invalid_extension;
11678 enum aarch64_parse_opt_result parse_res
11679 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11680
11681 if (parse_res == AARCH64_PARSE_OK)
11682 return true;
11683
11684 switch (parse_res)
11685 {
11686 case AARCH64_PARSE_MISSING_ARG:
11687 error ("missing cpu name in %<-mcpu=%s%>", str);
11688 break;
11689 case AARCH64_PARSE_INVALID_ARG:
11690 error ("unknown value %qs for %<-mcpu%>", str);
11691 aarch64_print_hint_for_core (str);
11692 break;
11693 case AARCH64_PARSE_INVALID_FEATURE:
11694 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11695 invalid_extension.c_str (), str);
11696 aarch64_print_hint_for_extensions (invalid_extension);
11697 break;
11698 default:
11699 gcc_unreachable ();
11700 }
11701
11702 return false;
11703 }
11704
11705 /* Parses CONST_STR for branch protection features specified in
11706 aarch64_branch_protect_types, and set any global variables required. Returns
11707 the parsing result and assigns LAST_STR to the last processed token from
11708 CONST_STR so that it can be used for error reporting. */
11709
11710 static enum
11711 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11712 char** last_str)
11713 {
11714 char *str_root = xstrdup (const_str);
11715 char* token_save = NULL;
11716 char *str = strtok_r (str_root, "+", &token_save);
11717 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11718 if (!str)
11719 res = AARCH64_PARSE_MISSING_ARG;
11720 else
11721 {
11722 char *next_str = strtok_r (NULL, "+", &token_save);
11723 /* Reset the branch protection features to their defaults. */
11724 aarch64_handle_no_branch_protection (NULL, NULL);
11725
11726 while (str && res == AARCH64_PARSE_OK)
11727 {
11728 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11729 bool found = false;
11730 /* Search for this type. */
11731 while (type && type->name && !found && res == AARCH64_PARSE_OK)
11732 {
11733 if (strcmp (str, type->name) == 0)
11734 {
11735 found = true;
11736 res = type->handler (str, next_str);
11737 str = next_str;
11738 next_str = strtok_r (NULL, "+", &token_save);
11739 }
11740 else
11741 type++;
11742 }
11743 if (found && res == AARCH64_PARSE_OK)
11744 {
11745 bool found_subtype = true;
11746 /* Loop through each token until we find one that isn't a
11747 subtype. */
11748 while (found_subtype)
11749 {
11750 found_subtype = false;
11751 const aarch64_branch_protect_type *subtype = type->subtypes;
11752 /* Search for the subtype. */
11753 while (str && subtype && subtype->name && !found_subtype
11754 && res == AARCH64_PARSE_OK)
11755 {
11756 if (strcmp (str, subtype->name) == 0)
11757 {
11758 found_subtype = true;
11759 res = subtype->handler (str, next_str);
11760 str = next_str;
11761 next_str = strtok_r (NULL, "+", &token_save);
11762 }
11763 else
11764 subtype++;
11765 }
11766 }
11767 }
11768 else if (!found)
11769 res = AARCH64_PARSE_INVALID_ARG;
11770 }
11771 }
11772 /* Copy the last processed token into the argument to pass it back.
11773 Used by option and attribute validation to print the offending token. */
11774 if (last_str)
11775 {
11776 if (str) strcpy (*last_str, str);
11777 else *last_str = NULL;
11778 }
11779 if (res == AARCH64_PARSE_OK)
11780 {
11781 /* If needed, alloc the accepted string then copy in const_str.
11782 Used by override_option_after_change_1. */
11783 if (!accepted_branch_protection_string)
11784 accepted_branch_protection_string = (char *) xmalloc (
11785 BRANCH_PROTECT_STR_MAX
11786 + 1);
11787 strncpy (accepted_branch_protection_string, const_str,
11788 BRANCH_PROTECT_STR_MAX + 1);
11789 /* Forcibly null-terminate. */
11790 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11791 }
11792 return res;
11793 }
11794
11795 static bool
11796 aarch64_validate_mbranch_protection (const char *const_str)
11797 {
11798 char *str = (char *) xmalloc (strlen (const_str));
11799 enum aarch64_parse_opt_result res =
11800 aarch64_parse_branch_protection (const_str, &str);
11801 if (res == AARCH64_PARSE_INVALID_ARG)
11802 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11803 else if (res == AARCH64_PARSE_MISSING_ARG)
11804 error ("missing arg for %<-mbranch-protection=%>");
11805 free (str);
11806 return res == AARCH64_PARSE_OK;
11807 }
11808
11809 /* Validate a command-line -march option. Parse the arch and extensions
11810 (if any) specified in STR and throw errors if appropriate. Put the
11811 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11812 option is valid. */
11813
11814 static bool
11815 aarch64_validate_march (const char *str, const struct processor **res,
11816 unsigned long *isa_flags)
11817 {
11818 std::string invalid_extension;
11819 enum aarch64_parse_opt_result parse_res
11820 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11821
11822 if (parse_res == AARCH64_PARSE_OK)
11823 return true;
11824
11825 switch (parse_res)
11826 {
11827 case AARCH64_PARSE_MISSING_ARG:
11828 error ("missing arch name in %<-march=%s%>", str);
11829 break;
11830 case AARCH64_PARSE_INVALID_ARG:
11831 error ("unknown value %qs for %<-march%>", str);
11832 aarch64_print_hint_for_arch (str);
11833 break;
11834 case AARCH64_PARSE_INVALID_FEATURE:
11835 error ("invalid feature modifier %qs in %<-march=%s%>",
11836 invalid_extension.c_str (), str);
11837 aarch64_print_hint_for_extensions (invalid_extension);
11838 break;
11839 default:
11840 gcc_unreachable ();
11841 }
11842
11843 return false;
11844 }
11845
11846 /* Validate a command-line -mtune option. Parse the cpu
11847 specified in STR and throw errors if appropriate. Put the
11848 result, if it is valid, in RES. Return whether the option is
11849 valid. */
11850
11851 static bool
11852 aarch64_validate_mtune (const char *str, const struct processor **res)
11853 {
11854 enum aarch64_parse_opt_result parse_res
11855 = aarch64_parse_tune (str, res);
11856
11857 if (parse_res == AARCH64_PARSE_OK)
11858 return true;
11859
11860 switch (parse_res)
11861 {
11862 case AARCH64_PARSE_MISSING_ARG:
11863 error ("missing cpu name in %<-mtune=%s%>", str);
11864 break;
11865 case AARCH64_PARSE_INVALID_ARG:
11866 error ("unknown value %qs for %<-mtune%>", str);
11867 aarch64_print_hint_for_core (str);
11868 break;
11869 default:
11870 gcc_unreachable ();
11871 }
11872 return false;
11873 }
11874
11875 /* Return the CPU corresponding to the enum CPU.
11876 If it doesn't specify a cpu, return the default. */
11877
11878 static const struct processor *
11879 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11880 {
11881 if (cpu != aarch64_none)
11882 return &all_cores[cpu];
11883
11884 /* The & 0x3f is to extract the bottom 6 bits that encode the
11885 default cpu as selected by the --with-cpu GCC configure option
11886 in config.gcc.
11887 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11888 flags mechanism should be reworked to make it more sane. */
11889 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11890 }
11891
11892 /* Return the architecture corresponding to the enum ARCH.
11893 If it doesn't specify a valid architecture, return the default. */
11894
11895 static const struct processor *
11896 aarch64_get_arch (enum aarch64_arch arch)
11897 {
11898 if (arch != aarch64_no_arch)
11899 return &all_architectures[arch];
11900
11901 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11902
11903 return &all_architectures[cpu->arch];
11904 }
11905
11906 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11907
11908 static poly_uint16
11909 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11910 {
11911 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11912 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11913 deciding which .md file patterns to use and when deciding whether
11914 something is a legitimate address or constant. */
11915 if (value == SVE_SCALABLE || value == SVE_128)
11916 return poly_uint16 (2, 2);
11917 else
11918 return (int) value / 64;
11919 }
11920
11921 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11922 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11923 tuning structs. In particular it must set selected_tune and
11924 aarch64_isa_flags that define the available ISA features and tuning
11925 decisions. It must also set selected_arch as this will be used to
11926 output the .arch asm tags for each function. */
11927
11928 static void
11929 aarch64_override_options (void)
11930 {
11931 unsigned long cpu_isa = 0;
11932 unsigned long arch_isa = 0;
11933 aarch64_isa_flags = 0;
11934
11935 bool valid_cpu = true;
11936 bool valid_tune = true;
11937 bool valid_arch = true;
11938
11939 selected_cpu = NULL;
11940 selected_arch = NULL;
11941 selected_tune = NULL;
11942
11943 if (aarch64_branch_protection_string)
11944 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11945
11946 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11947 If either of -march or -mtune is given, they override their
11948 respective component of -mcpu. */
11949 if (aarch64_cpu_string)
11950 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11951 &cpu_isa);
11952
11953 if (aarch64_arch_string)
11954 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11955 &arch_isa);
11956
11957 if (aarch64_tune_string)
11958 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11959
11960 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11961 SUBTARGET_OVERRIDE_OPTIONS;
11962 #endif
11963
11964 /* If the user did not specify a processor, choose the default
11965 one for them. This will be the CPU set during configuration using
11966 --with-cpu, otherwise it is "generic". */
11967 if (!selected_cpu)
11968 {
11969 if (selected_arch)
11970 {
11971 selected_cpu = &all_cores[selected_arch->ident];
11972 aarch64_isa_flags = arch_isa;
11973 explicit_arch = selected_arch->arch;
11974 }
11975 else
11976 {
11977 /* Get default configure-time CPU. */
11978 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11979 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11980 }
11981
11982 if (selected_tune)
11983 explicit_tune_core = selected_tune->ident;
11984 }
11985 /* If both -mcpu and -march are specified check that they are architecturally
11986 compatible, warn if they're not and prefer the -march ISA flags. */
11987 else if (selected_arch)
11988 {
11989 if (selected_arch->arch != selected_cpu->arch)
11990 {
11991 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
11992 all_architectures[selected_cpu->arch].name,
11993 selected_arch->name);
11994 }
11995 aarch64_isa_flags = arch_isa;
11996 explicit_arch = selected_arch->arch;
11997 explicit_tune_core = selected_tune ? selected_tune->ident
11998 : selected_cpu->ident;
11999 }
12000 else
12001 {
12002 /* -mcpu but no -march. */
12003 aarch64_isa_flags = cpu_isa;
12004 explicit_tune_core = selected_tune ? selected_tune->ident
12005 : selected_cpu->ident;
12006 gcc_assert (selected_cpu);
12007 selected_arch = &all_architectures[selected_cpu->arch];
12008 explicit_arch = selected_arch->arch;
12009 }
12010
12011 /* Set the arch as well as we will need it when outputing
12012 the .arch directive in assembly. */
12013 if (!selected_arch)
12014 {
12015 gcc_assert (selected_cpu);
12016 selected_arch = &all_architectures[selected_cpu->arch];
12017 }
12018
12019 if (!selected_tune)
12020 selected_tune = selected_cpu;
12021
12022 if (aarch64_enable_bti == 2)
12023 {
12024 #ifdef TARGET_ENABLE_BTI
12025 aarch64_enable_bti = 1;
12026 #else
12027 aarch64_enable_bti = 0;
12028 #endif
12029 }
12030
12031 /* Return address signing is currently not supported for ILP32 targets. For
12032 LP64 targets use the configured option in the absence of a command-line
12033 option for -mbranch-protection. */
12034 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12035 {
12036 #ifdef TARGET_ENABLE_PAC_RET
12037 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12038 #else
12039 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12040 #endif
12041 }
12042
12043 #ifndef HAVE_AS_MABI_OPTION
12044 /* The compiler may have been configured with 2.23.* binutils, which does
12045 not have support for ILP32. */
12046 if (TARGET_ILP32)
12047 error ("assembler does not support %<-mabi=ilp32%>");
12048 #endif
12049
12050 /* Convert -msve-vector-bits to a VG count. */
12051 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12052
12053 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12054 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12055
12056 /* Make sure we properly set up the explicit options. */
12057 if ((aarch64_cpu_string && valid_cpu)
12058 || (aarch64_tune_string && valid_tune))
12059 gcc_assert (explicit_tune_core != aarch64_none);
12060
12061 if ((aarch64_cpu_string && valid_cpu)
12062 || (aarch64_arch_string && valid_arch))
12063 gcc_assert (explicit_arch != aarch64_no_arch);
12064
12065 /* The pass to insert speculation tracking runs before
12066 shrink-wrapping and the latter does not know how to update the
12067 tracking status. So disable it in this case. */
12068 if (aarch64_track_speculation)
12069 flag_shrink_wrap = 0;
12070
12071 aarch64_override_options_internal (&global_options);
12072
12073 /* Save these options as the default ones in case we push and pop them later
12074 while processing functions with potential target attributes. */
12075 target_option_default_node = target_option_current_node
12076 = build_target_option_node (&global_options);
12077 }
12078
12079 /* Implement targetm.override_options_after_change. */
12080
12081 static void
12082 aarch64_override_options_after_change (void)
12083 {
12084 aarch64_override_options_after_change_1 (&global_options);
12085 }
12086
12087 static struct machine_function *
12088 aarch64_init_machine_status (void)
12089 {
12090 struct machine_function *machine;
12091 machine = ggc_cleared_alloc<machine_function> ();
12092 return machine;
12093 }
12094
12095 void
12096 aarch64_init_expanders (void)
12097 {
12098 init_machine_status = aarch64_init_machine_status;
12099 }
12100
12101 /* A checking mechanism for the implementation of the various code models. */
12102 static void
12103 initialize_aarch64_code_model (struct gcc_options *opts)
12104 {
12105 if (opts->x_flag_pic)
12106 {
12107 switch (opts->x_aarch64_cmodel_var)
12108 {
12109 case AARCH64_CMODEL_TINY:
12110 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12111 break;
12112 case AARCH64_CMODEL_SMALL:
12113 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12114 aarch64_cmodel = (flag_pic == 2
12115 ? AARCH64_CMODEL_SMALL_PIC
12116 : AARCH64_CMODEL_SMALL_SPIC);
12117 #else
12118 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12119 #endif
12120 break;
12121 case AARCH64_CMODEL_LARGE:
12122 sorry ("code model %qs with %<-f%s%>", "large",
12123 opts->x_flag_pic > 1 ? "PIC" : "pic");
12124 break;
12125 default:
12126 gcc_unreachable ();
12127 }
12128 }
12129 else
12130 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12131 }
12132
12133 /* Implement TARGET_OPTION_SAVE. */
12134
12135 static void
12136 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12137 {
12138 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12139 ptr->x_aarch64_branch_protection_string
12140 = opts->x_aarch64_branch_protection_string;
12141 }
12142
12143 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12144 using the information saved in PTR. */
12145
12146 static void
12147 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12148 {
12149 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12150 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12151 opts->x_explicit_arch = ptr->x_explicit_arch;
12152 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12153 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12154 opts->x_aarch64_branch_protection_string
12155 = ptr->x_aarch64_branch_protection_string;
12156 if (opts->x_aarch64_branch_protection_string)
12157 {
12158 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12159 NULL);
12160 }
12161
12162 aarch64_override_options_internal (opts);
12163 }
12164
12165 /* Implement TARGET_OPTION_PRINT. */
12166
12167 static void
12168 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12169 {
12170 const struct processor *cpu
12171 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12172 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12173 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12174 std::string extension
12175 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12176
12177 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12178 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12179 arch->name, extension.c_str ());
12180 }
12181
12182 static GTY(()) tree aarch64_previous_fndecl;
12183
12184 void
12185 aarch64_reset_previous_fndecl (void)
12186 {
12187 aarch64_previous_fndecl = NULL;
12188 }
12189
12190 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12191 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12192 make sure optab availability predicates are recomputed when necessary. */
12193
12194 void
12195 aarch64_save_restore_target_globals (tree new_tree)
12196 {
12197 if (TREE_TARGET_GLOBALS (new_tree))
12198 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12199 else if (new_tree == target_option_default_node)
12200 restore_target_globals (&default_target_globals);
12201 else
12202 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12203 }
12204
12205 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12206 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12207 of the function, if such exists. This function may be called multiple
12208 times on a single function so use aarch64_previous_fndecl to avoid
12209 setting up identical state. */
12210
12211 static void
12212 aarch64_set_current_function (tree fndecl)
12213 {
12214 if (!fndecl || fndecl == aarch64_previous_fndecl)
12215 return;
12216
12217 tree old_tree = (aarch64_previous_fndecl
12218 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12219 : NULL_TREE);
12220
12221 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12222
12223 /* If current function has no attributes but the previous one did,
12224 use the default node. */
12225 if (!new_tree && old_tree)
12226 new_tree = target_option_default_node;
12227
12228 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12229 the default have been handled by aarch64_save_restore_target_globals from
12230 aarch64_pragma_target_parse. */
12231 if (old_tree == new_tree)
12232 return;
12233
12234 aarch64_previous_fndecl = fndecl;
12235
12236 /* First set the target options. */
12237 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12238
12239 aarch64_save_restore_target_globals (new_tree);
12240 }
12241
12242 /* Enum describing the various ways we can handle attributes.
12243 In many cases we can reuse the generic option handling machinery. */
12244
12245 enum aarch64_attr_opt_type
12246 {
12247 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12248 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12249 aarch64_attr_enum, /* Attribute sets an enum variable. */
12250 aarch64_attr_custom /* Attribute requires a custom handling function. */
12251 };
12252
12253 /* All the information needed to handle a target attribute.
12254 NAME is the name of the attribute.
12255 ATTR_TYPE specifies the type of behavior of the attribute as described
12256 in the definition of enum aarch64_attr_opt_type.
12257 ALLOW_NEG is true if the attribute supports a "no-" form.
12258 HANDLER is the function that takes the attribute string as an argument
12259 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12260 OPT_NUM is the enum specifying the option that the attribute modifies.
12261 This is needed for attributes that mirror the behavior of a command-line
12262 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12263 aarch64_attr_enum. */
12264
12265 struct aarch64_attribute_info
12266 {
12267 const char *name;
12268 enum aarch64_attr_opt_type attr_type;
12269 bool allow_neg;
12270 bool (*handler) (const char *);
12271 enum opt_code opt_num;
12272 };
12273
12274 /* Handle the ARCH_STR argument to the arch= target attribute. */
12275
12276 static bool
12277 aarch64_handle_attr_arch (const char *str)
12278 {
12279 const struct processor *tmp_arch = NULL;
12280 std::string invalid_extension;
12281 enum aarch64_parse_opt_result parse_res
12282 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12283
12284 if (parse_res == AARCH64_PARSE_OK)
12285 {
12286 gcc_assert (tmp_arch);
12287 selected_arch = tmp_arch;
12288 explicit_arch = selected_arch->arch;
12289 return true;
12290 }
12291
12292 switch (parse_res)
12293 {
12294 case AARCH64_PARSE_MISSING_ARG:
12295 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12296 break;
12297 case AARCH64_PARSE_INVALID_ARG:
12298 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12299 aarch64_print_hint_for_arch (str);
12300 break;
12301 case AARCH64_PARSE_INVALID_FEATURE:
12302 error ("invalid feature modifier %s of value (\"%s\") in "
12303 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12304 aarch64_print_hint_for_extensions (invalid_extension);
12305 break;
12306 default:
12307 gcc_unreachable ();
12308 }
12309
12310 return false;
12311 }
12312
12313 /* Handle the argument CPU_STR to the cpu= target attribute. */
12314
12315 static bool
12316 aarch64_handle_attr_cpu (const char *str)
12317 {
12318 const struct processor *tmp_cpu = NULL;
12319 std::string invalid_extension;
12320 enum aarch64_parse_opt_result parse_res
12321 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12322
12323 if (parse_res == AARCH64_PARSE_OK)
12324 {
12325 gcc_assert (tmp_cpu);
12326 selected_tune = tmp_cpu;
12327 explicit_tune_core = selected_tune->ident;
12328
12329 selected_arch = &all_architectures[tmp_cpu->arch];
12330 explicit_arch = selected_arch->arch;
12331 return true;
12332 }
12333
12334 switch (parse_res)
12335 {
12336 case AARCH64_PARSE_MISSING_ARG:
12337 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12338 break;
12339 case AARCH64_PARSE_INVALID_ARG:
12340 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12341 aarch64_print_hint_for_core (str);
12342 break;
12343 case AARCH64_PARSE_INVALID_FEATURE:
12344 error ("invalid feature modifier %s of value (\"%s\") in "
12345 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12346 aarch64_print_hint_for_extensions (invalid_extension);
12347 break;
12348 default:
12349 gcc_unreachable ();
12350 }
12351
12352 return false;
12353 }
12354
12355 /* Handle the argument STR to the branch-protection= attribute. */
12356
12357 static bool
12358 aarch64_handle_attr_branch_protection (const char* str)
12359 {
12360 char *err_str = (char *) xmalloc (strlen (str));
12361 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12362 &err_str);
12363 bool success = false;
12364 switch (res)
12365 {
12366 case AARCH64_PARSE_MISSING_ARG:
12367 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12368 " attribute");
12369 break;
12370 case AARCH64_PARSE_INVALID_ARG:
12371 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12372 "=\")%> pragma or attribute", err_str);
12373 break;
12374 case AARCH64_PARSE_OK:
12375 success = true;
12376 /* Fall through. */
12377 case AARCH64_PARSE_INVALID_FEATURE:
12378 break;
12379 default:
12380 gcc_unreachable ();
12381 }
12382 free (err_str);
12383 return success;
12384 }
12385
12386 /* Handle the argument STR to the tune= target attribute. */
12387
12388 static bool
12389 aarch64_handle_attr_tune (const char *str)
12390 {
12391 const struct processor *tmp_tune = NULL;
12392 enum aarch64_parse_opt_result parse_res
12393 = aarch64_parse_tune (str, &tmp_tune);
12394
12395 if (parse_res == AARCH64_PARSE_OK)
12396 {
12397 gcc_assert (tmp_tune);
12398 selected_tune = tmp_tune;
12399 explicit_tune_core = selected_tune->ident;
12400 return true;
12401 }
12402
12403 switch (parse_res)
12404 {
12405 case AARCH64_PARSE_INVALID_ARG:
12406 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12407 aarch64_print_hint_for_core (str);
12408 break;
12409 default:
12410 gcc_unreachable ();
12411 }
12412
12413 return false;
12414 }
12415
12416 /* Parse an architecture extensions target attribute string specified in STR.
12417 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12418 if successful. Update aarch64_isa_flags to reflect the ISA features
12419 modified. */
12420
12421 static bool
12422 aarch64_handle_attr_isa_flags (char *str)
12423 {
12424 enum aarch64_parse_opt_result parse_res;
12425 unsigned long isa_flags = aarch64_isa_flags;
12426
12427 /* We allow "+nothing" in the beginning to clear out all architectural
12428 features if the user wants to handpick specific features. */
12429 if (strncmp ("+nothing", str, 8) == 0)
12430 {
12431 isa_flags = 0;
12432 str += 8;
12433 }
12434
12435 std::string invalid_extension;
12436 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12437
12438 if (parse_res == AARCH64_PARSE_OK)
12439 {
12440 aarch64_isa_flags = isa_flags;
12441 return true;
12442 }
12443
12444 switch (parse_res)
12445 {
12446 case AARCH64_PARSE_MISSING_ARG:
12447 error ("missing value in %<target()%> pragma or attribute");
12448 break;
12449
12450 case AARCH64_PARSE_INVALID_FEATURE:
12451 error ("invalid feature modifier %s of value (\"%s\") in "
12452 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12453 break;
12454
12455 default:
12456 gcc_unreachable ();
12457 }
12458
12459 return false;
12460 }
12461
12462 /* The target attributes that we support. On top of these we also support just
12463 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12464 handled explicitly in aarch64_process_one_target_attr. */
12465
12466 static const struct aarch64_attribute_info aarch64_attributes[] =
12467 {
12468 { "general-regs-only", aarch64_attr_mask, false, NULL,
12469 OPT_mgeneral_regs_only },
12470 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12471 OPT_mfix_cortex_a53_835769 },
12472 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12473 OPT_mfix_cortex_a53_843419 },
12474 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12475 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12476 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12477 OPT_momit_leaf_frame_pointer },
12478 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12479 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12480 OPT_march_ },
12481 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12482 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12483 OPT_mtune_ },
12484 { "branch-protection", aarch64_attr_custom, false,
12485 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12486 { "sign-return-address", aarch64_attr_enum, false, NULL,
12487 OPT_msign_return_address_ },
12488 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12489 };
12490
12491 /* Parse ARG_STR which contains the definition of one target attribute.
12492 Show appropriate errors if any or return true if the attribute is valid. */
12493
12494 static bool
12495 aarch64_process_one_target_attr (char *arg_str)
12496 {
12497 bool invert = false;
12498
12499 size_t len = strlen (arg_str);
12500
12501 if (len == 0)
12502 {
12503 error ("malformed %<target()%> pragma or attribute");
12504 return false;
12505 }
12506
12507 char *str_to_check = (char *) alloca (len + 1);
12508 strcpy (str_to_check, arg_str);
12509
12510 /* Skip leading whitespace. */
12511 while (*str_to_check == ' ' || *str_to_check == '\t')
12512 str_to_check++;
12513
12514 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12515 It is easier to detect and handle it explicitly here rather than going
12516 through the machinery for the rest of the target attributes in this
12517 function. */
12518 if (*str_to_check == '+')
12519 return aarch64_handle_attr_isa_flags (str_to_check);
12520
12521 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12522 {
12523 invert = true;
12524 str_to_check += 3;
12525 }
12526 char *arg = strchr (str_to_check, '=');
12527
12528 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12529 and point ARG to "foo". */
12530 if (arg)
12531 {
12532 *arg = '\0';
12533 arg++;
12534 }
12535 const struct aarch64_attribute_info *p_attr;
12536 bool found = false;
12537 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12538 {
12539 /* If the names don't match up, or the user has given an argument
12540 to an attribute that doesn't accept one, or didn't give an argument
12541 to an attribute that expects one, fail to match. */
12542 if (strcmp (str_to_check, p_attr->name) != 0)
12543 continue;
12544
12545 found = true;
12546 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12547 || p_attr->attr_type == aarch64_attr_enum;
12548
12549 if (attr_need_arg_p ^ (arg != NULL))
12550 {
12551 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12552 return false;
12553 }
12554
12555 /* If the name matches but the attribute does not allow "no-" versions
12556 then we can't match. */
12557 if (invert && !p_attr->allow_neg)
12558 {
12559 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12560 return false;
12561 }
12562
12563 switch (p_attr->attr_type)
12564 {
12565 /* Has a custom handler registered.
12566 For example, cpu=, arch=, tune=. */
12567 case aarch64_attr_custom:
12568 gcc_assert (p_attr->handler);
12569 if (!p_attr->handler (arg))
12570 return false;
12571 break;
12572
12573 /* Either set or unset a boolean option. */
12574 case aarch64_attr_bool:
12575 {
12576 struct cl_decoded_option decoded;
12577
12578 generate_option (p_attr->opt_num, NULL, !invert,
12579 CL_TARGET, &decoded);
12580 aarch64_handle_option (&global_options, &global_options_set,
12581 &decoded, input_location);
12582 break;
12583 }
12584 /* Set or unset a bit in the target_flags. aarch64_handle_option
12585 should know what mask to apply given the option number. */
12586 case aarch64_attr_mask:
12587 {
12588 struct cl_decoded_option decoded;
12589 /* We only need to specify the option number.
12590 aarch64_handle_option will know which mask to apply. */
12591 decoded.opt_index = p_attr->opt_num;
12592 decoded.value = !invert;
12593 aarch64_handle_option (&global_options, &global_options_set,
12594 &decoded, input_location);
12595 break;
12596 }
12597 /* Use the option setting machinery to set an option to an enum. */
12598 case aarch64_attr_enum:
12599 {
12600 gcc_assert (arg);
12601 bool valid;
12602 int value;
12603 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12604 &value, CL_TARGET);
12605 if (valid)
12606 {
12607 set_option (&global_options, NULL, p_attr->opt_num, value,
12608 NULL, DK_UNSPECIFIED, input_location,
12609 global_dc);
12610 }
12611 else
12612 {
12613 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12614 }
12615 break;
12616 }
12617 default:
12618 gcc_unreachable ();
12619 }
12620 }
12621
12622 /* If we reached here we either have found an attribute and validated
12623 it or didn't match any. If we matched an attribute but its arguments
12624 were malformed we will have returned false already. */
12625 return found;
12626 }
12627
12628 /* Count how many times the character C appears in
12629 NULL-terminated string STR. */
12630
12631 static unsigned int
12632 num_occurences_in_str (char c, char *str)
12633 {
12634 unsigned int res = 0;
12635 while (*str != '\0')
12636 {
12637 if (*str == c)
12638 res++;
12639
12640 str++;
12641 }
12642
12643 return res;
12644 }
12645
12646 /* Parse the tree in ARGS that contains the target attribute information
12647 and update the global target options space. */
12648
12649 bool
12650 aarch64_process_target_attr (tree args)
12651 {
12652 if (TREE_CODE (args) == TREE_LIST)
12653 {
12654 do
12655 {
12656 tree head = TREE_VALUE (args);
12657 if (head)
12658 {
12659 if (!aarch64_process_target_attr (head))
12660 return false;
12661 }
12662 args = TREE_CHAIN (args);
12663 } while (args);
12664
12665 return true;
12666 }
12667
12668 if (TREE_CODE (args) != STRING_CST)
12669 {
12670 error ("attribute %<target%> argument not a string");
12671 return false;
12672 }
12673
12674 size_t len = strlen (TREE_STRING_POINTER (args));
12675 char *str_to_check = (char *) alloca (len + 1);
12676 strcpy (str_to_check, TREE_STRING_POINTER (args));
12677
12678 if (len == 0)
12679 {
12680 error ("malformed %<target()%> pragma or attribute");
12681 return false;
12682 }
12683
12684 /* Used to catch empty spaces between commas i.e.
12685 attribute ((target ("attr1,,attr2"))). */
12686 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12687
12688 /* Handle multiple target attributes separated by ','. */
12689 char *token = strtok_r (str_to_check, ",", &str_to_check);
12690
12691 unsigned int num_attrs = 0;
12692 while (token)
12693 {
12694 num_attrs++;
12695 if (!aarch64_process_one_target_attr (token))
12696 {
12697 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12698 return false;
12699 }
12700
12701 token = strtok_r (NULL, ",", &str_to_check);
12702 }
12703
12704 if (num_attrs != num_commas + 1)
12705 {
12706 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12707 return false;
12708 }
12709
12710 return true;
12711 }
12712
12713 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12714 process attribute ((target ("..."))). */
12715
12716 static bool
12717 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12718 {
12719 struct cl_target_option cur_target;
12720 bool ret;
12721 tree old_optimize;
12722 tree new_target, new_optimize;
12723 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12724
12725 /* If what we're processing is the current pragma string then the
12726 target option node is already stored in target_option_current_node
12727 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12728 having to re-parse the string. This is especially useful to keep
12729 arm_neon.h compile times down since that header contains a lot
12730 of intrinsics enclosed in pragmas. */
12731 if (!existing_target && args == current_target_pragma)
12732 {
12733 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12734 return true;
12735 }
12736 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12737
12738 old_optimize = build_optimization_node (&global_options);
12739 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12740
12741 /* If the function changed the optimization levels as well as setting
12742 target options, start with the optimizations specified. */
12743 if (func_optimize && func_optimize != old_optimize)
12744 cl_optimization_restore (&global_options,
12745 TREE_OPTIMIZATION (func_optimize));
12746
12747 /* Save the current target options to restore at the end. */
12748 cl_target_option_save (&cur_target, &global_options);
12749
12750 /* If fndecl already has some target attributes applied to it, unpack
12751 them so that we add this attribute on top of them, rather than
12752 overwriting them. */
12753 if (existing_target)
12754 {
12755 struct cl_target_option *existing_options
12756 = TREE_TARGET_OPTION (existing_target);
12757
12758 if (existing_options)
12759 cl_target_option_restore (&global_options, existing_options);
12760 }
12761 else
12762 cl_target_option_restore (&global_options,
12763 TREE_TARGET_OPTION (target_option_current_node));
12764
12765 ret = aarch64_process_target_attr (args);
12766
12767 /* Set up any additional state. */
12768 if (ret)
12769 {
12770 aarch64_override_options_internal (&global_options);
12771 /* Initialize SIMD builtins if we haven't already.
12772 Set current_target_pragma to NULL for the duration so that
12773 the builtin initialization code doesn't try to tag the functions
12774 being built with the attributes specified by any current pragma, thus
12775 going into an infinite recursion. */
12776 if (TARGET_SIMD)
12777 {
12778 tree saved_current_target_pragma = current_target_pragma;
12779 current_target_pragma = NULL;
12780 aarch64_init_simd_builtins ();
12781 current_target_pragma = saved_current_target_pragma;
12782 }
12783 new_target = build_target_option_node (&global_options);
12784 }
12785 else
12786 new_target = NULL;
12787
12788 new_optimize = build_optimization_node (&global_options);
12789
12790 if (fndecl && ret)
12791 {
12792 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12793
12794 if (old_optimize != new_optimize)
12795 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12796 }
12797
12798 cl_target_option_restore (&global_options, &cur_target);
12799
12800 if (old_optimize != new_optimize)
12801 cl_optimization_restore (&global_options,
12802 TREE_OPTIMIZATION (old_optimize));
12803 return ret;
12804 }
12805
12806 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12807 tri-bool options (yes, no, don't care) and the default value is
12808 DEF, determine whether to reject inlining. */
12809
12810 static bool
12811 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12812 int dont_care, int def)
12813 {
12814 /* If the callee doesn't care, always allow inlining. */
12815 if (callee == dont_care)
12816 return true;
12817
12818 /* If the caller doesn't care, always allow inlining. */
12819 if (caller == dont_care)
12820 return true;
12821
12822 /* Otherwise, allow inlining if either the callee and caller values
12823 agree, or if the callee is using the default value. */
12824 return (callee == caller || callee == def);
12825 }
12826
12827 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12828 to inline CALLEE into CALLER based on target-specific info.
12829 Make sure that the caller and callee have compatible architectural
12830 features. Then go through the other possible target attributes
12831 and see if they can block inlining. Try not to reject always_inline
12832 callees unless they are incompatible architecturally. */
12833
12834 static bool
12835 aarch64_can_inline_p (tree caller, tree callee)
12836 {
12837 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12838 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12839
12840 struct cl_target_option *caller_opts
12841 = TREE_TARGET_OPTION (caller_tree ? caller_tree
12842 : target_option_default_node);
12843
12844 struct cl_target_option *callee_opts
12845 = TREE_TARGET_OPTION (callee_tree ? callee_tree
12846 : target_option_default_node);
12847
12848 /* Callee's ISA flags should be a subset of the caller's. */
12849 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12850 != callee_opts->x_aarch64_isa_flags)
12851 return false;
12852
12853 /* Allow non-strict aligned functions inlining into strict
12854 aligned ones. */
12855 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12856 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12857 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12858 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12859 return false;
12860
12861 bool always_inline = lookup_attribute ("always_inline",
12862 DECL_ATTRIBUTES (callee));
12863
12864 /* If the architectural features match up and the callee is always_inline
12865 then the other attributes don't matter. */
12866 if (always_inline)
12867 return true;
12868
12869 if (caller_opts->x_aarch64_cmodel_var
12870 != callee_opts->x_aarch64_cmodel_var)
12871 return false;
12872
12873 if (caller_opts->x_aarch64_tls_dialect
12874 != callee_opts->x_aarch64_tls_dialect)
12875 return false;
12876
12877 /* Honour explicit requests to workaround errata. */
12878 if (!aarch64_tribools_ok_for_inlining_p (
12879 caller_opts->x_aarch64_fix_a53_err835769,
12880 callee_opts->x_aarch64_fix_a53_err835769,
12881 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12882 return false;
12883
12884 if (!aarch64_tribools_ok_for_inlining_p (
12885 caller_opts->x_aarch64_fix_a53_err843419,
12886 callee_opts->x_aarch64_fix_a53_err843419,
12887 2, TARGET_FIX_ERR_A53_843419))
12888 return false;
12889
12890 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12891 caller and calle and they don't match up, reject inlining. */
12892 if (!aarch64_tribools_ok_for_inlining_p (
12893 caller_opts->x_flag_omit_leaf_frame_pointer,
12894 callee_opts->x_flag_omit_leaf_frame_pointer,
12895 2, 1))
12896 return false;
12897
12898 /* If the callee has specific tuning overrides, respect them. */
12899 if (callee_opts->x_aarch64_override_tune_string != NULL
12900 && caller_opts->x_aarch64_override_tune_string == NULL)
12901 return false;
12902
12903 /* If the user specified tuning override strings for the
12904 caller and callee and they don't match up, reject inlining.
12905 We just do a string compare here, we don't analyze the meaning
12906 of the string, as it would be too costly for little gain. */
12907 if (callee_opts->x_aarch64_override_tune_string
12908 && caller_opts->x_aarch64_override_tune_string
12909 && (strcmp (callee_opts->x_aarch64_override_tune_string,
12910 caller_opts->x_aarch64_override_tune_string) != 0))
12911 return false;
12912
12913 return true;
12914 }
12915
12916 /* Return true if SYMBOL_REF X binds locally. */
12917
12918 static bool
12919 aarch64_symbol_binds_local_p (const_rtx x)
12920 {
12921 return (SYMBOL_REF_DECL (x)
12922 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12923 : SYMBOL_REF_LOCAL_P (x));
12924 }
12925
12926 /* Return true if SYMBOL_REF X is thread local */
12927 static bool
12928 aarch64_tls_symbol_p (rtx x)
12929 {
12930 if (! TARGET_HAVE_TLS)
12931 return false;
12932
12933 if (GET_CODE (x) != SYMBOL_REF)
12934 return false;
12935
12936 return SYMBOL_REF_TLS_MODEL (x) != 0;
12937 }
12938
12939 /* Classify a TLS symbol into one of the TLS kinds. */
12940 enum aarch64_symbol_type
12941 aarch64_classify_tls_symbol (rtx x)
12942 {
12943 enum tls_model tls_kind = tls_symbolic_operand_type (x);
12944
12945 switch (tls_kind)
12946 {
12947 case TLS_MODEL_GLOBAL_DYNAMIC:
12948 case TLS_MODEL_LOCAL_DYNAMIC:
12949 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12950
12951 case TLS_MODEL_INITIAL_EXEC:
12952 switch (aarch64_cmodel)
12953 {
12954 case AARCH64_CMODEL_TINY:
12955 case AARCH64_CMODEL_TINY_PIC:
12956 return SYMBOL_TINY_TLSIE;
12957 default:
12958 return SYMBOL_SMALL_TLSIE;
12959 }
12960
12961 case TLS_MODEL_LOCAL_EXEC:
12962 if (aarch64_tls_size == 12)
12963 return SYMBOL_TLSLE12;
12964 else if (aarch64_tls_size == 24)
12965 return SYMBOL_TLSLE24;
12966 else if (aarch64_tls_size == 32)
12967 return SYMBOL_TLSLE32;
12968 else if (aarch64_tls_size == 48)
12969 return SYMBOL_TLSLE48;
12970 else
12971 gcc_unreachable ();
12972
12973 case TLS_MODEL_EMULATED:
12974 case TLS_MODEL_NONE:
12975 return SYMBOL_FORCE_TO_MEM;
12976
12977 default:
12978 gcc_unreachable ();
12979 }
12980 }
12981
12982 /* Return the correct method for accessing X + OFFSET, where X is either
12983 a SYMBOL_REF or LABEL_REF. */
12984
12985 enum aarch64_symbol_type
12986 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12987 {
12988 if (GET_CODE (x) == LABEL_REF)
12989 {
12990 switch (aarch64_cmodel)
12991 {
12992 case AARCH64_CMODEL_LARGE:
12993 return SYMBOL_FORCE_TO_MEM;
12994
12995 case AARCH64_CMODEL_TINY_PIC:
12996 case AARCH64_CMODEL_TINY:
12997 return SYMBOL_TINY_ABSOLUTE;
12998
12999 case AARCH64_CMODEL_SMALL_SPIC:
13000 case AARCH64_CMODEL_SMALL_PIC:
13001 case AARCH64_CMODEL_SMALL:
13002 return SYMBOL_SMALL_ABSOLUTE;
13003
13004 default:
13005 gcc_unreachable ();
13006 }
13007 }
13008
13009 if (GET_CODE (x) == SYMBOL_REF)
13010 {
13011 if (aarch64_tls_symbol_p (x))
13012 return aarch64_classify_tls_symbol (x);
13013
13014 switch (aarch64_cmodel)
13015 {
13016 case AARCH64_CMODEL_TINY:
13017 /* When we retrieve symbol + offset address, we have to make sure
13018 the offset does not cause overflow of the final address. But
13019 we have no way of knowing the address of symbol at compile time
13020 so we can't accurately say if the distance between the PC and
13021 symbol + offset is outside the addressible range of +/-1M in the
13022 TINY code model. So we rely on images not being greater than
13023 1M and cap the offset at 1M and anything beyond 1M will have to
13024 be loaded using an alternative mechanism. Furthermore if the
13025 symbol is a weak reference to something that isn't known to
13026 resolve to a symbol in this module, then force to memory. */
13027 if ((SYMBOL_REF_WEAK (x)
13028 && !aarch64_symbol_binds_local_p (x))
13029 || !IN_RANGE (offset, -1048575, 1048575))
13030 return SYMBOL_FORCE_TO_MEM;
13031 return SYMBOL_TINY_ABSOLUTE;
13032
13033 case AARCH64_CMODEL_SMALL:
13034 /* Same reasoning as the tiny code model, but the offset cap here is
13035 4G. */
13036 if ((SYMBOL_REF_WEAK (x)
13037 && !aarch64_symbol_binds_local_p (x))
13038 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13039 HOST_WIDE_INT_C (4294967264)))
13040 return SYMBOL_FORCE_TO_MEM;
13041 return SYMBOL_SMALL_ABSOLUTE;
13042
13043 case AARCH64_CMODEL_TINY_PIC:
13044 if (!aarch64_symbol_binds_local_p (x))
13045 return SYMBOL_TINY_GOT;
13046 return SYMBOL_TINY_ABSOLUTE;
13047
13048 case AARCH64_CMODEL_SMALL_SPIC:
13049 case AARCH64_CMODEL_SMALL_PIC:
13050 if (!aarch64_symbol_binds_local_p (x))
13051 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13052 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13053 return SYMBOL_SMALL_ABSOLUTE;
13054
13055 case AARCH64_CMODEL_LARGE:
13056 /* This is alright even in PIC code as the constant
13057 pool reference is always PC relative and within
13058 the same translation unit. */
13059 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13060 return SYMBOL_SMALL_ABSOLUTE;
13061 else
13062 return SYMBOL_FORCE_TO_MEM;
13063
13064 default:
13065 gcc_unreachable ();
13066 }
13067 }
13068
13069 /* By default push everything into the constant pool. */
13070 return SYMBOL_FORCE_TO_MEM;
13071 }
13072
13073 bool
13074 aarch64_constant_address_p (rtx x)
13075 {
13076 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13077 }
13078
13079 bool
13080 aarch64_legitimate_pic_operand_p (rtx x)
13081 {
13082 if (GET_CODE (x) == SYMBOL_REF
13083 || (GET_CODE (x) == CONST
13084 && GET_CODE (XEXP (x, 0)) == PLUS
13085 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13086 return false;
13087
13088 return true;
13089 }
13090
13091 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13092 that should be rematerialized rather than spilled. */
13093
13094 static bool
13095 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13096 {
13097 /* Support CSE and rematerialization of common constants. */
13098 if (CONST_INT_P (x)
13099 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13100 || GET_CODE (x) == CONST_VECTOR)
13101 return true;
13102
13103 /* Do not allow vector struct mode constants for Advanced SIMD.
13104 We could support 0 and -1 easily, but they need support in
13105 aarch64-simd.md. */
13106 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13107 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13108 return false;
13109
13110 /* Only accept variable-length vector constants if they can be
13111 handled directly.
13112
13113 ??? It would be possible to handle rematerialization of other
13114 constants via secondary reloads. */
13115 if (vec_flags & VEC_ANY_SVE)
13116 return aarch64_simd_valid_immediate (x, NULL);
13117
13118 if (GET_CODE (x) == HIGH)
13119 x = XEXP (x, 0);
13120
13121 /* Accept polynomial constants that can be calculated by using the
13122 destination of a move as the sole temporary. Constants that
13123 require a second temporary cannot be rematerialized (they can't be
13124 forced to memory and also aren't legitimate constants). */
13125 poly_int64 offset;
13126 if (poly_int_rtx_p (x, &offset))
13127 return aarch64_offset_temporaries (false, offset) <= 1;
13128
13129 /* If an offset is being added to something else, we need to allow the
13130 base to be moved into the destination register, meaning that there
13131 are no free temporaries for the offset. */
13132 x = strip_offset (x, &offset);
13133 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13134 return false;
13135
13136 /* Do not allow const (plus (anchor_symbol, const_int)). */
13137 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13138 return false;
13139
13140 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13141 so spilling them is better than rematerialization. */
13142 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13143 return true;
13144
13145 /* Label references are always constant. */
13146 if (GET_CODE (x) == LABEL_REF)
13147 return true;
13148
13149 return false;
13150 }
13151
13152 rtx
13153 aarch64_load_tp (rtx target)
13154 {
13155 if (!target
13156 || GET_MODE (target) != Pmode
13157 || !register_operand (target, Pmode))
13158 target = gen_reg_rtx (Pmode);
13159
13160 /* Can return in any reg. */
13161 emit_insn (gen_aarch64_load_tp_hard (target));
13162 return target;
13163 }
13164
13165 /* On AAPCS systems, this is the "struct __va_list". */
13166 static GTY(()) tree va_list_type;
13167
13168 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13169 Return the type to use as __builtin_va_list.
13170
13171 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13172
13173 struct __va_list
13174 {
13175 void *__stack;
13176 void *__gr_top;
13177 void *__vr_top;
13178 int __gr_offs;
13179 int __vr_offs;
13180 }; */
13181
13182 static tree
13183 aarch64_build_builtin_va_list (void)
13184 {
13185 tree va_list_name;
13186 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13187
13188 /* Create the type. */
13189 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13190 /* Give it the required name. */
13191 va_list_name = build_decl (BUILTINS_LOCATION,
13192 TYPE_DECL,
13193 get_identifier ("__va_list"),
13194 va_list_type);
13195 DECL_ARTIFICIAL (va_list_name) = 1;
13196 TYPE_NAME (va_list_type) = va_list_name;
13197 TYPE_STUB_DECL (va_list_type) = va_list_name;
13198
13199 /* Create the fields. */
13200 f_stack = build_decl (BUILTINS_LOCATION,
13201 FIELD_DECL, get_identifier ("__stack"),
13202 ptr_type_node);
13203 f_grtop = build_decl (BUILTINS_LOCATION,
13204 FIELD_DECL, get_identifier ("__gr_top"),
13205 ptr_type_node);
13206 f_vrtop = build_decl (BUILTINS_LOCATION,
13207 FIELD_DECL, get_identifier ("__vr_top"),
13208 ptr_type_node);
13209 f_groff = build_decl (BUILTINS_LOCATION,
13210 FIELD_DECL, get_identifier ("__gr_offs"),
13211 integer_type_node);
13212 f_vroff = build_decl (BUILTINS_LOCATION,
13213 FIELD_DECL, get_identifier ("__vr_offs"),
13214 integer_type_node);
13215
13216 /* Tell tree-stdarg pass about our internal offset fields.
13217 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13218 purpose to identify whether the code is updating va_list internal
13219 offset fields through irregular way. */
13220 va_list_gpr_counter_field = f_groff;
13221 va_list_fpr_counter_field = f_vroff;
13222
13223 DECL_ARTIFICIAL (f_stack) = 1;
13224 DECL_ARTIFICIAL (f_grtop) = 1;
13225 DECL_ARTIFICIAL (f_vrtop) = 1;
13226 DECL_ARTIFICIAL (f_groff) = 1;
13227 DECL_ARTIFICIAL (f_vroff) = 1;
13228
13229 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13230 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13231 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13232 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13233 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13234
13235 TYPE_FIELDS (va_list_type) = f_stack;
13236 DECL_CHAIN (f_stack) = f_grtop;
13237 DECL_CHAIN (f_grtop) = f_vrtop;
13238 DECL_CHAIN (f_vrtop) = f_groff;
13239 DECL_CHAIN (f_groff) = f_vroff;
13240
13241 /* Compute its layout. */
13242 layout_type (va_list_type);
13243
13244 return va_list_type;
13245 }
13246
13247 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13248 static void
13249 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13250 {
13251 const CUMULATIVE_ARGS *cum;
13252 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13253 tree stack, grtop, vrtop, groff, vroff;
13254 tree t;
13255 int gr_save_area_size = cfun->va_list_gpr_size;
13256 int vr_save_area_size = cfun->va_list_fpr_size;
13257 int vr_offset;
13258
13259 cum = &crtl->args.info;
13260 if (cfun->va_list_gpr_size)
13261 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13262 cfun->va_list_gpr_size);
13263 if (cfun->va_list_fpr_size)
13264 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13265 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13266
13267 if (!TARGET_FLOAT)
13268 {
13269 gcc_assert (cum->aapcs_nvrn == 0);
13270 vr_save_area_size = 0;
13271 }
13272
13273 f_stack = TYPE_FIELDS (va_list_type_node);
13274 f_grtop = DECL_CHAIN (f_stack);
13275 f_vrtop = DECL_CHAIN (f_grtop);
13276 f_groff = DECL_CHAIN (f_vrtop);
13277 f_vroff = DECL_CHAIN (f_groff);
13278
13279 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13280 NULL_TREE);
13281 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13282 NULL_TREE);
13283 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13284 NULL_TREE);
13285 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13286 NULL_TREE);
13287 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13288 NULL_TREE);
13289
13290 /* Emit code to initialize STACK, which points to the next varargs stack
13291 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13292 by named arguments. STACK is 8-byte aligned. */
13293 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13294 if (cum->aapcs_stack_size > 0)
13295 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13296 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13297 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13298
13299 /* Emit code to initialize GRTOP, the top of the GR save area.
13300 virtual_incoming_args_rtx should have been 16 byte aligned. */
13301 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13302 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13303 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13304
13305 /* Emit code to initialize VRTOP, the top of the VR save area.
13306 This address is gr_save_area_bytes below GRTOP, rounded
13307 down to the next 16-byte boundary. */
13308 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13309 vr_offset = ROUND_UP (gr_save_area_size,
13310 STACK_BOUNDARY / BITS_PER_UNIT);
13311
13312 if (vr_offset)
13313 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13314 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13315 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13316
13317 /* Emit code to initialize GROFF, the offset from GRTOP of the
13318 next GPR argument. */
13319 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13320 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13321 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13322
13323 /* Likewise emit code to initialize VROFF, the offset from FTOP
13324 of the next VR argument. */
13325 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13326 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13327 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13328 }
13329
13330 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13331
13332 static tree
13333 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13334 gimple_seq *post_p ATTRIBUTE_UNUSED)
13335 {
13336 tree addr;
13337 bool indirect_p;
13338 bool is_ha; /* is HFA or HVA. */
13339 bool dw_align; /* double-word align. */
13340 machine_mode ag_mode = VOIDmode;
13341 int nregs;
13342 machine_mode mode;
13343
13344 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13345 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13346 HOST_WIDE_INT size, rsize, adjust, align;
13347 tree t, u, cond1, cond2;
13348
13349 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13350 if (indirect_p)
13351 type = build_pointer_type (type);
13352
13353 mode = TYPE_MODE (type);
13354
13355 f_stack = TYPE_FIELDS (va_list_type_node);
13356 f_grtop = DECL_CHAIN (f_stack);
13357 f_vrtop = DECL_CHAIN (f_grtop);
13358 f_groff = DECL_CHAIN (f_vrtop);
13359 f_vroff = DECL_CHAIN (f_groff);
13360
13361 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13362 f_stack, NULL_TREE);
13363 size = int_size_in_bytes (type);
13364
13365 bool abi_break;
13366 align
13367 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13368
13369 dw_align = false;
13370 adjust = 0;
13371 if (aarch64_vfp_is_call_or_return_candidate (mode,
13372 type,
13373 &ag_mode,
13374 &nregs,
13375 &is_ha))
13376 {
13377 /* No frontends can create types with variable-sized modes, so we
13378 shouldn't be asked to pass or return them. */
13379 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13380
13381 /* TYPE passed in fp/simd registers. */
13382 if (!TARGET_FLOAT)
13383 aarch64_err_no_fpadvsimd (mode);
13384
13385 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13386 unshare_expr (valist), f_vrtop, NULL_TREE);
13387 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13388 unshare_expr (valist), f_vroff, NULL_TREE);
13389
13390 rsize = nregs * UNITS_PER_VREG;
13391
13392 if (is_ha)
13393 {
13394 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13395 adjust = UNITS_PER_VREG - ag_size;
13396 }
13397 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13398 && size < UNITS_PER_VREG)
13399 {
13400 adjust = UNITS_PER_VREG - size;
13401 }
13402 }
13403 else
13404 {
13405 /* TYPE passed in general registers. */
13406 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13407 unshare_expr (valist), f_grtop, NULL_TREE);
13408 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13409 unshare_expr (valist), f_groff, NULL_TREE);
13410 rsize = ROUND_UP (size, UNITS_PER_WORD);
13411 nregs = rsize / UNITS_PER_WORD;
13412
13413 if (align > 8)
13414 {
13415 if (abi_break && warn_psabi)
13416 inform (input_location, "parameter passing for argument of type "
13417 "%qT changed in GCC 9.1", type);
13418 dw_align = true;
13419 }
13420
13421 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13422 && size < UNITS_PER_WORD)
13423 {
13424 adjust = UNITS_PER_WORD - size;
13425 }
13426 }
13427
13428 /* Get a local temporary for the field value. */
13429 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13430
13431 /* Emit code to branch if off >= 0. */
13432 t = build2 (GE_EXPR, boolean_type_node, off,
13433 build_int_cst (TREE_TYPE (off), 0));
13434 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13435
13436 if (dw_align)
13437 {
13438 /* Emit: offs = (offs + 15) & -16. */
13439 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13440 build_int_cst (TREE_TYPE (off), 15));
13441 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13442 build_int_cst (TREE_TYPE (off), -16));
13443 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13444 }
13445 else
13446 roundup = NULL;
13447
13448 /* Update ap.__[g|v]r_offs */
13449 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13450 build_int_cst (TREE_TYPE (off), rsize));
13451 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13452
13453 /* String up. */
13454 if (roundup)
13455 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13456
13457 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13458 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13459 build_int_cst (TREE_TYPE (f_off), 0));
13460 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13461
13462 /* String up: make sure the assignment happens before the use. */
13463 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13464 COND_EXPR_ELSE (cond1) = t;
13465
13466 /* Prepare the trees handling the argument that is passed on the stack;
13467 the top level node will store in ON_STACK. */
13468 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13469 if (align > 8)
13470 {
13471 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13472 t = fold_build_pointer_plus_hwi (arg, 15);
13473 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13474 build_int_cst (TREE_TYPE (t), -16));
13475 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13476 }
13477 else
13478 roundup = NULL;
13479 /* Advance ap.__stack */
13480 t = fold_build_pointer_plus_hwi (arg, size + 7);
13481 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13482 build_int_cst (TREE_TYPE (t), -8));
13483 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13484 /* String up roundup and advance. */
13485 if (roundup)
13486 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13487 /* String up with arg */
13488 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13489 /* Big-endianness related address adjustment. */
13490 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13491 && size < UNITS_PER_WORD)
13492 {
13493 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13494 size_int (UNITS_PER_WORD - size));
13495 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13496 }
13497
13498 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13499 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13500
13501 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13502 t = off;
13503 if (adjust)
13504 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13505 build_int_cst (TREE_TYPE (off), adjust));
13506
13507 t = fold_convert (sizetype, t);
13508 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13509
13510 if (is_ha)
13511 {
13512 /* type ha; // treat as "struct {ftype field[n];}"
13513 ... [computing offs]
13514 for (i = 0; i <nregs; ++i, offs += 16)
13515 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13516 return ha; */
13517 int i;
13518 tree tmp_ha, field_t, field_ptr_t;
13519
13520 /* Declare a local variable. */
13521 tmp_ha = create_tmp_var_raw (type, "ha");
13522 gimple_add_tmp_var (tmp_ha);
13523
13524 /* Establish the base type. */
13525 switch (ag_mode)
13526 {
13527 case E_SFmode:
13528 field_t = float_type_node;
13529 field_ptr_t = float_ptr_type_node;
13530 break;
13531 case E_DFmode:
13532 field_t = double_type_node;
13533 field_ptr_t = double_ptr_type_node;
13534 break;
13535 case E_TFmode:
13536 field_t = long_double_type_node;
13537 field_ptr_t = long_double_ptr_type_node;
13538 break;
13539 case E_HFmode:
13540 field_t = aarch64_fp16_type_node;
13541 field_ptr_t = aarch64_fp16_ptr_type_node;
13542 break;
13543 case E_V2SImode:
13544 case E_V4SImode:
13545 {
13546 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13547 field_t = build_vector_type_for_mode (innertype, ag_mode);
13548 field_ptr_t = build_pointer_type (field_t);
13549 }
13550 break;
13551 default:
13552 gcc_assert (0);
13553 }
13554
13555 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13556 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13557 addr = t;
13558 t = fold_convert (field_ptr_t, addr);
13559 t = build2 (MODIFY_EXPR, field_t,
13560 build1 (INDIRECT_REF, field_t, tmp_ha),
13561 build1 (INDIRECT_REF, field_t, t));
13562
13563 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13564 for (i = 1; i < nregs; ++i)
13565 {
13566 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13567 u = fold_convert (field_ptr_t, addr);
13568 u = build2 (MODIFY_EXPR, field_t,
13569 build2 (MEM_REF, field_t, tmp_ha,
13570 build_int_cst (field_ptr_t,
13571 (i *
13572 int_size_in_bytes (field_t)))),
13573 build1 (INDIRECT_REF, field_t, u));
13574 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13575 }
13576
13577 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13578 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13579 }
13580
13581 COND_EXPR_ELSE (cond2) = t;
13582 addr = fold_convert (build_pointer_type (type), cond1);
13583 addr = build_va_arg_indirect_ref (addr);
13584
13585 if (indirect_p)
13586 addr = build_va_arg_indirect_ref (addr);
13587
13588 return addr;
13589 }
13590
13591 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13592
13593 static void
13594 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13595 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13596 int no_rtl)
13597 {
13598 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13599 CUMULATIVE_ARGS local_cum;
13600 int gr_saved = cfun->va_list_gpr_size;
13601 int vr_saved = cfun->va_list_fpr_size;
13602
13603 /* The caller has advanced CUM up to, but not beyond, the last named
13604 argument. Advance a local copy of CUM past the last "real" named
13605 argument, to find out how many registers are left over. */
13606 local_cum = *cum;
13607 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13608
13609 /* Found out how many registers we need to save.
13610 Honor tree-stdvar analysis results. */
13611 if (cfun->va_list_gpr_size)
13612 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13613 cfun->va_list_gpr_size / UNITS_PER_WORD);
13614 if (cfun->va_list_fpr_size)
13615 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13616 cfun->va_list_fpr_size / UNITS_PER_VREG);
13617
13618 if (!TARGET_FLOAT)
13619 {
13620 gcc_assert (local_cum.aapcs_nvrn == 0);
13621 vr_saved = 0;
13622 }
13623
13624 if (!no_rtl)
13625 {
13626 if (gr_saved > 0)
13627 {
13628 rtx ptr, mem;
13629
13630 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13631 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13632 - gr_saved * UNITS_PER_WORD);
13633 mem = gen_frame_mem (BLKmode, ptr);
13634 set_mem_alias_set (mem, get_varargs_alias_set ());
13635
13636 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13637 mem, gr_saved);
13638 }
13639 if (vr_saved > 0)
13640 {
13641 /* We can't use move_block_from_reg, because it will use
13642 the wrong mode, storing D regs only. */
13643 machine_mode mode = TImode;
13644 int off, i, vr_start;
13645
13646 /* Set OFF to the offset from virtual_incoming_args_rtx of
13647 the first vector register. The VR save area lies below
13648 the GR one, and is aligned to 16 bytes. */
13649 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13650 STACK_BOUNDARY / BITS_PER_UNIT);
13651 off -= vr_saved * UNITS_PER_VREG;
13652
13653 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13654 for (i = 0; i < vr_saved; ++i)
13655 {
13656 rtx ptr, mem;
13657
13658 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13659 mem = gen_frame_mem (mode, ptr);
13660 set_mem_alias_set (mem, get_varargs_alias_set ());
13661 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13662 off += UNITS_PER_VREG;
13663 }
13664 }
13665 }
13666
13667 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13668 any complication of having crtl->args.pretend_args_size changed. */
13669 cfun->machine->frame.saved_varargs_size
13670 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13671 STACK_BOUNDARY / BITS_PER_UNIT)
13672 + vr_saved * UNITS_PER_VREG);
13673 }
13674
13675 static void
13676 aarch64_conditional_register_usage (void)
13677 {
13678 int i;
13679 if (!TARGET_FLOAT)
13680 {
13681 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13682 {
13683 fixed_regs[i] = 1;
13684 call_used_regs[i] = 1;
13685 }
13686 }
13687 if (!TARGET_SVE)
13688 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13689 {
13690 fixed_regs[i] = 1;
13691 call_used_regs[i] = 1;
13692 }
13693
13694 /* When tracking speculation, we need a couple of call-clobbered registers
13695 to track the speculation state. It would be nice to just use
13696 IP0 and IP1, but currently there are numerous places that just
13697 assume these registers are free for other uses (eg pointer
13698 authentication). */
13699 if (aarch64_track_speculation)
13700 {
13701 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13702 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13703 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13704 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13705 }
13706 }
13707
13708 /* Walk down the type tree of TYPE counting consecutive base elements.
13709 If *MODEP is VOIDmode, then set it to the first valid floating point
13710 type. If a non-floating point type is found, or if a floating point
13711 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13712 otherwise return the count in the sub-tree. */
13713 static int
13714 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13715 {
13716 machine_mode mode;
13717 HOST_WIDE_INT size;
13718
13719 switch (TREE_CODE (type))
13720 {
13721 case REAL_TYPE:
13722 mode = TYPE_MODE (type);
13723 if (mode != DFmode && mode != SFmode
13724 && mode != TFmode && mode != HFmode)
13725 return -1;
13726
13727 if (*modep == VOIDmode)
13728 *modep = mode;
13729
13730 if (*modep == mode)
13731 return 1;
13732
13733 break;
13734
13735 case COMPLEX_TYPE:
13736 mode = TYPE_MODE (TREE_TYPE (type));
13737 if (mode != DFmode && mode != SFmode
13738 && mode != TFmode && mode != HFmode)
13739 return -1;
13740
13741 if (*modep == VOIDmode)
13742 *modep = mode;
13743
13744 if (*modep == mode)
13745 return 2;
13746
13747 break;
13748
13749 case VECTOR_TYPE:
13750 /* Use V2SImode and V4SImode as representatives of all 64-bit
13751 and 128-bit vector types. */
13752 size = int_size_in_bytes (type);
13753 switch (size)
13754 {
13755 case 8:
13756 mode = V2SImode;
13757 break;
13758 case 16:
13759 mode = V4SImode;
13760 break;
13761 default:
13762 return -1;
13763 }
13764
13765 if (*modep == VOIDmode)
13766 *modep = mode;
13767
13768 /* Vector modes are considered to be opaque: two vectors are
13769 equivalent for the purposes of being homogeneous aggregates
13770 if they are the same size. */
13771 if (*modep == mode)
13772 return 1;
13773
13774 break;
13775
13776 case ARRAY_TYPE:
13777 {
13778 int count;
13779 tree index = TYPE_DOMAIN (type);
13780
13781 /* Can't handle incomplete types nor sizes that are not
13782 fixed. */
13783 if (!COMPLETE_TYPE_P (type)
13784 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13785 return -1;
13786
13787 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13788 if (count == -1
13789 || !index
13790 || !TYPE_MAX_VALUE (index)
13791 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13792 || !TYPE_MIN_VALUE (index)
13793 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13794 || count < 0)
13795 return -1;
13796
13797 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13798 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13799
13800 /* There must be no padding. */
13801 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13802 count * GET_MODE_BITSIZE (*modep)))
13803 return -1;
13804
13805 return count;
13806 }
13807
13808 case RECORD_TYPE:
13809 {
13810 int count = 0;
13811 int sub_count;
13812 tree field;
13813
13814 /* Can't handle incomplete types nor sizes that are not
13815 fixed. */
13816 if (!COMPLETE_TYPE_P (type)
13817 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13818 return -1;
13819
13820 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13821 {
13822 if (TREE_CODE (field) != FIELD_DECL)
13823 continue;
13824
13825 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13826 if (sub_count < 0)
13827 return -1;
13828 count += sub_count;
13829 }
13830
13831 /* There must be no padding. */
13832 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13833 count * GET_MODE_BITSIZE (*modep)))
13834 return -1;
13835
13836 return count;
13837 }
13838
13839 case UNION_TYPE:
13840 case QUAL_UNION_TYPE:
13841 {
13842 /* These aren't very interesting except in a degenerate case. */
13843 int count = 0;
13844 int sub_count;
13845 tree field;
13846
13847 /* Can't handle incomplete types nor sizes that are not
13848 fixed. */
13849 if (!COMPLETE_TYPE_P (type)
13850 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13851 return -1;
13852
13853 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13854 {
13855 if (TREE_CODE (field) != FIELD_DECL)
13856 continue;
13857
13858 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13859 if (sub_count < 0)
13860 return -1;
13861 count = count > sub_count ? count : sub_count;
13862 }
13863
13864 /* There must be no padding. */
13865 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13866 count * GET_MODE_BITSIZE (*modep)))
13867 return -1;
13868
13869 return count;
13870 }
13871
13872 default:
13873 break;
13874 }
13875
13876 return -1;
13877 }
13878
13879 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13880 type as described in AAPCS64 \S 4.1.2.
13881
13882 See the comment above aarch64_composite_type_p for the notes on MODE. */
13883
13884 static bool
13885 aarch64_short_vector_p (const_tree type,
13886 machine_mode mode)
13887 {
13888 poly_int64 size = -1;
13889
13890 if (type && TREE_CODE (type) == VECTOR_TYPE)
13891 size = int_size_in_bytes (type);
13892 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13893 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13894 size = GET_MODE_SIZE (mode);
13895
13896 return known_eq (size, 8) || known_eq (size, 16);
13897 }
13898
13899 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13900 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13901 array types. The C99 floating-point complex types are also considered
13902 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13903 types, which are GCC extensions and out of the scope of AAPCS64, are
13904 treated as composite types here as well.
13905
13906 Note that MODE itself is not sufficient in determining whether a type
13907 is such a composite type or not. This is because
13908 stor-layout.c:compute_record_mode may have already changed the MODE
13909 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13910 structure with only one field may have its MODE set to the mode of the
13911 field. Also an integer mode whose size matches the size of the
13912 RECORD_TYPE type may be used to substitute the original mode
13913 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13914 solely relied on. */
13915
13916 static bool
13917 aarch64_composite_type_p (const_tree type,
13918 machine_mode mode)
13919 {
13920 if (aarch64_short_vector_p (type, mode))
13921 return false;
13922
13923 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13924 return true;
13925
13926 if (mode == BLKmode
13927 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13928 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13929 return true;
13930
13931 return false;
13932 }
13933
13934 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13935 shall be passed or returned in simd/fp register(s) (providing these
13936 parameter passing registers are available).
13937
13938 Upon successful return, *COUNT returns the number of needed registers,
13939 *BASE_MODE returns the mode of the individual register and when IS_HAF
13940 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13941 floating-point aggregate or a homogeneous short-vector aggregate. */
13942
13943 static bool
13944 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13945 const_tree type,
13946 machine_mode *base_mode,
13947 int *count,
13948 bool *is_ha)
13949 {
13950 machine_mode new_mode = VOIDmode;
13951 bool composite_p = aarch64_composite_type_p (type, mode);
13952
13953 if (is_ha != NULL) *is_ha = false;
13954
13955 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13956 || aarch64_short_vector_p (type, mode))
13957 {
13958 *count = 1;
13959 new_mode = mode;
13960 }
13961 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13962 {
13963 if (is_ha != NULL) *is_ha = true;
13964 *count = 2;
13965 new_mode = GET_MODE_INNER (mode);
13966 }
13967 else if (type && composite_p)
13968 {
13969 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13970
13971 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13972 {
13973 if (is_ha != NULL) *is_ha = true;
13974 *count = ag_count;
13975 }
13976 else
13977 return false;
13978 }
13979 else
13980 return false;
13981
13982 *base_mode = new_mode;
13983 return true;
13984 }
13985
13986 /* Implement TARGET_STRUCT_VALUE_RTX. */
13987
13988 static rtx
13989 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13990 int incoming ATTRIBUTE_UNUSED)
13991 {
13992 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13993 }
13994
13995 /* Implements target hook vector_mode_supported_p. */
13996 static bool
13997 aarch64_vector_mode_supported_p (machine_mode mode)
13998 {
13999 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14000 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14001 }
14002
14003 /* Return appropriate SIMD container
14004 for MODE within a vector of WIDTH bits. */
14005 static machine_mode
14006 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14007 {
14008 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14009 switch (mode)
14010 {
14011 case E_DFmode:
14012 return VNx2DFmode;
14013 case E_SFmode:
14014 return VNx4SFmode;
14015 case E_HFmode:
14016 return VNx8HFmode;
14017 case E_DImode:
14018 return VNx2DImode;
14019 case E_SImode:
14020 return VNx4SImode;
14021 case E_HImode:
14022 return VNx8HImode;
14023 case E_QImode:
14024 return VNx16QImode;
14025 default:
14026 return word_mode;
14027 }
14028
14029 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14030 if (TARGET_SIMD)
14031 {
14032 if (known_eq (width, 128))
14033 switch (mode)
14034 {
14035 case E_DFmode:
14036 return V2DFmode;
14037 case E_SFmode:
14038 return V4SFmode;
14039 case E_HFmode:
14040 return V8HFmode;
14041 case E_SImode:
14042 return V4SImode;
14043 case E_HImode:
14044 return V8HImode;
14045 case E_QImode:
14046 return V16QImode;
14047 case E_DImode:
14048 return V2DImode;
14049 default:
14050 break;
14051 }
14052 else
14053 switch (mode)
14054 {
14055 case E_SFmode:
14056 return V2SFmode;
14057 case E_HFmode:
14058 return V4HFmode;
14059 case E_SImode:
14060 return V2SImode;
14061 case E_HImode:
14062 return V4HImode;
14063 case E_QImode:
14064 return V8QImode;
14065 default:
14066 break;
14067 }
14068 }
14069 return word_mode;
14070 }
14071
14072 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14073 static machine_mode
14074 aarch64_preferred_simd_mode (scalar_mode mode)
14075 {
14076 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14077 return aarch64_simd_container_mode (mode, bits);
14078 }
14079
14080 /* Return a list of possible vector sizes for the vectorizer
14081 to iterate over. */
14082 static void
14083 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14084 {
14085 if (TARGET_SVE)
14086 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14087 sizes->safe_push (16);
14088 sizes->safe_push (8);
14089 }
14090
14091 /* Implement TARGET_MANGLE_TYPE. */
14092
14093 static const char *
14094 aarch64_mangle_type (const_tree type)
14095 {
14096 /* The AArch64 ABI documents say that "__va_list" has to be
14097 mangled as if it is in the "std" namespace. */
14098 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14099 return "St9__va_list";
14100
14101 /* Half-precision float. */
14102 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14103 return "Dh";
14104
14105 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14106 builtin types. */
14107 if (TYPE_NAME (type) != NULL)
14108 return aarch64_mangle_builtin_type (type);
14109
14110 /* Use the default mangling. */
14111 return NULL;
14112 }
14113
14114 /* Find the first rtx_insn before insn that will generate an assembly
14115 instruction. */
14116
14117 static rtx_insn *
14118 aarch64_prev_real_insn (rtx_insn *insn)
14119 {
14120 if (!insn)
14121 return NULL;
14122
14123 do
14124 {
14125 insn = prev_real_insn (insn);
14126 }
14127 while (insn && recog_memoized (insn) < 0);
14128
14129 return insn;
14130 }
14131
14132 static bool
14133 is_madd_op (enum attr_type t1)
14134 {
14135 unsigned int i;
14136 /* A number of these may be AArch32 only. */
14137 enum attr_type mlatypes[] = {
14138 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14139 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14140 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14141 };
14142
14143 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14144 {
14145 if (t1 == mlatypes[i])
14146 return true;
14147 }
14148
14149 return false;
14150 }
14151
14152 /* Check if there is a register dependency between a load and the insn
14153 for which we hold recog_data. */
14154
14155 static bool
14156 dep_between_memop_and_curr (rtx memop)
14157 {
14158 rtx load_reg;
14159 int opno;
14160
14161 gcc_assert (GET_CODE (memop) == SET);
14162
14163 if (!REG_P (SET_DEST (memop)))
14164 return false;
14165
14166 load_reg = SET_DEST (memop);
14167 for (opno = 1; opno < recog_data.n_operands; opno++)
14168 {
14169 rtx operand = recog_data.operand[opno];
14170 if (REG_P (operand)
14171 && reg_overlap_mentioned_p (load_reg, operand))
14172 return true;
14173
14174 }
14175 return false;
14176 }
14177
14178
14179 /* When working around the Cortex-A53 erratum 835769,
14180 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14181 instruction and has a preceding memory instruction such that a NOP
14182 should be inserted between them. */
14183
14184 bool
14185 aarch64_madd_needs_nop (rtx_insn* insn)
14186 {
14187 enum attr_type attr_type;
14188 rtx_insn *prev;
14189 rtx body;
14190
14191 if (!TARGET_FIX_ERR_A53_835769)
14192 return false;
14193
14194 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14195 return false;
14196
14197 attr_type = get_attr_type (insn);
14198 if (!is_madd_op (attr_type))
14199 return false;
14200
14201 prev = aarch64_prev_real_insn (insn);
14202 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14203 Restore recog state to INSN to avoid state corruption. */
14204 extract_constrain_insn_cached (insn);
14205
14206 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14207 return false;
14208
14209 body = single_set (prev);
14210
14211 /* If the previous insn is a memory op and there is no dependency between
14212 it and the DImode madd, emit a NOP between them. If body is NULL then we
14213 have a complex memory operation, probably a load/store pair.
14214 Be conservative for now and emit a NOP. */
14215 if (GET_MODE (recog_data.operand[0]) == DImode
14216 && (!body || !dep_between_memop_and_curr (body)))
14217 return true;
14218
14219 return false;
14220
14221 }
14222
14223
14224 /* Implement FINAL_PRESCAN_INSN. */
14225
14226 void
14227 aarch64_final_prescan_insn (rtx_insn *insn)
14228 {
14229 if (aarch64_madd_needs_nop (insn))
14230 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14231 }
14232
14233
14234 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14235 instruction. */
14236
14237 bool
14238 aarch64_sve_index_immediate_p (rtx base_or_step)
14239 {
14240 return (CONST_INT_P (base_or_step)
14241 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14242 }
14243
14244 /* Return true if X is a valid immediate for the SVE ADD and SUB
14245 instructions. Negate X first if NEGATE_P is true. */
14246
14247 bool
14248 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14249 {
14250 rtx elt;
14251
14252 if (!const_vec_duplicate_p (x, &elt)
14253 || !CONST_INT_P (elt))
14254 return false;
14255
14256 HOST_WIDE_INT val = INTVAL (elt);
14257 if (negate_p)
14258 val = -val;
14259 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14260
14261 if (val & 0xff)
14262 return IN_RANGE (val, 0, 0xff);
14263 return IN_RANGE (val, 0, 0xff00);
14264 }
14265
14266 /* Return true if X is a valid immediate operand for an SVE logical
14267 instruction such as AND. */
14268
14269 bool
14270 aarch64_sve_bitmask_immediate_p (rtx x)
14271 {
14272 rtx elt;
14273
14274 return (const_vec_duplicate_p (x, &elt)
14275 && CONST_INT_P (elt)
14276 && aarch64_bitmask_imm (INTVAL (elt),
14277 GET_MODE_INNER (GET_MODE (x))));
14278 }
14279
14280 /* Return true if X is a valid immediate for the SVE DUP and CPY
14281 instructions. */
14282
14283 bool
14284 aarch64_sve_dup_immediate_p (rtx x)
14285 {
14286 rtx elt;
14287
14288 if (!const_vec_duplicate_p (x, &elt)
14289 || !CONST_INT_P (elt))
14290 return false;
14291
14292 HOST_WIDE_INT val = INTVAL (elt);
14293 if (val & 0xff)
14294 return IN_RANGE (val, -0x80, 0x7f);
14295 return IN_RANGE (val, -0x8000, 0x7f00);
14296 }
14297
14298 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14299 SIGNED_P says whether the operand is signed rather than unsigned. */
14300
14301 bool
14302 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14303 {
14304 rtx elt;
14305
14306 return (const_vec_duplicate_p (x, &elt)
14307 && CONST_INT_P (elt)
14308 && (signed_p
14309 ? IN_RANGE (INTVAL (elt), -16, 15)
14310 : IN_RANGE (INTVAL (elt), 0, 127)));
14311 }
14312
14313 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14314 instruction. Negate X first if NEGATE_P is true. */
14315
14316 bool
14317 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14318 {
14319 rtx elt;
14320 REAL_VALUE_TYPE r;
14321
14322 if (!const_vec_duplicate_p (x, &elt)
14323 || GET_CODE (elt) != CONST_DOUBLE)
14324 return false;
14325
14326 r = *CONST_DOUBLE_REAL_VALUE (elt);
14327
14328 if (negate_p)
14329 r = real_value_negate (&r);
14330
14331 if (real_equal (&r, &dconst1))
14332 return true;
14333 if (real_equal (&r, &dconsthalf))
14334 return true;
14335 return false;
14336 }
14337
14338 /* Return true if X is a valid immediate operand for an SVE FMUL
14339 instruction. */
14340
14341 bool
14342 aarch64_sve_float_mul_immediate_p (rtx x)
14343 {
14344 rtx elt;
14345
14346 /* GCC will never generate a multiply with an immediate of 2, so there is no
14347 point testing for it (even though it is a valid constant). */
14348 return (const_vec_duplicate_p (x, &elt)
14349 && GET_CODE (elt) == CONST_DOUBLE
14350 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14351 }
14352
14353 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14354 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14355 is nonnull, use it to describe valid immediates. */
14356 static bool
14357 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14358 simd_immediate_info *info,
14359 enum simd_immediate_check which,
14360 simd_immediate_info::insn_type insn)
14361 {
14362 /* Try a 4-byte immediate with LSL. */
14363 for (unsigned int shift = 0; shift < 32; shift += 8)
14364 if ((val32 & (0xff << shift)) == val32)
14365 {
14366 if (info)
14367 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14368 simd_immediate_info::LSL, shift);
14369 return true;
14370 }
14371
14372 /* Try a 2-byte immediate with LSL. */
14373 unsigned int imm16 = val32 & 0xffff;
14374 if (imm16 == (val32 >> 16))
14375 for (unsigned int shift = 0; shift < 16; shift += 8)
14376 if ((imm16 & (0xff << shift)) == imm16)
14377 {
14378 if (info)
14379 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14380 simd_immediate_info::LSL, shift);
14381 return true;
14382 }
14383
14384 /* Try a 4-byte immediate with MSL, except for cases that MVN
14385 can handle. */
14386 if (which == AARCH64_CHECK_MOV)
14387 for (unsigned int shift = 8; shift < 24; shift += 8)
14388 {
14389 unsigned int low = (1 << shift) - 1;
14390 if (((val32 & (0xff << shift)) | low) == val32)
14391 {
14392 if (info)
14393 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14394 simd_immediate_info::MSL, shift);
14395 return true;
14396 }
14397 }
14398
14399 return false;
14400 }
14401
14402 /* Return true if replicating VAL64 is a valid immediate for the
14403 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14404 use it to describe valid immediates. */
14405 static bool
14406 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14407 simd_immediate_info *info,
14408 enum simd_immediate_check which)
14409 {
14410 unsigned int val32 = val64 & 0xffffffff;
14411 unsigned int val16 = val64 & 0xffff;
14412 unsigned int val8 = val64 & 0xff;
14413
14414 if (val32 == (val64 >> 32))
14415 {
14416 if ((which & AARCH64_CHECK_ORR) != 0
14417 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14418 simd_immediate_info::MOV))
14419 return true;
14420
14421 if ((which & AARCH64_CHECK_BIC) != 0
14422 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14423 simd_immediate_info::MVN))
14424 return true;
14425
14426 /* Try using a replicated byte. */
14427 if (which == AARCH64_CHECK_MOV
14428 && val16 == (val32 >> 16)
14429 && val8 == (val16 >> 8))
14430 {
14431 if (info)
14432 *info = simd_immediate_info (QImode, val8);
14433 return true;
14434 }
14435 }
14436
14437 /* Try using a bit-to-bytemask. */
14438 if (which == AARCH64_CHECK_MOV)
14439 {
14440 unsigned int i;
14441 for (i = 0; i < 64; i += 8)
14442 {
14443 unsigned char byte = (val64 >> i) & 0xff;
14444 if (byte != 0 && byte != 0xff)
14445 break;
14446 }
14447 if (i == 64)
14448 {
14449 if (info)
14450 *info = simd_immediate_info (DImode, val64);
14451 return true;
14452 }
14453 }
14454 return false;
14455 }
14456
14457 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14458 instruction. If INFO is nonnull, use it to describe valid immediates. */
14459
14460 static bool
14461 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14462 simd_immediate_info *info)
14463 {
14464 scalar_int_mode mode = DImode;
14465 unsigned int val32 = val64 & 0xffffffff;
14466 if (val32 == (val64 >> 32))
14467 {
14468 mode = SImode;
14469 unsigned int val16 = val32 & 0xffff;
14470 if (val16 == (val32 >> 16))
14471 {
14472 mode = HImode;
14473 unsigned int val8 = val16 & 0xff;
14474 if (val8 == (val16 >> 8))
14475 mode = QImode;
14476 }
14477 }
14478 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14479 if (IN_RANGE (val, -0x80, 0x7f))
14480 {
14481 /* DUP with no shift. */
14482 if (info)
14483 *info = simd_immediate_info (mode, val);
14484 return true;
14485 }
14486 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14487 {
14488 /* DUP with LSL #8. */
14489 if (info)
14490 *info = simd_immediate_info (mode, val);
14491 return true;
14492 }
14493 if (aarch64_bitmask_imm (val64, mode))
14494 {
14495 /* DUPM. */
14496 if (info)
14497 *info = simd_immediate_info (mode, val);
14498 return true;
14499 }
14500 return false;
14501 }
14502
14503 /* Return true if OP is a valid SIMD immediate for the operation
14504 described by WHICH. If INFO is nonnull, use it to describe valid
14505 immediates. */
14506 bool
14507 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14508 enum simd_immediate_check which)
14509 {
14510 machine_mode mode = GET_MODE (op);
14511 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14512 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14513 return false;
14514
14515 scalar_mode elt_mode = GET_MODE_INNER (mode);
14516 rtx base, step;
14517 unsigned int n_elts;
14518 if (GET_CODE (op) == CONST_VECTOR
14519 && CONST_VECTOR_DUPLICATE_P (op))
14520 n_elts = CONST_VECTOR_NPATTERNS (op);
14521 else if ((vec_flags & VEC_SVE_DATA)
14522 && const_vec_series_p (op, &base, &step))
14523 {
14524 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14525 if (!aarch64_sve_index_immediate_p (base)
14526 || !aarch64_sve_index_immediate_p (step))
14527 return false;
14528
14529 if (info)
14530 *info = simd_immediate_info (elt_mode, base, step);
14531 return true;
14532 }
14533 else if (GET_CODE (op) == CONST_VECTOR
14534 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14535 /* N_ELTS set above. */;
14536 else
14537 return false;
14538
14539 /* Handle PFALSE and PTRUE. */
14540 if (vec_flags & VEC_SVE_PRED)
14541 return (op == CONST0_RTX (mode)
14542 || op == CONSTM1_RTX (mode));
14543
14544 scalar_float_mode elt_float_mode;
14545 if (n_elts == 1
14546 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14547 {
14548 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14549 if (aarch64_float_const_zero_rtx_p (elt)
14550 || aarch64_float_const_representable_p (elt))
14551 {
14552 if (info)
14553 *info = simd_immediate_info (elt_float_mode, elt);
14554 return true;
14555 }
14556 }
14557
14558 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14559 if (elt_size > 8)
14560 return false;
14561
14562 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14563
14564 /* Expand the vector constant out into a byte vector, with the least
14565 significant byte of the register first. */
14566 auto_vec<unsigned char, 16> bytes;
14567 bytes.reserve (n_elts * elt_size);
14568 for (unsigned int i = 0; i < n_elts; i++)
14569 {
14570 /* The vector is provided in gcc endian-neutral fashion.
14571 For aarch64_be Advanced SIMD, it must be laid out in the vector
14572 register in reverse order. */
14573 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14574 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14575
14576 if (elt_mode != elt_int_mode)
14577 elt = gen_lowpart (elt_int_mode, elt);
14578
14579 if (!CONST_INT_P (elt))
14580 return false;
14581
14582 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14583 for (unsigned int byte = 0; byte < elt_size; byte++)
14584 {
14585 bytes.quick_push (elt_val & 0xff);
14586 elt_val >>= BITS_PER_UNIT;
14587 }
14588 }
14589
14590 /* The immediate must repeat every eight bytes. */
14591 unsigned int nbytes = bytes.length ();
14592 for (unsigned i = 8; i < nbytes; ++i)
14593 if (bytes[i] != bytes[i - 8])
14594 return false;
14595
14596 /* Get the repeating 8-byte value as an integer. No endian correction
14597 is needed here because bytes is already in lsb-first order. */
14598 unsigned HOST_WIDE_INT val64 = 0;
14599 for (unsigned int i = 0; i < 8; i++)
14600 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14601 << (i * BITS_PER_UNIT));
14602
14603 if (vec_flags & VEC_SVE_DATA)
14604 return aarch64_sve_valid_immediate (val64, info);
14605 else
14606 return aarch64_advsimd_valid_immediate (val64, info, which);
14607 }
14608
14609 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14610 has a step in the range of INDEX. Return the index expression if so,
14611 otherwise return null. */
14612 rtx
14613 aarch64_check_zero_based_sve_index_immediate (rtx x)
14614 {
14615 rtx base, step;
14616 if (const_vec_series_p (x, &base, &step)
14617 && base == const0_rtx
14618 && aarch64_sve_index_immediate_p (step))
14619 return step;
14620 return NULL_RTX;
14621 }
14622
14623 /* Check of immediate shift constants are within range. */
14624 bool
14625 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14626 {
14627 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14628 if (left)
14629 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14630 else
14631 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14632 }
14633
14634 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14635 operation of width WIDTH at bit position POS. */
14636
14637 rtx
14638 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14639 {
14640 gcc_assert (CONST_INT_P (width));
14641 gcc_assert (CONST_INT_P (pos));
14642
14643 unsigned HOST_WIDE_INT mask
14644 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14645 return GEN_INT (mask << UINTVAL (pos));
14646 }
14647
14648 bool
14649 aarch64_mov_operand_p (rtx x, machine_mode mode)
14650 {
14651 if (GET_CODE (x) == HIGH
14652 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14653 return true;
14654
14655 if (CONST_INT_P (x))
14656 return true;
14657
14658 if (VECTOR_MODE_P (GET_MODE (x)))
14659 return aarch64_simd_valid_immediate (x, NULL);
14660
14661 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14662 return true;
14663
14664 if (aarch64_sve_cnt_immediate_p (x))
14665 return true;
14666
14667 return aarch64_classify_symbolic_expression (x)
14668 == SYMBOL_TINY_ABSOLUTE;
14669 }
14670
14671 /* Return a const_int vector of VAL. */
14672 rtx
14673 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14674 {
14675 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14676 return gen_const_vec_duplicate (mode, c);
14677 }
14678
14679 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14680
14681 bool
14682 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14683 {
14684 machine_mode vmode;
14685
14686 vmode = aarch64_simd_container_mode (mode, 64);
14687 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14688 return aarch64_simd_valid_immediate (op_v, NULL);
14689 }
14690
14691 /* Construct and return a PARALLEL RTX vector with elements numbering the
14692 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14693 the vector - from the perspective of the architecture. This does not
14694 line up with GCC's perspective on lane numbers, so we end up with
14695 different masks depending on our target endian-ness. The diagram
14696 below may help. We must draw the distinction when building masks
14697 which select one half of the vector. An instruction selecting
14698 architectural low-lanes for a big-endian target, must be described using
14699 a mask selecting GCC high-lanes.
14700
14701 Big-Endian Little-Endian
14702
14703 GCC 0 1 2 3 3 2 1 0
14704 | x | x | x | x | | x | x | x | x |
14705 Architecture 3 2 1 0 3 2 1 0
14706
14707 Low Mask: { 2, 3 } { 0, 1 }
14708 High Mask: { 0, 1 } { 2, 3 }
14709
14710 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14711
14712 rtx
14713 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14714 {
14715 rtvec v = rtvec_alloc (nunits / 2);
14716 int high_base = nunits / 2;
14717 int low_base = 0;
14718 int base;
14719 rtx t1;
14720 int i;
14721
14722 if (BYTES_BIG_ENDIAN)
14723 base = high ? low_base : high_base;
14724 else
14725 base = high ? high_base : low_base;
14726
14727 for (i = 0; i < nunits / 2; i++)
14728 RTVEC_ELT (v, i) = GEN_INT (base + i);
14729
14730 t1 = gen_rtx_PARALLEL (mode, v);
14731 return t1;
14732 }
14733
14734 /* Check OP for validity as a PARALLEL RTX vector with elements
14735 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14736 from the perspective of the architecture. See the diagram above
14737 aarch64_simd_vect_par_cnst_half for more details. */
14738
14739 bool
14740 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14741 bool high)
14742 {
14743 int nelts;
14744 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14745 return false;
14746
14747 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14748 HOST_WIDE_INT count_op = XVECLEN (op, 0);
14749 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14750 int i = 0;
14751
14752 if (count_op != count_ideal)
14753 return false;
14754
14755 for (i = 0; i < count_ideal; i++)
14756 {
14757 rtx elt_op = XVECEXP (op, 0, i);
14758 rtx elt_ideal = XVECEXP (ideal, 0, i);
14759
14760 if (!CONST_INT_P (elt_op)
14761 || INTVAL (elt_ideal) != INTVAL (elt_op))
14762 return false;
14763 }
14764 return true;
14765 }
14766
14767 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14768 HIGH (exclusive). */
14769 void
14770 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14771 const_tree exp)
14772 {
14773 HOST_WIDE_INT lane;
14774 gcc_assert (CONST_INT_P (operand));
14775 lane = INTVAL (operand);
14776
14777 if (lane < low || lane >= high)
14778 {
14779 if (exp)
14780 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14781 else
14782 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14783 }
14784 }
14785
14786 /* Peform endian correction on lane number N, which indexes a vector
14787 of mode MODE, and return the result as an SImode rtx. */
14788
14789 rtx
14790 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14791 {
14792 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14793 }
14794
14795 /* Return TRUE if OP is a valid vector addressing mode. */
14796
14797 bool
14798 aarch64_simd_mem_operand_p (rtx op)
14799 {
14800 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14801 || REG_P (XEXP (op, 0)));
14802 }
14803
14804 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14805
14806 bool
14807 aarch64_sve_ld1r_operand_p (rtx op)
14808 {
14809 struct aarch64_address_info addr;
14810 scalar_mode mode;
14811
14812 return (MEM_P (op)
14813 && is_a <scalar_mode> (GET_MODE (op), &mode)
14814 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14815 && addr.type == ADDRESS_REG_IMM
14816 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14817 }
14818
14819 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14820 The conditions for STR are the same. */
14821 bool
14822 aarch64_sve_ldr_operand_p (rtx op)
14823 {
14824 struct aarch64_address_info addr;
14825
14826 return (MEM_P (op)
14827 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14828 false, ADDR_QUERY_ANY)
14829 && addr.type == ADDRESS_REG_IMM);
14830 }
14831
14832 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14833 We need to be able to access the individual pieces, so the range
14834 is different from LD[234] and ST[234]. */
14835 bool
14836 aarch64_sve_struct_memory_operand_p (rtx op)
14837 {
14838 if (!MEM_P (op))
14839 return false;
14840
14841 machine_mode mode = GET_MODE (op);
14842 struct aarch64_address_info addr;
14843 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14844 ADDR_QUERY_ANY)
14845 || addr.type != ADDRESS_REG_IMM)
14846 return false;
14847
14848 poly_int64 first = addr.const_offset;
14849 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14850 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14851 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14852 }
14853
14854 /* Emit a register copy from operand to operand, taking care not to
14855 early-clobber source registers in the process.
14856
14857 COUNT is the number of components into which the copy needs to be
14858 decomposed. */
14859 void
14860 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14861 unsigned int count)
14862 {
14863 unsigned int i;
14864 int rdest = REGNO (operands[0]);
14865 int rsrc = REGNO (operands[1]);
14866
14867 if (!reg_overlap_mentioned_p (operands[0], operands[1])
14868 || rdest < rsrc)
14869 for (i = 0; i < count; i++)
14870 emit_move_insn (gen_rtx_REG (mode, rdest + i),
14871 gen_rtx_REG (mode, rsrc + i));
14872 else
14873 for (i = 0; i < count; i++)
14874 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14875 gen_rtx_REG (mode, rsrc + count - i - 1));
14876 }
14877
14878 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14879 one of VSTRUCT modes: OI, CI, or XI. */
14880 int
14881 aarch64_simd_attr_length_rglist (machine_mode mode)
14882 {
14883 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14884 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14885 }
14886
14887 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14888 alignment of a vector to 128 bits. SVE predicates have an alignment of
14889 16 bits. */
14890 static HOST_WIDE_INT
14891 aarch64_simd_vector_alignment (const_tree type)
14892 {
14893 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14894 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14895 be set for non-predicate vectors of booleans. Modes are the most
14896 direct way we have of identifying real SVE predicate types. */
14897 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14898 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14899 return MIN (align, 128);
14900 }
14901
14902 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14903 static poly_uint64
14904 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14905 {
14906 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14907 {
14908 /* If the length of the vector is fixed, try to align to that length,
14909 otherwise don't try to align at all. */
14910 HOST_WIDE_INT result;
14911 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14912 result = TYPE_ALIGN (TREE_TYPE (type));
14913 return result;
14914 }
14915 return TYPE_ALIGN (type);
14916 }
14917
14918 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14919 static bool
14920 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14921 {
14922 if (is_packed)
14923 return false;
14924
14925 /* For fixed-length vectors, check that the vectorizer will aim for
14926 full-vector alignment. This isn't true for generic GCC vectors
14927 that are wider than the ABI maximum of 128 bits. */
14928 poly_uint64 preferred_alignment =
14929 aarch64_vectorize_preferred_vector_alignment (type);
14930 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14931 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14932 preferred_alignment))
14933 return false;
14934
14935 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
14936 return true;
14937 }
14938
14939 /* Return true if the vector misalignment factor is supported by the
14940 target. */
14941 static bool
14942 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14943 const_tree type, int misalignment,
14944 bool is_packed)
14945 {
14946 if (TARGET_SIMD && STRICT_ALIGNMENT)
14947 {
14948 /* Return if movmisalign pattern is not supported for this mode. */
14949 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14950 return false;
14951
14952 /* Misalignment factor is unknown at compile time. */
14953 if (misalignment == -1)
14954 return false;
14955 }
14956 return default_builtin_support_vector_misalignment (mode, type, misalignment,
14957 is_packed);
14958 }
14959
14960 /* If VALS is a vector constant that can be loaded into a register
14961 using DUP, generate instructions to do so and return an RTX to
14962 assign to the register. Otherwise return NULL_RTX. */
14963 static rtx
14964 aarch64_simd_dup_constant (rtx vals)
14965 {
14966 machine_mode mode = GET_MODE (vals);
14967 machine_mode inner_mode = GET_MODE_INNER (mode);
14968 rtx x;
14969
14970 if (!const_vec_duplicate_p (vals, &x))
14971 return NULL_RTX;
14972
14973 /* We can load this constant by using DUP and a constant in a
14974 single ARM register. This will be cheaper than a vector
14975 load. */
14976 x = copy_to_mode_reg (inner_mode, x);
14977 return gen_vec_duplicate (mode, x);
14978 }
14979
14980
14981 /* Generate code to load VALS, which is a PARALLEL containing only
14982 constants (for vec_init) or CONST_VECTOR, efficiently into a
14983 register. Returns an RTX to copy into the register, or NULL_RTX
14984 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
14985 static rtx
14986 aarch64_simd_make_constant (rtx vals)
14987 {
14988 machine_mode mode = GET_MODE (vals);
14989 rtx const_dup;
14990 rtx const_vec = NULL_RTX;
14991 int n_const = 0;
14992 int i;
14993
14994 if (GET_CODE (vals) == CONST_VECTOR)
14995 const_vec = vals;
14996 else if (GET_CODE (vals) == PARALLEL)
14997 {
14998 /* A CONST_VECTOR must contain only CONST_INTs and
14999 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15000 Only store valid constants in a CONST_VECTOR. */
15001 int n_elts = XVECLEN (vals, 0);
15002 for (i = 0; i < n_elts; ++i)
15003 {
15004 rtx x = XVECEXP (vals, 0, i);
15005 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15006 n_const++;
15007 }
15008 if (n_const == n_elts)
15009 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15010 }
15011 else
15012 gcc_unreachable ();
15013
15014 if (const_vec != NULL_RTX
15015 && aarch64_simd_valid_immediate (const_vec, NULL))
15016 /* Load using MOVI/MVNI. */
15017 return const_vec;
15018 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15019 /* Loaded using DUP. */
15020 return const_dup;
15021 else if (const_vec != NULL_RTX)
15022 /* Load from constant pool. We cannot take advantage of single-cycle
15023 LD1 because we need a PC-relative addressing mode. */
15024 return const_vec;
15025 else
15026 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15027 We cannot construct an initializer. */
15028 return NULL_RTX;
15029 }
15030
15031 /* Expand a vector initialisation sequence, such that TARGET is
15032 initialised to contain VALS. */
15033
15034 void
15035 aarch64_expand_vector_init (rtx target, rtx vals)
15036 {
15037 machine_mode mode = GET_MODE (target);
15038 scalar_mode inner_mode = GET_MODE_INNER (mode);
15039 /* The number of vector elements. */
15040 int n_elts = XVECLEN (vals, 0);
15041 /* The number of vector elements which are not constant. */
15042 int n_var = 0;
15043 rtx any_const = NULL_RTX;
15044 /* The first element of vals. */
15045 rtx v0 = XVECEXP (vals, 0, 0);
15046 bool all_same = true;
15047
15048 /* Count the number of variable elements to initialise. */
15049 for (int i = 0; i < n_elts; ++i)
15050 {
15051 rtx x = XVECEXP (vals, 0, i);
15052 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15053 ++n_var;
15054 else
15055 any_const = x;
15056
15057 all_same &= rtx_equal_p (x, v0);
15058 }
15059
15060 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15061 how best to handle this. */
15062 if (n_var == 0)
15063 {
15064 rtx constant = aarch64_simd_make_constant (vals);
15065 if (constant != NULL_RTX)
15066 {
15067 emit_move_insn (target, constant);
15068 return;
15069 }
15070 }
15071
15072 /* Splat a single non-constant element if we can. */
15073 if (all_same)
15074 {
15075 rtx x = copy_to_mode_reg (inner_mode, v0);
15076 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15077 return;
15078 }
15079
15080 enum insn_code icode = optab_handler (vec_set_optab, mode);
15081 gcc_assert (icode != CODE_FOR_nothing);
15082
15083 /* If there are only variable elements, try to optimize
15084 the insertion using dup for the most common element
15085 followed by insertions. */
15086
15087 /* The algorithm will fill matches[*][0] with the earliest matching element,
15088 and matches[X][1] with the count of duplicate elements (if X is the
15089 earliest element which has duplicates). */
15090
15091 if (n_var == n_elts && n_elts <= 16)
15092 {
15093 int matches[16][2] = {0};
15094 for (int i = 0; i < n_elts; i++)
15095 {
15096 for (int j = 0; j <= i; j++)
15097 {
15098 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15099 {
15100 matches[i][0] = j;
15101 matches[j][1]++;
15102 break;
15103 }
15104 }
15105 }
15106 int maxelement = 0;
15107 int maxv = 0;
15108 for (int i = 0; i < n_elts; i++)
15109 if (matches[i][1] > maxv)
15110 {
15111 maxelement = i;
15112 maxv = matches[i][1];
15113 }
15114
15115 /* Create a duplicate of the most common element, unless all elements
15116 are equally useless to us, in which case just immediately set the
15117 vector register using the first element. */
15118
15119 if (maxv == 1)
15120 {
15121 /* For vectors of two 64-bit elements, we can do even better. */
15122 if (n_elts == 2
15123 && (inner_mode == E_DImode
15124 || inner_mode == E_DFmode))
15125
15126 {
15127 rtx x0 = XVECEXP (vals, 0, 0);
15128 rtx x1 = XVECEXP (vals, 0, 1);
15129 /* Combine can pick up this case, but handling it directly
15130 here leaves clearer RTL.
15131
15132 This is load_pair_lanes<mode>, and also gives us a clean-up
15133 for store_pair_lanes<mode>. */
15134 if (memory_operand (x0, inner_mode)
15135 && memory_operand (x1, inner_mode)
15136 && !STRICT_ALIGNMENT
15137 && rtx_equal_p (XEXP (x1, 0),
15138 plus_constant (Pmode,
15139 XEXP (x0, 0),
15140 GET_MODE_SIZE (inner_mode))))
15141 {
15142 rtx t;
15143 if (inner_mode == DFmode)
15144 t = gen_load_pair_lanesdf (target, x0, x1);
15145 else
15146 t = gen_load_pair_lanesdi (target, x0, x1);
15147 emit_insn (t);
15148 return;
15149 }
15150 }
15151 /* The subreg-move sequence below will move into lane zero of the
15152 vector register. For big-endian we want that position to hold
15153 the last element of VALS. */
15154 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15155 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15156 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15157 }
15158 else
15159 {
15160 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15161 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15162 }
15163
15164 /* Insert the rest. */
15165 for (int i = 0; i < n_elts; i++)
15166 {
15167 rtx x = XVECEXP (vals, 0, i);
15168 if (matches[i][0] == maxelement)
15169 continue;
15170 x = copy_to_mode_reg (inner_mode, x);
15171 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15172 }
15173 return;
15174 }
15175
15176 /* Initialise a vector which is part-variable. We want to first try
15177 to build those lanes which are constant in the most efficient way we
15178 can. */
15179 if (n_var != n_elts)
15180 {
15181 rtx copy = copy_rtx (vals);
15182
15183 /* Load constant part of vector. We really don't care what goes into the
15184 parts we will overwrite, but we're more likely to be able to load the
15185 constant efficiently if it has fewer, larger, repeating parts
15186 (see aarch64_simd_valid_immediate). */
15187 for (int i = 0; i < n_elts; i++)
15188 {
15189 rtx x = XVECEXP (vals, 0, i);
15190 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15191 continue;
15192 rtx subst = any_const;
15193 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15194 {
15195 /* Look in the copied vector, as more elements are const. */
15196 rtx test = XVECEXP (copy, 0, i ^ bit);
15197 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15198 {
15199 subst = test;
15200 break;
15201 }
15202 }
15203 XVECEXP (copy, 0, i) = subst;
15204 }
15205 aarch64_expand_vector_init (target, copy);
15206 }
15207
15208 /* Insert the variable lanes directly. */
15209 for (int i = 0; i < n_elts; i++)
15210 {
15211 rtx x = XVECEXP (vals, 0, i);
15212 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15213 continue;
15214 x = copy_to_mode_reg (inner_mode, x);
15215 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15216 }
15217 }
15218
15219 static unsigned HOST_WIDE_INT
15220 aarch64_shift_truncation_mask (machine_mode mode)
15221 {
15222 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15223 return 0;
15224 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15225 }
15226
15227 /* Select a format to encode pointers in exception handling data. */
15228 int
15229 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15230 {
15231 int type;
15232 switch (aarch64_cmodel)
15233 {
15234 case AARCH64_CMODEL_TINY:
15235 case AARCH64_CMODEL_TINY_PIC:
15236 case AARCH64_CMODEL_SMALL:
15237 case AARCH64_CMODEL_SMALL_PIC:
15238 case AARCH64_CMODEL_SMALL_SPIC:
15239 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15240 for everything. */
15241 type = DW_EH_PE_sdata4;
15242 break;
15243 default:
15244 /* No assumptions here. 8-byte relocs required. */
15245 type = DW_EH_PE_sdata8;
15246 break;
15247 }
15248 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15249 }
15250
15251 /* The last .arch and .tune assembly strings that we printed. */
15252 static std::string aarch64_last_printed_arch_string;
15253 static std::string aarch64_last_printed_tune_string;
15254
15255 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15256 by the function fndecl. */
15257
15258 void
15259 aarch64_declare_function_name (FILE *stream, const char* name,
15260 tree fndecl)
15261 {
15262 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15263
15264 struct cl_target_option *targ_options;
15265 if (target_parts)
15266 targ_options = TREE_TARGET_OPTION (target_parts);
15267 else
15268 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15269 gcc_assert (targ_options);
15270
15271 const struct processor *this_arch
15272 = aarch64_get_arch (targ_options->x_explicit_arch);
15273
15274 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15275 std::string extension
15276 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15277 this_arch->flags);
15278 /* Only update the assembler .arch string if it is distinct from the last
15279 such string we printed. */
15280 std::string to_print = this_arch->name + extension;
15281 if (to_print != aarch64_last_printed_arch_string)
15282 {
15283 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15284 aarch64_last_printed_arch_string = to_print;
15285 }
15286
15287 /* Print the cpu name we're tuning for in the comments, might be
15288 useful to readers of the generated asm. Do it only when it changes
15289 from function to function and verbose assembly is requested. */
15290 const struct processor *this_tune
15291 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15292
15293 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15294 {
15295 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15296 this_tune->name);
15297 aarch64_last_printed_tune_string = this_tune->name;
15298 }
15299
15300 /* Don't forget the type directive for ELF. */
15301 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15302 ASM_OUTPUT_LABEL (stream, name);
15303 }
15304
15305 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15306
15307 static void
15308 aarch64_start_file (void)
15309 {
15310 struct cl_target_option *default_options
15311 = TREE_TARGET_OPTION (target_option_default_node);
15312
15313 const struct processor *default_arch
15314 = aarch64_get_arch (default_options->x_explicit_arch);
15315 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15316 std::string extension
15317 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15318 default_arch->flags);
15319
15320 aarch64_last_printed_arch_string = default_arch->name + extension;
15321 aarch64_last_printed_tune_string = "";
15322 asm_fprintf (asm_out_file, "\t.arch %s\n",
15323 aarch64_last_printed_arch_string.c_str ());
15324
15325 default_file_start ();
15326 }
15327
15328 /* Emit load exclusive. */
15329
15330 static void
15331 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15332 rtx mem, rtx model_rtx)
15333 {
15334 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15335 }
15336
15337 /* Emit store exclusive. */
15338
15339 static void
15340 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15341 rtx rval, rtx mem, rtx model_rtx)
15342 {
15343 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15344 }
15345
15346 /* Mark the previous jump instruction as unlikely. */
15347
15348 static void
15349 aarch64_emit_unlikely_jump (rtx insn)
15350 {
15351 rtx_insn *jump = emit_jump_insn (insn);
15352 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15353 }
15354
15355 /* Expand a compare and swap pattern. */
15356
15357 void
15358 aarch64_expand_compare_and_swap (rtx operands[])
15359 {
15360 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15361 machine_mode mode, r_mode;
15362
15363 bval = operands[0];
15364 rval = operands[1];
15365 mem = operands[2];
15366 oldval = operands[3];
15367 newval = operands[4];
15368 is_weak = operands[5];
15369 mod_s = operands[6];
15370 mod_f = operands[7];
15371 mode = GET_MODE (mem);
15372
15373 /* Normally the succ memory model must be stronger than fail, but in the
15374 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15375 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15376 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15377 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15378 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15379
15380 r_mode = mode;
15381 if (mode == QImode || mode == HImode)
15382 {
15383 r_mode = SImode;
15384 rval = gen_reg_rtx (r_mode);
15385 }
15386
15387 if (TARGET_LSE)
15388 {
15389 /* The CAS insn requires oldval and rval overlap, but we need to
15390 have a copy of oldval saved across the operation to tell if
15391 the operation is successful. */
15392 if (reg_overlap_mentioned_p (rval, oldval))
15393 rval = copy_to_mode_reg (r_mode, oldval);
15394 else
15395 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15396
15397 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15398 newval, mod_s));
15399 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15400 }
15401 else
15402 {
15403 /* The oldval predicate varies by mode. Test it and force to reg. */
15404 insn_code code = code_for_aarch64_compare_and_swap (mode);
15405 if (!insn_data[code].operand[2].predicate (oldval, mode))
15406 oldval = force_reg (mode, oldval);
15407
15408 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15409 is_weak, mod_s, mod_f));
15410 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15411 }
15412
15413 if (r_mode != mode)
15414 rval = gen_lowpart (mode, rval);
15415 emit_move_insn (operands[1], rval);
15416
15417 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15418 emit_insn (gen_rtx_SET (bval, x));
15419 }
15420
15421 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15422 sequence implementing an atomic operation. */
15423
15424 static void
15425 aarch64_emit_post_barrier (enum memmodel model)
15426 {
15427 const enum memmodel base_model = memmodel_base (model);
15428
15429 if (is_mm_sync (model)
15430 && (base_model == MEMMODEL_ACQUIRE
15431 || base_model == MEMMODEL_ACQ_REL
15432 || base_model == MEMMODEL_SEQ_CST))
15433 {
15434 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15435 }
15436 }
15437
15438 /* Split a compare and swap pattern. */
15439
15440 void
15441 aarch64_split_compare_and_swap (rtx operands[])
15442 {
15443 rtx rval, mem, oldval, newval, scratch;
15444 machine_mode mode;
15445 bool is_weak;
15446 rtx_code_label *label1, *label2;
15447 rtx x, cond;
15448 enum memmodel model;
15449 rtx model_rtx;
15450
15451 rval = operands[0];
15452 mem = operands[1];
15453 oldval = operands[2];
15454 newval = operands[3];
15455 is_weak = (operands[4] != const0_rtx);
15456 model_rtx = operands[5];
15457 scratch = operands[7];
15458 mode = GET_MODE (mem);
15459 model = memmodel_from_int (INTVAL (model_rtx));
15460
15461 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15462 loop:
15463 .label1:
15464 LD[A]XR rval, [mem]
15465 CBNZ rval, .label2
15466 ST[L]XR scratch, newval, [mem]
15467 CBNZ scratch, .label1
15468 .label2:
15469 CMP rval, 0. */
15470 bool strong_zero_p = !is_weak && oldval == const0_rtx;
15471
15472 label1 = NULL;
15473 if (!is_weak)
15474 {
15475 label1 = gen_label_rtx ();
15476 emit_label (label1);
15477 }
15478 label2 = gen_label_rtx ();
15479
15480 /* The initial load can be relaxed for a __sync operation since a final
15481 barrier will be emitted to stop code hoisting. */
15482 if (is_mm_sync (model))
15483 aarch64_emit_load_exclusive (mode, rval, mem,
15484 GEN_INT (MEMMODEL_RELAXED));
15485 else
15486 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15487
15488 if (strong_zero_p)
15489 {
15490 if (aarch64_track_speculation)
15491 {
15492 /* Emit an explicit compare instruction, so that we can correctly
15493 track the condition codes. */
15494 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15495 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15496 }
15497 else
15498 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15499
15500 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15501 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15502 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15503 }
15504 else
15505 {
15506 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15507 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15508 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15509 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15510 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15511 }
15512
15513 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15514
15515 if (!is_weak)
15516 {
15517 if (aarch64_track_speculation)
15518 {
15519 /* Emit an explicit compare instruction, so that we can correctly
15520 track the condition codes. */
15521 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15522 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15523 }
15524 else
15525 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15526
15527 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15528 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15529 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15530 }
15531 else
15532 {
15533 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15534 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15535 emit_insn (gen_rtx_SET (cond, x));
15536 }
15537
15538 emit_label (label2);
15539 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15540 to set the condition flags. If this is not used it will be removed by
15541 later passes. */
15542 if (strong_zero_p)
15543 {
15544 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15545 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15546 emit_insn (gen_rtx_SET (cond, x));
15547 }
15548 /* Emit any final barrier needed for a __sync operation. */
15549 if (is_mm_sync (model))
15550 aarch64_emit_post_barrier (model);
15551 }
15552
15553 /* Split an atomic operation. */
15554
15555 void
15556 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15557 rtx value, rtx model_rtx, rtx cond)
15558 {
15559 machine_mode mode = GET_MODE (mem);
15560 machine_mode wmode = (mode == DImode ? DImode : SImode);
15561 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15562 const bool is_sync = is_mm_sync (model);
15563 rtx_code_label *label;
15564 rtx x;
15565
15566 /* Split the atomic operation into a sequence. */
15567 label = gen_label_rtx ();
15568 emit_label (label);
15569
15570 if (new_out)
15571 new_out = gen_lowpart (wmode, new_out);
15572 if (old_out)
15573 old_out = gen_lowpart (wmode, old_out);
15574 else
15575 old_out = new_out;
15576 value = simplify_gen_subreg (wmode, value, mode, 0);
15577
15578 /* The initial load can be relaxed for a __sync operation since a final
15579 barrier will be emitted to stop code hoisting. */
15580 if (is_sync)
15581 aarch64_emit_load_exclusive (mode, old_out, mem,
15582 GEN_INT (MEMMODEL_RELAXED));
15583 else
15584 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15585
15586 switch (code)
15587 {
15588 case SET:
15589 new_out = value;
15590 break;
15591
15592 case NOT:
15593 x = gen_rtx_AND (wmode, old_out, value);
15594 emit_insn (gen_rtx_SET (new_out, x));
15595 x = gen_rtx_NOT (wmode, new_out);
15596 emit_insn (gen_rtx_SET (new_out, x));
15597 break;
15598
15599 case MINUS:
15600 if (CONST_INT_P (value))
15601 {
15602 value = GEN_INT (-INTVAL (value));
15603 code = PLUS;
15604 }
15605 /* Fall through. */
15606
15607 default:
15608 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15609 emit_insn (gen_rtx_SET (new_out, x));
15610 break;
15611 }
15612
15613 aarch64_emit_store_exclusive (mode, cond, mem,
15614 gen_lowpart (mode, new_out), model_rtx);
15615
15616 if (aarch64_track_speculation)
15617 {
15618 /* Emit an explicit compare instruction, so that we can correctly
15619 track the condition codes. */
15620 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15621 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15622 }
15623 else
15624 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15625
15626 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15627 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15628 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15629
15630 /* Emit any final barrier needed for a __sync operation. */
15631 if (is_sync)
15632 aarch64_emit_post_barrier (model);
15633 }
15634
15635 static void
15636 aarch64_init_libfuncs (void)
15637 {
15638 /* Half-precision float operations. The compiler handles all operations
15639 with NULL libfuncs by converting to SFmode. */
15640
15641 /* Conversions. */
15642 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15643 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15644
15645 /* Arithmetic. */
15646 set_optab_libfunc (add_optab, HFmode, NULL);
15647 set_optab_libfunc (sdiv_optab, HFmode, NULL);
15648 set_optab_libfunc (smul_optab, HFmode, NULL);
15649 set_optab_libfunc (neg_optab, HFmode, NULL);
15650 set_optab_libfunc (sub_optab, HFmode, NULL);
15651
15652 /* Comparisons. */
15653 set_optab_libfunc (eq_optab, HFmode, NULL);
15654 set_optab_libfunc (ne_optab, HFmode, NULL);
15655 set_optab_libfunc (lt_optab, HFmode, NULL);
15656 set_optab_libfunc (le_optab, HFmode, NULL);
15657 set_optab_libfunc (ge_optab, HFmode, NULL);
15658 set_optab_libfunc (gt_optab, HFmode, NULL);
15659 set_optab_libfunc (unord_optab, HFmode, NULL);
15660 }
15661
15662 /* Target hook for c_mode_for_suffix. */
15663 static machine_mode
15664 aarch64_c_mode_for_suffix (char suffix)
15665 {
15666 if (suffix == 'q')
15667 return TFmode;
15668
15669 return VOIDmode;
15670 }
15671
15672 /* We can only represent floating point constants which will fit in
15673 "quarter-precision" values. These values are characterised by
15674 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
15675 by:
15676
15677 (-1)^s * (n/16) * 2^r
15678
15679 Where:
15680 's' is the sign bit.
15681 'n' is an integer in the range 16 <= n <= 31.
15682 'r' is an integer in the range -3 <= r <= 4. */
15683
15684 /* Return true iff X can be represented by a quarter-precision
15685 floating point immediate operand X. Note, we cannot represent 0.0. */
15686 bool
15687 aarch64_float_const_representable_p (rtx x)
15688 {
15689 /* This represents our current view of how many bits
15690 make up the mantissa. */
15691 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15692 int exponent;
15693 unsigned HOST_WIDE_INT mantissa, mask;
15694 REAL_VALUE_TYPE r, m;
15695 bool fail;
15696
15697 if (!CONST_DOUBLE_P (x))
15698 return false;
15699
15700 if (GET_MODE (x) == VOIDmode
15701 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15702 return false;
15703
15704 r = *CONST_DOUBLE_REAL_VALUE (x);
15705
15706 /* We cannot represent infinities, NaNs or +/-zero. We won't
15707 know if we have +zero until we analyse the mantissa, but we
15708 can reject the other invalid values. */
15709 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15710 || REAL_VALUE_MINUS_ZERO (r))
15711 return false;
15712
15713 /* Extract exponent. */
15714 r = real_value_abs (&r);
15715 exponent = REAL_EXP (&r);
15716
15717 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15718 highest (sign) bit, with a fixed binary point at bit point_pos.
15719 m1 holds the low part of the mantissa, m2 the high part.
15720 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15721 bits for the mantissa, this can fail (low bits will be lost). */
15722 real_ldexp (&m, &r, point_pos - exponent);
15723 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15724
15725 /* If the low part of the mantissa has bits set we cannot represent
15726 the value. */
15727 if (w.ulow () != 0)
15728 return false;
15729 /* We have rejected the lower HOST_WIDE_INT, so update our
15730 understanding of how many bits lie in the mantissa and
15731 look only at the high HOST_WIDE_INT. */
15732 mantissa = w.elt (1);
15733 point_pos -= HOST_BITS_PER_WIDE_INT;
15734
15735 /* We can only represent values with a mantissa of the form 1.xxxx. */
15736 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15737 if ((mantissa & mask) != 0)
15738 return false;
15739
15740 /* Having filtered unrepresentable values, we may now remove all
15741 but the highest 5 bits. */
15742 mantissa >>= point_pos - 5;
15743
15744 /* We cannot represent the value 0.0, so reject it. This is handled
15745 elsewhere. */
15746 if (mantissa == 0)
15747 return false;
15748
15749 /* Then, as bit 4 is always set, we can mask it off, leaving
15750 the mantissa in the range [0, 15]. */
15751 mantissa &= ~(1 << 4);
15752 gcc_assert (mantissa <= 15);
15753
15754 /* GCC internally does not use IEEE754-like encoding (where normalized
15755 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
15756 Our mantissa values are shifted 4 places to the left relative to
15757 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15758 by 5 places to correct for GCC's representation. */
15759 exponent = 5 - exponent;
15760
15761 return (exponent >= 0 && exponent <= 7);
15762 }
15763
15764 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15765 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
15766 output MOVI/MVNI, ORR or BIC immediate. */
15767 char*
15768 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15769 enum simd_immediate_check which)
15770 {
15771 bool is_valid;
15772 static char templ[40];
15773 const char *mnemonic;
15774 const char *shift_op;
15775 unsigned int lane_count = 0;
15776 char element_char;
15777
15778 struct simd_immediate_info info;
15779
15780 /* This will return true to show const_vector is legal for use as either
15781 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15782 It will also update INFO to show how the immediate should be generated.
15783 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
15784 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15785 gcc_assert (is_valid);
15786
15787 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15788 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15789
15790 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15791 {
15792 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15793 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15794 move immediate path. */
15795 if (aarch64_float_const_zero_rtx_p (info.value))
15796 info.value = GEN_INT (0);
15797 else
15798 {
15799 const unsigned int buf_size = 20;
15800 char float_buf[buf_size] = {'\0'};
15801 real_to_decimal_for_mode (float_buf,
15802 CONST_DOUBLE_REAL_VALUE (info.value),
15803 buf_size, buf_size, 1, info.elt_mode);
15804
15805 if (lane_count == 1)
15806 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15807 else
15808 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15809 lane_count, element_char, float_buf);
15810 return templ;
15811 }
15812 }
15813
15814 gcc_assert (CONST_INT_P (info.value));
15815
15816 if (which == AARCH64_CHECK_MOV)
15817 {
15818 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15819 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15820 if (lane_count == 1)
15821 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15822 mnemonic, UINTVAL (info.value));
15823 else if (info.shift)
15824 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15825 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15826 element_char, UINTVAL (info.value), shift_op, info.shift);
15827 else
15828 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15829 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15830 element_char, UINTVAL (info.value));
15831 }
15832 else
15833 {
15834 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15835 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15836 if (info.shift)
15837 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15838 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15839 element_char, UINTVAL (info.value), "lsl", info.shift);
15840 else
15841 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15842 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15843 element_char, UINTVAL (info.value));
15844 }
15845 return templ;
15846 }
15847
15848 char*
15849 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15850 {
15851
15852 /* If a floating point number was passed and we desire to use it in an
15853 integer mode do the conversion to integer. */
15854 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15855 {
15856 unsigned HOST_WIDE_INT ival;
15857 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15858 gcc_unreachable ();
15859 immediate = gen_int_mode (ival, mode);
15860 }
15861
15862 machine_mode vmode;
15863 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15864 a 128 bit vector mode. */
15865 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15866
15867 vmode = aarch64_simd_container_mode (mode, width);
15868 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15869 return aarch64_output_simd_mov_immediate (v_op, width);
15870 }
15871
15872 /* Return the output string to use for moving immediate CONST_VECTOR
15873 into an SVE register. */
15874
15875 char *
15876 aarch64_output_sve_mov_immediate (rtx const_vector)
15877 {
15878 static char templ[40];
15879 struct simd_immediate_info info;
15880 char element_char;
15881
15882 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15883 gcc_assert (is_valid);
15884
15885 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15886
15887 if (info.step)
15888 {
15889 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15890 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15891 element_char, INTVAL (info.value), INTVAL (info.step));
15892 return templ;
15893 }
15894
15895 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15896 {
15897 if (aarch64_float_const_zero_rtx_p (info.value))
15898 info.value = GEN_INT (0);
15899 else
15900 {
15901 const int buf_size = 20;
15902 char float_buf[buf_size] = {};
15903 real_to_decimal_for_mode (float_buf,
15904 CONST_DOUBLE_REAL_VALUE (info.value),
15905 buf_size, buf_size, 1, info.elt_mode);
15906
15907 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15908 element_char, float_buf);
15909 return templ;
15910 }
15911 }
15912
15913 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15914 element_char, INTVAL (info.value));
15915 return templ;
15916 }
15917
15918 /* Return the asm format for a PTRUE instruction whose destination has
15919 mode MODE. SUFFIX is the element size suffix. */
15920
15921 char *
15922 aarch64_output_ptrue (machine_mode mode, char suffix)
15923 {
15924 unsigned int nunits;
15925 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15926 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15927 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15928 else
15929 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15930 return buf;
15931 }
15932
15933 /* Split operands into moves from op[1] + op[2] into op[0]. */
15934
15935 void
15936 aarch64_split_combinev16qi (rtx operands[3])
15937 {
15938 unsigned int dest = REGNO (operands[0]);
15939 unsigned int src1 = REGNO (operands[1]);
15940 unsigned int src2 = REGNO (operands[2]);
15941 machine_mode halfmode = GET_MODE (operands[1]);
15942 unsigned int halfregs = REG_NREGS (operands[1]);
15943 rtx destlo, desthi;
15944
15945 gcc_assert (halfmode == V16QImode);
15946
15947 if (src1 == dest && src2 == dest + halfregs)
15948 {
15949 /* No-op move. Can't split to nothing; emit something. */
15950 emit_note (NOTE_INSN_DELETED);
15951 return;
15952 }
15953
15954 /* Preserve register attributes for variable tracking. */
15955 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15956 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15957 GET_MODE_SIZE (halfmode));
15958
15959 /* Special case of reversed high/low parts. */
15960 if (reg_overlap_mentioned_p (operands[2], destlo)
15961 && reg_overlap_mentioned_p (operands[1], desthi))
15962 {
15963 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15964 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15965 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15966 }
15967 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15968 {
15969 /* Try to avoid unnecessary moves if part of the result
15970 is in the right place already. */
15971 if (src1 != dest)
15972 emit_move_insn (destlo, operands[1]);
15973 if (src2 != dest + halfregs)
15974 emit_move_insn (desthi, operands[2]);
15975 }
15976 else
15977 {
15978 if (src2 != dest + halfregs)
15979 emit_move_insn (desthi, operands[2]);
15980 if (src1 != dest)
15981 emit_move_insn (destlo, operands[1]);
15982 }
15983 }
15984
15985 /* vec_perm support. */
15986
15987 struct expand_vec_perm_d
15988 {
15989 rtx target, op0, op1;
15990 vec_perm_indices perm;
15991 machine_mode vmode;
15992 unsigned int vec_flags;
15993 bool one_vector_p;
15994 bool testing_p;
15995 };
15996
15997 /* Generate a variable permutation. */
15998
15999 static void
16000 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16001 {
16002 machine_mode vmode = GET_MODE (target);
16003 bool one_vector_p = rtx_equal_p (op0, op1);
16004
16005 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16006 gcc_checking_assert (GET_MODE (op0) == vmode);
16007 gcc_checking_assert (GET_MODE (op1) == vmode);
16008 gcc_checking_assert (GET_MODE (sel) == vmode);
16009 gcc_checking_assert (TARGET_SIMD);
16010
16011 if (one_vector_p)
16012 {
16013 if (vmode == V8QImode)
16014 {
16015 /* Expand the argument to a V16QI mode by duplicating it. */
16016 rtx pair = gen_reg_rtx (V16QImode);
16017 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16018 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16019 }
16020 else
16021 {
16022 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16023 }
16024 }
16025 else
16026 {
16027 rtx pair;
16028
16029 if (vmode == V8QImode)
16030 {
16031 pair = gen_reg_rtx (V16QImode);
16032 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16033 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16034 }
16035 else
16036 {
16037 pair = gen_reg_rtx (OImode);
16038 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16039 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16040 }
16041 }
16042 }
16043
16044 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16045 NELT is the number of elements in the vector. */
16046
16047 void
16048 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16049 unsigned int nelt)
16050 {
16051 machine_mode vmode = GET_MODE (target);
16052 bool one_vector_p = rtx_equal_p (op0, op1);
16053 rtx mask;
16054
16055 /* The TBL instruction does not use a modulo index, so we must take care
16056 of that ourselves. */
16057 mask = aarch64_simd_gen_const_vector_dup (vmode,
16058 one_vector_p ? nelt - 1 : 2 * nelt - 1);
16059 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16060
16061 /* For big-endian, we also need to reverse the index within the vector
16062 (but not which vector). */
16063 if (BYTES_BIG_ENDIAN)
16064 {
16065 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16066 if (!one_vector_p)
16067 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16068 sel = expand_simple_binop (vmode, XOR, sel, mask,
16069 NULL, 0, OPTAB_LIB_WIDEN);
16070 }
16071 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16072 }
16073
16074 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
16075
16076 static void
16077 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16078 {
16079 emit_insn (gen_rtx_SET (target,
16080 gen_rtx_UNSPEC (GET_MODE (target),
16081 gen_rtvec (2, op0, op1), code)));
16082 }
16083
16084 /* Expand an SVE vec_perm with the given operands. */
16085
16086 void
16087 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16088 {
16089 machine_mode data_mode = GET_MODE (target);
16090 machine_mode sel_mode = GET_MODE (sel);
16091 /* Enforced by the pattern condition. */
16092 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16093
16094 /* Note: vec_perm indices are supposed to wrap when they go beyond the
16095 size of the two value vectors, i.e. the upper bits of the indices
16096 are effectively ignored. SVE TBL instead produces 0 for any
16097 out-of-range indices, so we need to modulo all the vec_perm indices
16098 to ensure they are all in range. */
16099 rtx sel_reg = force_reg (sel_mode, sel);
16100
16101 /* Check if the sel only references the first values vector. */
16102 if (GET_CODE (sel) == CONST_VECTOR
16103 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16104 {
16105 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16106 return;
16107 }
16108
16109 /* Check if the two values vectors are the same. */
16110 if (rtx_equal_p (op0, op1))
16111 {
16112 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16113 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16114 NULL, 0, OPTAB_DIRECT);
16115 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16116 return;
16117 }
16118
16119 /* Run TBL on for each value vector and combine the results. */
16120
16121 rtx res0 = gen_reg_rtx (data_mode);
16122 rtx res1 = gen_reg_rtx (data_mode);
16123 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16124 if (GET_CODE (sel) != CONST_VECTOR
16125 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16126 {
16127 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16128 2 * nunits - 1);
16129 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16130 NULL, 0, OPTAB_DIRECT);
16131 }
16132 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16133 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16134 NULL, 0, OPTAB_DIRECT);
16135 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16136 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16137 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16138 else
16139 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16140 }
16141
16142 /* Recognize patterns suitable for the TRN instructions. */
16143 static bool
16144 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16145 {
16146 HOST_WIDE_INT odd;
16147 poly_uint64 nelt = d->perm.length ();
16148 rtx out, in0, in1, x;
16149 machine_mode vmode = d->vmode;
16150
16151 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16152 return false;
16153
16154 /* Note that these are little-endian tests.
16155 We correct for big-endian later. */
16156 if (!d->perm[0].is_constant (&odd)
16157 || (odd != 0 && odd != 1)
16158 || !d->perm.series_p (0, 2, odd, 2)
16159 || !d->perm.series_p (1, 2, nelt + odd, 2))
16160 return false;
16161
16162 /* Success! */
16163 if (d->testing_p)
16164 return true;
16165
16166 in0 = d->op0;
16167 in1 = d->op1;
16168 /* We don't need a big-endian lane correction for SVE; see the comment
16169 at the head of aarch64-sve.md for details. */
16170 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16171 {
16172 x = in0, in0 = in1, in1 = x;
16173 odd = !odd;
16174 }
16175 out = d->target;
16176
16177 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16178 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16179 return true;
16180 }
16181
16182 /* Recognize patterns suitable for the UZP instructions. */
16183 static bool
16184 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16185 {
16186 HOST_WIDE_INT odd;
16187 rtx out, in0, in1, x;
16188 machine_mode vmode = d->vmode;
16189
16190 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16191 return false;
16192
16193 /* Note that these are little-endian tests.
16194 We correct for big-endian later. */
16195 if (!d->perm[0].is_constant (&odd)
16196 || (odd != 0 && odd != 1)
16197 || !d->perm.series_p (0, 1, odd, 2))
16198 return false;
16199
16200 /* Success! */
16201 if (d->testing_p)
16202 return true;
16203
16204 in0 = d->op0;
16205 in1 = d->op1;
16206 /* We don't need a big-endian lane correction for SVE; see the comment
16207 at the head of aarch64-sve.md for details. */
16208 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16209 {
16210 x = in0, in0 = in1, in1 = x;
16211 odd = !odd;
16212 }
16213 out = d->target;
16214
16215 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16216 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16217 return true;
16218 }
16219
16220 /* Recognize patterns suitable for the ZIP instructions. */
16221 static bool
16222 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16223 {
16224 unsigned int high;
16225 poly_uint64 nelt = d->perm.length ();
16226 rtx out, in0, in1, x;
16227 machine_mode vmode = d->vmode;
16228
16229 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16230 return false;
16231
16232 /* Note that these are little-endian tests.
16233 We correct for big-endian later. */
16234 poly_uint64 first = d->perm[0];
16235 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16236 || !d->perm.series_p (0, 2, first, 1)
16237 || !d->perm.series_p (1, 2, first + nelt, 1))
16238 return false;
16239 high = maybe_ne (first, 0U);
16240
16241 /* Success! */
16242 if (d->testing_p)
16243 return true;
16244
16245 in0 = d->op0;
16246 in1 = d->op1;
16247 /* We don't need a big-endian lane correction for SVE; see the comment
16248 at the head of aarch64-sve.md for details. */
16249 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16250 {
16251 x = in0, in0 = in1, in1 = x;
16252 high = !high;
16253 }
16254 out = d->target;
16255
16256 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16257 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16258 return true;
16259 }
16260
16261 /* Recognize patterns for the EXT insn. */
16262
16263 static bool
16264 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16265 {
16266 HOST_WIDE_INT location;
16267 rtx offset;
16268
16269 /* The first element always refers to the first vector.
16270 Check if the extracted indices are increasing by one. */
16271 if (d->vec_flags == VEC_SVE_PRED
16272 || !d->perm[0].is_constant (&location)
16273 || !d->perm.series_p (0, 1, location, 1))
16274 return false;
16275
16276 /* Success! */
16277 if (d->testing_p)
16278 return true;
16279
16280 /* The case where (location == 0) is a no-op for both big- and little-endian,
16281 and is removed by the mid-end at optimization levels -O1 and higher.
16282
16283 We don't need a big-endian lane correction for SVE; see the comment
16284 at the head of aarch64-sve.md for details. */
16285 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16286 {
16287 /* After setup, we want the high elements of the first vector (stored
16288 at the LSB end of the register), and the low elements of the second
16289 vector (stored at the MSB end of the register). So swap. */
16290 std::swap (d->op0, d->op1);
16291 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16292 to_constant () is safe since this is restricted to Advanced SIMD
16293 vectors. */
16294 location = d->perm.length ().to_constant () - location;
16295 }
16296
16297 offset = GEN_INT (location);
16298 emit_set_insn (d->target,
16299 gen_rtx_UNSPEC (d->vmode,
16300 gen_rtvec (3, d->op0, d->op1, offset),
16301 UNSPEC_EXT));
16302 return true;
16303 }
16304
16305 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16306 within each 64-bit, 32-bit or 16-bit granule. */
16307
16308 static bool
16309 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16310 {
16311 HOST_WIDE_INT diff;
16312 unsigned int i, size, unspec;
16313 machine_mode pred_mode;
16314
16315 if (d->vec_flags == VEC_SVE_PRED
16316 || !d->one_vector_p
16317 || !d->perm[0].is_constant (&diff))
16318 return false;
16319
16320 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16321 if (size == 8)
16322 {
16323 unspec = UNSPEC_REV64;
16324 pred_mode = VNx2BImode;
16325 }
16326 else if (size == 4)
16327 {
16328 unspec = UNSPEC_REV32;
16329 pred_mode = VNx4BImode;
16330 }
16331 else if (size == 2)
16332 {
16333 unspec = UNSPEC_REV16;
16334 pred_mode = VNx8BImode;
16335 }
16336 else
16337 return false;
16338
16339 unsigned int step = diff + 1;
16340 for (i = 0; i < step; ++i)
16341 if (!d->perm.series_p (i, step, diff - i, step))
16342 return false;
16343
16344 /* Success! */
16345 if (d->testing_p)
16346 return true;
16347
16348 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16349 if (d->vec_flags == VEC_SVE_DATA)
16350 {
16351 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16352 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16353 UNSPEC_MERGE_PTRUE);
16354 }
16355 emit_set_insn (d->target, src);
16356 return true;
16357 }
16358
16359 /* Recognize patterns for the REV insn, which reverses elements within
16360 a full vector. */
16361
16362 static bool
16363 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16364 {
16365 poly_uint64 nelt = d->perm.length ();
16366
16367 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16368 return false;
16369
16370 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16371 return false;
16372
16373 /* Success! */
16374 if (d->testing_p)
16375 return true;
16376
16377 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16378 emit_set_insn (d->target, src);
16379 return true;
16380 }
16381
16382 static bool
16383 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16384 {
16385 rtx out = d->target;
16386 rtx in0;
16387 HOST_WIDE_INT elt;
16388 machine_mode vmode = d->vmode;
16389 rtx lane;
16390
16391 if (d->vec_flags == VEC_SVE_PRED
16392 || d->perm.encoding ().encoded_nelts () != 1
16393 || !d->perm[0].is_constant (&elt))
16394 return false;
16395
16396 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16397 return false;
16398
16399 /* Success! */
16400 if (d->testing_p)
16401 return true;
16402
16403 /* The generic preparation in aarch64_expand_vec_perm_const_1
16404 swaps the operand order and the permute indices if it finds
16405 d->perm[0] to be in the second operand. Thus, we can always
16406 use d->op0 and need not do any extra arithmetic to get the
16407 correct lane number. */
16408 in0 = d->op0;
16409 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16410
16411 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16412 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16413 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16414 return true;
16415 }
16416
16417 static bool
16418 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16419 {
16420 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16421 machine_mode vmode = d->vmode;
16422
16423 /* Make sure that the indices are constant. */
16424 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16425 for (unsigned int i = 0; i < encoded_nelts; ++i)
16426 if (!d->perm[i].is_constant ())
16427 return false;
16428
16429 if (d->testing_p)
16430 return true;
16431
16432 /* Generic code will try constant permutation twice. Once with the
16433 original mode and again with the elements lowered to QImode.
16434 So wait and don't do the selector expansion ourselves. */
16435 if (vmode != V8QImode && vmode != V16QImode)
16436 return false;
16437
16438 /* to_constant is safe since this routine is specific to Advanced SIMD
16439 vectors. */
16440 unsigned int nelt = d->perm.length ().to_constant ();
16441 for (unsigned int i = 0; i < nelt; ++i)
16442 /* If big-endian and two vectors we end up with a weird mixed-endian
16443 mode on NEON. Reverse the index within each word but not the word
16444 itself. to_constant is safe because we checked is_constant above. */
16445 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16446 ? d->perm[i].to_constant () ^ (nelt - 1)
16447 : d->perm[i].to_constant ());
16448
16449 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16450 sel = force_reg (vmode, sel);
16451
16452 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16453 return true;
16454 }
16455
16456 /* Try to implement D using an SVE TBL instruction. */
16457
16458 static bool
16459 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16460 {
16461 unsigned HOST_WIDE_INT nelt;
16462
16463 /* Permuting two variable-length vectors could overflow the
16464 index range. */
16465 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16466 return false;
16467
16468 if (d->testing_p)
16469 return true;
16470
16471 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16472 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16473 if (d->one_vector_p)
16474 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16475 else
16476 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16477 return true;
16478 }
16479
16480 static bool
16481 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16482 {
16483 /* The pattern matching functions above are written to look for a small
16484 number to begin the sequence (0, 1, N/2). If we begin with an index
16485 from the second operand, we can swap the operands. */
16486 poly_int64 nelt = d->perm.length ();
16487 if (known_ge (d->perm[0], nelt))
16488 {
16489 d->perm.rotate_inputs (1);
16490 std::swap (d->op0, d->op1);
16491 }
16492
16493 if ((d->vec_flags == VEC_ADVSIMD
16494 || d->vec_flags == VEC_SVE_DATA
16495 || d->vec_flags == VEC_SVE_PRED)
16496 && known_gt (nelt, 1))
16497 {
16498 if (aarch64_evpc_rev_local (d))
16499 return true;
16500 else if (aarch64_evpc_rev_global (d))
16501 return true;
16502 else if (aarch64_evpc_ext (d))
16503 return true;
16504 else if (aarch64_evpc_dup (d))
16505 return true;
16506 else if (aarch64_evpc_zip (d))
16507 return true;
16508 else if (aarch64_evpc_uzp (d))
16509 return true;
16510 else if (aarch64_evpc_trn (d))
16511 return true;
16512 if (d->vec_flags == VEC_SVE_DATA)
16513 return aarch64_evpc_sve_tbl (d);
16514 else if (d->vec_flags == VEC_ADVSIMD)
16515 return aarch64_evpc_tbl (d);
16516 }
16517 return false;
16518 }
16519
16520 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16521
16522 static bool
16523 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16524 rtx op1, const vec_perm_indices &sel)
16525 {
16526 struct expand_vec_perm_d d;
16527
16528 /* Check whether the mask can be applied to a single vector. */
16529 if (sel.ninputs () == 1
16530 || (op0 && rtx_equal_p (op0, op1)))
16531 d.one_vector_p = true;
16532 else if (sel.all_from_input_p (0))
16533 {
16534 d.one_vector_p = true;
16535 op1 = op0;
16536 }
16537 else if (sel.all_from_input_p (1))
16538 {
16539 d.one_vector_p = true;
16540 op0 = op1;
16541 }
16542 else
16543 d.one_vector_p = false;
16544
16545 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16546 sel.nelts_per_input ());
16547 d.vmode = vmode;
16548 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16549 d.target = target;
16550 d.op0 = op0;
16551 d.op1 = op1;
16552 d.testing_p = !target;
16553
16554 if (!d.testing_p)
16555 return aarch64_expand_vec_perm_const_1 (&d);
16556
16557 rtx_insn *last = get_last_insn ();
16558 bool ret = aarch64_expand_vec_perm_const_1 (&d);
16559 gcc_assert (last == get_last_insn ());
16560
16561 return ret;
16562 }
16563
16564 /* Generate a byte permute mask for a register of mode MODE,
16565 which has NUNITS units. */
16566
16567 rtx
16568 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16569 {
16570 /* We have to reverse each vector because we dont have
16571 a permuted load that can reverse-load according to ABI rules. */
16572 rtx mask;
16573 rtvec v = rtvec_alloc (16);
16574 unsigned int i, j;
16575 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16576
16577 gcc_assert (BYTES_BIG_ENDIAN);
16578 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16579
16580 for (i = 0; i < nunits; i++)
16581 for (j = 0; j < usize; j++)
16582 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16583 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16584 return force_reg (V16QImode, mask);
16585 }
16586
16587 /* Return true if X is a valid second operand for the SVE instruction
16588 that implements integer comparison OP_CODE. */
16589
16590 static bool
16591 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16592 {
16593 if (register_operand (x, VOIDmode))
16594 return true;
16595
16596 switch (op_code)
16597 {
16598 case LTU:
16599 case LEU:
16600 case GEU:
16601 case GTU:
16602 return aarch64_sve_cmp_immediate_p (x, false);
16603 case LT:
16604 case LE:
16605 case GE:
16606 case GT:
16607 case NE:
16608 case EQ:
16609 return aarch64_sve_cmp_immediate_p (x, true);
16610 default:
16611 gcc_unreachable ();
16612 }
16613 }
16614
16615 /* Use predicated SVE instructions to implement the equivalent of:
16616
16617 (set TARGET OP)
16618
16619 given that PTRUE is an all-true predicate of the appropriate mode. */
16620
16621 static void
16622 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16623 {
16624 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16625 gen_rtvec (2, ptrue, op),
16626 UNSPEC_MERGE_PTRUE);
16627 rtx_insn *insn = emit_set_insn (target, unspec);
16628 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16629 }
16630
16631 /* Likewise, but also clobber the condition codes. */
16632
16633 static void
16634 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16635 {
16636 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16637 gen_rtvec (2, ptrue, op),
16638 UNSPEC_MERGE_PTRUE);
16639 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16640 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16641 }
16642
16643 /* Return the UNSPEC_COND_* code for comparison CODE. */
16644
16645 static unsigned int
16646 aarch64_unspec_cond_code (rtx_code code)
16647 {
16648 switch (code)
16649 {
16650 case NE:
16651 return UNSPEC_COND_NE;
16652 case EQ:
16653 return UNSPEC_COND_EQ;
16654 case LT:
16655 return UNSPEC_COND_LT;
16656 case GT:
16657 return UNSPEC_COND_GT;
16658 case LE:
16659 return UNSPEC_COND_LE;
16660 case GE:
16661 return UNSPEC_COND_GE;
16662 default:
16663 gcc_unreachable ();
16664 }
16665 }
16666
16667 /* Emit:
16668
16669 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16670
16671 where <X> is the operation associated with comparison CODE. This form
16672 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16673 semantics, such as when PRED might not be all-true and when comparing
16674 inactive lanes could have side effects. */
16675
16676 static void
16677 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16678 rtx pred, rtx op0, rtx op1)
16679 {
16680 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16681 gen_rtvec (3, pred, op0, op1),
16682 aarch64_unspec_cond_code (code));
16683 emit_set_insn (target, unspec);
16684 }
16685
16686 /* Expand an SVE integer comparison using the SVE equivalent of:
16687
16688 (set TARGET (CODE OP0 OP1)). */
16689
16690 void
16691 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16692 {
16693 machine_mode pred_mode = GET_MODE (target);
16694 machine_mode data_mode = GET_MODE (op0);
16695
16696 if (!aarch64_sve_cmp_operand_p (code, op1))
16697 op1 = force_reg (data_mode, op1);
16698
16699 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16700 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16701 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16702 }
16703
16704 /* Emit the SVE equivalent of:
16705
16706 (set TMP1 (CODE1 OP0 OP1))
16707 (set TMP2 (CODE2 OP0 OP1))
16708 (set TARGET (ior:PRED_MODE TMP1 TMP2))
16709
16710 PTRUE is an all-true predicate with the same mode as TARGET. */
16711
16712 static void
16713 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16714 rtx ptrue, rtx op0, rtx op1)
16715 {
16716 machine_mode pred_mode = GET_MODE (ptrue);
16717 rtx tmp1 = gen_reg_rtx (pred_mode);
16718 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16719 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16720 rtx tmp2 = gen_reg_rtx (pred_mode);
16721 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16722 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16723 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16724 }
16725
16726 /* Emit the SVE equivalent of:
16727
16728 (set TMP (CODE OP0 OP1))
16729 (set TARGET (not TMP))
16730
16731 PTRUE is an all-true predicate with the same mode as TARGET. */
16732
16733 static void
16734 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16735 rtx op0, rtx op1)
16736 {
16737 machine_mode pred_mode = GET_MODE (ptrue);
16738 rtx tmp = gen_reg_rtx (pred_mode);
16739 aarch64_emit_sve_ptrue_op (tmp, ptrue,
16740 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16741 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16742 }
16743
16744 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16745
16746 (set TARGET (CODE OP0 OP1))
16747
16748 If CAN_INVERT_P is true, the caller can also handle inverted results;
16749 return true if the result is in fact inverted. */
16750
16751 bool
16752 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16753 rtx op0, rtx op1, bool can_invert_p)
16754 {
16755 machine_mode pred_mode = GET_MODE (target);
16756 machine_mode data_mode = GET_MODE (op0);
16757
16758 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16759 switch (code)
16760 {
16761 case UNORDERED:
16762 /* UNORDERED has no immediate form. */
16763 op1 = force_reg (data_mode, op1);
16764 /* fall through */
16765 case LT:
16766 case LE:
16767 case GT:
16768 case GE:
16769 case EQ:
16770 case NE:
16771 {
16772 /* There is native support for the comparison. */
16773 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16774 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16775 return false;
16776 }
16777
16778 case LTGT:
16779 /* This is a trapping operation (LT or GT). */
16780 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16781 return false;
16782
16783 case UNEQ:
16784 if (!flag_trapping_math)
16785 {
16786 /* This would trap for signaling NaNs. */
16787 op1 = force_reg (data_mode, op1);
16788 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16789 return false;
16790 }
16791 /* fall through */
16792 case UNLT:
16793 case UNLE:
16794 case UNGT:
16795 case UNGE:
16796 if (flag_trapping_math)
16797 {
16798 /* Work out which elements are ordered. */
16799 rtx ordered = gen_reg_rtx (pred_mode);
16800 op1 = force_reg (data_mode, op1);
16801 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16802
16803 /* Test the opposite condition for the ordered elements,
16804 then invert the result. */
16805 if (code == UNEQ)
16806 code = NE;
16807 else
16808 code = reverse_condition_maybe_unordered (code);
16809 if (can_invert_p)
16810 {
16811 aarch64_emit_sve_predicated_cond (target, code,
16812 ordered, op0, op1);
16813 return true;
16814 }
16815 rtx tmp = gen_reg_rtx (pred_mode);
16816 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16817 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16818 return false;
16819 }
16820 break;
16821
16822 case ORDERED:
16823 /* ORDERED has no immediate form. */
16824 op1 = force_reg (data_mode, op1);
16825 break;
16826
16827 default:
16828 gcc_unreachable ();
16829 }
16830
16831 /* There is native support for the inverse comparison. */
16832 code = reverse_condition_maybe_unordered (code);
16833 if (can_invert_p)
16834 {
16835 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16836 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16837 return true;
16838 }
16839 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16840 return false;
16841 }
16842
16843 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16844 of the data being selected and CMP_MODE is the mode of the values being
16845 compared. */
16846
16847 void
16848 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16849 rtx *ops)
16850 {
16851 machine_mode pred_mode
16852 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16853 GET_MODE_SIZE (cmp_mode)).require ();
16854 rtx pred = gen_reg_rtx (pred_mode);
16855 if (FLOAT_MODE_P (cmp_mode))
16856 {
16857 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16858 ops[4], ops[5], true))
16859 std::swap (ops[1], ops[2]);
16860 }
16861 else
16862 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16863
16864 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16865 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16866 }
16867
16868 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16869 true. However due to issues with register allocation it is preferable
16870 to avoid tieing integer scalar and FP scalar modes. Executing integer
16871 operations in general registers is better than treating them as scalar
16872 vector operations. This reduces latency and avoids redundant int<->FP
16873 moves. So tie modes if they are either the same class, or vector modes
16874 with other vector modes, vector structs or any scalar mode. */
16875
16876 static bool
16877 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16878 {
16879 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16880 return true;
16881
16882 /* We specifically want to allow elements of "structure" modes to
16883 be tieable to the structure. This more general condition allows
16884 other rarer situations too. The reason we don't extend this to
16885 predicate modes is that there are no predicate structure modes
16886 nor any specific instructions for extracting part of a predicate
16887 register. */
16888 if (aarch64_vector_data_mode_p (mode1)
16889 && aarch64_vector_data_mode_p (mode2))
16890 return true;
16891
16892 /* Also allow any scalar modes with vectors. */
16893 if (aarch64_vector_mode_supported_p (mode1)
16894 || aarch64_vector_mode_supported_p (mode2))
16895 return true;
16896
16897 return false;
16898 }
16899
16900 /* Return a new RTX holding the result of moving POINTER forward by
16901 AMOUNT bytes. */
16902
16903 static rtx
16904 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16905 {
16906 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16907
16908 return adjust_automodify_address (pointer, GET_MODE (pointer),
16909 next, amount);
16910 }
16911
16912 /* Return a new RTX holding the result of moving POINTER forward by the
16913 size of the mode it points to. */
16914
16915 static rtx
16916 aarch64_progress_pointer (rtx pointer)
16917 {
16918 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16919 }
16920
16921 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16922 MODE bytes. */
16923
16924 static void
16925 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16926 machine_mode mode)
16927 {
16928 rtx reg = gen_reg_rtx (mode);
16929
16930 /* "Cast" the pointers to the correct mode. */
16931 *src = adjust_address (*src, mode, 0);
16932 *dst = adjust_address (*dst, mode, 0);
16933 /* Emit the memcpy. */
16934 emit_move_insn (reg, *src);
16935 emit_move_insn (*dst, reg);
16936 /* Move the pointers forward. */
16937 *src = aarch64_progress_pointer (*src);
16938 *dst = aarch64_progress_pointer (*dst);
16939 }
16940
16941 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16942 we succeed, otherwise return false. */
16943
16944 bool
16945 aarch64_expand_movmem (rtx *operands)
16946 {
16947 int n, mode_bits;
16948 rtx dst = operands[0];
16949 rtx src = operands[1];
16950 rtx base;
16951 machine_mode cur_mode = BLKmode, next_mode;
16952 bool speed_p = !optimize_function_for_size_p (cfun);
16953
16954 /* When optimizing for size, give a better estimate of the length of a
16955 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16956 will always require an even number of instructions to do now. And each
16957 operation requires both a load+store, so devide the max number by 2. */
16958 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16959
16960 /* We can't do anything smart if the amount to copy is not constant. */
16961 if (!CONST_INT_P (operands[2]))
16962 return false;
16963
16964 n = INTVAL (operands[2]);
16965
16966 /* Try to keep the number of instructions low. For all cases we will do at
16967 most two moves for the residual amount, since we'll always overlap the
16968 remainder. */
16969 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16970 return false;
16971
16972 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16973 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16974
16975 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16976 src = adjust_automodify_address (src, VOIDmode, base, 0);
16977
16978 /* Convert n to bits to make the rest of the code simpler. */
16979 n = n * BITS_PER_UNIT;
16980
16981 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
16982 larger than TImode, but we should not use them for loads/stores here. */
16983 const int copy_limit = GET_MODE_BITSIZE (TImode);
16984
16985 while (n > 0)
16986 {
16987 /* Find the largest mode in which to do the copy in without over reading
16988 or writing. */
16989 opt_scalar_int_mode mode_iter;
16990 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16991 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16992 cur_mode = mode_iter.require ();
16993
16994 gcc_assert (cur_mode != BLKmode);
16995
16996 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16997 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16998
16999 n -= mode_bits;
17000
17001 /* Do certain trailing copies as overlapping if it's going to be
17002 cheaper. i.e. less instructions to do so. For instance doing a 15
17003 byte copy it's more efficient to do two overlapping 8 byte copies than
17004 8 + 6 + 1. */
17005 if (n > 0 && n <= 8 * BITS_PER_UNIT)
17006 {
17007 next_mode = smallest_mode_for_size (n, MODE_INT);
17008 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17009 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17010 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17011 n = n_bits;
17012 }
17013 }
17014
17015 return true;
17016 }
17017
17018 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17019 SImode stores. Handle the case when the constant has identical
17020 bottom and top halves. This is beneficial when the two stores can be
17021 merged into an STP and we avoid synthesising potentially expensive
17022 immediates twice. Return true if such a split is possible. */
17023
17024 bool
17025 aarch64_split_dimode_const_store (rtx dst, rtx src)
17026 {
17027 rtx lo = gen_lowpart (SImode, src);
17028 rtx hi = gen_highpart_mode (SImode, DImode, src);
17029
17030 bool size_p = optimize_function_for_size_p (cfun);
17031
17032 if (!rtx_equal_p (lo, hi))
17033 return false;
17034
17035 unsigned int orig_cost
17036 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17037 unsigned int lo_cost
17038 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17039
17040 /* We want to transform:
17041 MOV x1, 49370
17042 MOVK x1, 0x140, lsl 16
17043 MOVK x1, 0xc0da, lsl 32
17044 MOVK x1, 0x140, lsl 48
17045 STR x1, [x0]
17046 into:
17047 MOV w1, 49370
17048 MOVK w1, 0x140, lsl 16
17049 STP w1, w1, [x0]
17050 So we want to perform this only when we save two instructions
17051 or more. When optimizing for size, however, accept any code size
17052 savings we can. */
17053 if (size_p && orig_cost <= lo_cost)
17054 return false;
17055
17056 if (!size_p
17057 && (orig_cost <= lo_cost + 1))
17058 return false;
17059
17060 rtx mem_lo = adjust_address (dst, SImode, 0);
17061 if (!aarch64_mem_pair_operand (mem_lo, SImode))
17062 return false;
17063
17064 rtx tmp_reg = gen_reg_rtx (SImode);
17065 aarch64_expand_mov_immediate (tmp_reg, lo);
17066 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17067 /* Don't emit an explicit store pair as this may not be always profitable.
17068 Let the sched-fusion logic decide whether to merge them. */
17069 emit_move_insn (mem_lo, tmp_reg);
17070 emit_move_insn (mem_hi, tmp_reg);
17071
17072 return true;
17073 }
17074
17075 /* Generate RTL for a conditional branch with rtx comparison CODE in
17076 mode CC_MODE. The destination of the unlikely conditional branch
17077 is LABEL_REF. */
17078
17079 void
17080 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17081 rtx label_ref)
17082 {
17083 rtx x;
17084 x = gen_rtx_fmt_ee (code, VOIDmode,
17085 gen_rtx_REG (cc_mode, CC_REGNUM),
17086 const0_rtx);
17087
17088 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17089 gen_rtx_LABEL_REF (VOIDmode, label_ref),
17090 pc_rtx);
17091 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17092 }
17093
17094 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17095
17096 OP1 represents the TImode destination operand 1
17097 OP2 represents the TImode destination operand 2
17098 LOW_DEST represents the low half (DImode) of TImode operand 0
17099 LOW_IN1 represents the low half (DImode) of TImode operand 1
17100 LOW_IN2 represents the low half (DImode) of TImode operand 2
17101 HIGH_DEST represents the high half (DImode) of TImode operand 0
17102 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17103 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17104
17105 void
17106 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17107 rtx *low_in1, rtx *low_in2,
17108 rtx *high_dest, rtx *high_in1,
17109 rtx *high_in2)
17110 {
17111 *low_dest = gen_reg_rtx (DImode);
17112 *low_in1 = gen_lowpart (DImode, op1);
17113 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17114 subreg_lowpart_offset (DImode, TImode));
17115 *high_dest = gen_reg_rtx (DImode);
17116 *high_in1 = gen_highpart (DImode, op1);
17117 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17118 subreg_highpart_offset (DImode, TImode));
17119 }
17120
17121 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17122
17123 This function differs from 'arch64_addti_scratch_regs' in that
17124 OP1 can be an immediate constant (zero). We must call
17125 subreg_highpart_offset with DImode and TImode arguments, otherwise
17126 VOIDmode will be used for the const_int which generates an internal
17127 error from subreg_size_highpart_offset which does not expect a size of zero.
17128
17129 OP1 represents the TImode destination operand 1
17130 OP2 represents the TImode destination operand 2
17131 LOW_DEST represents the low half (DImode) of TImode operand 0
17132 LOW_IN1 represents the low half (DImode) of TImode operand 1
17133 LOW_IN2 represents the low half (DImode) of TImode operand 2
17134 HIGH_DEST represents the high half (DImode) of TImode operand 0
17135 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17136 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17137
17138
17139 void
17140 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17141 rtx *low_in1, rtx *low_in2,
17142 rtx *high_dest, rtx *high_in1,
17143 rtx *high_in2)
17144 {
17145 *low_dest = gen_reg_rtx (DImode);
17146 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17147 subreg_lowpart_offset (DImode, TImode));
17148
17149 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17150 subreg_lowpart_offset (DImode, TImode));
17151 *high_dest = gen_reg_rtx (DImode);
17152
17153 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17154 subreg_highpart_offset (DImode, TImode));
17155 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17156 subreg_highpart_offset (DImode, TImode));
17157 }
17158
17159 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17160
17161 OP0 represents the TImode destination operand 0
17162 LOW_DEST represents the low half (DImode) of TImode operand 0
17163 LOW_IN1 represents the low half (DImode) of TImode operand 1
17164 LOW_IN2 represents the low half (DImode) of TImode operand 2
17165 HIGH_DEST represents the high half (DImode) of TImode operand 0
17166 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17167 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17168 UNSIGNED_P is true if the operation is being performed on unsigned
17169 values. */
17170 void
17171 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17172 rtx low_in2, rtx high_dest, rtx high_in1,
17173 rtx high_in2, bool unsigned_p)
17174 {
17175 if (low_in2 == const0_rtx)
17176 {
17177 low_dest = low_in1;
17178 high_in2 = force_reg (DImode, high_in2);
17179 if (unsigned_p)
17180 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17181 else
17182 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17183 }
17184 else
17185 {
17186 if (CONST_INT_P (low_in2))
17187 {
17188 high_in2 = force_reg (DImode, high_in2);
17189 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17190 GEN_INT (-INTVAL (low_in2))));
17191 }
17192 else
17193 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17194
17195 if (unsigned_p)
17196 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17197 else
17198 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17199 }
17200
17201 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17202 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17203
17204 }
17205
17206 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17207
17208 static unsigned HOST_WIDE_INT
17209 aarch64_asan_shadow_offset (void)
17210 {
17211 return (HOST_WIDE_INT_1 << 36);
17212 }
17213
17214 static rtx
17215 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17216 int code, tree treeop0, tree treeop1)
17217 {
17218 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17219 rtx op0, op1;
17220 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17221 insn_code icode;
17222 struct expand_operand ops[4];
17223
17224 start_sequence ();
17225 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17226
17227 op_mode = GET_MODE (op0);
17228 if (op_mode == VOIDmode)
17229 op_mode = GET_MODE (op1);
17230
17231 switch (op_mode)
17232 {
17233 case E_QImode:
17234 case E_HImode:
17235 case E_SImode:
17236 cmp_mode = SImode;
17237 icode = CODE_FOR_cmpsi;
17238 break;
17239
17240 case E_DImode:
17241 cmp_mode = DImode;
17242 icode = CODE_FOR_cmpdi;
17243 break;
17244
17245 case E_SFmode:
17246 cmp_mode = SFmode;
17247 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17248 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17249 break;
17250
17251 case E_DFmode:
17252 cmp_mode = DFmode;
17253 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17254 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17255 break;
17256
17257 default:
17258 end_sequence ();
17259 return NULL_RTX;
17260 }
17261
17262 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17263 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17264 if (!op0 || !op1)
17265 {
17266 end_sequence ();
17267 return NULL_RTX;
17268 }
17269 *prep_seq = get_insns ();
17270 end_sequence ();
17271
17272 create_fixed_operand (&ops[0], op0);
17273 create_fixed_operand (&ops[1], op1);
17274
17275 start_sequence ();
17276 if (!maybe_expand_insn (icode, 2, ops))
17277 {
17278 end_sequence ();
17279 return NULL_RTX;
17280 }
17281 *gen_seq = get_insns ();
17282 end_sequence ();
17283
17284 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17285 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17286 }
17287
17288 static rtx
17289 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17290 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17291 {
17292 rtx op0, op1, target;
17293 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17294 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17295 insn_code icode;
17296 struct expand_operand ops[6];
17297 int aarch64_cond;
17298
17299 push_to_sequence (*prep_seq);
17300 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17301
17302 op_mode = GET_MODE (op0);
17303 if (op_mode == VOIDmode)
17304 op_mode = GET_MODE (op1);
17305
17306 switch (op_mode)
17307 {
17308 case E_QImode:
17309 case E_HImode:
17310 case E_SImode:
17311 cmp_mode = SImode;
17312 icode = CODE_FOR_ccmpsi;
17313 break;
17314
17315 case E_DImode:
17316 cmp_mode = DImode;
17317 icode = CODE_FOR_ccmpdi;
17318 break;
17319
17320 case E_SFmode:
17321 cmp_mode = SFmode;
17322 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17323 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17324 break;
17325
17326 case E_DFmode:
17327 cmp_mode = DFmode;
17328 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17329 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17330 break;
17331
17332 default:
17333 end_sequence ();
17334 return NULL_RTX;
17335 }
17336
17337 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17338 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17339 if (!op0 || !op1)
17340 {
17341 end_sequence ();
17342 return NULL_RTX;
17343 }
17344 *prep_seq = get_insns ();
17345 end_sequence ();
17346
17347 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17348 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17349
17350 if (bit_code != AND)
17351 {
17352 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17353 GET_MODE (XEXP (prev, 0))),
17354 VOIDmode, XEXP (prev, 0), const0_rtx);
17355 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17356 }
17357
17358 create_fixed_operand (&ops[0], XEXP (prev, 0));
17359 create_fixed_operand (&ops[1], target);
17360 create_fixed_operand (&ops[2], op0);
17361 create_fixed_operand (&ops[3], op1);
17362 create_fixed_operand (&ops[4], prev);
17363 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17364
17365 push_to_sequence (*gen_seq);
17366 if (!maybe_expand_insn (icode, 6, ops))
17367 {
17368 end_sequence ();
17369 return NULL_RTX;
17370 }
17371
17372 *gen_seq = get_insns ();
17373 end_sequence ();
17374
17375 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17376 }
17377
17378 #undef TARGET_GEN_CCMP_FIRST
17379 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17380
17381 #undef TARGET_GEN_CCMP_NEXT
17382 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17383
17384 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17385 instruction fusion of some sort. */
17386
17387 static bool
17388 aarch64_macro_fusion_p (void)
17389 {
17390 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17391 }
17392
17393
17394 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17395 should be kept together during scheduling. */
17396
17397 static bool
17398 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17399 {
17400 rtx set_dest;
17401 rtx prev_set = single_set (prev);
17402 rtx curr_set = single_set (curr);
17403 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17404 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17405
17406 if (!aarch64_macro_fusion_p ())
17407 return false;
17408
17409 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17410 {
17411 /* We are trying to match:
17412 prev (mov) == (set (reg r0) (const_int imm16))
17413 curr (movk) == (set (zero_extract (reg r0)
17414 (const_int 16)
17415 (const_int 16))
17416 (const_int imm16_1)) */
17417
17418 set_dest = SET_DEST (curr_set);
17419
17420 if (GET_CODE (set_dest) == ZERO_EXTRACT
17421 && CONST_INT_P (SET_SRC (curr_set))
17422 && CONST_INT_P (SET_SRC (prev_set))
17423 && CONST_INT_P (XEXP (set_dest, 2))
17424 && INTVAL (XEXP (set_dest, 2)) == 16
17425 && REG_P (XEXP (set_dest, 0))
17426 && REG_P (SET_DEST (prev_set))
17427 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17428 {
17429 return true;
17430 }
17431 }
17432
17433 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17434 {
17435
17436 /* We're trying to match:
17437 prev (adrp) == (set (reg r1)
17438 (high (symbol_ref ("SYM"))))
17439 curr (add) == (set (reg r0)
17440 (lo_sum (reg r1)
17441 (symbol_ref ("SYM"))))
17442 Note that r0 need not necessarily be the same as r1, especially
17443 during pre-regalloc scheduling. */
17444
17445 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17446 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17447 {
17448 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17449 && REG_P (XEXP (SET_SRC (curr_set), 0))
17450 && REGNO (XEXP (SET_SRC (curr_set), 0))
17451 == REGNO (SET_DEST (prev_set))
17452 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17453 XEXP (SET_SRC (curr_set), 1)))
17454 return true;
17455 }
17456 }
17457
17458 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17459 {
17460
17461 /* We're trying to match:
17462 prev (movk) == (set (zero_extract (reg r0)
17463 (const_int 16)
17464 (const_int 32))
17465 (const_int imm16_1))
17466 curr (movk) == (set (zero_extract (reg r0)
17467 (const_int 16)
17468 (const_int 48))
17469 (const_int imm16_2)) */
17470
17471 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17472 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17473 && REG_P (XEXP (SET_DEST (prev_set), 0))
17474 && REG_P (XEXP (SET_DEST (curr_set), 0))
17475 && REGNO (XEXP (SET_DEST (prev_set), 0))
17476 == REGNO (XEXP (SET_DEST (curr_set), 0))
17477 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17478 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17479 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17480 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17481 && CONST_INT_P (SET_SRC (prev_set))
17482 && CONST_INT_P (SET_SRC (curr_set)))
17483 return true;
17484
17485 }
17486 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17487 {
17488 /* We're trying to match:
17489 prev (adrp) == (set (reg r0)
17490 (high (symbol_ref ("SYM"))))
17491 curr (ldr) == (set (reg r1)
17492 (mem (lo_sum (reg r0)
17493 (symbol_ref ("SYM")))))
17494 or
17495 curr (ldr) == (set (reg r1)
17496 (zero_extend (mem
17497 (lo_sum (reg r0)
17498 (symbol_ref ("SYM")))))) */
17499 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17500 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17501 {
17502 rtx curr_src = SET_SRC (curr_set);
17503
17504 if (GET_CODE (curr_src) == ZERO_EXTEND)
17505 curr_src = XEXP (curr_src, 0);
17506
17507 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17508 && REG_P (XEXP (XEXP (curr_src, 0), 0))
17509 && REGNO (XEXP (XEXP (curr_src, 0), 0))
17510 == REGNO (SET_DEST (prev_set))
17511 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17512 XEXP (SET_SRC (prev_set), 0)))
17513 return true;
17514 }
17515 }
17516
17517 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17518 && aarch_crypto_can_dual_issue (prev, curr))
17519 return true;
17520
17521 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17522 && any_condjump_p (curr))
17523 {
17524 unsigned int condreg1, condreg2;
17525 rtx cc_reg_1;
17526 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17527 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17528
17529 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17530 && prev
17531 && modified_in_p (cc_reg_1, prev))
17532 {
17533 enum attr_type prev_type = get_attr_type (prev);
17534
17535 /* FIXME: this misses some which is considered simple arthematic
17536 instructions for ThunderX. Simple shifts are missed here. */
17537 if (prev_type == TYPE_ALUS_SREG
17538 || prev_type == TYPE_ALUS_IMM
17539 || prev_type == TYPE_LOGICS_REG
17540 || prev_type == TYPE_LOGICS_IMM)
17541 return true;
17542 }
17543 }
17544
17545 if (prev_set
17546 && curr_set
17547 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17548 && any_condjump_p (curr))
17549 {
17550 /* We're trying to match:
17551 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17552 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17553 (const_int 0))
17554 (label_ref ("SYM"))
17555 (pc)) */
17556 if (SET_DEST (curr_set) == (pc_rtx)
17557 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17558 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17559 && REG_P (SET_DEST (prev_set))
17560 && REGNO (SET_DEST (prev_set))
17561 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17562 {
17563 /* Fuse ALU operations followed by conditional branch instruction. */
17564 switch (get_attr_type (prev))
17565 {
17566 case TYPE_ALU_IMM:
17567 case TYPE_ALU_SREG:
17568 case TYPE_ADC_REG:
17569 case TYPE_ADC_IMM:
17570 case TYPE_ADCS_REG:
17571 case TYPE_ADCS_IMM:
17572 case TYPE_LOGIC_REG:
17573 case TYPE_LOGIC_IMM:
17574 case TYPE_CSEL:
17575 case TYPE_ADR:
17576 case TYPE_MOV_IMM:
17577 case TYPE_SHIFT_REG:
17578 case TYPE_SHIFT_IMM:
17579 case TYPE_BFM:
17580 case TYPE_RBIT:
17581 case TYPE_REV:
17582 case TYPE_EXTEND:
17583 return true;
17584
17585 default:;
17586 }
17587 }
17588 }
17589
17590 return false;
17591 }
17592
17593 /* Return true iff the instruction fusion described by OP is enabled. */
17594
17595 bool
17596 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17597 {
17598 return (aarch64_tune_params.fusible_ops & op) != 0;
17599 }
17600
17601 /* If MEM is in the form of [base+offset], extract the two parts
17602 of address and set to BASE and OFFSET, otherwise return false
17603 after clearing BASE and OFFSET. */
17604
17605 bool
17606 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17607 {
17608 rtx addr;
17609
17610 gcc_assert (MEM_P (mem));
17611
17612 addr = XEXP (mem, 0);
17613
17614 if (REG_P (addr))
17615 {
17616 *base = addr;
17617 *offset = const0_rtx;
17618 return true;
17619 }
17620
17621 if (GET_CODE (addr) == PLUS
17622 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17623 {
17624 *base = XEXP (addr, 0);
17625 *offset = XEXP (addr, 1);
17626 return true;
17627 }
17628
17629 *base = NULL_RTX;
17630 *offset = NULL_RTX;
17631
17632 return false;
17633 }
17634
17635 /* Types for scheduling fusion. */
17636 enum sched_fusion_type
17637 {
17638 SCHED_FUSION_NONE = 0,
17639 SCHED_FUSION_LD_SIGN_EXTEND,
17640 SCHED_FUSION_LD_ZERO_EXTEND,
17641 SCHED_FUSION_LD,
17642 SCHED_FUSION_ST,
17643 SCHED_FUSION_NUM
17644 };
17645
17646 /* If INSN is a load or store of address in the form of [base+offset],
17647 extract the two parts and set to BASE and OFFSET. Return scheduling
17648 fusion type this INSN is. */
17649
17650 static enum sched_fusion_type
17651 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17652 {
17653 rtx x, dest, src;
17654 enum sched_fusion_type fusion = SCHED_FUSION_LD;
17655
17656 gcc_assert (INSN_P (insn));
17657 x = PATTERN (insn);
17658 if (GET_CODE (x) != SET)
17659 return SCHED_FUSION_NONE;
17660
17661 src = SET_SRC (x);
17662 dest = SET_DEST (x);
17663
17664 machine_mode dest_mode = GET_MODE (dest);
17665
17666 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17667 return SCHED_FUSION_NONE;
17668
17669 if (GET_CODE (src) == SIGN_EXTEND)
17670 {
17671 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17672 src = XEXP (src, 0);
17673 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17674 return SCHED_FUSION_NONE;
17675 }
17676 else if (GET_CODE (src) == ZERO_EXTEND)
17677 {
17678 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17679 src = XEXP (src, 0);
17680 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17681 return SCHED_FUSION_NONE;
17682 }
17683
17684 if (GET_CODE (src) == MEM && REG_P (dest))
17685 extract_base_offset_in_addr (src, base, offset);
17686 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17687 {
17688 fusion = SCHED_FUSION_ST;
17689 extract_base_offset_in_addr (dest, base, offset);
17690 }
17691 else
17692 return SCHED_FUSION_NONE;
17693
17694 if (*base == NULL_RTX || *offset == NULL_RTX)
17695 fusion = SCHED_FUSION_NONE;
17696
17697 return fusion;
17698 }
17699
17700 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17701
17702 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17703 and PRI are only calculated for these instructions. For other instruction,
17704 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
17705 type instruction fusion can be added by returning different priorities.
17706
17707 It's important that irrelevant instructions get the largest FUSION_PRI. */
17708
17709 static void
17710 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17711 int *fusion_pri, int *pri)
17712 {
17713 int tmp, off_val;
17714 rtx base, offset;
17715 enum sched_fusion_type fusion;
17716
17717 gcc_assert (INSN_P (insn));
17718
17719 tmp = max_pri - 1;
17720 fusion = fusion_load_store (insn, &base, &offset);
17721 if (fusion == SCHED_FUSION_NONE)
17722 {
17723 *pri = tmp;
17724 *fusion_pri = tmp;
17725 return;
17726 }
17727
17728 /* Set FUSION_PRI according to fusion type and base register. */
17729 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17730
17731 /* Calculate PRI. */
17732 tmp /= 2;
17733
17734 /* INSN with smaller offset goes first. */
17735 off_val = (int)(INTVAL (offset));
17736 if (off_val >= 0)
17737 tmp -= (off_val & 0xfffff);
17738 else
17739 tmp += ((- off_val) & 0xfffff);
17740
17741 *pri = tmp;
17742 return;
17743 }
17744
17745 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17746 Adjust priority of sha1h instructions so they are scheduled before
17747 other SHA1 instructions. */
17748
17749 static int
17750 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17751 {
17752 rtx x = PATTERN (insn);
17753
17754 if (GET_CODE (x) == SET)
17755 {
17756 x = SET_SRC (x);
17757
17758 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17759 return priority + 10;
17760 }
17761
17762 return priority;
17763 }
17764
17765 /* Given OPERANDS of consecutive load/store, check if we can merge
17766 them into ldp/stp. LOAD is true if they are load instructions.
17767 MODE is the mode of memory operands. */
17768
17769 bool
17770 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17771 machine_mode mode)
17772 {
17773 HOST_WIDE_INT offval_1, offval_2, msize;
17774 enum reg_class rclass_1, rclass_2;
17775 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17776
17777 if (load)
17778 {
17779 mem_1 = operands[1];
17780 mem_2 = operands[3];
17781 reg_1 = operands[0];
17782 reg_2 = operands[2];
17783 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17784 if (REGNO (reg_1) == REGNO (reg_2))
17785 return false;
17786 }
17787 else
17788 {
17789 mem_1 = operands[0];
17790 mem_2 = operands[2];
17791 reg_1 = operands[1];
17792 reg_2 = operands[3];
17793 }
17794
17795 /* The mems cannot be volatile. */
17796 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17797 return false;
17798
17799 /* If we have SImode and slow unaligned ldp,
17800 check the alignment to be at least 8 byte. */
17801 if (mode == SImode
17802 && (aarch64_tune_params.extra_tuning_flags
17803 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17804 && !optimize_size
17805 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17806 return false;
17807
17808 /* Check if the addresses are in the form of [base+offset]. */
17809 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17810 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17811 return false;
17812 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17813 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17814 return false;
17815
17816 /* Check if the bases are same. */
17817 if (!rtx_equal_p (base_1, base_2))
17818 return false;
17819
17820 /* The operands must be of the same size. */
17821 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17822 GET_MODE_SIZE (GET_MODE (mem_2))));
17823
17824 offval_1 = INTVAL (offset_1);
17825 offval_2 = INTVAL (offset_2);
17826 /* We should only be trying this for fixed-sized modes. There is no
17827 SVE LDP/STP instruction. */
17828 msize = GET_MODE_SIZE (mode).to_constant ();
17829 /* Check if the offsets are consecutive. */
17830 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17831 return false;
17832
17833 /* Check if the addresses are clobbered by load. */
17834 if (load)
17835 {
17836 if (reg_mentioned_p (reg_1, mem_1))
17837 return false;
17838
17839 /* In increasing order, the last load can clobber the address. */
17840 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17841 return false;
17842 }
17843
17844 /* One of the memory accesses must be a mempair operand.
17845 If it is not the first one, they need to be swapped by the
17846 peephole. */
17847 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17848 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17849 return false;
17850
17851 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17852 rclass_1 = FP_REGS;
17853 else
17854 rclass_1 = GENERAL_REGS;
17855
17856 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17857 rclass_2 = FP_REGS;
17858 else
17859 rclass_2 = GENERAL_REGS;
17860
17861 /* Check if the registers are of same class. */
17862 if (rclass_1 != rclass_2)
17863 return false;
17864
17865 return true;
17866 }
17867
17868 /* Given OPERANDS of consecutive load/store that can be merged,
17869 swap them if they are not in ascending order. */
17870 void
17871 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17872 {
17873 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17874 HOST_WIDE_INT offval_1, offval_2;
17875
17876 if (load)
17877 {
17878 mem_1 = operands[1];
17879 mem_2 = operands[3];
17880 }
17881 else
17882 {
17883 mem_1 = operands[0];
17884 mem_2 = operands[2];
17885 }
17886
17887 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17888 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17889
17890 offval_1 = INTVAL (offset_1);
17891 offval_2 = INTVAL (offset_2);
17892
17893 if (offval_1 > offval_2)
17894 {
17895 /* Irrespective of whether this is a load or a store,
17896 we do the same swap. */
17897 std::swap (operands[0], operands[2]);
17898 std::swap (operands[1], operands[3]);
17899 }
17900 }
17901
17902 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17903 comparison between the two. */
17904 int
17905 aarch64_host_wide_int_compare (const void *x, const void *y)
17906 {
17907 return wi::cmps (* ((const HOST_WIDE_INT *) x),
17908 * ((const HOST_WIDE_INT *) y));
17909 }
17910
17911 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17912 other pointing to a REG rtx containing an offset, compare the offsets
17913 of the two pairs.
17914
17915 Return:
17916
17917 1 iff offset (X) > offset (Y)
17918 0 iff offset (X) == offset (Y)
17919 -1 iff offset (X) < offset (Y) */
17920 int
17921 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17922 {
17923 const rtx * operands_1 = (const rtx *) x;
17924 const rtx * operands_2 = (const rtx *) y;
17925 rtx mem_1, mem_2, base, offset_1, offset_2;
17926
17927 if (MEM_P (operands_1[0]))
17928 mem_1 = operands_1[0];
17929 else
17930 mem_1 = operands_1[1];
17931
17932 if (MEM_P (operands_2[0]))
17933 mem_2 = operands_2[0];
17934 else
17935 mem_2 = operands_2[1];
17936
17937 /* Extract the offsets. */
17938 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17939 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17940
17941 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17942
17943 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17944 }
17945
17946 /* Given OPERANDS of consecutive load/store, check if we can merge
17947 them into ldp/stp by adjusting the offset. LOAD is true if they
17948 are load instructions. MODE is the mode of memory operands.
17949
17950 Given below consecutive stores:
17951
17952 str w1, [xb, 0x100]
17953 str w1, [xb, 0x104]
17954 str w1, [xb, 0x108]
17955 str w1, [xb, 0x10c]
17956
17957 Though the offsets are out of the range supported by stp, we can
17958 still pair them after adjusting the offset, like:
17959
17960 add scratch, xb, 0x100
17961 stp w1, w1, [scratch]
17962 stp w1, w1, [scratch, 0x8]
17963
17964 The peephole patterns detecting this opportunity should guarantee
17965 the scratch register is avaliable. */
17966
17967 bool
17968 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17969 scalar_mode mode)
17970 {
17971 const int num_insns = 4;
17972 enum reg_class rclass;
17973 HOST_WIDE_INT offvals[num_insns], msize;
17974 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17975
17976 if (load)
17977 {
17978 for (int i = 0; i < num_insns; i++)
17979 {
17980 reg[i] = operands[2 * i];
17981 mem[i] = operands[2 * i + 1];
17982
17983 gcc_assert (REG_P (reg[i]));
17984 }
17985
17986 /* Do not attempt to merge the loads if the loads clobber each other. */
17987 for (int i = 0; i < 8; i += 2)
17988 for (int j = i + 2; j < 8; j += 2)
17989 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17990 return false;
17991 }
17992 else
17993 for (int i = 0; i < num_insns; i++)
17994 {
17995 mem[i] = operands[2 * i];
17996 reg[i] = operands[2 * i + 1];
17997 }
17998
17999 /* Skip if memory operand is by itself valid for ldp/stp. */
18000 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18001 return false;
18002
18003 for (int i = 0; i < num_insns; i++)
18004 {
18005 /* The mems cannot be volatile. */
18006 if (MEM_VOLATILE_P (mem[i]))
18007 return false;
18008
18009 /* Check if the addresses are in the form of [base+offset]. */
18010 extract_base_offset_in_addr (mem[i], base + i, offset + i);
18011 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18012 return false;
18013 }
18014
18015 /* Check if the registers are of same class. */
18016 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18017 ? FP_REGS : GENERAL_REGS;
18018
18019 for (int i = 1; i < num_insns; i++)
18020 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18021 {
18022 if (rclass != FP_REGS)
18023 return false;
18024 }
18025 else
18026 {
18027 if (rclass != GENERAL_REGS)
18028 return false;
18029 }
18030
18031 /* Only the last register in the order in which they occur
18032 may be clobbered by the load. */
18033 if (rclass == GENERAL_REGS && load)
18034 for (int i = 0; i < num_insns - 1; i++)
18035 if (reg_mentioned_p (reg[i], mem[i]))
18036 return false;
18037
18038 /* Check if the bases are same. */
18039 for (int i = 0; i < num_insns - 1; i++)
18040 if (!rtx_equal_p (base[i], base[i + 1]))
18041 return false;
18042
18043 for (int i = 0; i < num_insns; i++)
18044 offvals[i] = INTVAL (offset[i]);
18045
18046 msize = GET_MODE_SIZE (mode);
18047
18048 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18049 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18050 aarch64_host_wide_int_compare);
18051
18052 if (!(offvals[1] == offvals[0] + msize
18053 && offvals[3] == offvals[2] + msize))
18054 return false;
18055
18056 /* Check that offsets are within range of each other. The ldp/stp
18057 instructions have 7 bit immediate offsets, so use 0x80. */
18058 if (offvals[2] - offvals[0] >= msize * 0x80)
18059 return false;
18060
18061 /* The offsets must be aligned with respect to each other. */
18062 if (offvals[0] % msize != offvals[2] % msize)
18063 return false;
18064
18065 /* If we have SImode and slow unaligned ldp,
18066 check the alignment to be at least 8 byte. */
18067 if (mode == SImode
18068 && (aarch64_tune_params.extra_tuning_flags
18069 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18070 && !optimize_size
18071 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18072 return false;
18073
18074 return true;
18075 }
18076
18077 /* Given OPERANDS of consecutive load/store, this function pairs them
18078 into LDP/STP after adjusting the offset. It depends on the fact
18079 that the operands can be sorted so the offsets are correct for STP.
18080 MODE is the mode of memory operands. CODE is the rtl operator
18081 which should be applied to all memory operands, it's SIGN_EXTEND,
18082 ZERO_EXTEND or UNKNOWN. */
18083
18084 bool
18085 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18086 scalar_mode mode, RTX_CODE code)
18087 {
18088 rtx base, offset_1, offset_3, t1, t2;
18089 rtx mem_1, mem_2, mem_3, mem_4;
18090 rtx temp_operands[8];
18091 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18092 stp_off_upper_limit, stp_off_lower_limit, msize;
18093
18094 /* We make changes on a copy as we may still bail out. */
18095 for (int i = 0; i < 8; i ++)
18096 temp_operands[i] = operands[i];
18097
18098 /* Sort the operands. */
18099 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18100
18101 if (load)
18102 {
18103 mem_1 = temp_operands[1];
18104 mem_2 = temp_operands[3];
18105 mem_3 = temp_operands[5];
18106 mem_4 = temp_operands[7];
18107 }
18108 else
18109 {
18110 mem_1 = temp_operands[0];
18111 mem_2 = temp_operands[2];
18112 mem_3 = temp_operands[4];
18113 mem_4 = temp_operands[6];
18114 gcc_assert (code == UNKNOWN);
18115 }
18116
18117 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18118 extract_base_offset_in_addr (mem_3, &base, &offset_3);
18119 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18120 && offset_3 != NULL_RTX);
18121
18122 /* Adjust offset so it can fit in LDP/STP instruction. */
18123 msize = GET_MODE_SIZE (mode);
18124 stp_off_upper_limit = msize * (0x40 - 1);
18125 stp_off_lower_limit = - msize * 0x40;
18126
18127 off_val_1 = INTVAL (offset_1);
18128 off_val_3 = INTVAL (offset_3);
18129
18130 /* The base offset is optimally half way between the two STP/LDP offsets. */
18131 if (msize <= 4)
18132 base_off = (off_val_1 + off_val_3) / 2;
18133 else
18134 /* However, due to issues with negative LDP/STP offset generation for
18135 larger modes, for DF, DI and vector modes. we must not use negative
18136 addresses smaller than 9 signed unadjusted bits can store. This
18137 provides the most range in this case. */
18138 base_off = off_val_1;
18139
18140 /* Adjust the base so that it is aligned with the addresses but still
18141 optimal. */
18142 if (base_off % msize != off_val_1 % msize)
18143 /* Fix the offset, bearing in mind we want to make it bigger not
18144 smaller. */
18145 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18146 else if (msize <= 4)
18147 /* The negative range of LDP/STP is one larger than the positive range. */
18148 base_off += msize;
18149
18150 /* Check if base offset is too big or too small. We can attempt to resolve
18151 this issue by setting it to the maximum value and seeing if the offsets
18152 still fit. */
18153 if (base_off >= 0x1000)
18154 {
18155 base_off = 0x1000 - 1;
18156 /* We must still make sure that the base offset is aligned with respect
18157 to the address. But it may may not be made any bigger. */
18158 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18159 }
18160
18161 /* Likewise for the case where the base is too small. */
18162 if (base_off <= -0x1000)
18163 {
18164 base_off = -0x1000 + 1;
18165 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18166 }
18167
18168 /* Offset of the first STP/LDP. */
18169 new_off_1 = off_val_1 - base_off;
18170
18171 /* Offset of the second STP/LDP. */
18172 new_off_3 = off_val_3 - base_off;
18173
18174 /* The offsets must be within the range of the LDP/STP instructions. */
18175 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18176 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18177 return false;
18178
18179 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18180 new_off_1), true);
18181 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18182 new_off_1 + msize), true);
18183 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18184 new_off_3), true);
18185 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18186 new_off_3 + msize), true);
18187
18188 if (!aarch64_mem_pair_operand (mem_1, mode)
18189 || !aarch64_mem_pair_operand (mem_3, mode))
18190 return false;
18191
18192 if (code == ZERO_EXTEND)
18193 {
18194 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18195 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18196 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18197 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18198 }
18199 else if (code == SIGN_EXTEND)
18200 {
18201 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18202 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18203 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18204 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18205 }
18206
18207 if (load)
18208 {
18209 operands[0] = temp_operands[0];
18210 operands[1] = mem_1;
18211 operands[2] = temp_operands[2];
18212 operands[3] = mem_2;
18213 operands[4] = temp_operands[4];
18214 operands[5] = mem_3;
18215 operands[6] = temp_operands[6];
18216 operands[7] = mem_4;
18217 }
18218 else
18219 {
18220 operands[0] = mem_1;
18221 operands[1] = temp_operands[1];
18222 operands[2] = mem_2;
18223 operands[3] = temp_operands[3];
18224 operands[4] = mem_3;
18225 operands[5] = temp_operands[5];
18226 operands[6] = mem_4;
18227 operands[7] = temp_operands[7];
18228 }
18229
18230 /* Emit adjusting instruction. */
18231 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18232 /* Emit ldp/stp instructions. */
18233 t1 = gen_rtx_SET (operands[0], operands[1]);
18234 t2 = gen_rtx_SET (operands[2], operands[3]);
18235 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18236 t1 = gen_rtx_SET (operands[4], operands[5]);
18237 t2 = gen_rtx_SET (operands[6], operands[7]);
18238 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18239 return true;
18240 }
18241
18242 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18243 it isn't worth branching around empty masked ops (including masked
18244 stores). */
18245
18246 static bool
18247 aarch64_empty_mask_is_expensive (unsigned)
18248 {
18249 return false;
18250 }
18251
18252 /* Return 1 if pseudo register should be created and used to hold
18253 GOT address for PIC code. */
18254
18255 bool
18256 aarch64_use_pseudo_pic_reg (void)
18257 {
18258 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18259 }
18260
18261 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18262
18263 static int
18264 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18265 {
18266 switch (XINT (x, 1))
18267 {
18268 case UNSPEC_GOTSMALLPIC:
18269 case UNSPEC_GOTSMALLPIC28K:
18270 case UNSPEC_GOTTINYPIC:
18271 return 0;
18272 default:
18273 break;
18274 }
18275
18276 return default_unspec_may_trap_p (x, flags);
18277 }
18278
18279
18280 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18281 return the log2 of that value. Otherwise return -1. */
18282
18283 int
18284 aarch64_fpconst_pow_of_2 (rtx x)
18285 {
18286 const REAL_VALUE_TYPE *r;
18287
18288 if (!CONST_DOUBLE_P (x))
18289 return -1;
18290
18291 r = CONST_DOUBLE_REAL_VALUE (x);
18292
18293 if (REAL_VALUE_NEGATIVE (*r)
18294 || REAL_VALUE_ISNAN (*r)
18295 || REAL_VALUE_ISINF (*r)
18296 || !real_isinteger (r, DFmode))
18297 return -1;
18298
18299 return exact_log2 (real_to_integer (r));
18300 }
18301
18302 /* If X is a vector of equal CONST_DOUBLE values and that value is
18303 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18304
18305 int
18306 aarch64_vec_fpconst_pow_of_2 (rtx x)
18307 {
18308 int nelts;
18309 if (GET_CODE (x) != CONST_VECTOR
18310 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18311 return -1;
18312
18313 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18314 return -1;
18315
18316 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18317 if (firstval <= 0)
18318 return -1;
18319
18320 for (int i = 1; i < nelts; i++)
18321 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18322 return -1;
18323
18324 return firstval;
18325 }
18326
18327 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18328 to float.
18329
18330 __fp16 always promotes through this hook.
18331 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18332 through the generic excess precision logic rather than here. */
18333
18334 static tree
18335 aarch64_promoted_type (const_tree t)
18336 {
18337 if (SCALAR_FLOAT_TYPE_P (t)
18338 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18339 return float_type_node;
18340
18341 return NULL_TREE;
18342 }
18343
18344 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18345
18346 static bool
18347 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18348 optimization_type opt_type)
18349 {
18350 switch (op)
18351 {
18352 case rsqrt_optab:
18353 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18354
18355 default:
18356 return true;
18357 }
18358 }
18359
18360 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18361
18362 static unsigned int
18363 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18364 int *offset)
18365 {
18366 /* Polynomial invariant 1 == (VG / 2) - 1. */
18367 gcc_assert (i == 1);
18368 *factor = 2;
18369 *offset = 1;
18370 return AARCH64_DWARF_VG;
18371 }
18372
18373 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18374 if MODE is HFmode, and punt to the generic implementation otherwise. */
18375
18376 static bool
18377 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18378 {
18379 return (mode == HFmode
18380 ? true
18381 : default_libgcc_floating_mode_supported_p (mode));
18382 }
18383
18384 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18385 if MODE is HFmode, and punt to the generic implementation otherwise. */
18386
18387 static bool
18388 aarch64_scalar_mode_supported_p (scalar_mode mode)
18389 {
18390 return (mode == HFmode
18391 ? true
18392 : default_scalar_mode_supported_p (mode));
18393 }
18394
18395 /* Set the value of FLT_EVAL_METHOD.
18396 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18397
18398 0: evaluate all operations and constants, whose semantic type has at
18399 most the range and precision of type float, to the range and
18400 precision of float; evaluate all other operations and constants to
18401 the range and precision of the semantic type;
18402
18403 N, where _FloatN is a supported interchange floating type
18404 evaluate all operations and constants, whose semantic type has at
18405 most the range and precision of _FloatN type, to the range and
18406 precision of the _FloatN type; evaluate all other operations and
18407 constants to the range and precision of the semantic type;
18408
18409 If we have the ARMv8.2-A extensions then we support _Float16 in native
18410 precision, so we should set this to 16. Otherwise, we support the type,
18411 but want to evaluate expressions in float precision, so set this to
18412 0. */
18413
18414 static enum flt_eval_method
18415 aarch64_excess_precision (enum excess_precision_type type)
18416 {
18417 switch (type)
18418 {
18419 case EXCESS_PRECISION_TYPE_FAST:
18420 case EXCESS_PRECISION_TYPE_STANDARD:
18421 /* We can calculate either in 16-bit range and precision or
18422 32-bit range and precision. Make that decision based on whether
18423 we have native support for the ARMv8.2-A 16-bit floating-point
18424 instructions or not. */
18425 return (TARGET_FP_F16INST
18426 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18427 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18428 case EXCESS_PRECISION_TYPE_IMPLICIT:
18429 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18430 default:
18431 gcc_unreachable ();
18432 }
18433 return FLT_EVAL_METHOD_UNPREDICTABLE;
18434 }
18435
18436 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18437 scheduled for speculative execution. Reject the long-running division
18438 and square-root instructions. */
18439
18440 static bool
18441 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18442 {
18443 switch (get_attr_type (insn))
18444 {
18445 case TYPE_SDIV:
18446 case TYPE_UDIV:
18447 case TYPE_FDIVS:
18448 case TYPE_FDIVD:
18449 case TYPE_FSQRTS:
18450 case TYPE_FSQRTD:
18451 case TYPE_NEON_FP_SQRT_S:
18452 case TYPE_NEON_FP_SQRT_D:
18453 case TYPE_NEON_FP_SQRT_S_Q:
18454 case TYPE_NEON_FP_SQRT_D_Q:
18455 case TYPE_NEON_FP_DIV_S:
18456 case TYPE_NEON_FP_DIV_D:
18457 case TYPE_NEON_FP_DIV_S_Q:
18458 case TYPE_NEON_FP_DIV_D_Q:
18459 return false;
18460 default:
18461 return true;
18462 }
18463 }
18464
18465 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18466
18467 static int
18468 aarch64_compute_pressure_classes (reg_class *classes)
18469 {
18470 int i = 0;
18471 classes[i++] = GENERAL_REGS;
18472 classes[i++] = FP_REGS;
18473 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18474 registers need to go in PR_LO_REGS at some point during their
18475 lifetime. Splitting it into two halves has the effect of making
18476 all predicates count against PR_LO_REGS, so that we try whenever
18477 possible to restrict the number of live predicates to 8. This
18478 greatly reduces the amount of spilling in certain loops. */
18479 classes[i++] = PR_LO_REGS;
18480 classes[i++] = PR_HI_REGS;
18481 return i;
18482 }
18483
18484 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18485
18486 static bool
18487 aarch64_can_change_mode_class (machine_mode from,
18488 machine_mode to, reg_class_t)
18489 {
18490 if (BYTES_BIG_ENDIAN)
18491 {
18492 bool from_sve_p = aarch64_sve_data_mode_p (from);
18493 bool to_sve_p = aarch64_sve_data_mode_p (to);
18494
18495 /* Don't allow changes between SVE data modes and non-SVE modes.
18496 See the comment at the head of aarch64-sve.md for details. */
18497 if (from_sve_p != to_sve_p)
18498 return false;
18499
18500 /* Don't allow changes in element size: lane 0 of the new vector
18501 would not then be lane 0 of the old vector. See the comment
18502 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18503 description.
18504
18505 In the worst case, this forces a register to be spilled in
18506 one mode and reloaded in the other, which handles the
18507 endianness correctly. */
18508 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18509 return false;
18510 }
18511 return true;
18512 }
18513
18514 /* Implement TARGET_EARLY_REMAT_MODES. */
18515
18516 static void
18517 aarch64_select_early_remat_modes (sbitmap modes)
18518 {
18519 /* SVE values are not normally live across a call, so it should be
18520 worth doing early rematerialization even in VL-specific mode. */
18521 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18522 {
18523 machine_mode mode = (machine_mode) i;
18524 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18525 if (vec_flags & VEC_ANY_SVE)
18526 bitmap_set_bit (modes, i);
18527 }
18528 }
18529
18530 /* Override the default target speculation_safe_value. */
18531 static rtx
18532 aarch64_speculation_safe_value (machine_mode mode,
18533 rtx result, rtx val, rtx failval)
18534 {
18535 /* Maybe we should warn if falling back to hard barriers. They are
18536 likely to be noticably more expensive than the alternative below. */
18537 if (!aarch64_track_speculation)
18538 return default_speculation_safe_value (mode, result, val, failval);
18539
18540 if (!REG_P (val))
18541 val = copy_to_mode_reg (mode, val);
18542
18543 if (!aarch64_reg_or_zero (failval, mode))
18544 failval = copy_to_mode_reg (mode, failval);
18545
18546 emit_insn (gen_despeculate_copy (mode, result, val, failval));
18547 return result;
18548 }
18549
18550 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18551 Look into the tuning structure for an estimate.
18552 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18553 Advanced SIMD 128 bits. */
18554
18555 static HOST_WIDE_INT
18556 aarch64_estimated_poly_value (poly_int64 val)
18557 {
18558 enum aarch64_sve_vector_bits_enum width_source
18559 = aarch64_tune_params.sve_width;
18560
18561 /* If we still don't have an estimate, use the default. */
18562 if (width_source == SVE_SCALABLE)
18563 return default_estimated_poly_value (val);
18564
18565 HOST_WIDE_INT over_128 = width_source - 128;
18566 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18567 }
18568
18569
18570 /* Return true for types that could be supported as SIMD return or
18571 argument types. */
18572
18573 static bool
18574 supported_simd_type (tree t)
18575 {
18576 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18577 {
18578 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18579 return s == 1 || s == 2 || s == 4 || s == 8;
18580 }
18581 return false;
18582 }
18583
18584 /* Return true for types that currently are supported as SIMD return
18585 or argument types. */
18586
18587 static bool
18588 currently_supported_simd_type (tree t, tree b)
18589 {
18590 if (COMPLEX_FLOAT_TYPE_P (t))
18591 return false;
18592
18593 if (TYPE_SIZE (t) != TYPE_SIZE (b))
18594 return false;
18595
18596 return supported_simd_type (t);
18597 }
18598
18599 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
18600
18601 static int
18602 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
18603 struct cgraph_simd_clone *clonei,
18604 tree base_type, int num)
18605 {
18606 tree t, ret_type, arg_type;
18607 unsigned int elt_bits, vec_bits, count;
18608
18609 if (!TARGET_SIMD)
18610 return 0;
18611
18612 if (clonei->simdlen
18613 && (clonei->simdlen < 2
18614 || clonei->simdlen > 1024
18615 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
18616 {
18617 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18618 "unsupported simdlen %d", clonei->simdlen);
18619 return 0;
18620 }
18621
18622 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
18623 if (TREE_CODE (ret_type) != VOID_TYPE
18624 && !currently_supported_simd_type (ret_type, base_type))
18625 {
18626 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
18627 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18628 "GCC does not currently support mixed size types "
18629 "for %<simd%> functions");
18630 else if (supported_simd_type (ret_type))
18631 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18632 "GCC does not currently support return type %qT "
18633 "for %<simd%> functions", ret_type);
18634 else
18635 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18636 "unsupported return type %qT for %<simd%> functions",
18637 ret_type);
18638 return 0;
18639 }
18640
18641 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
18642 {
18643 arg_type = TREE_TYPE (t);
18644
18645 if (!currently_supported_simd_type (arg_type, base_type))
18646 {
18647 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
18648 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18649 "GCC does not currently support mixed size types "
18650 "for %<simd%> functions");
18651 else
18652 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18653 "GCC does not currently support argument type %qT "
18654 "for %<simd%> functions", arg_type);
18655 return 0;
18656 }
18657 }
18658
18659 clonei->vecsize_mangle = 'n';
18660 clonei->mask_mode = VOIDmode;
18661 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
18662 if (clonei->simdlen == 0)
18663 {
18664 count = 2;
18665 vec_bits = (num == 0 ? 64 : 128);
18666 clonei->simdlen = vec_bits / elt_bits;
18667 }
18668 else
18669 {
18670 count = 1;
18671 vec_bits = clonei->simdlen * elt_bits;
18672 if (vec_bits != 64 && vec_bits != 128)
18673 {
18674 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18675 "GCC does not currently support simdlen %d for type %qT",
18676 clonei->simdlen, base_type);
18677 return 0;
18678 }
18679 }
18680 clonei->vecsize_int = vec_bits;
18681 clonei->vecsize_float = vec_bits;
18682 return count;
18683 }
18684
18685 /* Implement TARGET_SIMD_CLONE_ADJUST. */
18686
18687 static void
18688 aarch64_simd_clone_adjust (struct cgraph_node *node)
18689 {
18690 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
18691 use the correct ABI. */
18692
18693 tree t = TREE_TYPE (node->decl);
18694 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
18695 TYPE_ATTRIBUTES (t));
18696 }
18697
18698 /* Implement TARGET_SIMD_CLONE_USABLE. */
18699
18700 static int
18701 aarch64_simd_clone_usable (struct cgraph_node *node)
18702 {
18703 switch (node->simdclone->vecsize_mangle)
18704 {
18705 case 'n':
18706 if (!TARGET_SIMD)
18707 return -1;
18708 return 0;
18709 default:
18710 gcc_unreachable ();
18711 }
18712 }
18713
18714 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
18715
18716 static int
18717 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
18718 {
18719 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
18720 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
18721 return 0;
18722 return 1;
18723 }
18724
18725 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
18726
18727 static const char *
18728 aarch64_get_multilib_abi_name (void)
18729 {
18730 if (TARGET_BIG_END)
18731 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
18732 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
18733 }
18734
18735 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
18736 global variable based guard use the default else
18737 return a null tree. */
18738 static tree
18739 aarch64_stack_protect_guard (void)
18740 {
18741 if (aarch64_stack_protector_guard == SSP_GLOBAL)
18742 return default_stack_protect_guard ();
18743
18744 return NULL_TREE;
18745 }
18746
18747
18748 /* Target-specific selftests. */
18749
18750 #if CHECKING_P
18751
18752 namespace selftest {
18753
18754 /* Selftest for the RTL loader.
18755 Verify that the RTL loader copes with a dump from
18756 print_rtx_function. This is essentially just a test that class
18757 function_reader can handle a real dump, but it also verifies
18758 that lookup_reg_by_dump_name correctly handles hard regs.
18759 The presence of hard reg names in the dump means that the test is
18760 target-specific, hence it is in this file. */
18761
18762 static void
18763 aarch64_test_loading_full_dump ()
18764 {
18765 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18766
18767 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18768
18769 rtx_insn *insn_1 = get_insn_by_uid (1);
18770 ASSERT_EQ (NOTE, GET_CODE (insn_1));
18771
18772 rtx_insn *insn_15 = get_insn_by_uid (15);
18773 ASSERT_EQ (INSN, GET_CODE (insn_15));
18774 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18775
18776 /* Verify crtl->return_rtx. */
18777 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18778 ASSERT_EQ (0, REGNO (crtl->return_rtx));
18779 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18780 }
18781
18782 /* Run all target-specific selftests. */
18783
18784 static void
18785 aarch64_run_selftests (void)
18786 {
18787 aarch64_test_loading_full_dump ();
18788 }
18789
18790 } // namespace selftest
18791
18792 #endif /* #if CHECKING_P */
18793
18794 #undef TARGET_STACK_PROTECT_GUARD
18795 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
18796
18797 #undef TARGET_ADDRESS_COST
18798 #define TARGET_ADDRESS_COST aarch64_address_cost
18799
18800 /* This hook will determines whether unnamed bitfields affect the alignment
18801 of the containing structure. The hook returns true if the structure
18802 should inherit the alignment requirements of an unnamed bitfield's
18803 type. */
18804 #undef TARGET_ALIGN_ANON_BITFIELD
18805 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18806
18807 #undef TARGET_ASM_ALIGNED_DI_OP
18808 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18809
18810 #undef TARGET_ASM_ALIGNED_HI_OP
18811 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18812
18813 #undef TARGET_ASM_ALIGNED_SI_OP
18814 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18815
18816 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18817 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18818 hook_bool_const_tree_hwi_hwi_const_tree_true
18819
18820 #undef TARGET_ASM_FILE_START
18821 #define TARGET_ASM_FILE_START aarch64_start_file
18822
18823 #undef TARGET_ASM_OUTPUT_MI_THUNK
18824 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18825
18826 #undef TARGET_ASM_SELECT_RTX_SECTION
18827 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18828
18829 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18830 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18831
18832 #undef TARGET_BUILD_BUILTIN_VA_LIST
18833 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18834
18835 #undef TARGET_CALLEE_COPIES
18836 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18837
18838 #undef TARGET_CAN_ELIMINATE
18839 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18840
18841 #undef TARGET_CAN_INLINE_P
18842 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18843
18844 #undef TARGET_CANNOT_FORCE_CONST_MEM
18845 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18846
18847 #undef TARGET_CASE_VALUES_THRESHOLD
18848 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18849
18850 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18851 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18852
18853 /* Only the least significant bit is used for initialization guard
18854 variables. */
18855 #undef TARGET_CXX_GUARD_MASK_BIT
18856 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18857
18858 #undef TARGET_C_MODE_FOR_SUFFIX
18859 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18860
18861 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18862 #undef TARGET_DEFAULT_TARGET_FLAGS
18863 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18864 #endif
18865
18866 #undef TARGET_CLASS_MAX_NREGS
18867 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18868
18869 #undef TARGET_BUILTIN_DECL
18870 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18871
18872 #undef TARGET_BUILTIN_RECIPROCAL
18873 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18874
18875 #undef TARGET_C_EXCESS_PRECISION
18876 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18877
18878 #undef TARGET_EXPAND_BUILTIN
18879 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18880
18881 #undef TARGET_EXPAND_BUILTIN_VA_START
18882 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18883
18884 #undef TARGET_FOLD_BUILTIN
18885 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18886
18887 #undef TARGET_FUNCTION_ARG
18888 #define TARGET_FUNCTION_ARG aarch64_function_arg
18889
18890 #undef TARGET_FUNCTION_ARG_ADVANCE
18891 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18892
18893 #undef TARGET_FUNCTION_ARG_BOUNDARY
18894 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18895
18896 #undef TARGET_FUNCTION_ARG_PADDING
18897 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18898
18899 #undef TARGET_GET_RAW_RESULT_MODE
18900 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18901 #undef TARGET_GET_RAW_ARG_MODE
18902 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18903
18904 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18905 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18906
18907 #undef TARGET_FUNCTION_VALUE
18908 #define TARGET_FUNCTION_VALUE aarch64_function_value
18909
18910 #undef TARGET_FUNCTION_VALUE_REGNO_P
18911 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18912
18913 #undef TARGET_GIMPLE_FOLD_BUILTIN
18914 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18915
18916 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18917 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18918
18919 #undef TARGET_INIT_BUILTINS
18920 #define TARGET_INIT_BUILTINS aarch64_init_builtins
18921
18922 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18923 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18924 aarch64_ira_change_pseudo_allocno_class
18925
18926 #undef TARGET_LEGITIMATE_ADDRESS_P
18927 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18928
18929 #undef TARGET_LEGITIMATE_CONSTANT_P
18930 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18931
18932 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18933 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18934 aarch64_legitimize_address_displacement
18935
18936 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18937 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18938
18939 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18940 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18941 aarch64_libgcc_floating_mode_supported_p
18942
18943 #undef TARGET_MANGLE_TYPE
18944 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18945
18946 #undef TARGET_MEMORY_MOVE_COST
18947 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18948
18949 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18950 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18951
18952 #undef TARGET_MUST_PASS_IN_STACK
18953 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18954
18955 /* This target hook should return true if accesses to volatile bitfields
18956 should use the narrowest mode possible. It should return false if these
18957 accesses should use the bitfield container type. */
18958 #undef TARGET_NARROW_VOLATILE_BITFIELD
18959 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18960
18961 #undef TARGET_OPTION_OVERRIDE
18962 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18963
18964 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18965 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18966 aarch64_override_options_after_change
18967
18968 #undef TARGET_OPTION_SAVE
18969 #define TARGET_OPTION_SAVE aarch64_option_save
18970
18971 #undef TARGET_OPTION_RESTORE
18972 #define TARGET_OPTION_RESTORE aarch64_option_restore
18973
18974 #undef TARGET_OPTION_PRINT
18975 #define TARGET_OPTION_PRINT aarch64_option_print
18976
18977 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18978 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18979
18980 #undef TARGET_SET_CURRENT_FUNCTION
18981 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18982
18983 #undef TARGET_PASS_BY_REFERENCE
18984 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18985
18986 #undef TARGET_PREFERRED_RELOAD_CLASS
18987 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18988
18989 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18990 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18991
18992 #undef TARGET_PROMOTED_TYPE
18993 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18994
18995 #undef TARGET_SECONDARY_RELOAD
18996 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18997
18998 #undef TARGET_SHIFT_TRUNCATION_MASK
18999 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19000
19001 #undef TARGET_SETUP_INCOMING_VARARGS
19002 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19003
19004 #undef TARGET_STRUCT_VALUE_RTX
19005 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
19006
19007 #undef TARGET_REGISTER_MOVE_COST
19008 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19009
19010 #undef TARGET_RETURN_IN_MEMORY
19011 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19012
19013 #undef TARGET_RETURN_IN_MSB
19014 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19015
19016 #undef TARGET_RTX_COSTS
19017 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19018
19019 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19020 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19021
19022 #undef TARGET_SCHED_ISSUE_RATE
19023 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19024
19025 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19026 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19027 aarch64_sched_first_cycle_multipass_dfa_lookahead
19028
19029 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19030 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19031 aarch64_first_cycle_multipass_dfa_lookahead_guard
19032
19033 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19034 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19035 aarch64_get_separate_components
19036
19037 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19038 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19039 aarch64_components_for_bb
19040
19041 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19042 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19043 aarch64_disqualify_components
19044
19045 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19046 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19047 aarch64_emit_prologue_components
19048
19049 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19050 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19051 aarch64_emit_epilogue_components
19052
19053 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19054 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19055 aarch64_set_handled_components
19056
19057 #undef TARGET_TRAMPOLINE_INIT
19058 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19059
19060 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19061 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19062
19063 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19064 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19065
19066 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19067 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19068 aarch64_builtin_support_vector_misalignment
19069
19070 #undef TARGET_ARRAY_MODE
19071 #define TARGET_ARRAY_MODE aarch64_array_mode
19072
19073 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19074 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19075
19076 #undef TARGET_VECTORIZE_ADD_STMT_COST
19077 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19078
19079 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19080 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19081 aarch64_builtin_vectorization_cost
19082
19083 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19084 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19085
19086 #undef TARGET_VECTORIZE_BUILTINS
19087 #define TARGET_VECTORIZE_BUILTINS
19088
19089 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19090 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19091 aarch64_builtin_vectorized_function
19092
19093 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19094 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19095 aarch64_autovectorize_vector_sizes
19096
19097 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19098 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19099 aarch64_atomic_assign_expand_fenv
19100
19101 /* Section anchor support. */
19102
19103 #undef TARGET_MIN_ANCHOR_OFFSET
19104 #define TARGET_MIN_ANCHOR_OFFSET -256
19105
19106 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19107 byte offset; we can do much more for larger data types, but have no way
19108 to determine the size of the access. We assume accesses are aligned. */
19109 #undef TARGET_MAX_ANCHOR_OFFSET
19110 #define TARGET_MAX_ANCHOR_OFFSET 4095
19111
19112 #undef TARGET_VECTOR_ALIGNMENT
19113 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19114
19115 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19116 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19117 aarch64_vectorize_preferred_vector_alignment
19118 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19119 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19120 aarch64_simd_vector_alignment_reachable
19121
19122 /* vec_perm support. */
19123
19124 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19125 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19126 aarch64_vectorize_vec_perm_const
19127
19128 #undef TARGET_VECTORIZE_GET_MASK_MODE
19129 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19130 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19131 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19132 aarch64_empty_mask_is_expensive
19133 #undef TARGET_PREFERRED_ELSE_VALUE
19134 #define TARGET_PREFERRED_ELSE_VALUE \
19135 aarch64_preferred_else_value
19136
19137 #undef TARGET_INIT_LIBFUNCS
19138 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19139
19140 #undef TARGET_FIXED_CONDITION_CODE_REGS
19141 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19142
19143 #undef TARGET_FLAGS_REGNUM
19144 #define TARGET_FLAGS_REGNUM CC_REGNUM
19145
19146 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19147 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19148
19149 #undef TARGET_ASAN_SHADOW_OFFSET
19150 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19151
19152 #undef TARGET_LEGITIMIZE_ADDRESS
19153 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19154
19155 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19156 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19157
19158 #undef TARGET_CAN_USE_DOLOOP_P
19159 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19160
19161 #undef TARGET_SCHED_ADJUST_PRIORITY
19162 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19163
19164 #undef TARGET_SCHED_MACRO_FUSION_P
19165 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19166
19167 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19168 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19169
19170 #undef TARGET_SCHED_FUSION_PRIORITY
19171 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19172
19173 #undef TARGET_UNSPEC_MAY_TRAP_P
19174 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19175
19176 #undef TARGET_USE_PSEUDO_PIC_REG
19177 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19178
19179 #undef TARGET_PRINT_OPERAND
19180 #define TARGET_PRINT_OPERAND aarch64_print_operand
19181
19182 #undef TARGET_PRINT_OPERAND_ADDRESS
19183 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19184
19185 #undef TARGET_OPTAB_SUPPORTED_P
19186 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19187
19188 #undef TARGET_OMIT_STRUCT_RETURN_REG
19189 #define TARGET_OMIT_STRUCT_RETURN_REG true
19190
19191 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19192 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19193 aarch64_dwarf_poly_indeterminate_value
19194
19195 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
19196 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19197 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19198
19199 #undef TARGET_HARD_REGNO_NREGS
19200 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19201 #undef TARGET_HARD_REGNO_MODE_OK
19202 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19203
19204 #undef TARGET_MODES_TIEABLE_P
19205 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19206
19207 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19208 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19209 aarch64_hard_regno_call_part_clobbered
19210
19211 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19212 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19213 aarch64_remove_extra_call_preserved_regs
19214
19215 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19216 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19217 aarch64_return_call_with_max_clobbers
19218
19219 #undef TARGET_CONSTANT_ALIGNMENT
19220 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19221
19222 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19223 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19224 aarch64_stack_clash_protection_alloca_probe_range
19225
19226 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19227 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19228
19229 #undef TARGET_CAN_CHANGE_MODE_CLASS
19230 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19231
19232 #undef TARGET_SELECT_EARLY_REMAT_MODES
19233 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19234
19235 #undef TARGET_SPECULATION_SAFE_VALUE
19236 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19237
19238 #undef TARGET_ESTIMATED_POLY_VALUE
19239 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19240
19241 #undef TARGET_ATTRIBUTE_TABLE
19242 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19243
19244 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19245 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19246 aarch64_simd_clone_compute_vecsize_and_simdlen
19247
19248 #undef TARGET_SIMD_CLONE_ADJUST
19249 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19250
19251 #undef TARGET_SIMD_CLONE_USABLE
19252 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19253
19254 #undef TARGET_COMP_TYPE_ATTRIBUTES
19255 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19256
19257 #undef TARGET_GET_MULTILIB_ABI_NAME
19258 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19259
19260 #if CHECKING_P
19261 #undef TARGET_RUN_TARGET_SELFTESTS
19262 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19263 #endif /* #if CHECKING_P */
19264
19265 struct gcc_target targetm = TARGET_INITIALIZER;
19266
19267 #include "gt-aarch64.h"