]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AArch64] Add support for system register based stack protector canary access
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95
96 /* The mode of the elements. */
97 scalar_mode elt_mode;
98
99 /* The value of each element if all elements are the same, or the
100 first value if the constant is a series. */
101 rtx value;
102
103 /* The value of the step if the constant is a series, null otherwise. */
104 rtx step;
105
106 /* The instruction to use to move the immediate into a vector. */
107 insn_type insn;
108
109 /* The kind of shift modifier to use, and the number of bits to shift.
110 This is (LSL, 0) if no shift is needed. */
111 modifier_type modifier;
112 unsigned int shift;
113 };
114
115 /* Construct a floating-point immediate in which each element has mode
116 ELT_MODE_IN and value VALUE_IN. */
117 inline simd_immediate_info
118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
119 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
120 modifier (LSL), shift (0)
121 {}
122
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124 and value VALUE_IN. The other parameters are as for the structure
125 fields. */
126 inline simd_immediate_info
127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
128 unsigned HOST_WIDE_INT value_in,
129 insn_type insn_in, modifier_type modifier_in,
130 unsigned int shift_in)
131 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
132 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 {}
134
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136 and where element I is equal to VALUE_IN + I * STEP_IN. */
137 inline simd_immediate_info
138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
139 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
140 modifier (LSL), shift (0)
141 {}
142
143 /* The current code model. */
144 enum aarch64_code_model aarch64_cmodel;
145
146 /* The number of 64-bit elements in an SVE vector. */
147 poly_uint16 aarch64_sve_vg;
148
149 #ifdef HAVE_AS_TLS
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
152 #endif
153
154 static bool aarch64_composite_type_p (const_tree, machine_mode);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
156 const_tree,
157 machine_mode *, int *,
158 bool *);
159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode);
163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
165 const_tree type,
166 int misalignment,
167 bool is_packed);
168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
170 aarch64_addr_query_type);
171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
172
173 /* Major revision number of the ARM Architecture implemented by the target. */
174 unsigned aarch64_architecture_version;
175
176 /* The processor for which instructions should be scheduled. */
177 enum aarch64_processor aarch64_tune = cortexa53;
178
179 /* Mask to specify which instruction scheduling options should be used. */
180 unsigned long aarch64_tune_flags = 0;
181
182 /* Global flag for PC relative loads. */
183 bool aarch64_pcrelative_literal_loads;
184
185 /* Global flag for whether frame pointer is enabled. */
186 bool aarch64_use_frame_pointer;
187
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string = NULL;
190
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
193
194 /* Support for command line parsing of boolean flags in the tuning
195 structures. */
196 struct aarch64_flag_desc
197 {
198 const char* name;
199 unsigned int flag;
200 };
201
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203 { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
205 {
206 { "none", AARCH64_FUSE_NOTHING },
207 #include "aarch64-fusion-pairs.def"
208 { "all", AARCH64_FUSE_ALL },
209 { NULL, AARCH64_FUSE_NOTHING }
210 };
211
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213 { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
215 {
216 { "none", AARCH64_EXTRA_TUNE_NONE },
217 #include "aarch64-tuning-flags.def"
218 { "all", AARCH64_EXTRA_TUNE_ALL },
219 { NULL, AARCH64_EXTRA_TUNE_NONE }
220 };
221
222 /* Tuning parameters. */
223
224 static const struct cpu_addrcost_table generic_addrcost_table =
225 {
226 {
227 1, /* hi */
228 0, /* si */
229 0, /* di */
230 1, /* ti */
231 },
232 0, /* pre_modify */
233 0, /* post_modify */
234 0, /* register_offset */
235 0, /* register_sextend */
236 0, /* register_zextend */
237 0 /* imm_offset */
238 };
239
240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
241 {
242 {
243 0, /* hi */
244 0, /* si */
245 0, /* di */
246 2, /* ti */
247 },
248 0, /* pre_modify */
249 0, /* post_modify */
250 1, /* register_offset */
251 1, /* register_sextend */
252 2, /* register_zextend */
253 0, /* imm_offset */
254 };
255
256 static const struct cpu_addrcost_table xgene1_addrcost_table =
257 {
258 {
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
263 },
264 1, /* pre_modify */
265 1, /* post_modify */
266 0, /* register_offset */
267 1, /* register_sextend */
268 1, /* register_zextend */
269 0, /* imm_offset */
270 };
271
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
273 {
274 {
275 1, /* hi */
276 1, /* si */
277 1, /* di */
278 2, /* ti */
279 },
280 0, /* pre_modify */
281 0, /* post_modify */
282 2, /* register_offset */
283 3, /* register_sextend */
284 3, /* register_zextend */
285 0, /* imm_offset */
286 };
287
288 static const struct cpu_addrcost_table tsv110_addrcost_table =
289 {
290 {
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
295 },
296 0, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
302 };
303
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
305 {
306 {
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
311 },
312 1, /* pre_modify */
313 1, /* post_modify */
314 3, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 2, /* imm_offset */
318 };
319
320 static const struct cpu_regmove_cost generic_regmove_cost =
321 {
322 1, /* GP2GP */
323 /* Avoid the use of slow int<->fp moves for spilling by setting
324 their cost higher than memmov_cost. */
325 5, /* GP2FP */
326 5, /* FP2GP */
327 2 /* FP2FP */
328 };
329
330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
331 {
332 1, /* GP2GP */
333 /* Avoid the use of slow int<->fp moves for spilling by setting
334 their cost higher than memmov_cost. */
335 5, /* GP2FP */
336 5, /* FP2GP */
337 2 /* FP2FP */
338 };
339
340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
341 {
342 1, /* GP2GP */
343 /* Avoid the use of slow int<->fp moves for spilling by setting
344 their cost higher than memmov_cost. */
345 5, /* GP2FP */
346 5, /* FP2GP */
347 2 /* FP2FP */
348 };
349
350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
351 {
352 1, /* GP2GP */
353 /* Avoid the use of slow int<->fp moves for spilling by setting
354 their cost higher than memmov_cost (actual, 4 and 9). */
355 9, /* GP2FP */
356 9, /* FP2GP */
357 1 /* FP2FP */
358 };
359
360 static const struct cpu_regmove_cost thunderx_regmove_cost =
361 {
362 2, /* GP2GP */
363 2, /* GP2FP */
364 6, /* FP2GP */
365 4 /* FP2FP */
366 };
367
368 static const struct cpu_regmove_cost xgene1_regmove_cost =
369 {
370 1, /* GP2GP */
371 /* Avoid the use of slow int<->fp moves for spilling by setting
372 their cost higher than memmov_cost. */
373 8, /* GP2FP */
374 8, /* FP2GP */
375 2 /* FP2FP */
376 };
377
378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
379 {
380 2, /* GP2GP */
381 /* Avoid the use of int<->fp moves for spilling. */
382 6, /* GP2FP */
383 6, /* FP2GP */
384 4 /* FP2FP */
385 };
386
387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
388 {
389 1, /* GP2GP */
390 /* Avoid the use of int<->fp moves for spilling. */
391 8, /* GP2FP */
392 8, /* FP2GP */
393 4 /* FP2FP */
394 };
395
396 static const struct cpu_regmove_cost tsv110_regmove_cost =
397 {
398 1, /* GP2GP */
399 /* Avoid the use of slow int<->fp moves for spilling by setting
400 their cost higher than memmov_cost. */
401 2, /* GP2FP */
402 3, /* FP2GP */
403 2 /* FP2FP */
404 };
405
406 /* Generic costs for vector insn classes. */
407 static const struct cpu_vector_cost generic_vector_cost =
408 {
409 1, /* scalar_int_stmt_cost */
410 1, /* scalar_fp_stmt_cost */
411 1, /* scalar_load_cost */
412 1, /* scalar_store_cost */
413 1, /* vec_int_stmt_cost */
414 1, /* vec_fp_stmt_cost */
415 2, /* vec_permute_cost */
416 1, /* vec_to_scalar_cost */
417 1, /* scalar_to_vec_cost */
418 1, /* vec_align_load_cost */
419 1, /* vec_unalign_load_cost */
420 1, /* vec_unalign_store_cost */
421 1, /* vec_store_cost */
422 3, /* cond_taken_branch_cost */
423 1 /* cond_not_taken_branch_cost */
424 };
425
426 /* QDF24XX costs for vector insn classes. */
427 static const struct cpu_vector_cost qdf24xx_vector_cost =
428 {
429 1, /* scalar_int_stmt_cost */
430 1, /* scalar_fp_stmt_cost */
431 1, /* scalar_load_cost */
432 1, /* scalar_store_cost */
433 1, /* vec_int_stmt_cost */
434 3, /* vec_fp_stmt_cost */
435 2, /* vec_permute_cost */
436 1, /* vec_to_scalar_cost */
437 1, /* scalar_to_vec_cost */
438 1, /* vec_align_load_cost */
439 1, /* vec_unalign_load_cost */
440 1, /* vec_unalign_store_cost */
441 1, /* vec_store_cost */
442 3, /* cond_taken_branch_cost */
443 1 /* cond_not_taken_branch_cost */
444 };
445
446 /* ThunderX costs for vector insn classes. */
447 static const struct cpu_vector_cost thunderx_vector_cost =
448 {
449 1, /* scalar_int_stmt_cost */
450 1, /* scalar_fp_stmt_cost */
451 3, /* scalar_load_cost */
452 1, /* scalar_store_cost */
453 4, /* vec_int_stmt_cost */
454 1, /* vec_fp_stmt_cost */
455 4, /* vec_permute_cost */
456 2, /* vec_to_scalar_cost */
457 2, /* scalar_to_vec_cost */
458 3, /* vec_align_load_cost */
459 5, /* vec_unalign_load_cost */
460 5, /* vec_unalign_store_cost */
461 1, /* vec_store_cost */
462 3, /* cond_taken_branch_cost */
463 3 /* cond_not_taken_branch_cost */
464 };
465
466 static const struct cpu_vector_cost tsv110_vector_cost =
467 {
468 1, /* scalar_int_stmt_cost */
469 1, /* scalar_fp_stmt_cost */
470 5, /* scalar_load_cost */
471 1, /* scalar_store_cost */
472 2, /* vec_int_stmt_cost */
473 2, /* vec_fp_stmt_cost */
474 2, /* vec_permute_cost */
475 3, /* vec_to_scalar_cost */
476 2, /* scalar_to_vec_cost */
477 5, /* vec_align_load_cost */
478 5, /* vec_unalign_load_cost */
479 1, /* vec_unalign_store_cost */
480 1, /* vec_store_cost */
481 1, /* cond_taken_branch_cost */
482 1 /* cond_not_taken_branch_cost */
483 };
484
485 /* Generic costs for vector insn classes. */
486 static const struct cpu_vector_cost cortexa57_vector_cost =
487 {
488 1, /* scalar_int_stmt_cost */
489 1, /* scalar_fp_stmt_cost */
490 4, /* scalar_load_cost */
491 1, /* scalar_store_cost */
492 2, /* vec_int_stmt_cost */
493 2, /* vec_fp_stmt_cost */
494 3, /* vec_permute_cost */
495 8, /* vec_to_scalar_cost */
496 8, /* scalar_to_vec_cost */
497 4, /* vec_align_load_cost */
498 4, /* vec_unalign_load_cost */
499 1, /* vec_unalign_store_cost */
500 1, /* vec_store_cost */
501 1, /* cond_taken_branch_cost */
502 1 /* cond_not_taken_branch_cost */
503 };
504
505 static const struct cpu_vector_cost exynosm1_vector_cost =
506 {
507 1, /* scalar_int_stmt_cost */
508 1, /* scalar_fp_stmt_cost */
509 5, /* scalar_load_cost */
510 1, /* scalar_store_cost */
511 3, /* vec_int_stmt_cost */
512 3, /* vec_fp_stmt_cost */
513 3, /* vec_permute_cost */
514 3, /* vec_to_scalar_cost */
515 3, /* scalar_to_vec_cost */
516 5, /* vec_align_load_cost */
517 5, /* vec_unalign_load_cost */
518 1, /* vec_unalign_store_cost */
519 1, /* vec_store_cost */
520 1, /* cond_taken_branch_cost */
521 1 /* cond_not_taken_branch_cost */
522 };
523
524 /* Generic costs for vector insn classes. */
525 static const struct cpu_vector_cost xgene1_vector_cost =
526 {
527 1, /* scalar_int_stmt_cost */
528 1, /* scalar_fp_stmt_cost */
529 5, /* scalar_load_cost */
530 1, /* scalar_store_cost */
531 2, /* vec_int_stmt_cost */
532 2, /* vec_fp_stmt_cost */
533 2, /* vec_permute_cost */
534 4, /* vec_to_scalar_cost */
535 4, /* scalar_to_vec_cost */
536 10, /* vec_align_load_cost */
537 10, /* vec_unalign_load_cost */
538 2, /* vec_unalign_store_cost */
539 2, /* vec_store_cost */
540 2, /* cond_taken_branch_cost */
541 1 /* cond_not_taken_branch_cost */
542 };
543
544 /* Costs for vector insn classes for Vulcan. */
545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
546 {
547 1, /* scalar_int_stmt_cost */
548 6, /* scalar_fp_stmt_cost */
549 4, /* scalar_load_cost */
550 1, /* scalar_store_cost */
551 5, /* vec_int_stmt_cost */
552 6, /* vec_fp_stmt_cost */
553 3, /* vec_permute_cost */
554 6, /* vec_to_scalar_cost */
555 5, /* scalar_to_vec_cost */
556 8, /* vec_align_load_cost */
557 8, /* vec_unalign_load_cost */
558 4, /* vec_unalign_store_cost */
559 4, /* vec_store_cost */
560 2, /* cond_taken_branch_cost */
561 1 /* cond_not_taken_branch_cost */
562 };
563
564 /* Generic costs for branch instructions. */
565 static const struct cpu_branch_cost generic_branch_cost =
566 {
567 1, /* Predictable. */
568 3 /* Unpredictable. */
569 };
570
571 /* Generic approximation modes. */
572 static const cpu_approx_modes generic_approx_modes =
573 {
574 AARCH64_APPROX_NONE, /* division */
575 AARCH64_APPROX_NONE, /* sqrt */
576 AARCH64_APPROX_NONE /* recip_sqrt */
577 };
578
579 /* Approximation modes for Exynos M1. */
580 static const cpu_approx_modes exynosm1_approx_modes =
581 {
582 AARCH64_APPROX_NONE, /* division */
583 AARCH64_APPROX_ALL, /* sqrt */
584 AARCH64_APPROX_ALL /* recip_sqrt */
585 };
586
587 /* Approximation modes for X-Gene 1. */
588 static const cpu_approx_modes xgene1_approx_modes =
589 {
590 AARCH64_APPROX_NONE, /* division */
591 AARCH64_APPROX_NONE, /* sqrt */
592 AARCH64_APPROX_ALL /* recip_sqrt */
593 };
594
595 /* Generic prefetch settings (which disable prefetch). */
596 static const cpu_prefetch_tune generic_prefetch_tune =
597 {
598 0, /* num_slots */
599 -1, /* l1_cache_size */
600 -1, /* l1_cache_line_size */
601 -1, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 -1 /* default_opt_level */
605 };
606
607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
608 {
609 0, /* num_slots */
610 -1, /* l1_cache_size */
611 64, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
616 };
617
618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
619 {
620 4, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 512, /* l2_cache_size */
624 false, /* prefetch_dynamic_strides */
625 2048, /* minimum_stride */
626 3 /* default_opt_level */
627 };
628
629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
630 {
631 8, /* num_slots */
632 32, /* l1_cache_size */
633 128, /* l1_cache_line_size */
634 16*1024, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 3 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune thunderx_prefetch_tune =
641 {
642 8, /* num_slots */
643 32, /* l1_cache_size */
644 128, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
652 {
653 8, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 256, /* l2_cache_size */
657 true, /* prefetch_dynamic_strides */
658 -1, /* minimum_stride */
659 -1 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune tsv110_prefetch_tune =
663 {
664 0, /* num_slots */
665 64, /* l1_cache_size */
666 64, /* l1_cache_line_size */
667 512, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 -1 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune xgene1_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 64, /* l1_cache_line_size */
678 256, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const struct tune_params generic_tunings =
685 {
686 &cortexa57_extra_costs,
687 &generic_addrcost_table,
688 &generic_regmove_cost,
689 &generic_vector_cost,
690 &generic_branch_cost,
691 &generic_approx_modes,
692 SVE_NOT_IMPLEMENTED, /* sve_width */
693 4, /* memmov_cost */
694 2, /* issue_rate */
695 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
696 "8", /* function_align. */
697 "4", /* jump_align. */
698 "8", /* loop_align. */
699 2, /* int_reassoc_width. */
700 4, /* fp_reassoc_width. */
701 1, /* vec_reassoc_width. */
702 2, /* min_div_recip_mul_sf. */
703 2, /* min_div_recip_mul_df. */
704 0, /* max_case_values. */
705 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
706 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
707 &generic_prefetch_tune
708 };
709
710 static const struct tune_params cortexa35_tunings =
711 {
712 &cortexa53_extra_costs,
713 &generic_addrcost_table,
714 &cortexa53_regmove_cost,
715 &generic_vector_cost,
716 &generic_branch_cost,
717 &generic_approx_modes,
718 SVE_NOT_IMPLEMENTED, /* sve_width */
719 4, /* memmov_cost */
720 1, /* issue_rate */
721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
723 "16", /* function_align. */
724 "4", /* jump_align. */
725 "8", /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
734 &generic_prefetch_tune
735 };
736
737 static const struct tune_params cortexa53_tunings =
738 {
739 &cortexa53_extra_costs,
740 &generic_addrcost_table,
741 &cortexa53_regmove_cost,
742 &generic_vector_cost,
743 &generic_branch_cost,
744 &generic_approx_modes,
745 SVE_NOT_IMPLEMENTED, /* sve_width */
746 4, /* memmov_cost */
747 2, /* issue_rate */
748 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
749 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
750 "16", /* function_align. */
751 "4", /* jump_align. */
752 "8", /* loop_align. */
753 2, /* int_reassoc_width. */
754 4, /* fp_reassoc_width. */
755 1, /* vec_reassoc_width. */
756 2, /* min_div_recip_mul_sf. */
757 2, /* min_div_recip_mul_df. */
758 0, /* max_case_values. */
759 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
760 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
761 &generic_prefetch_tune
762 };
763
764 static const struct tune_params cortexa57_tunings =
765 {
766 &cortexa57_extra_costs,
767 &generic_addrcost_table,
768 &cortexa57_regmove_cost,
769 &cortexa57_vector_cost,
770 &generic_branch_cost,
771 &generic_approx_modes,
772 SVE_NOT_IMPLEMENTED, /* sve_width */
773 4, /* memmov_cost */
774 3, /* issue_rate */
775 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
776 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
777 "16", /* function_align. */
778 "4", /* jump_align. */
779 "8", /* loop_align. */
780 2, /* int_reassoc_width. */
781 4, /* fp_reassoc_width. */
782 1, /* vec_reassoc_width. */
783 2, /* min_div_recip_mul_sf. */
784 2, /* min_div_recip_mul_df. */
785 0, /* max_case_values. */
786 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
787 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
788 &generic_prefetch_tune
789 };
790
791 static const struct tune_params cortexa72_tunings =
792 {
793 &cortexa57_extra_costs,
794 &generic_addrcost_table,
795 &cortexa57_regmove_cost,
796 &cortexa57_vector_cost,
797 &generic_branch_cost,
798 &generic_approx_modes,
799 SVE_NOT_IMPLEMENTED, /* sve_width */
800 4, /* memmov_cost */
801 3, /* issue_rate */
802 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
803 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
804 "16", /* function_align. */
805 "4", /* jump_align. */
806 "8", /* loop_align. */
807 2, /* int_reassoc_width. */
808 4, /* fp_reassoc_width. */
809 1, /* vec_reassoc_width. */
810 2, /* min_div_recip_mul_sf. */
811 2, /* min_div_recip_mul_df. */
812 0, /* max_case_values. */
813 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
814 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
815 &generic_prefetch_tune
816 };
817
818 static const struct tune_params cortexa73_tunings =
819 {
820 &cortexa57_extra_costs,
821 &generic_addrcost_table,
822 &cortexa57_regmove_cost,
823 &cortexa57_vector_cost,
824 &generic_branch_cost,
825 &generic_approx_modes,
826 SVE_NOT_IMPLEMENTED, /* sve_width */
827 4, /* memmov_cost. */
828 2, /* issue_rate. */
829 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
830 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
831 "16", /* function_align. */
832 "4", /* jump_align. */
833 "8", /* loop_align. */
834 2, /* int_reassoc_width. */
835 4, /* fp_reassoc_width. */
836 1, /* vec_reassoc_width. */
837 2, /* min_div_recip_mul_sf. */
838 2, /* min_div_recip_mul_df. */
839 0, /* max_case_values. */
840 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
841 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
842 &generic_prefetch_tune
843 };
844
845
846
847 static const struct tune_params exynosm1_tunings =
848 {
849 &exynosm1_extra_costs,
850 &exynosm1_addrcost_table,
851 &exynosm1_regmove_cost,
852 &exynosm1_vector_cost,
853 &generic_branch_cost,
854 &exynosm1_approx_modes,
855 SVE_NOT_IMPLEMENTED, /* sve_width */
856 4, /* memmov_cost */
857 3, /* issue_rate */
858 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
859 "4", /* function_align. */
860 "4", /* jump_align. */
861 "4", /* loop_align. */
862 2, /* int_reassoc_width. */
863 4, /* fp_reassoc_width. */
864 1, /* vec_reassoc_width. */
865 2, /* min_div_recip_mul_sf. */
866 2, /* min_div_recip_mul_df. */
867 48, /* max_case_values. */
868 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
869 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
870 &exynosm1_prefetch_tune
871 };
872
873 static const struct tune_params thunderxt88_tunings =
874 {
875 &thunderx_extra_costs,
876 &generic_addrcost_table,
877 &thunderx_regmove_cost,
878 &thunderx_vector_cost,
879 &generic_branch_cost,
880 &generic_approx_modes,
881 SVE_NOT_IMPLEMENTED, /* sve_width */
882 6, /* memmov_cost */
883 2, /* issue_rate */
884 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
885 "8", /* function_align. */
886 "8", /* jump_align. */
887 "8", /* loop_align. */
888 2, /* int_reassoc_width. */
889 4, /* fp_reassoc_width. */
890 1, /* vec_reassoc_width. */
891 2, /* min_div_recip_mul_sf. */
892 2, /* min_div_recip_mul_df. */
893 0, /* max_case_values. */
894 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
895 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
896 &thunderxt88_prefetch_tune
897 };
898
899 static const struct tune_params thunderx_tunings =
900 {
901 &thunderx_extra_costs,
902 &generic_addrcost_table,
903 &thunderx_regmove_cost,
904 &thunderx_vector_cost,
905 &generic_branch_cost,
906 &generic_approx_modes,
907 SVE_NOT_IMPLEMENTED, /* sve_width */
908 6, /* memmov_cost */
909 2, /* issue_rate */
910 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
911 "8", /* function_align. */
912 "8", /* jump_align. */
913 "8", /* loop_align. */
914 2, /* int_reassoc_width. */
915 4, /* fp_reassoc_width. */
916 1, /* vec_reassoc_width. */
917 2, /* min_div_recip_mul_sf. */
918 2, /* min_div_recip_mul_df. */
919 0, /* max_case_values. */
920 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
921 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
922 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
923 &thunderx_prefetch_tune
924 };
925
926 static const struct tune_params tsv110_tunings =
927 {
928 &tsv110_extra_costs,
929 &tsv110_addrcost_table,
930 &tsv110_regmove_cost,
931 &tsv110_vector_cost,
932 &generic_branch_cost,
933 &generic_approx_modes,
934 SVE_NOT_IMPLEMENTED, /* sve_width */
935 4, /* memmov_cost */
936 4, /* issue_rate */
937 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
938 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
939 "16", /* function_align. */
940 "4", /* jump_align. */
941 "8", /* loop_align. */
942 2, /* int_reassoc_width. */
943 4, /* fp_reassoc_width. */
944 1, /* vec_reassoc_width. */
945 2, /* min_div_recip_mul_sf. */
946 2, /* min_div_recip_mul_df. */
947 0, /* max_case_values. */
948 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
949 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
950 &tsv110_prefetch_tune
951 };
952
953 static const struct tune_params xgene1_tunings =
954 {
955 &xgene1_extra_costs,
956 &xgene1_addrcost_table,
957 &xgene1_regmove_cost,
958 &xgene1_vector_cost,
959 &generic_branch_cost,
960 &xgene1_approx_modes,
961 SVE_NOT_IMPLEMENTED, /* sve_width */
962 6, /* memmov_cost */
963 4, /* issue_rate */
964 AARCH64_FUSE_NOTHING, /* fusible_ops */
965 "16", /* function_align. */
966 "16", /* jump_align. */
967 "16", /* loop_align. */
968 2, /* int_reassoc_width. */
969 4, /* fp_reassoc_width. */
970 1, /* vec_reassoc_width. */
971 2, /* min_div_recip_mul_sf. */
972 2, /* min_div_recip_mul_df. */
973 17, /* max_case_values. */
974 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
975 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
976 &xgene1_prefetch_tune
977 };
978
979 static const struct tune_params emag_tunings =
980 {
981 &xgene1_extra_costs,
982 &xgene1_addrcost_table,
983 &xgene1_regmove_cost,
984 &xgene1_vector_cost,
985 &generic_branch_cost,
986 &xgene1_approx_modes,
987 SVE_NOT_IMPLEMENTED,
988 6, /* memmov_cost */
989 4, /* issue_rate */
990 AARCH64_FUSE_NOTHING, /* fusible_ops */
991 "16", /* function_align. */
992 "16", /* jump_align. */
993 "16", /* loop_align. */
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 17, /* max_case_values. */
1000 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1001 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1002 &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007 &qdf24xx_extra_costs,
1008 &qdf24xx_addrcost_table,
1009 &qdf24xx_regmove_cost,
1010 &qdf24xx_vector_cost,
1011 &generic_branch_cost,
1012 &generic_approx_modes,
1013 SVE_NOT_IMPLEMENTED, /* sve_width */
1014 4, /* memmov_cost */
1015 4, /* issue_rate */
1016 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1018 "16", /* function_align. */
1019 "8", /* jump_align. */
1020 "16", /* loop_align. */
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1028 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1029 &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1033 for now. */
1034 static const struct tune_params saphira_tunings =
1035 {
1036 &generic_extra_costs,
1037 &generic_addrcost_table,
1038 &generic_regmove_cost,
1039 &generic_vector_cost,
1040 &generic_branch_cost,
1041 &generic_approx_modes,
1042 SVE_NOT_IMPLEMENTED, /* sve_width */
1043 4, /* memmov_cost */
1044 4, /* issue_rate */
1045 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1047 "16", /* function_align. */
1048 "8", /* jump_align. */
1049 "16", /* loop_align. */
1050 2, /* int_reassoc_width. */
1051 4, /* fp_reassoc_width. */
1052 1, /* vec_reassoc_width. */
1053 2, /* min_div_recip_mul_sf. */
1054 2, /* min_div_recip_mul_df. */
1055 0, /* max_case_values. */
1056 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1057 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1058 &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063 &thunderx2t99_extra_costs,
1064 &thunderx2t99_addrcost_table,
1065 &thunderx2t99_regmove_cost,
1066 &thunderx2t99_vector_cost,
1067 &generic_branch_cost,
1068 &generic_approx_modes,
1069 SVE_NOT_IMPLEMENTED, /* sve_width */
1070 4, /* memmov_cost. */
1071 4, /* issue_rate. */
1072 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1074 "16", /* function_align. */
1075 "8", /* jump_align. */
1076 "16", /* loop_align. */
1077 3, /* int_reassoc_width. */
1078 2, /* fp_reassoc_width. */
1079 2, /* vec_reassoc_width. */
1080 2, /* min_div_recip_mul_sf. */
1081 2, /* min_div_recip_mul_df. */
1082 0, /* max_case_values. */
1083 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1084 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1085 &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params ares_tunings =
1089 {
1090 &cortexa57_extra_costs,
1091 &generic_addrcost_table,
1092 &generic_regmove_cost,
1093 &cortexa57_vector_cost,
1094 &generic_branch_cost,
1095 &generic_approx_modes,
1096 SVE_NOT_IMPLEMENTED, /* sve_width */
1097 4, /* memmov_cost */
1098 3, /* issue_rate */
1099 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1100 "32:16", /* function_align. */
1101 "32:16", /* jump_align. */
1102 "32:16", /* loop_align. */
1103 2, /* int_reassoc_width. */
1104 4, /* fp_reassoc_width. */
1105 2, /* vec_reassoc_width. */
1106 2, /* min_div_recip_mul_sf. */
1107 2, /* min_div_recip_mul_df. */
1108 0, /* max_case_values. */
1109 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1110 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1111 &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures. */
1115 struct aarch64_tuning_override_function
1116 {
1117 const char* name;
1118 void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128 { "fuse", aarch64_parse_fuse_string },
1129 { "tune", aarch64_parse_tune_string },
1130 { "sve_width", aarch64_parse_sve_width_string },
1131 { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64. */
1135 struct processor
1136 {
1137 const char *const name;
1138 enum aarch64_processor ident;
1139 enum aarch64_processor sched_core;
1140 enum aarch64_arch arch;
1141 unsigned architecture_version;
1142 const unsigned long flags;
1143 const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64. */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64. */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1160 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1161 FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification. These are populated by the -march, -mtune, -mcpu
1170 handling code or by target attributes. */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 /* The current tuning set. */
1176 struct tune_params aarch64_tune_params = generic_tunings;
1177
1178 /* Table of machine attributes. */
1179 static const struct attribute_spec aarch64_attribute_table[] =
1180 {
1181 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1182 affects_type_identity, handler, exclude } */
1183 { "aarch64_vector_pcs", 0, 0, false, true, true, false, NULL, NULL },
1184 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1185 };
1186
1187 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1188
1189 /* An ISA extension in the co-processor and main instruction set space. */
1190 struct aarch64_option_extension
1191 {
1192 const char *const name;
1193 const unsigned long flags_on;
1194 const unsigned long flags_off;
1195 };
1196
1197 typedef enum aarch64_cond_code
1198 {
1199 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1200 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1201 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1202 }
1203 aarch64_cc;
1204
1205 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1206
1207 struct aarch64_branch_protect_type
1208 {
1209 /* The type's name that the user passes to the branch-protection option
1210 string. */
1211 const char* name;
1212 /* Function to handle the protection type and set global variables.
1213 First argument is the string token corresponding with this type and the
1214 second argument is the next token in the option string.
1215 Return values:
1216 * AARCH64_PARSE_OK: Handling was sucessful.
1217 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1218 should print an error.
1219 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1220 own error. */
1221 enum aarch64_parse_opt_result (*handler)(char*, char*);
1222 /* A list of types that can follow this type in the option string. */
1223 const aarch64_branch_protect_type* subtypes;
1224 unsigned int num_subtypes;
1225 };
1226
1227 static enum aarch64_parse_opt_result
1228 aarch64_handle_no_branch_protection (char* str, char* rest)
1229 {
1230 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1231 aarch64_enable_bti = 0;
1232 if (rest)
1233 {
1234 error ("unexpected %<%s%> after %<%s%>", rest, str);
1235 return AARCH64_PARSE_INVALID_FEATURE;
1236 }
1237 return AARCH64_PARSE_OK;
1238 }
1239
1240 static enum aarch64_parse_opt_result
1241 aarch64_handle_standard_branch_protection (char* str, char* rest)
1242 {
1243 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1244 aarch64_enable_bti = 1;
1245 if (rest)
1246 {
1247 error ("unexpected %<%s%> after %<%s%>", rest, str);
1248 return AARCH64_PARSE_INVALID_FEATURE;
1249 }
1250 return AARCH64_PARSE_OK;
1251 }
1252
1253 static enum aarch64_parse_opt_result
1254 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1255 char* rest ATTRIBUTE_UNUSED)
1256 {
1257 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1258 return AARCH64_PARSE_OK;
1259 }
1260
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1263 char* rest ATTRIBUTE_UNUSED)
1264 {
1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1266 return AARCH64_PARSE_OK;
1267 }
1268
1269 static enum aarch64_parse_opt_result
1270 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1271 char* rest ATTRIBUTE_UNUSED)
1272 {
1273 aarch64_enable_bti = 1;
1274 return AARCH64_PARSE_OK;
1275 }
1276
1277 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1278 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1279 { NULL, NULL, NULL, 0 }
1280 };
1281
1282 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1283 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1284 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1285 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1286 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1287 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1288 { NULL, NULL, NULL, 0 }
1289 };
1290
1291 /* The condition codes of the processor, and the inverse function. */
1292 static const char * const aarch64_condition_codes[] =
1293 {
1294 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1295 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1296 };
1297
1298 /* Generate code to enable conditional branches in functions over 1 MiB. */
1299 const char *
1300 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1301 const char * branch_format)
1302 {
1303 rtx_code_label * tmp_label = gen_label_rtx ();
1304 char label_buf[256];
1305 char buffer[128];
1306 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1307 CODE_LABEL_NUMBER (tmp_label));
1308 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1309 rtx dest_label = operands[pos_label];
1310 operands[pos_label] = tmp_label;
1311
1312 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1313 output_asm_insn (buffer, operands);
1314
1315 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1316 operands[pos_label] = dest_label;
1317 output_asm_insn (buffer, operands);
1318 return "";
1319 }
1320
1321 void
1322 aarch64_err_no_fpadvsimd (machine_mode mode)
1323 {
1324 if (TARGET_GENERAL_REGS_ONLY)
1325 if (FLOAT_MODE_P (mode))
1326 error ("%qs is incompatible with the use of floating-point types",
1327 "-mgeneral-regs-only");
1328 else
1329 error ("%qs is incompatible with the use of vector types",
1330 "-mgeneral-regs-only");
1331 else
1332 if (FLOAT_MODE_P (mode))
1333 error ("%qs feature modifier is incompatible with the use of"
1334 " floating-point types", "+nofp");
1335 else
1336 error ("%qs feature modifier is incompatible with the use of"
1337 " vector types", "+nofp");
1338 }
1339
1340 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1341 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1342 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1343 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1344 and GENERAL_REGS is lower than the memory cost (in this case the best class
1345 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1346 cost results in bad allocations with many redundant int<->FP moves which
1347 are expensive on various cores.
1348 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1349 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1350 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1351 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1352 The result of this is that it is no longer inefficient to have a higher
1353 memory move cost than the register move cost.
1354 */
1355
1356 static reg_class_t
1357 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1358 reg_class_t best_class)
1359 {
1360 machine_mode mode;
1361
1362 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1363 || !reg_class_subset_p (FP_REGS, allocno_class))
1364 return allocno_class;
1365
1366 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1367 || !reg_class_subset_p (FP_REGS, best_class))
1368 return best_class;
1369
1370 mode = PSEUDO_REGNO_MODE (regno);
1371 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1372 }
1373
1374 static unsigned int
1375 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1376 {
1377 if (GET_MODE_UNIT_SIZE (mode) == 4)
1378 return aarch64_tune_params.min_div_recip_mul_sf;
1379 return aarch64_tune_params.min_div_recip_mul_df;
1380 }
1381
1382 /* Return the reassociation width of treeop OPC with mode MODE. */
1383 static int
1384 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1385 {
1386 if (VECTOR_MODE_P (mode))
1387 return aarch64_tune_params.vec_reassoc_width;
1388 if (INTEGRAL_MODE_P (mode))
1389 return aarch64_tune_params.int_reassoc_width;
1390 /* Avoid reassociating floating point addition so we emit more FMAs. */
1391 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1392 return aarch64_tune_params.fp_reassoc_width;
1393 return 1;
1394 }
1395
1396 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1397 unsigned
1398 aarch64_dbx_register_number (unsigned regno)
1399 {
1400 if (GP_REGNUM_P (regno))
1401 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1402 else if (regno == SP_REGNUM)
1403 return AARCH64_DWARF_SP;
1404 else if (FP_REGNUM_P (regno))
1405 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1406 else if (PR_REGNUM_P (regno))
1407 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1408 else if (regno == VG_REGNUM)
1409 return AARCH64_DWARF_VG;
1410
1411 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1412 equivalent DWARF register. */
1413 return DWARF_FRAME_REGISTERS;
1414 }
1415
1416 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1417 static bool
1418 aarch64_advsimd_struct_mode_p (machine_mode mode)
1419 {
1420 return (TARGET_SIMD
1421 && (mode == OImode || mode == CImode || mode == XImode));
1422 }
1423
1424 /* Return true if MODE is an SVE predicate mode. */
1425 static bool
1426 aarch64_sve_pred_mode_p (machine_mode mode)
1427 {
1428 return (TARGET_SVE
1429 && (mode == VNx16BImode
1430 || mode == VNx8BImode
1431 || mode == VNx4BImode
1432 || mode == VNx2BImode));
1433 }
1434
1435 /* Three mutually-exclusive flags describing a vector or predicate type. */
1436 const unsigned int VEC_ADVSIMD = 1;
1437 const unsigned int VEC_SVE_DATA = 2;
1438 const unsigned int VEC_SVE_PRED = 4;
1439 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1440 a structure of 2, 3 or 4 vectors. */
1441 const unsigned int VEC_STRUCT = 8;
1442 /* Useful combinations of the above. */
1443 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1444 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1445
1446 /* Return a set of flags describing the vector properties of mode MODE.
1447 Ignore modes that are not supported by the current target. */
1448 static unsigned int
1449 aarch64_classify_vector_mode (machine_mode mode)
1450 {
1451 if (aarch64_advsimd_struct_mode_p (mode))
1452 return VEC_ADVSIMD | VEC_STRUCT;
1453
1454 if (aarch64_sve_pred_mode_p (mode))
1455 return VEC_SVE_PRED;
1456
1457 scalar_mode inner = GET_MODE_INNER (mode);
1458 if (VECTOR_MODE_P (mode)
1459 && (inner == QImode
1460 || inner == HImode
1461 || inner == HFmode
1462 || inner == SImode
1463 || inner == SFmode
1464 || inner == DImode
1465 || inner == DFmode))
1466 {
1467 if (TARGET_SVE)
1468 {
1469 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1470 return VEC_SVE_DATA;
1471 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1472 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1473 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1474 return VEC_SVE_DATA | VEC_STRUCT;
1475 }
1476
1477 /* This includes V1DF but not V1DI (which doesn't exist). */
1478 if (TARGET_SIMD
1479 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1480 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1481 return VEC_ADVSIMD;
1482 }
1483
1484 return 0;
1485 }
1486
1487 /* Return true if MODE is any of the data vector modes, including
1488 structure modes. */
1489 static bool
1490 aarch64_vector_data_mode_p (machine_mode mode)
1491 {
1492 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1493 }
1494
1495 /* Return true if MODE is an SVE data vector mode; either a single vector
1496 or a structure of vectors. */
1497 static bool
1498 aarch64_sve_data_mode_p (machine_mode mode)
1499 {
1500 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1501 }
1502
1503 /* Implement target hook TARGET_ARRAY_MODE. */
1504 static opt_machine_mode
1505 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1506 {
1507 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1508 && IN_RANGE (nelems, 2, 4))
1509 return mode_for_vector (GET_MODE_INNER (mode),
1510 GET_MODE_NUNITS (mode) * nelems);
1511
1512 return opt_machine_mode ();
1513 }
1514
1515 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1516 static bool
1517 aarch64_array_mode_supported_p (machine_mode mode,
1518 unsigned HOST_WIDE_INT nelems)
1519 {
1520 if (TARGET_SIMD
1521 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1522 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1523 && (nelems >= 2 && nelems <= 4))
1524 return true;
1525
1526 return false;
1527 }
1528
1529 /* Return the SVE predicate mode to use for elements that have
1530 ELEM_NBYTES bytes, if such a mode exists. */
1531
1532 opt_machine_mode
1533 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1534 {
1535 if (TARGET_SVE)
1536 {
1537 if (elem_nbytes == 1)
1538 return VNx16BImode;
1539 if (elem_nbytes == 2)
1540 return VNx8BImode;
1541 if (elem_nbytes == 4)
1542 return VNx4BImode;
1543 if (elem_nbytes == 8)
1544 return VNx2BImode;
1545 }
1546 return opt_machine_mode ();
1547 }
1548
1549 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1550
1551 static opt_machine_mode
1552 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1553 {
1554 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1555 {
1556 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1557 machine_mode pred_mode;
1558 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1559 return pred_mode;
1560 }
1561
1562 return default_get_mask_mode (nunits, nbytes);
1563 }
1564
1565 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1566 prefer to use the first arithmetic operand as the else value if
1567 the else value doesn't matter, since that exactly matches the SVE
1568 destructive merging form. For ternary operations we could either
1569 pick the first operand and use FMAD-like instructions or the last
1570 operand and use FMLA-like instructions; the latter seems more
1571 natural. */
1572
1573 static tree
1574 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1575 {
1576 return nops == 3 ? ops[2] : ops[0];
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_NREGS. */
1580
1581 static unsigned int
1582 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1583 {
1584 /* ??? Logically we should only need to provide a value when
1585 HARD_REGNO_MODE_OK says that the combination is valid,
1586 but at the moment we need to handle all modes. Just ignore
1587 any runtime parts for registers that can't store them. */
1588 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1589 switch (aarch64_regno_regclass (regno))
1590 {
1591 case FP_REGS:
1592 case FP_LO_REGS:
1593 if (aarch64_sve_data_mode_p (mode))
1594 return exact_div (GET_MODE_SIZE (mode),
1595 BYTES_PER_SVE_VECTOR).to_constant ();
1596 return CEIL (lowest_size, UNITS_PER_VREG);
1597 case PR_REGS:
1598 case PR_LO_REGS:
1599 case PR_HI_REGS:
1600 return 1;
1601 default:
1602 return CEIL (lowest_size, UNITS_PER_WORD);
1603 }
1604 gcc_unreachable ();
1605 }
1606
1607 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1608
1609 static bool
1610 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1611 {
1612 if (GET_MODE_CLASS (mode) == MODE_CC)
1613 return regno == CC_REGNUM;
1614
1615 if (regno == VG_REGNUM)
1616 /* This must have the same size as _Unwind_Word. */
1617 return mode == DImode;
1618
1619 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1620 if (vec_flags & VEC_SVE_PRED)
1621 return PR_REGNUM_P (regno);
1622
1623 if (PR_REGNUM_P (regno))
1624 return 0;
1625
1626 if (regno == SP_REGNUM)
1627 /* The purpose of comparing with ptr_mode is to support the
1628 global register variable associated with the stack pointer
1629 register via the syntax of asm ("wsp") in ILP32. */
1630 return mode == Pmode || mode == ptr_mode;
1631
1632 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1633 return mode == Pmode;
1634
1635 if (GP_REGNUM_P (regno))
1636 {
1637 if (known_le (GET_MODE_SIZE (mode), 8))
1638 return true;
1639 else if (known_le (GET_MODE_SIZE (mode), 16))
1640 return (regno & 1) == 0;
1641 }
1642 else if (FP_REGNUM_P (regno))
1643 {
1644 if (vec_flags & VEC_STRUCT)
1645 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1646 else
1647 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1648 }
1649
1650 return false;
1651 }
1652
1653 /* Return true if this is a definition of a vectorized simd function. */
1654
1655 static bool
1656 aarch64_simd_decl_p (tree fndecl)
1657 {
1658 tree fntype;
1659
1660 if (fndecl == NULL)
1661 return false;
1662 fntype = TREE_TYPE (fndecl);
1663 if (fntype == NULL)
1664 return false;
1665
1666 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1667 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1668 return true;
1669
1670 return false;
1671 }
1672
1673 /* Return the mode a register save/restore should use. DImode for integer
1674 registers, DFmode for FP registers in non-SIMD functions (they only save
1675 the bottom half of a 128 bit register), or TFmode for FP registers in
1676 SIMD functions. */
1677
1678 static machine_mode
1679 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1680 {
1681 return GP_REGNUM_P (regno)
1682 ? E_DImode
1683 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1684 }
1685
1686 /* Return true if the instruction is a call to a SIMD function, false
1687 if it is not a SIMD function or if we do not know anything about
1688 the function. */
1689
1690 static bool
1691 aarch64_simd_call_p (rtx_insn *insn)
1692 {
1693 rtx symbol;
1694 rtx call;
1695 tree fndecl;
1696
1697 gcc_assert (CALL_P (insn));
1698 call = get_call_rtx_from (insn);
1699 symbol = XEXP (XEXP (call, 0), 0);
1700 if (GET_CODE (symbol) != SYMBOL_REF)
1701 return false;
1702 fndecl = SYMBOL_REF_DECL (symbol);
1703 if (!fndecl)
1704 return false;
1705
1706 return aarch64_simd_decl_p (fndecl);
1707 }
1708
1709 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1710 a function that uses the SIMD ABI, take advantage of the extra
1711 call-preserved registers that the ABI provides. */
1712
1713 void
1714 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1715 HARD_REG_SET *return_set)
1716 {
1717 if (aarch64_simd_call_p (insn))
1718 {
1719 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1720 if (FP_SIMD_SAVED_REGNUM_P (regno))
1721 CLEAR_HARD_REG_BIT (*return_set, regno);
1722 }
1723 }
1724
1725 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1726 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1727 clobbers the top 64 bits when restoring the bottom 64 bits. */
1728
1729 static bool
1730 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1731 machine_mode mode)
1732 {
1733 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1734 return FP_REGNUM_P (regno)
1735 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1736 }
1737
1738 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1739
1740 rtx_insn *
1741 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1742 {
1743 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1744
1745 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1746 return call_1;
1747 else
1748 return call_2;
1749 }
1750
1751 /* Implement REGMODE_NATURAL_SIZE. */
1752 poly_uint64
1753 aarch64_regmode_natural_size (machine_mode mode)
1754 {
1755 /* The natural size for SVE data modes is one SVE data vector,
1756 and similarly for predicates. We can't independently modify
1757 anything smaller than that. */
1758 /* ??? For now, only do this for variable-width SVE registers.
1759 Doing it for constant-sized registers breaks lower-subreg.c. */
1760 /* ??? And once that's fixed, we should probably have similar
1761 code for Advanced SIMD. */
1762 if (!aarch64_sve_vg.is_constant ())
1763 {
1764 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1765 if (vec_flags & VEC_SVE_PRED)
1766 return BYTES_PER_SVE_PRED;
1767 if (vec_flags & VEC_SVE_DATA)
1768 return BYTES_PER_SVE_VECTOR;
1769 }
1770 return UNITS_PER_WORD;
1771 }
1772
1773 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1774 machine_mode
1775 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1776 machine_mode mode)
1777 {
1778 /* The predicate mode determines which bits are significant and
1779 which are "don't care". Decreasing the number of lanes would
1780 lose data while increasing the number of lanes would make bits
1781 unnecessarily significant. */
1782 if (PR_REGNUM_P (regno))
1783 return mode;
1784 if (known_ge (GET_MODE_SIZE (mode), 4))
1785 return mode;
1786 else
1787 return SImode;
1788 }
1789
1790 /* Return true if I's bits are consecutive ones from the MSB. */
1791 bool
1792 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1793 {
1794 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1795 }
1796
1797 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1798 that strcpy from constants will be faster. */
1799
1800 static HOST_WIDE_INT
1801 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1802 {
1803 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1804 return MAX (align, BITS_PER_WORD);
1805 return align;
1806 }
1807
1808 /* Return true if calls to DECL should be treated as
1809 long-calls (ie called via a register). */
1810 static bool
1811 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1812 {
1813 return false;
1814 }
1815
1816 /* Return true if calls to symbol-ref SYM should be treated as
1817 long-calls (ie called via a register). */
1818 bool
1819 aarch64_is_long_call_p (rtx sym)
1820 {
1821 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1822 }
1823
1824 /* Return true if calls to symbol-ref SYM should not go through
1825 plt stubs. */
1826
1827 bool
1828 aarch64_is_noplt_call_p (rtx sym)
1829 {
1830 const_tree decl = SYMBOL_REF_DECL (sym);
1831
1832 if (flag_pic
1833 && decl
1834 && (!flag_plt
1835 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1836 && !targetm.binds_local_p (decl))
1837 return true;
1838
1839 return false;
1840 }
1841
1842 /* Return true if the offsets to a zero/sign-extract operation
1843 represent an expression that matches an extend operation. The
1844 operands represent the paramters from
1845
1846 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1847 bool
1848 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1849 rtx extract_imm)
1850 {
1851 HOST_WIDE_INT mult_val, extract_val;
1852
1853 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1854 return false;
1855
1856 mult_val = INTVAL (mult_imm);
1857 extract_val = INTVAL (extract_imm);
1858
1859 if (extract_val > 8
1860 && extract_val < GET_MODE_BITSIZE (mode)
1861 && exact_log2 (extract_val & ~7) > 0
1862 && (extract_val & 7) <= 4
1863 && mult_val == (1 << (extract_val & 7)))
1864 return true;
1865
1866 return false;
1867 }
1868
1869 /* Emit an insn that's a simple single-set. Both the operands must be
1870 known to be valid. */
1871 inline static rtx_insn *
1872 emit_set_insn (rtx x, rtx y)
1873 {
1874 return emit_insn (gen_rtx_SET (x, y));
1875 }
1876
1877 /* X and Y are two things to compare using CODE. Emit the compare insn and
1878 return the rtx for register 0 in the proper mode. */
1879 rtx
1880 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1881 {
1882 machine_mode mode = SELECT_CC_MODE (code, x, y);
1883 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1884
1885 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1886 return cc_reg;
1887 }
1888
1889 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1890
1891 static rtx
1892 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1893 machine_mode y_mode)
1894 {
1895 if (y_mode == E_QImode || y_mode == E_HImode)
1896 {
1897 if (CONST_INT_P (y))
1898 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1899 else
1900 {
1901 rtx t, cc_reg;
1902 machine_mode cc_mode;
1903
1904 t = gen_rtx_ZERO_EXTEND (SImode, y);
1905 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1906 cc_mode = CC_SWPmode;
1907 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1908 emit_set_insn (cc_reg, t);
1909 return cc_reg;
1910 }
1911 }
1912
1913 return aarch64_gen_compare_reg (code, x, y);
1914 }
1915
1916 /* Build the SYMBOL_REF for __tls_get_addr. */
1917
1918 static GTY(()) rtx tls_get_addr_libfunc;
1919
1920 rtx
1921 aarch64_tls_get_addr (void)
1922 {
1923 if (!tls_get_addr_libfunc)
1924 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1925 return tls_get_addr_libfunc;
1926 }
1927
1928 /* Return the TLS model to use for ADDR. */
1929
1930 static enum tls_model
1931 tls_symbolic_operand_type (rtx addr)
1932 {
1933 enum tls_model tls_kind = TLS_MODEL_NONE;
1934 if (GET_CODE (addr) == CONST)
1935 {
1936 poly_int64 addend;
1937 rtx sym = strip_offset (addr, &addend);
1938 if (GET_CODE (sym) == SYMBOL_REF)
1939 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1940 }
1941 else if (GET_CODE (addr) == SYMBOL_REF)
1942 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1943
1944 return tls_kind;
1945 }
1946
1947 /* We'll allow lo_sum's in addresses in our legitimate addresses
1948 so that combine would take care of combining addresses where
1949 necessary, but for generation purposes, we'll generate the address
1950 as :
1951 RTL Absolute
1952 tmp = hi (symbol_ref); adrp x1, foo
1953 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1954 nop
1955
1956 PIC TLS
1957 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1958 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1959 bl __tls_get_addr
1960 nop
1961
1962 Load TLS symbol, depending on TLS mechanism and TLS access model.
1963
1964 Global Dynamic - Traditional TLS:
1965 adrp tmp, :tlsgd:imm
1966 add dest, tmp, #:tlsgd_lo12:imm
1967 bl __tls_get_addr
1968
1969 Global Dynamic - TLS Descriptors:
1970 adrp dest, :tlsdesc:imm
1971 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1972 add dest, dest, #:tlsdesc_lo12:imm
1973 blr tmp
1974 mrs tp, tpidr_el0
1975 add dest, dest, tp
1976
1977 Initial Exec:
1978 mrs tp, tpidr_el0
1979 adrp tmp, :gottprel:imm
1980 ldr dest, [tmp, #:gottprel_lo12:imm]
1981 add dest, dest, tp
1982
1983 Local Exec:
1984 mrs tp, tpidr_el0
1985 add t0, tp, #:tprel_hi12:imm, lsl #12
1986 add t0, t0, #:tprel_lo12_nc:imm
1987 */
1988
1989 static void
1990 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1991 enum aarch64_symbol_type type)
1992 {
1993 switch (type)
1994 {
1995 case SYMBOL_SMALL_ABSOLUTE:
1996 {
1997 /* In ILP32, the mode of dest can be either SImode or DImode. */
1998 rtx tmp_reg = dest;
1999 machine_mode mode = GET_MODE (dest);
2000
2001 gcc_assert (mode == Pmode || mode == ptr_mode);
2002
2003 if (can_create_pseudo_p ())
2004 tmp_reg = gen_reg_rtx (mode);
2005
2006 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2007 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2008 return;
2009 }
2010
2011 case SYMBOL_TINY_ABSOLUTE:
2012 emit_insn (gen_rtx_SET (dest, imm));
2013 return;
2014
2015 case SYMBOL_SMALL_GOT_28K:
2016 {
2017 machine_mode mode = GET_MODE (dest);
2018 rtx gp_rtx = pic_offset_table_rtx;
2019 rtx insn;
2020 rtx mem;
2021
2022 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2023 here before rtl expand. Tree IVOPT will generate rtl pattern to
2024 decide rtx costs, in which case pic_offset_table_rtx is not
2025 initialized. For that case no need to generate the first adrp
2026 instruction as the final cost for global variable access is
2027 one instruction. */
2028 if (gp_rtx != NULL)
2029 {
2030 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2031 using the page base as GOT base, the first page may be wasted,
2032 in the worst scenario, there is only 28K space for GOT).
2033
2034 The generate instruction sequence for accessing global variable
2035 is:
2036
2037 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2038
2039 Only one instruction needed. But we must initialize
2040 pic_offset_table_rtx properly. We generate initialize insn for
2041 every global access, and allow CSE to remove all redundant.
2042
2043 The final instruction sequences will look like the following
2044 for multiply global variables access.
2045
2046 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2047
2048 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2049 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2050 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2051 ... */
2052
2053 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2054 crtl->uses_pic_offset_table = 1;
2055 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2056
2057 if (mode != GET_MODE (gp_rtx))
2058 gp_rtx = gen_lowpart (mode, gp_rtx);
2059
2060 }
2061
2062 if (mode == ptr_mode)
2063 {
2064 if (mode == DImode)
2065 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2066 else
2067 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2068
2069 mem = XVECEXP (SET_SRC (insn), 0, 0);
2070 }
2071 else
2072 {
2073 gcc_assert (mode == Pmode);
2074
2075 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2076 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2077 }
2078
2079 /* The operand is expected to be MEM. Whenever the related insn
2080 pattern changed, above code which calculate mem should be
2081 updated. */
2082 gcc_assert (GET_CODE (mem) == MEM);
2083 MEM_READONLY_P (mem) = 1;
2084 MEM_NOTRAP_P (mem) = 1;
2085 emit_insn (insn);
2086 return;
2087 }
2088
2089 case SYMBOL_SMALL_GOT_4G:
2090 {
2091 /* In ILP32, the mode of dest can be either SImode or DImode,
2092 while the got entry is always of SImode size. The mode of
2093 dest depends on how dest is used: if dest is assigned to a
2094 pointer (e.g. in the memory), it has SImode; it may have
2095 DImode if dest is dereferenced to access the memeory.
2096 This is why we have to handle three different ldr_got_small
2097 patterns here (two patterns for ILP32). */
2098
2099 rtx insn;
2100 rtx mem;
2101 rtx tmp_reg = dest;
2102 machine_mode mode = GET_MODE (dest);
2103
2104 if (can_create_pseudo_p ())
2105 tmp_reg = gen_reg_rtx (mode);
2106
2107 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2108 if (mode == ptr_mode)
2109 {
2110 if (mode == DImode)
2111 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2112 else
2113 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2114
2115 mem = XVECEXP (SET_SRC (insn), 0, 0);
2116 }
2117 else
2118 {
2119 gcc_assert (mode == Pmode);
2120
2121 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2122 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2123 }
2124
2125 gcc_assert (GET_CODE (mem) == MEM);
2126 MEM_READONLY_P (mem) = 1;
2127 MEM_NOTRAP_P (mem) = 1;
2128 emit_insn (insn);
2129 return;
2130 }
2131
2132 case SYMBOL_SMALL_TLSGD:
2133 {
2134 rtx_insn *insns;
2135 machine_mode mode = GET_MODE (dest);
2136 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2137
2138 start_sequence ();
2139 if (TARGET_ILP32)
2140 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2141 else
2142 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2143 insns = get_insns ();
2144 end_sequence ();
2145
2146 RTL_CONST_CALL_P (insns) = 1;
2147 emit_libcall_block (insns, dest, result, imm);
2148 return;
2149 }
2150
2151 case SYMBOL_SMALL_TLSDESC:
2152 {
2153 machine_mode mode = GET_MODE (dest);
2154 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2155 rtx tp;
2156
2157 gcc_assert (mode == Pmode || mode == ptr_mode);
2158
2159 /* In ILP32, the got entry is always of SImode size. Unlike
2160 small GOT, the dest is fixed at reg 0. */
2161 if (TARGET_ILP32)
2162 emit_insn (gen_tlsdesc_small_si (imm));
2163 else
2164 emit_insn (gen_tlsdesc_small_di (imm));
2165 tp = aarch64_load_tp (NULL);
2166
2167 if (mode != Pmode)
2168 tp = gen_lowpart (mode, tp);
2169
2170 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2171 if (REG_P (dest))
2172 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2173 return;
2174 }
2175
2176 case SYMBOL_SMALL_TLSIE:
2177 {
2178 /* In ILP32, the mode of dest can be either SImode or DImode,
2179 while the got entry is always of SImode size. The mode of
2180 dest depends on how dest is used: if dest is assigned to a
2181 pointer (e.g. in the memory), it has SImode; it may have
2182 DImode if dest is dereferenced to access the memeory.
2183 This is why we have to handle three different tlsie_small
2184 patterns here (two patterns for ILP32). */
2185 machine_mode mode = GET_MODE (dest);
2186 rtx tmp_reg = gen_reg_rtx (mode);
2187 rtx tp = aarch64_load_tp (NULL);
2188
2189 if (mode == ptr_mode)
2190 {
2191 if (mode == DImode)
2192 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2193 else
2194 {
2195 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2196 tp = gen_lowpart (mode, tp);
2197 }
2198 }
2199 else
2200 {
2201 gcc_assert (mode == Pmode);
2202 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2203 }
2204
2205 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2206 if (REG_P (dest))
2207 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2208 return;
2209 }
2210
2211 case SYMBOL_TLSLE12:
2212 case SYMBOL_TLSLE24:
2213 case SYMBOL_TLSLE32:
2214 case SYMBOL_TLSLE48:
2215 {
2216 machine_mode mode = GET_MODE (dest);
2217 rtx tp = aarch64_load_tp (NULL);
2218
2219 if (mode != Pmode)
2220 tp = gen_lowpart (mode, tp);
2221
2222 switch (type)
2223 {
2224 case SYMBOL_TLSLE12:
2225 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2226 (dest, tp, imm));
2227 break;
2228 case SYMBOL_TLSLE24:
2229 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2230 (dest, tp, imm));
2231 break;
2232 case SYMBOL_TLSLE32:
2233 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2234 (dest, imm));
2235 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2236 (dest, dest, tp));
2237 break;
2238 case SYMBOL_TLSLE48:
2239 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2240 (dest, imm));
2241 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2242 (dest, dest, tp));
2243 break;
2244 default:
2245 gcc_unreachable ();
2246 }
2247
2248 if (REG_P (dest))
2249 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2250 return;
2251 }
2252
2253 case SYMBOL_TINY_GOT:
2254 emit_insn (gen_ldr_got_tiny (dest, imm));
2255 return;
2256
2257 case SYMBOL_TINY_TLSIE:
2258 {
2259 machine_mode mode = GET_MODE (dest);
2260 rtx tp = aarch64_load_tp (NULL);
2261
2262 if (mode == ptr_mode)
2263 {
2264 if (mode == DImode)
2265 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2266 else
2267 {
2268 tp = gen_lowpart (mode, tp);
2269 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2270 }
2271 }
2272 else
2273 {
2274 gcc_assert (mode == Pmode);
2275 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2276 }
2277
2278 if (REG_P (dest))
2279 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2280 return;
2281 }
2282
2283 default:
2284 gcc_unreachable ();
2285 }
2286 }
2287
2288 /* Emit a move from SRC to DEST. Assume that the move expanders can
2289 handle all moves if !can_create_pseudo_p (). The distinction is
2290 important because, unlike emit_move_insn, the move expanders know
2291 how to force Pmode objects into the constant pool even when the
2292 constant pool address is not itself legitimate. */
2293 static rtx
2294 aarch64_emit_move (rtx dest, rtx src)
2295 {
2296 return (can_create_pseudo_p ()
2297 ? emit_move_insn (dest, src)
2298 : emit_move_insn_1 (dest, src));
2299 }
2300
2301 /* Apply UNOPTAB to OP and store the result in DEST. */
2302
2303 static void
2304 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2305 {
2306 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2307 if (dest != tmp)
2308 emit_move_insn (dest, tmp);
2309 }
2310
2311 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2312
2313 static void
2314 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2315 {
2316 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2317 OPTAB_DIRECT);
2318 if (dest != tmp)
2319 emit_move_insn (dest, tmp);
2320 }
2321
2322 /* Split a 128-bit move operation into two 64-bit move operations,
2323 taking care to handle partial overlap of register to register
2324 copies. Special cases are needed when moving between GP regs and
2325 FP regs. SRC can be a register, constant or memory; DST a register
2326 or memory. If either operand is memory it must not have any side
2327 effects. */
2328 void
2329 aarch64_split_128bit_move (rtx dst, rtx src)
2330 {
2331 rtx dst_lo, dst_hi;
2332 rtx src_lo, src_hi;
2333
2334 machine_mode mode = GET_MODE (dst);
2335
2336 gcc_assert (mode == TImode || mode == TFmode);
2337 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2338 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2339
2340 if (REG_P (dst) && REG_P (src))
2341 {
2342 int src_regno = REGNO (src);
2343 int dst_regno = REGNO (dst);
2344
2345 /* Handle FP <-> GP regs. */
2346 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2347 {
2348 src_lo = gen_lowpart (word_mode, src);
2349 src_hi = gen_highpart (word_mode, src);
2350
2351 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2352 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2353 return;
2354 }
2355 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2356 {
2357 dst_lo = gen_lowpart (word_mode, dst);
2358 dst_hi = gen_highpart (word_mode, dst);
2359
2360 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2361 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2362 return;
2363 }
2364 }
2365
2366 dst_lo = gen_lowpart (word_mode, dst);
2367 dst_hi = gen_highpart (word_mode, dst);
2368 src_lo = gen_lowpart (word_mode, src);
2369 src_hi = gen_highpart_mode (word_mode, mode, src);
2370
2371 /* At most one pairing may overlap. */
2372 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2373 {
2374 aarch64_emit_move (dst_hi, src_hi);
2375 aarch64_emit_move (dst_lo, src_lo);
2376 }
2377 else
2378 {
2379 aarch64_emit_move (dst_lo, src_lo);
2380 aarch64_emit_move (dst_hi, src_hi);
2381 }
2382 }
2383
2384 bool
2385 aarch64_split_128bit_move_p (rtx dst, rtx src)
2386 {
2387 return (! REG_P (src)
2388 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2389 }
2390
2391 /* Split a complex SIMD combine. */
2392
2393 void
2394 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2395 {
2396 machine_mode src_mode = GET_MODE (src1);
2397 machine_mode dst_mode = GET_MODE (dst);
2398
2399 gcc_assert (VECTOR_MODE_P (dst_mode));
2400 gcc_assert (register_operand (dst, dst_mode)
2401 && register_operand (src1, src_mode)
2402 && register_operand (src2, src_mode));
2403
2404 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2405 return;
2406 }
2407
2408 /* Split a complex SIMD move. */
2409
2410 void
2411 aarch64_split_simd_move (rtx dst, rtx src)
2412 {
2413 machine_mode src_mode = GET_MODE (src);
2414 machine_mode dst_mode = GET_MODE (dst);
2415
2416 gcc_assert (VECTOR_MODE_P (dst_mode));
2417
2418 if (REG_P (dst) && REG_P (src))
2419 {
2420 gcc_assert (VECTOR_MODE_P (src_mode));
2421 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2422 }
2423 }
2424
2425 bool
2426 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2427 machine_mode ymode, rtx y)
2428 {
2429 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2430 gcc_assert (r != NULL);
2431 return rtx_equal_p (x, r);
2432 }
2433
2434
2435 static rtx
2436 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2437 {
2438 if (can_create_pseudo_p ())
2439 return force_reg (mode, value);
2440 else
2441 {
2442 gcc_assert (x);
2443 aarch64_emit_move (x, value);
2444 return x;
2445 }
2446 }
2447
2448 /* Return true if we can move VALUE into a register using a single
2449 CNT[BHWD] instruction. */
2450
2451 static bool
2452 aarch64_sve_cnt_immediate_p (poly_int64 value)
2453 {
2454 HOST_WIDE_INT factor = value.coeffs[0];
2455 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2456 return (value.coeffs[1] == factor
2457 && IN_RANGE (factor, 2, 16 * 16)
2458 && (factor & 1) == 0
2459 && factor <= 16 * (factor & -factor));
2460 }
2461
2462 /* Likewise for rtx X. */
2463
2464 bool
2465 aarch64_sve_cnt_immediate_p (rtx x)
2466 {
2467 poly_int64 value;
2468 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2469 }
2470
2471 /* Return the asm string for an instruction with a CNT-like vector size
2472 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2473 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2474 first part of the operands template (the part that comes before the
2475 vector size itself). FACTOR is the number of quadwords.
2476 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2477 If it is zero, we can use any element size. */
2478
2479 static char *
2480 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2481 unsigned int factor,
2482 unsigned int nelts_per_vq)
2483 {
2484 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2485
2486 if (nelts_per_vq == 0)
2487 /* There is some overlap in the ranges of the four CNT instructions.
2488 Here we always use the smallest possible element size, so that the
2489 multiplier is 1 whereever possible. */
2490 nelts_per_vq = factor & -factor;
2491 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2492 gcc_assert (IN_RANGE (shift, 1, 4));
2493 char suffix = "dwhb"[shift - 1];
2494
2495 factor >>= shift;
2496 unsigned int written;
2497 if (factor == 1)
2498 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2499 prefix, suffix, operands);
2500 else
2501 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2502 prefix, suffix, operands, factor);
2503 gcc_assert (written < sizeof (buffer));
2504 return buffer;
2505 }
2506
2507 /* Return the asm string for an instruction with a CNT-like vector size
2508 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2509 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2510 first part of the operands template (the part that comes before the
2511 vector size itself). X is the value of the vector size operand,
2512 as a polynomial integer rtx. */
2513
2514 char *
2515 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2516 rtx x)
2517 {
2518 poly_int64 value = rtx_to_poly_int64 (x);
2519 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2520 return aarch64_output_sve_cnt_immediate (prefix, operands,
2521 value.coeffs[1], 0);
2522 }
2523
2524 /* Return true if we can add VALUE to a register using a single ADDVL
2525 or ADDPL instruction. */
2526
2527 static bool
2528 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2529 {
2530 HOST_WIDE_INT factor = value.coeffs[0];
2531 if (factor == 0 || value.coeffs[1] != factor)
2532 return false;
2533 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2534 and a value of 16 is one vector width. */
2535 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2536 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2537 }
2538
2539 /* Likewise for rtx X. */
2540
2541 bool
2542 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2543 {
2544 poly_int64 value;
2545 return (poly_int_rtx_p (x, &value)
2546 && aarch64_sve_addvl_addpl_immediate_p (value));
2547 }
2548
2549 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2550 and storing the result in operand 0. */
2551
2552 char *
2553 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2554 {
2555 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2556 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2557 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2558
2559 /* Use INC or DEC if possible. */
2560 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2561 {
2562 if (aarch64_sve_cnt_immediate_p (offset_value))
2563 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2564 offset_value.coeffs[1], 0);
2565 if (aarch64_sve_cnt_immediate_p (-offset_value))
2566 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2567 -offset_value.coeffs[1], 0);
2568 }
2569
2570 int factor = offset_value.coeffs[1];
2571 if ((factor & 15) == 0)
2572 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2573 else
2574 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2575 return buffer;
2576 }
2577
2578 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2579 instruction. If it is, store the number of elements in each vector
2580 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2581 factor in *FACTOR_OUT (if nonnull). */
2582
2583 bool
2584 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2585 unsigned int *nelts_per_vq_out)
2586 {
2587 rtx elt;
2588 poly_int64 value;
2589
2590 if (!const_vec_duplicate_p (x, &elt)
2591 || !poly_int_rtx_p (elt, &value))
2592 return false;
2593
2594 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2595 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2596 /* There's no vector INCB. */
2597 return false;
2598
2599 HOST_WIDE_INT factor = value.coeffs[0];
2600 if (value.coeffs[1] != factor)
2601 return false;
2602
2603 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2604 if ((factor % nelts_per_vq) != 0
2605 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2606 return false;
2607
2608 if (factor_out)
2609 *factor_out = factor;
2610 if (nelts_per_vq_out)
2611 *nelts_per_vq_out = nelts_per_vq;
2612 return true;
2613 }
2614
2615 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2616 instruction. */
2617
2618 bool
2619 aarch64_sve_inc_dec_immediate_p (rtx x)
2620 {
2621 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2622 }
2623
2624 /* Return the asm template for an SVE vector INC or DEC instruction.
2625 OPERANDS gives the operands before the vector count and X is the
2626 value of the vector count operand itself. */
2627
2628 char *
2629 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2630 {
2631 int factor;
2632 unsigned int nelts_per_vq;
2633 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2634 gcc_unreachable ();
2635 if (factor < 0)
2636 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2637 nelts_per_vq);
2638 else
2639 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2640 nelts_per_vq);
2641 }
2642
2643 static int
2644 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2645 scalar_int_mode mode)
2646 {
2647 int i;
2648 unsigned HOST_WIDE_INT val, val2, mask;
2649 int one_match, zero_match;
2650 int num_insns;
2651
2652 val = INTVAL (imm);
2653
2654 if (aarch64_move_imm (val, mode))
2655 {
2656 if (generate)
2657 emit_insn (gen_rtx_SET (dest, imm));
2658 return 1;
2659 }
2660
2661 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2662 (with XXXX non-zero). In that case check to see if the move can be done in
2663 a smaller mode. */
2664 val2 = val & 0xffffffff;
2665 if (mode == DImode
2666 && aarch64_move_imm (val2, SImode)
2667 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2668 {
2669 if (generate)
2670 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2671
2672 /* Check if we have to emit a second instruction by checking to see
2673 if any of the upper 32 bits of the original DI mode value is set. */
2674 if (val == val2)
2675 return 1;
2676
2677 i = (val >> 48) ? 48 : 32;
2678
2679 if (generate)
2680 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2681 GEN_INT ((val >> i) & 0xffff)));
2682
2683 return 2;
2684 }
2685
2686 if ((val >> 32) == 0 || mode == SImode)
2687 {
2688 if (generate)
2689 {
2690 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2691 if (mode == SImode)
2692 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2693 GEN_INT ((val >> 16) & 0xffff)));
2694 else
2695 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2696 GEN_INT ((val >> 16) & 0xffff)));
2697 }
2698 return 2;
2699 }
2700
2701 /* Remaining cases are all for DImode. */
2702
2703 mask = 0xffff;
2704 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2705 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2706 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2707 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2708
2709 if (zero_match != 2 && one_match != 2)
2710 {
2711 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2712 For a 64-bit bitmask try whether changing 16 bits to all ones or
2713 zeroes creates a valid bitmask. To check any repeated bitmask,
2714 try using 16 bits from the other 32-bit half of val. */
2715
2716 for (i = 0; i < 64; i += 16, mask <<= 16)
2717 {
2718 val2 = val & ~mask;
2719 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2720 break;
2721 val2 = val | mask;
2722 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2723 break;
2724 val2 = val2 & ~mask;
2725 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2726 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2727 break;
2728 }
2729 if (i != 64)
2730 {
2731 if (generate)
2732 {
2733 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2734 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2735 GEN_INT ((val >> i) & 0xffff)));
2736 }
2737 return 2;
2738 }
2739 }
2740
2741 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2742 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2743 otherwise skip zero bits. */
2744
2745 num_insns = 1;
2746 mask = 0xffff;
2747 val2 = one_match > zero_match ? ~val : val;
2748 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2749
2750 if (generate)
2751 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2752 ? (val | ~(mask << i))
2753 : (val & (mask << i)))));
2754 for (i += 16; i < 64; i += 16)
2755 {
2756 if ((val2 & (mask << i)) == 0)
2757 continue;
2758 if (generate)
2759 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2760 GEN_INT ((val >> i) & 0xffff)));
2761 num_insns ++;
2762 }
2763
2764 return num_insns;
2765 }
2766
2767 /* Return whether imm is a 128-bit immediate which is simple enough to
2768 expand inline. */
2769 bool
2770 aarch64_mov128_immediate (rtx imm)
2771 {
2772 if (GET_CODE (imm) == CONST_INT)
2773 return true;
2774
2775 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2776
2777 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2778 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2779
2780 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2781 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2782 }
2783
2784
2785 /* Return the number of temporary registers that aarch64_add_offset_1
2786 would need to add OFFSET to a register. */
2787
2788 static unsigned int
2789 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2790 {
2791 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2792 }
2793
2794 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2795 a non-polynomial OFFSET. MODE is the mode of the addition.
2796 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2797 be set and CFA adjustments added to the generated instructions.
2798
2799 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2800 temporary if register allocation is already complete. This temporary
2801 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2802 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2803 the immediate again.
2804
2805 Since this function may be used to adjust the stack pointer, we must
2806 ensure that it cannot cause transient stack deallocation (for example
2807 by first incrementing SP and then decrementing when adjusting by a
2808 large immediate). */
2809
2810 static void
2811 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2812 rtx src, HOST_WIDE_INT offset, rtx temp1,
2813 bool frame_related_p, bool emit_move_imm)
2814 {
2815 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2816 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2817
2818 HOST_WIDE_INT moffset = abs_hwi (offset);
2819 rtx_insn *insn;
2820
2821 if (!moffset)
2822 {
2823 if (!rtx_equal_p (dest, src))
2824 {
2825 insn = emit_insn (gen_rtx_SET (dest, src));
2826 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2827 }
2828 return;
2829 }
2830
2831 /* Single instruction adjustment. */
2832 if (aarch64_uimm12_shift (moffset))
2833 {
2834 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2835 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2836 return;
2837 }
2838
2839 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2840 and either:
2841
2842 a) the offset cannot be loaded by a 16-bit move or
2843 b) there is no spare register into which we can move it. */
2844 if (moffset < 0x1000000
2845 && ((!temp1 && !can_create_pseudo_p ())
2846 || !aarch64_move_imm (moffset, mode)))
2847 {
2848 HOST_WIDE_INT low_off = moffset & 0xfff;
2849
2850 low_off = offset < 0 ? -low_off : low_off;
2851 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2852 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2853 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2854 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2855 return;
2856 }
2857
2858 /* Emit a move immediate if required and an addition/subtraction. */
2859 if (emit_move_imm)
2860 {
2861 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2862 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2863 }
2864 insn = emit_insn (offset < 0
2865 ? gen_sub3_insn (dest, src, temp1)
2866 : gen_add3_insn (dest, src, temp1));
2867 if (frame_related_p)
2868 {
2869 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2870 rtx adj = plus_constant (mode, src, offset);
2871 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2872 }
2873 }
2874
2875 /* Return the number of temporary registers that aarch64_add_offset
2876 would need to move OFFSET into a register or add OFFSET to a register;
2877 ADD_P is true if we want the latter rather than the former. */
2878
2879 static unsigned int
2880 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2881 {
2882 /* This follows the same structure as aarch64_add_offset. */
2883 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2884 return 0;
2885
2886 unsigned int count = 0;
2887 HOST_WIDE_INT factor = offset.coeffs[1];
2888 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2889 poly_int64 poly_offset (factor, factor);
2890 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2891 /* Need one register for the ADDVL/ADDPL result. */
2892 count += 1;
2893 else if (factor != 0)
2894 {
2895 factor = abs (factor);
2896 if (factor > 16 * (factor & -factor))
2897 /* Need one register for the CNT result and one for the multiplication
2898 factor. If necessary, the second temporary can be reused for the
2899 constant part of the offset. */
2900 return 2;
2901 /* Need one register for the CNT result (which might then
2902 be shifted). */
2903 count += 1;
2904 }
2905 return count + aarch64_add_offset_1_temporaries (constant);
2906 }
2907
2908 /* If X can be represented as a poly_int64, return the number
2909 of temporaries that are required to add it to a register.
2910 Return -1 otherwise. */
2911
2912 int
2913 aarch64_add_offset_temporaries (rtx x)
2914 {
2915 poly_int64 offset;
2916 if (!poly_int_rtx_p (x, &offset))
2917 return -1;
2918 return aarch64_offset_temporaries (true, offset);
2919 }
2920
2921 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2922 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2923 be set and CFA adjustments added to the generated instructions.
2924
2925 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2926 temporary if register allocation is already complete. This temporary
2927 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2928 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2929 false to avoid emitting the immediate again.
2930
2931 TEMP2, if nonnull, is a second temporary register that doesn't
2932 overlap either DEST or REG.
2933
2934 Since this function may be used to adjust the stack pointer, we must
2935 ensure that it cannot cause transient stack deallocation (for example
2936 by first incrementing SP and then decrementing when adjusting by a
2937 large immediate). */
2938
2939 static void
2940 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2941 poly_int64 offset, rtx temp1, rtx temp2,
2942 bool frame_related_p, bool emit_move_imm = true)
2943 {
2944 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2945 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2946 gcc_assert (temp1 == NULL_RTX
2947 || !frame_related_p
2948 || !reg_overlap_mentioned_p (temp1, dest));
2949 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2950
2951 /* Try using ADDVL or ADDPL to add the whole value. */
2952 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2953 {
2954 rtx offset_rtx = gen_int_mode (offset, mode);
2955 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2956 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2957 return;
2958 }
2959
2960 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2961 SVE vector register, over and above the minimum size of 128 bits.
2962 This is equivalent to half the value returned by CNTD with a
2963 vector shape of ALL. */
2964 HOST_WIDE_INT factor = offset.coeffs[1];
2965 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2966
2967 /* Try using ADDVL or ADDPL to add the VG-based part. */
2968 poly_int64 poly_offset (factor, factor);
2969 if (src != const0_rtx
2970 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2971 {
2972 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2973 if (frame_related_p)
2974 {
2975 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2976 RTX_FRAME_RELATED_P (insn) = true;
2977 src = dest;
2978 }
2979 else
2980 {
2981 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2982 src = aarch64_force_temporary (mode, temp1, addr);
2983 temp1 = temp2;
2984 temp2 = NULL_RTX;
2985 }
2986 }
2987 /* Otherwise use a CNT-based sequence. */
2988 else if (factor != 0)
2989 {
2990 /* Use a subtraction if we have a negative factor. */
2991 rtx_code code = PLUS;
2992 if (factor < 0)
2993 {
2994 factor = -factor;
2995 code = MINUS;
2996 }
2997
2998 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2999 into the multiplication. */
3000 rtx val;
3001 int shift = 0;
3002 if (factor & 1)
3003 /* Use a right shift by 1. */
3004 shift = -1;
3005 else
3006 factor /= 2;
3007 HOST_WIDE_INT low_bit = factor & -factor;
3008 if (factor <= 16 * low_bit)
3009 {
3010 if (factor > 16 * 8)
3011 {
3012 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3013 the value with the minimum multiplier and shift it into
3014 position. */
3015 int extra_shift = exact_log2 (low_bit);
3016 shift += extra_shift;
3017 factor >>= extra_shift;
3018 }
3019 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3020 }
3021 else
3022 {
3023 /* Use CNTD, then multiply it by FACTOR. */
3024 val = gen_int_mode (poly_int64 (2, 2), mode);
3025 val = aarch64_force_temporary (mode, temp1, val);
3026
3027 /* Go back to using a negative multiplication factor if we have
3028 no register from which to subtract. */
3029 if (code == MINUS && src == const0_rtx)
3030 {
3031 factor = -factor;
3032 code = PLUS;
3033 }
3034 rtx coeff1 = gen_int_mode (factor, mode);
3035 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3036 val = gen_rtx_MULT (mode, val, coeff1);
3037 }
3038
3039 if (shift > 0)
3040 {
3041 /* Multiply by 1 << SHIFT. */
3042 val = aarch64_force_temporary (mode, temp1, val);
3043 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3044 }
3045 else if (shift == -1)
3046 {
3047 /* Divide by 2. */
3048 val = aarch64_force_temporary (mode, temp1, val);
3049 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3050 }
3051
3052 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3053 if (src != const0_rtx)
3054 {
3055 val = aarch64_force_temporary (mode, temp1, val);
3056 val = gen_rtx_fmt_ee (code, mode, src, val);
3057 }
3058 else if (code == MINUS)
3059 {
3060 val = aarch64_force_temporary (mode, temp1, val);
3061 val = gen_rtx_NEG (mode, val);
3062 }
3063
3064 if (constant == 0 || frame_related_p)
3065 {
3066 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3067 if (frame_related_p)
3068 {
3069 RTX_FRAME_RELATED_P (insn) = true;
3070 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3071 gen_rtx_SET (dest, plus_constant (Pmode, src,
3072 poly_offset)));
3073 }
3074 src = dest;
3075 if (constant == 0)
3076 return;
3077 }
3078 else
3079 {
3080 src = aarch64_force_temporary (mode, temp1, val);
3081 temp1 = temp2;
3082 temp2 = NULL_RTX;
3083 }
3084
3085 emit_move_imm = true;
3086 }
3087
3088 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3089 frame_related_p, emit_move_imm);
3090 }
3091
3092 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3093 than a poly_int64. */
3094
3095 void
3096 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3097 rtx offset_rtx, rtx temp1, rtx temp2)
3098 {
3099 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3100 temp1, temp2, false);
3101 }
3102
3103 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3104 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3105 if TEMP1 already contains abs (DELTA). */
3106
3107 static inline void
3108 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3109 {
3110 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3111 temp1, temp2, true, emit_move_imm);
3112 }
3113
3114 /* Subtract DELTA from the stack pointer, marking the instructions
3115 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3116 if nonnull. */
3117
3118 static inline void
3119 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3120 bool emit_move_imm = true)
3121 {
3122 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3123 temp1, temp2, frame_related_p, emit_move_imm);
3124 }
3125
3126 /* Set DEST to (vec_series BASE STEP). */
3127
3128 static void
3129 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3130 {
3131 machine_mode mode = GET_MODE (dest);
3132 scalar_mode inner = GET_MODE_INNER (mode);
3133
3134 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3135 if (!aarch64_sve_index_immediate_p (base))
3136 base = force_reg (inner, base);
3137 if (!aarch64_sve_index_immediate_p (step))
3138 step = force_reg (inner, step);
3139
3140 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3141 }
3142
3143 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3144 integer of mode INT_MODE. Return true on success. */
3145
3146 static bool
3147 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3148 rtx src)
3149 {
3150 /* If the constant is smaller than 128 bits, we can do the move
3151 using a vector of SRC_MODEs. */
3152 if (src_mode != TImode)
3153 {
3154 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3155 GET_MODE_SIZE (src_mode));
3156 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3157 emit_move_insn (gen_lowpart (dup_mode, dest),
3158 gen_const_vec_duplicate (dup_mode, src));
3159 return true;
3160 }
3161
3162 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3163 src = force_const_mem (src_mode, src);
3164 if (!src)
3165 return false;
3166
3167 /* Make sure that the address is legitimate. */
3168 if (!aarch64_sve_ld1r_operand_p (src))
3169 {
3170 rtx addr = force_reg (Pmode, XEXP (src, 0));
3171 src = replace_equiv_address (src, addr);
3172 }
3173
3174 machine_mode mode = GET_MODE (dest);
3175 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3176 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3177 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3178 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3179 emit_insn (gen_rtx_SET (dest, src));
3180 return true;
3181 }
3182
3183 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3184 isn't a simple duplicate or series. */
3185
3186 static void
3187 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3188 {
3189 machine_mode mode = GET_MODE (src);
3190 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3191 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3192 gcc_assert (npatterns > 1);
3193
3194 if (nelts_per_pattern == 1)
3195 {
3196 /* The constant is a repeating seqeuence of at least two elements,
3197 where the repeating elements occupy no more than 128 bits.
3198 Get an integer representation of the replicated value. */
3199 scalar_int_mode int_mode;
3200 if (BYTES_BIG_ENDIAN)
3201 /* For now, always use LD1RQ to load the value on big-endian
3202 targets, since the handling of smaller integers includes a
3203 subreg that is semantically an element reverse. */
3204 int_mode = TImode;
3205 else
3206 {
3207 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3208 gcc_assert (int_bits <= 128);
3209 int_mode = int_mode_for_size (int_bits, 0).require ();
3210 }
3211 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3212 if (int_value
3213 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3214 return;
3215 }
3216
3217 /* Expand each pattern individually. */
3218 rtx_vector_builder builder;
3219 auto_vec<rtx, 16> vectors (npatterns);
3220 for (unsigned int i = 0; i < npatterns; ++i)
3221 {
3222 builder.new_vector (mode, 1, nelts_per_pattern);
3223 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3224 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3225 vectors.quick_push (force_reg (mode, builder.build ()));
3226 }
3227
3228 /* Use permutes to interleave the separate vectors. */
3229 while (npatterns > 1)
3230 {
3231 npatterns /= 2;
3232 for (unsigned int i = 0; i < npatterns; ++i)
3233 {
3234 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3235 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3236 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3237 vectors[i] = tmp;
3238 }
3239 }
3240 gcc_assert (vectors[0] == dest);
3241 }
3242
3243 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3244 is a pattern that can be used to set DEST to a replicated scalar
3245 element. */
3246
3247 void
3248 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3249 rtx (*gen_vec_duplicate) (rtx, rtx))
3250 {
3251 machine_mode mode = GET_MODE (dest);
3252
3253 /* Check on what type of symbol it is. */
3254 scalar_int_mode int_mode;
3255 if ((GET_CODE (imm) == SYMBOL_REF
3256 || GET_CODE (imm) == LABEL_REF
3257 || GET_CODE (imm) == CONST
3258 || GET_CODE (imm) == CONST_POLY_INT)
3259 && is_a <scalar_int_mode> (mode, &int_mode))
3260 {
3261 rtx mem;
3262 poly_int64 offset;
3263 HOST_WIDE_INT const_offset;
3264 enum aarch64_symbol_type sty;
3265
3266 /* If we have (const (plus symbol offset)), separate out the offset
3267 before we start classifying the symbol. */
3268 rtx base = strip_offset (imm, &offset);
3269
3270 /* We must always add an offset involving VL separately, rather than
3271 folding it into the relocation. */
3272 if (!offset.is_constant (&const_offset))
3273 {
3274 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3275 emit_insn (gen_rtx_SET (dest, imm));
3276 else
3277 {
3278 /* Do arithmetic on 32-bit values if the result is smaller
3279 than that. */
3280 if (partial_subreg_p (int_mode, SImode))
3281 {
3282 /* It is invalid to do symbol calculations in modes
3283 narrower than SImode. */
3284 gcc_assert (base == const0_rtx);
3285 dest = gen_lowpart (SImode, dest);
3286 int_mode = SImode;
3287 }
3288 if (base != const0_rtx)
3289 {
3290 base = aarch64_force_temporary (int_mode, dest, base);
3291 aarch64_add_offset (int_mode, dest, base, offset,
3292 NULL_RTX, NULL_RTX, false);
3293 }
3294 else
3295 aarch64_add_offset (int_mode, dest, base, offset,
3296 dest, NULL_RTX, false);
3297 }
3298 return;
3299 }
3300
3301 sty = aarch64_classify_symbol (base, const_offset);
3302 switch (sty)
3303 {
3304 case SYMBOL_FORCE_TO_MEM:
3305 if (const_offset != 0
3306 && targetm.cannot_force_const_mem (int_mode, imm))
3307 {
3308 gcc_assert (can_create_pseudo_p ());
3309 base = aarch64_force_temporary (int_mode, dest, base);
3310 aarch64_add_offset (int_mode, dest, base, const_offset,
3311 NULL_RTX, NULL_RTX, false);
3312 return;
3313 }
3314
3315 mem = force_const_mem (ptr_mode, imm);
3316 gcc_assert (mem);
3317
3318 /* If we aren't generating PC relative literals, then
3319 we need to expand the literal pool access carefully.
3320 This is something that needs to be done in a number
3321 of places, so could well live as a separate function. */
3322 if (!aarch64_pcrelative_literal_loads)
3323 {
3324 gcc_assert (can_create_pseudo_p ());
3325 base = gen_reg_rtx (ptr_mode);
3326 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3327 if (ptr_mode != Pmode)
3328 base = convert_memory_address (Pmode, base);
3329 mem = gen_rtx_MEM (ptr_mode, base);
3330 }
3331
3332 if (int_mode != ptr_mode)
3333 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3334
3335 emit_insn (gen_rtx_SET (dest, mem));
3336
3337 return;
3338
3339 case SYMBOL_SMALL_TLSGD:
3340 case SYMBOL_SMALL_TLSDESC:
3341 case SYMBOL_SMALL_TLSIE:
3342 case SYMBOL_SMALL_GOT_28K:
3343 case SYMBOL_SMALL_GOT_4G:
3344 case SYMBOL_TINY_GOT:
3345 case SYMBOL_TINY_TLSIE:
3346 if (const_offset != 0)
3347 {
3348 gcc_assert(can_create_pseudo_p ());
3349 base = aarch64_force_temporary (int_mode, dest, base);
3350 aarch64_add_offset (int_mode, dest, base, const_offset,
3351 NULL_RTX, NULL_RTX, false);
3352 return;
3353 }
3354 /* FALLTHRU */
3355
3356 case SYMBOL_SMALL_ABSOLUTE:
3357 case SYMBOL_TINY_ABSOLUTE:
3358 case SYMBOL_TLSLE12:
3359 case SYMBOL_TLSLE24:
3360 case SYMBOL_TLSLE32:
3361 case SYMBOL_TLSLE48:
3362 aarch64_load_symref_appropriately (dest, imm, sty);
3363 return;
3364
3365 default:
3366 gcc_unreachable ();
3367 }
3368 }
3369
3370 if (!CONST_INT_P (imm))
3371 {
3372 rtx base, step, value;
3373 if (GET_CODE (imm) == HIGH
3374 || aarch64_simd_valid_immediate (imm, NULL))
3375 emit_insn (gen_rtx_SET (dest, imm));
3376 else if (const_vec_series_p (imm, &base, &step))
3377 aarch64_expand_vec_series (dest, base, step);
3378 else if (const_vec_duplicate_p (imm, &value))
3379 {
3380 /* If the constant is out of range of an SVE vector move,
3381 load it from memory if we can, otherwise move it into
3382 a register and use a DUP. */
3383 scalar_mode inner_mode = GET_MODE_INNER (mode);
3384 rtx op = force_const_mem (inner_mode, value);
3385 if (!op)
3386 op = force_reg (inner_mode, value);
3387 else if (!aarch64_sve_ld1r_operand_p (op))
3388 {
3389 rtx addr = force_reg (Pmode, XEXP (op, 0));
3390 op = replace_equiv_address (op, addr);
3391 }
3392 emit_insn (gen_vec_duplicate (dest, op));
3393 }
3394 else if (GET_CODE (imm) == CONST_VECTOR
3395 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3396 aarch64_expand_sve_const_vector (dest, imm);
3397 else
3398 {
3399 rtx mem = force_const_mem (mode, imm);
3400 gcc_assert (mem);
3401 emit_move_insn (dest, mem);
3402 }
3403
3404 return;
3405 }
3406
3407 aarch64_internal_mov_immediate (dest, imm, true,
3408 as_a <scalar_int_mode> (mode));
3409 }
3410
3411 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3412 that is known to contain PTRUE. */
3413
3414 void
3415 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3416 {
3417 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3418 gen_rtvec (2, pred, src),
3419 UNSPEC_MERGE_PTRUE)));
3420 }
3421
3422 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3423 operand is in memory. In this case we need to use the predicated LD1
3424 and ST1 instead of LDR and STR, both for correctness on big-endian
3425 targets and because LD1 and ST1 support a wider range of addressing modes.
3426 PRED_MODE is the mode of the predicate.
3427
3428 See the comment at the head of aarch64-sve.md for details about the
3429 big-endian handling. */
3430
3431 void
3432 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3433 {
3434 machine_mode mode = GET_MODE (dest);
3435 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3436 if (!register_operand (src, mode)
3437 && !register_operand (dest, mode))
3438 {
3439 rtx tmp = gen_reg_rtx (mode);
3440 if (MEM_P (src))
3441 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3442 else
3443 emit_move_insn (tmp, src);
3444 src = tmp;
3445 }
3446 aarch64_emit_sve_pred_move (dest, ptrue, src);
3447 }
3448
3449 /* Called only on big-endian targets. See whether an SVE vector move
3450 from SRC to DEST is effectively a REV[BHW] instruction, because at
3451 least one operand is a subreg of an SVE vector that has wider or
3452 narrower elements. Return true and emit the instruction if so.
3453
3454 For example:
3455
3456 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3457
3458 represents a VIEW_CONVERT between the following vectors, viewed
3459 in memory order:
3460
3461 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3462 R1: { [0], [1], [2], [3], ... }
3463
3464 The high part of lane X in R2 should therefore correspond to lane X*2
3465 of R1, but the register representations are:
3466
3467 msb lsb
3468 R2: ...... [1].high [1].low [0].high [0].low
3469 R1: ...... [3] [2] [1] [0]
3470
3471 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3472 We therefore need a reverse operation to swap the high and low values
3473 around.
3474
3475 This is purely an optimization. Without it we would spill the
3476 subreg operand to the stack in one mode and reload it in the
3477 other mode, which has the same effect as the REV. */
3478
3479 bool
3480 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3481 {
3482 gcc_assert (BYTES_BIG_ENDIAN);
3483 if (GET_CODE (dest) == SUBREG)
3484 dest = SUBREG_REG (dest);
3485 if (GET_CODE (src) == SUBREG)
3486 src = SUBREG_REG (src);
3487
3488 /* The optimization handles two single SVE REGs with different element
3489 sizes. */
3490 if (!REG_P (dest)
3491 || !REG_P (src)
3492 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3493 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3494 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3495 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3496 return false;
3497
3498 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3499 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3500 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3501 UNSPEC_REV_SUBREG);
3502 emit_insn (gen_rtx_SET (dest, unspec));
3503 return true;
3504 }
3505
3506 /* Return a copy of X with mode MODE, without changing its other
3507 attributes. Unlike gen_lowpart, this doesn't care whether the
3508 mode change is valid. */
3509
3510 static rtx
3511 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3512 {
3513 if (GET_MODE (x) == mode)
3514 return x;
3515
3516 x = shallow_copy_rtx (x);
3517 set_mode_and_regno (x, mode, REGNO (x));
3518 return x;
3519 }
3520
3521 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3522 operands. */
3523
3524 void
3525 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3526 {
3527 /* Decide which REV operation we need. The mode with narrower elements
3528 determines the mode of the operands and the mode with the wider
3529 elements determines the reverse width. */
3530 machine_mode mode_with_wider_elts = GET_MODE (dest);
3531 machine_mode mode_with_narrower_elts = GET_MODE (src);
3532 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3533 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3534 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3535
3536 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3537 unsigned int unspec;
3538 if (wider_bytes == 8)
3539 unspec = UNSPEC_REV64;
3540 else if (wider_bytes == 4)
3541 unspec = UNSPEC_REV32;
3542 else if (wider_bytes == 2)
3543 unspec = UNSPEC_REV16;
3544 else
3545 gcc_unreachable ();
3546 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3547
3548 /* Emit:
3549
3550 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3551 UNSPEC_MERGE_PTRUE))
3552
3553 with the appropriate modes. */
3554 ptrue = gen_lowpart (pred_mode, ptrue);
3555 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3556 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3557 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3558 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3559 UNSPEC_MERGE_PTRUE);
3560 emit_insn (gen_rtx_SET (dest, src));
3561 }
3562
3563 static bool
3564 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3565 tree exp ATTRIBUTE_UNUSED)
3566 {
3567 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3568 return false;
3569
3570 return true;
3571 }
3572
3573 /* Implement TARGET_PASS_BY_REFERENCE. */
3574
3575 static bool
3576 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3577 machine_mode mode,
3578 const_tree type,
3579 bool named ATTRIBUTE_UNUSED)
3580 {
3581 HOST_WIDE_INT size;
3582 machine_mode dummymode;
3583 int nregs;
3584
3585 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3586 if (mode == BLKmode && type)
3587 size = int_size_in_bytes (type);
3588 else
3589 /* No frontends can create types with variable-sized modes, so we
3590 shouldn't be asked to pass or return them. */
3591 size = GET_MODE_SIZE (mode).to_constant ();
3592
3593 /* Aggregates are passed by reference based on their size. */
3594 if (type && AGGREGATE_TYPE_P (type))
3595 {
3596 size = int_size_in_bytes (type);
3597 }
3598
3599 /* Variable sized arguments are always returned by reference. */
3600 if (size < 0)
3601 return true;
3602
3603 /* Can this be a candidate to be passed in fp/simd register(s)? */
3604 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3605 &dummymode, &nregs,
3606 NULL))
3607 return false;
3608
3609 /* Arguments which are variable sized or larger than 2 registers are
3610 passed by reference unless they are a homogenous floating point
3611 aggregate. */
3612 return size > 2 * UNITS_PER_WORD;
3613 }
3614
3615 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3616 static bool
3617 aarch64_return_in_msb (const_tree valtype)
3618 {
3619 machine_mode dummy_mode;
3620 int dummy_int;
3621
3622 /* Never happens in little-endian mode. */
3623 if (!BYTES_BIG_ENDIAN)
3624 return false;
3625
3626 /* Only composite types smaller than or equal to 16 bytes can
3627 be potentially returned in registers. */
3628 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3629 || int_size_in_bytes (valtype) <= 0
3630 || int_size_in_bytes (valtype) > 16)
3631 return false;
3632
3633 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3634 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3635 is always passed/returned in the least significant bits of fp/simd
3636 register(s). */
3637 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3638 &dummy_mode, &dummy_int, NULL))
3639 return false;
3640
3641 return true;
3642 }
3643
3644 /* Implement TARGET_FUNCTION_VALUE.
3645 Define how to find the value returned by a function. */
3646
3647 static rtx
3648 aarch64_function_value (const_tree type, const_tree func,
3649 bool outgoing ATTRIBUTE_UNUSED)
3650 {
3651 machine_mode mode;
3652 int unsignedp;
3653 int count;
3654 machine_mode ag_mode;
3655
3656 mode = TYPE_MODE (type);
3657 if (INTEGRAL_TYPE_P (type))
3658 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3659
3660 if (aarch64_return_in_msb (type))
3661 {
3662 HOST_WIDE_INT size = int_size_in_bytes (type);
3663
3664 if (size % UNITS_PER_WORD != 0)
3665 {
3666 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3667 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3668 }
3669 }
3670
3671 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3672 &ag_mode, &count, NULL))
3673 {
3674 if (!aarch64_composite_type_p (type, mode))
3675 {
3676 gcc_assert (count == 1 && mode == ag_mode);
3677 return gen_rtx_REG (mode, V0_REGNUM);
3678 }
3679 else
3680 {
3681 int i;
3682 rtx par;
3683
3684 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3685 for (i = 0; i < count; i++)
3686 {
3687 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3688 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3689 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3690 XVECEXP (par, 0, i) = tmp;
3691 }
3692 return par;
3693 }
3694 }
3695 else
3696 return gen_rtx_REG (mode, R0_REGNUM);
3697 }
3698
3699 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3700 Return true if REGNO is the number of a hard register in which the values
3701 of called function may come back. */
3702
3703 static bool
3704 aarch64_function_value_regno_p (const unsigned int regno)
3705 {
3706 /* Maximum of 16 bytes can be returned in the general registers. Examples
3707 of 16-byte return values are: 128-bit integers and 16-byte small
3708 structures (excluding homogeneous floating-point aggregates). */
3709 if (regno == R0_REGNUM || regno == R1_REGNUM)
3710 return true;
3711
3712 /* Up to four fp/simd registers can return a function value, e.g. a
3713 homogeneous floating-point aggregate having four members. */
3714 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3715 return TARGET_FLOAT;
3716
3717 return false;
3718 }
3719
3720 /* Implement TARGET_RETURN_IN_MEMORY.
3721
3722 If the type T of the result of a function is such that
3723 void func (T arg)
3724 would require that arg be passed as a value in a register (or set of
3725 registers) according to the parameter passing rules, then the result
3726 is returned in the same registers as would be used for such an
3727 argument. */
3728
3729 static bool
3730 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3731 {
3732 HOST_WIDE_INT size;
3733 machine_mode ag_mode;
3734 int count;
3735
3736 if (!AGGREGATE_TYPE_P (type)
3737 && TREE_CODE (type) != COMPLEX_TYPE
3738 && TREE_CODE (type) != VECTOR_TYPE)
3739 /* Simple scalar types always returned in registers. */
3740 return false;
3741
3742 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3743 type,
3744 &ag_mode,
3745 &count,
3746 NULL))
3747 return false;
3748
3749 /* Types larger than 2 registers returned in memory. */
3750 size = int_size_in_bytes (type);
3751 return (size < 0 || size > 2 * UNITS_PER_WORD);
3752 }
3753
3754 static bool
3755 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3756 const_tree type, int *nregs)
3757 {
3758 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3759 return aarch64_vfp_is_call_or_return_candidate (mode,
3760 type,
3761 &pcum->aapcs_vfp_rmode,
3762 nregs,
3763 NULL);
3764 }
3765
3766 /* Given MODE and TYPE of a function argument, return the alignment in
3767 bits. The idea is to suppress any stronger alignment requested by
3768 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3769 This is a helper function for local use only. */
3770
3771 static unsigned int
3772 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3773 {
3774 if (!type)
3775 return GET_MODE_ALIGNMENT (mode);
3776
3777 if (integer_zerop (TYPE_SIZE (type)))
3778 return 0;
3779
3780 gcc_assert (TYPE_MODE (type) == mode);
3781
3782 if (!AGGREGATE_TYPE_P (type))
3783 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3784
3785 if (TREE_CODE (type) == ARRAY_TYPE)
3786 return TYPE_ALIGN (TREE_TYPE (type));
3787
3788 unsigned int alignment = 0;
3789 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3790 if (TREE_CODE (field) == FIELD_DECL)
3791 alignment = std::max (alignment, DECL_ALIGN (field));
3792
3793 return alignment;
3794 }
3795
3796 /* Layout a function argument according to the AAPCS64 rules. The rule
3797 numbers refer to the rule numbers in the AAPCS64. */
3798
3799 static void
3800 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3801 const_tree type,
3802 bool named ATTRIBUTE_UNUSED)
3803 {
3804 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3805 int ncrn, nvrn, nregs;
3806 bool allocate_ncrn, allocate_nvrn;
3807 HOST_WIDE_INT size;
3808
3809 /* We need to do this once per argument. */
3810 if (pcum->aapcs_arg_processed)
3811 return;
3812
3813 pcum->aapcs_arg_processed = true;
3814
3815 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3816 if (type)
3817 size = int_size_in_bytes (type);
3818 else
3819 /* No frontends can create types with variable-sized modes, so we
3820 shouldn't be asked to pass or return them. */
3821 size = GET_MODE_SIZE (mode).to_constant ();
3822 size = ROUND_UP (size, UNITS_PER_WORD);
3823
3824 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3825 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3826 mode,
3827 type,
3828 &nregs);
3829
3830 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3831 The following code thus handles passing by SIMD/FP registers first. */
3832
3833 nvrn = pcum->aapcs_nvrn;
3834
3835 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3836 and homogenous short-vector aggregates (HVA). */
3837 if (allocate_nvrn)
3838 {
3839 if (!TARGET_FLOAT)
3840 aarch64_err_no_fpadvsimd (mode);
3841
3842 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3843 {
3844 pcum->aapcs_nextnvrn = nvrn + nregs;
3845 if (!aarch64_composite_type_p (type, mode))
3846 {
3847 gcc_assert (nregs == 1);
3848 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3849 }
3850 else
3851 {
3852 rtx par;
3853 int i;
3854 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3855 for (i = 0; i < nregs; i++)
3856 {
3857 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3858 V0_REGNUM + nvrn + i);
3859 rtx offset = gen_int_mode
3860 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3861 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3862 XVECEXP (par, 0, i) = tmp;
3863 }
3864 pcum->aapcs_reg = par;
3865 }
3866 return;
3867 }
3868 else
3869 {
3870 /* C.3 NSRN is set to 8. */
3871 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3872 goto on_stack;
3873 }
3874 }
3875
3876 ncrn = pcum->aapcs_ncrn;
3877 nregs = size / UNITS_PER_WORD;
3878
3879 /* C6 - C9. though the sign and zero extension semantics are
3880 handled elsewhere. This is the case where the argument fits
3881 entirely general registers. */
3882 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3883 {
3884
3885 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3886
3887 /* C.8 if the argument has an alignment of 16 then the NGRN is
3888 rounded up to the next even number. */
3889 if (nregs == 2
3890 && ncrn % 2
3891 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3892 comparison is there because for > 16 * BITS_PER_UNIT
3893 alignment nregs should be > 2 and therefore it should be
3894 passed by reference rather than value. */
3895 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3896 {
3897 ++ncrn;
3898 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3899 }
3900
3901 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3902 A reg is still generated for it, but the caller should be smart
3903 enough not to use it. */
3904 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3905 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3906 else
3907 {
3908 rtx par;
3909 int i;
3910
3911 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3912 for (i = 0; i < nregs; i++)
3913 {
3914 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3915 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3916 GEN_INT (i * UNITS_PER_WORD));
3917 XVECEXP (par, 0, i) = tmp;
3918 }
3919 pcum->aapcs_reg = par;
3920 }
3921
3922 pcum->aapcs_nextncrn = ncrn + nregs;
3923 return;
3924 }
3925
3926 /* C.11 */
3927 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3928
3929 /* The argument is passed on stack; record the needed number of words for
3930 this argument and align the total size if necessary. */
3931 on_stack:
3932 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3933
3934 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3935 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3936 16 / UNITS_PER_WORD);
3937 return;
3938 }
3939
3940 /* Implement TARGET_FUNCTION_ARG. */
3941
3942 static rtx
3943 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3944 const_tree type, bool named)
3945 {
3946 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3947 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3948
3949 if (mode == VOIDmode)
3950 return NULL_RTX;
3951
3952 aarch64_layout_arg (pcum_v, mode, type, named);
3953 return pcum->aapcs_reg;
3954 }
3955
3956 void
3957 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3958 const_tree fntype ATTRIBUTE_UNUSED,
3959 rtx libname ATTRIBUTE_UNUSED,
3960 const_tree fndecl ATTRIBUTE_UNUSED,
3961 unsigned n_named ATTRIBUTE_UNUSED)
3962 {
3963 pcum->aapcs_ncrn = 0;
3964 pcum->aapcs_nvrn = 0;
3965 pcum->aapcs_nextncrn = 0;
3966 pcum->aapcs_nextnvrn = 0;
3967 pcum->pcs_variant = ARM_PCS_AAPCS64;
3968 pcum->aapcs_reg = NULL_RTX;
3969 pcum->aapcs_arg_processed = false;
3970 pcum->aapcs_stack_words = 0;
3971 pcum->aapcs_stack_size = 0;
3972
3973 if (!TARGET_FLOAT
3974 && fndecl && TREE_PUBLIC (fndecl)
3975 && fntype && fntype != error_mark_node)
3976 {
3977 const_tree type = TREE_TYPE (fntype);
3978 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3979 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3980 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3981 &mode, &nregs, NULL))
3982 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3983 }
3984 return;
3985 }
3986
3987 static void
3988 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3989 machine_mode mode,
3990 const_tree type,
3991 bool named)
3992 {
3993 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3994 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3995 {
3996 aarch64_layout_arg (pcum_v, mode, type, named);
3997 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3998 != (pcum->aapcs_stack_words != 0));
3999 pcum->aapcs_arg_processed = false;
4000 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4001 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4002 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4003 pcum->aapcs_stack_words = 0;
4004 pcum->aapcs_reg = NULL_RTX;
4005 }
4006 }
4007
4008 bool
4009 aarch64_function_arg_regno_p (unsigned regno)
4010 {
4011 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4012 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4013 }
4014
4015 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4016 PARM_BOUNDARY bits of alignment, but will be given anything up
4017 to STACK_BOUNDARY bits if the type requires it. This makes sure
4018 that both before and after the layout of each argument, the Next
4019 Stacked Argument Address (NSAA) will have a minimum alignment of
4020 8 bytes. */
4021
4022 static unsigned int
4023 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4024 {
4025 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
4026 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4027 }
4028
4029 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4030
4031 static fixed_size_mode
4032 aarch64_get_reg_raw_mode (int regno)
4033 {
4034 if (TARGET_SVE && FP_REGNUM_P (regno))
4035 /* Don't use the SVE part of the register for __builtin_apply and
4036 __builtin_return. The SVE registers aren't used by the normal PCS,
4037 so using them there would be a waste of time. The PCS extensions
4038 for SVE types are fundamentally incompatible with the
4039 __builtin_return/__builtin_apply interface. */
4040 return as_a <fixed_size_mode> (V16QImode);
4041 return default_get_reg_raw_mode (regno);
4042 }
4043
4044 /* Implement TARGET_FUNCTION_ARG_PADDING.
4045
4046 Small aggregate types are placed in the lowest memory address.
4047
4048 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4049
4050 static pad_direction
4051 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4052 {
4053 /* On little-endian targets, the least significant byte of every stack
4054 argument is passed at the lowest byte address of the stack slot. */
4055 if (!BYTES_BIG_ENDIAN)
4056 return PAD_UPWARD;
4057
4058 /* Otherwise, integral, floating-point and pointer types are padded downward:
4059 the least significant byte of a stack argument is passed at the highest
4060 byte address of the stack slot. */
4061 if (type
4062 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4063 || POINTER_TYPE_P (type))
4064 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4065 return PAD_DOWNWARD;
4066
4067 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4068 return PAD_UPWARD;
4069 }
4070
4071 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4072
4073 It specifies padding for the last (may also be the only)
4074 element of a block move between registers and memory. If
4075 assuming the block is in the memory, padding upward means that
4076 the last element is padded after its highest significant byte,
4077 while in downward padding, the last element is padded at the
4078 its least significant byte side.
4079
4080 Small aggregates and small complex types are always padded
4081 upwards.
4082
4083 We don't need to worry about homogeneous floating-point or
4084 short-vector aggregates; their move is not affected by the
4085 padding direction determined here. Regardless of endianness,
4086 each element of such an aggregate is put in the least
4087 significant bits of a fp/simd register.
4088
4089 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4090 register has useful data, and return the opposite if the most
4091 significant byte does. */
4092
4093 bool
4094 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4095 bool first ATTRIBUTE_UNUSED)
4096 {
4097
4098 /* Small composite types are always padded upward. */
4099 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4100 {
4101 HOST_WIDE_INT size;
4102 if (type)
4103 size = int_size_in_bytes (type);
4104 else
4105 /* No frontends can create types with variable-sized modes, so we
4106 shouldn't be asked to pass or return them. */
4107 size = GET_MODE_SIZE (mode).to_constant ();
4108 if (size < 2 * UNITS_PER_WORD)
4109 return true;
4110 }
4111
4112 /* Otherwise, use the default padding. */
4113 return !BYTES_BIG_ENDIAN;
4114 }
4115
4116 static scalar_int_mode
4117 aarch64_libgcc_cmp_return_mode (void)
4118 {
4119 return SImode;
4120 }
4121
4122 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4123
4124 /* We use the 12-bit shifted immediate arithmetic instructions so values
4125 must be multiple of (1 << 12), i.e. 4096. */
4126 #define ARITH_FACTOR 4096
4127
4128 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4129 #error Cannot use simple address calculation for stack probing
4130 #endif
4131
4132 /* The pair of scratch registers used for stack probing. */
4133 #define PROBE_STACK_FIRST_REG R9_REGNUM
4134 #define PROBE_STACK_SECOND_REG R10_REGNUM
4135
4136 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4137 inclusive. These are offsets from the current stack pointer. */
4138
4139 static void
4140 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4141 {
4142 HOST_WIDE_INT size;
4143 if (!poly_size.is_constant (&size))
4144 {
4145 sorry ("stack probes for SVE frames");
4146 return;
4147 }
4148
4149 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4150
4151 /* See the same assertion on PROBE_INTERVAL above. */
4152 gcc_assert ((first % ARITH_FACTOR) == 0);
4153
4154 /* See if we have a constant small number of probes to generate. If so,
4155 that's the easy case. */
4156 if (size <= PROBE_INTERVAL)
4157 {
4158 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4159
4160 emit_set_insn (reg1,
4161 plus_constant (Pmode,
4162 stack_pointer_rtx, -(first + base)));
4163 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4164 }
4165
4166 /* The run-time loop is made up of 8 insns in the generic case while the
4167 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4168 else if (size <= 4 * PROBE_INTERVAL)
4169 {
4170 HOST_WIDE_INT i, rem;
4171
4172 emit_set_insn (reg1,
4173 plus_constant (Pmode,
4174 stack_pointer_rtx,
4175 -(first + PROBE_INTERVAL)));
4176 emit_stack_probe (reg1);
4177
4178 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4179 it exceeds SIZE. If only two probes are needed, this will not
4180 generate any code. Then probe at FIRST + SIZE. */
4181 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4182 {
4183 emit_set_insn (reg1,
4184 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4185 emit_stack_probe (reg1);
4186 }
4187
4188 rem = size - (i - PROBE_INTERVAL);
4189 if (rem > 256)
4190 {
4191 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4192
4193 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4194 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4195 }
4196 else
4197 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4198 }
4199
4200 /* Otherwise, do the same as above, but in a loop. Note that we must be
4201 extra careful with variables wrapping around because we might be at
4202 the very top (or the very bottom) of the address space and we have
4203 to be able to handle this case properly; in particular, we use an
4204 equality test for the loop condition. */
4205 else
4206 {
4207 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4208
4209 /* Step 1: round SIZE to the previous multiple of the interval. */
4210
4211 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4212
4213
4214 /* Step 2: compute initial and final value of the loop counter. */
4215
4216 /* TEST_ADDR = SP + FIRST. */
4217 emit_set_insn (reg1,
4218 plus_constant (Pmode, stack_pointer_rtx, -first));
4219
4220 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4221 HOST_WIDE_INT adjustment = - (first + rounded_size);
4222 if (! aarch64_uimm12_shift (adjustment))
4223 {
4224 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4225 true, Pmode);
4226 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4227 }
4228 else
4229 emit_set_insn (reg2,
4230 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4231
4232 /* Step 3: the loop
4233
4234 do
4235 {
4236 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4237 probe at TEST_ADDR
4238 }
4239 while (TEST_ADDR != LAST_ADDR)
4240
4241 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4242 until it is equal to ROUNDED_SIZE. */
4243
4244 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4245
4246
4247 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4248 that SIZE is equal to ROUNDED_SIZE. */
4249
4250 if (size != rounded_size)
4251 {
4252 HOST_WIDE_INT rem = size - rounded_size;
4253
4254 if (rem > 256)
4255 {
4256 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4257
4258 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4259 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4260 }
4261 else
4262 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4263 }
4264 }
4265
4266 /* Make sure nothing is scheduled before we are done. */
4267 emit_insn (gen_blockage ());
4268 }
4269
4270 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4271 absolute addresses. */
4272
4273 const char *
4274 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4275 {
4276 static int labelno = 0;
4277 char loop_lab[32];
4278 rtx xops[2];
4279
4280 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4281
4282 /* Loop. */
4283 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4284
4285 HOST_WIDE_INT stack_clash_probe_interval
4286 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4287
4288 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4289 xops[0] = reg1;
4290 HOST_WIDE_INT interval;
4291 if (flag_stack_clash_protection)
4292 interval = stack_clash_probe_interval;
4293 else
4294 interval = PROBE_INTERVAL;
4295
4296 gcc_assert (aarch64_uimm12_shift (interval));
4297 xops[1] = GEN_INT (interval);
4298
4299 output_asm_insn ("sub\t%0, %0, %1", xops);
4300
4301 /* If doing stack clash protection then we probe up by the ABI specified
4302 amount. We do this because we're dropping full pages at a time in the
4303 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4304 if (flag_stack_clash_protection)
4305 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4306 else
4307 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4308
4309 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4310 by this amount for each iteration. */
4311 output_asm_insn ("str\txzr, [%0, %1]", xops);
4312
4313 /* Test if TEST_ADDR == LAST_ADDR. */
4314 xops[1] = reg2;
4315 output_asm_insn ("cmp\t%0, %1", xops);
4316
4317 /* Branch. */
4318 fputs ("\tb.ne\t", asm_out_file);
4319 assemble_name_raw (asm_out_file, loop_lab);
4320 fputc ('\n', asm_out_file);
4321
4322 return "";
4323 }
4324
4325 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4326 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4327 of GUARD_SIZE. When a probe is emitted it is done at most
4328 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4329 at most MIN_PROBE_THRESHOLD. By the end of this function
4330 BASE = BASE - ADJUSTMENT. */
4331
4332 const char *
4333 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4334 rtx min_probe_threshold, rtx guard_size)
4335 {
4336 /* This function is not allowed to use any instruction generation function
4337 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4338 so instead emit the code you want using output_asm_insn. */
4339 gcc_assert (flag_stack_clash_protection);
4340 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4341 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4342
4343 /* The minimum required allocation before the residual requires probing. */
4344 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4345
4346 /* Clamp the value down to the nearest value that can be used with a cmp. */
4347 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4348 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4349
4350 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4351 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4352
4353 static int labelno = 0;
4354 char loop_start_lab[32];
4355 char loop_end_lab[32];
4356 rtx xops[2];
4357
4358 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4359 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4360
4361 /* Emit loop start label. */
4362 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4363
4364 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4365 xops[0] = adjustment;
4366 xops[1] = probe_offset_value_rtx;
4367 output_asm_insn ("cmp\t%0, %1", xops);
4368
4369 /* Branch to end if not enough adjustment to probe. */
4370 fputs ("\tb.lt\t", asm_out_file);
4371 assemble_name_raw (asm_out_file, loop_end_lab);
4372 fputc ('\n', asm_out_file);
4373
4374 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4375 xops[0] = base;
4376 xops[1] = probe_offset_value_rtx;
4377 output_asm_insn ("sub\t%0, %0, %1", xops);
4378
4379 /* Probe at BASE. */
4380 xops[1] = const0_rtx;
4381 output_asm_insn ("str\txzr, [%0, %1]", xops);
4382
4383 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4384 xops[0] = adjustment;
4385 xops[1] = probe_offset_value_rtx;
4386 output_asm_insn ("sub\t%0, %0, %1", xops);
4387
4388 /* Branch to start if still more bytes to allocate. */
4389 fputs ("\tb\t", asm_out_file);
4390 assemble_name_raw (asm_out_file, loop_start_lab);
4391 fputc ('\n', asm_out_file);
4392
4393 /* No probe leave. */
4394 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4395
4396 /* BASE = BASE - ADJUSTMENT. */
4397 xops[0] = base;
4398 xops[1] = adjustment;
4399 output_asm_insn ("sub\t%0, %0, %1", xops);
4400 return "";
4401 }
4402
4403 /* Determine whether a frame chain needs to be generated. */
4404 static bool
4405 aarch64_needs_frame_chain (void)
4406 {
4407 /* Force a frame chain for EH returns so the return address is at FP+8. */
4408 if (frame_pointer_needed || crtl->calls_eh_return)
4409 return true;
4410
4411 /* A leaf function cannot have calls or write LR. */
4412 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4413
4414 /* Don't use a frame chain in leaf functions if leaf frame pointers
4415 are disabled. */
4416 if (flag_omit_leaf_frame_pointer && is_leaf)
4417 return false;
4418
4419 return aarch64_use_frame_pointer;
4420 }
4421
4422 /* Mark the registers that need to be saved by the callee and calculate
4423 the size of the callee-saved registers area and frame record (both FP
4424 and LR may be omitted). */
4425 static void
4426 aarch64_layout_frame (void)
4427 {
4428 HOST_WIDE_INT offset = 0;
4429 int regno, last_fp_reg = INVALID_REGNUM;
4430 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4431
4432 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4433
4434 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4435 the mid-end is doing. */
4436 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4437
4438 #define SLOT_NOT_REQUIRED (-2)
4439 #define SLOT_REQUIRED (-1)
4440
4441 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4442 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4443
4444 /* If this is a non-leaf simd function with calls we assume that
4445 at least one of those calls is to a non-simd function and thus
4446 we must save V8 to V23 in the prologue. */
4447
4448 if (simd_function && !crtl->is_leaf)
4449 {
4450 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4451 if (FP_SIMD_SAVED_REGNUM_P (regno))
4452 df_set_regs_ever_live (regno, true);
4453 }
4454
4455 /* First mark all the registers that really need to be saved... */
4456 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4457 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4458
4459 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4460 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4461
4462 /* ... that includes the eh data registers (if needed)... */
4463 if (crtl->calls_eh_return)
4464 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4465 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4466 = SLOT_REQUIRED;
4467
4468 /* ... and any callee saved register that dataflow says is live. */
4469 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4470 if (df_regs_ever_live_p (regno)
4471 && (regno == R30_REGNUM
4472 || !call_used_regs[regno]))
4473 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4474
4475 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4476 if (df_regs_ever_live_p (regno)
4477 && (!call_used_regs[regno]
4478 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4479 {
4480 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4481 last_fp_reg = regno;
4482 }
4483
4484 if (cfun->machine->frame.emit_frame_chain)
4485 {
4486 /* FP and LR are placed in the linkage record. */
4487 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4488 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4489 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4490 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4491 offset = 2 * UNITS_PER_WORD;
4492 }
4493
4494 /* With stack-clash, LR must be saved in non-leaf functions. */
4495 gcc_assert (crtl->is_leaf
4496 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4497 != SLOT_NOT_REQUIRED));
4498
4499 /* Now assign stack slots for them. */
4500 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4501 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4502 {
4503 cfun->machine->frame.reg_offset[regno] = offset;
4504 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4505 cfun->machine->frame.wb_candidate1 = regno;
4506 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4507 cfun->machine->frame.wb_candidate2 = regno;
4508 offset += UNITS_PER_WORD;
4509 }
4510
4511 HOST_WIDE_INT max_int_offset = offset;
4512 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4513 bool has_align_gap = offset != max_int_offset;
4514
4515 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4516 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4517 {
4518 /* If there is an alignment gap between integer and fp callee-saves,
4519 allocate the last fp register to it if possible. */
4520 if (regno == last_fp_reg
4521 && has_align_gap
4522 && !simd_function
4523 && (offset & 8) == 0)
4524 {
4525 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4526 break;
4527 }
4528
4529 cfun->machine->frame.reg_offset[regno] = offset;
4530 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4531 cfun->machine->frame.wb_candidate1 = regno;
4532 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4533 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4534 cfun->machine->frame.wb_candidate2 = regno;
4535 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4536 }
4537
4538 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4539
4540 cfun->machine->frame.saved_regs_size = offset;
4541
4542 HOST_WIDE_INT varargs_and_saved_regs_size
4543 = offset + cfun->machine->frame.saved_varargs_size;
4544
4545 cfun->machine->frame.hard_fp_offset
4546 = aligned_upper_bound (varargs_and_saved_regs_size
4547 + get_frame_size (),
4548 STACK_BOUNDARY / BITS_PER_UNIT);
4549
4550 /* Both these values are already aligned. */
4551 gcc_assert (multiple_p (crtl->outgoing_args_size,
4552 STACK_BOUNDARY / BITS_PER_UNIT));
4553 cfun->machine->frame.frame_size
4554 = (cfun->machine->frame.hard_fp_offset
4555 + crtl->outgoing_args_size);
4556
4557 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4558
4559 cfun->machine->frame.initial_adjust = 0;
4560 cfun->machine->frame.final_adjust = 0;
4561 cfun->machine->frame.callee_adjust = 0;
4562 cfun->machine->frame.callee_offset = 0;
4563
4564 HOST_WIDE_INT max_push_offset = 0;
4565 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4566 max_push_offset = 512;
4567 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4568 max_push_offset = 256;
4569
4570 HOST_WIDE_INT const_size, const_fp_offset;
4571 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4572 && const_size < max_push_offset
4573 && known_eq (crtl->outgoing_args_size, 0))
4574 {
4575 /* Simple, small frame with no outgoing arguments:
4576 stp reg1, reg2, [sp, -frame_size]!
4577 stp reg3, reg4, [sp, 16] */
4578 cfun->machine->frame.callee_adjust = const_size;
4579 }
4580 else if (known_lt (crtl->outgoing_args_size
4581 + cfun->machine->frame.saved_regs_size, 512)
4582 && !(cfun->calls_alloca
4583 && known_lt (cfun->machine->frame.hard_fp_offset,
4584 max_push_offset)))
4585 {
4586 /* Frame with small outgoing arguments:
4587 sub sp, sp, frame_size
4588 stp reg1, reg2, [sp, outgoing_args_size]
4589 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4590 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4591 cfun->machine->frame.callee_offset
4592 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4593 }
4594 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4595 && const_fp_offset < max_push_offset)
4596 {
4597 /* Frame with large outgoing arguments but a small local area:
4598 stp reg1, reg2, [sp, -hard_fp_offset]!
4599 stp reg3, reg4, [sp, 16]
4600 sub sp, sp, outgoing_args_size */
4601 cfun->machine->frame.callee_adjust = const_fp_offset;
4602 cfun->machine->frame.final_adjust
4603 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4604 }
4605 else
4606 {
4607 /* Frame with large local area and outgoing arguments using frame pointer:
4608 sub sp, sp, hard_fp_offset
4609 stp x29, x30, [sp, 0]
4610 add x29, sp, 0
4611 stp reg3, reg4, [sp, 16]
4612 sub sp, sp, outgoing_args_size */
4613 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4614 cfun->machine->frame.final_adjust
4615 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4616 }
4617
4618 cfun->machine->frame.laid_out = true;
4619 }
4620
4621 /* Return true if the register REGNO is saved on entry to
4622 the current function. */
4623
4624 static bool
4625 aarch64_register_saved_on_entry (int regno)
4626 {
4627 return cfun->machine->frame.reg_offset[regno] >= 0;
4628 }
4629
4630 /* Return the next register up from REGNO up to LIMIT for the callee
4631 to save. */
4632
4633 static unsigned
4634 aarch64_next_callee_save (unsigned regno, unsigned limit)
4635 {
4636 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4637 regno ++;
4638 return regno;
4639 }
4640
4641 /* Push the register number REGNO of mode MODE to the stack with write-back
4642 adjusting the stack by ADJUSTMENT. */
4643
4644 static void
4645 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4646 HOST_WIDE_INT adjustment)
4647 {
4648 rtx base_rtx = stack_pointer_rtx;
4649 rtx insn, reg, mem;
4650
4651 reg = gen_rtx_REG (mode, regno);
4652 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4653 plus_constant (Pmode, base_rtx, -adjustment));
4654 mem = gen_frame_mem (mode, mem);
4655
4656 insn = emit_move_insn (mem, reg);
4657 RTX_FRAME_RELATED_P (insn) = 1;
4658 }
4659
4660 /* Generate and return an instruction to store the pair of registers
4661 REG and REG2 of mode MODE to location BASE with write-back adjusting
4662 the stack location BASE by ADJUSTMENT. */
4663
4664 static rtx
4665 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4666 HOST_WIDE_INT adjustment)
4667 {
4668 switch (mode)
4669 {
4670 case E_DImode:
4671 return gen_storewb_pairdi_di (base, base, reg, reg2,
4672 GEN_INT (-adjustment),
4673 GEN_INT (UNITS_PER_WORD - adjustment));
4674 case E_DFmode:
4675 return gen_storewb_pairdf_di (base, base, reg, reg2,
4676 GEN_INT (-adjustment),
4677 GEN_INT (UNITS_PER_WORD - adjustment));
4678 case E_TFmode:
4679 return gen_storewb_pairtf_di (base, base, reg, reg2,
4680 GEN_INT (-adjustment),
4681 GEN_INT (UNITS_PER_VREG - adjustment));
4682 default:
4683 gcc_unreachable ();
4684 }
4685 }
4686
4687 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4688 stack pointer by ADJUSTMENT. */
4689
4690 static void
4691 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4692 {
4693 rtx_insn *insn;
4694 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4695
4696 if (regno2 == INVALID_REGNUM)
4697 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4698
4699 rtx reg1 = gen_rtx_REG (mode, regno1);
4700 rtx reg2 = gen_rtx_REG (mode, regno2);
4701
4702 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4703 reg2, adjustment));
4704 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4705 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4706 RTX_FRAME_RELATED_P (insn) = 1;
4707 }
4708
4709 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4710 adjusting it by ADJUSTMENT afterwards. */
4711
4712 static rtx
4713 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4714 HOST_WIDE_INT adjustment)
4715 {
4716 switch (mode)
4717 {
4718 case E_DImode:
4719 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4720 GEN_INT (UNITS_PER_WORD));
4721 case E_DFmode:
4722 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4723 GEN_INT (UNITS_PER_WORD));
4724 case E_TFmode:
4725 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4726 GEN_INT (UNITS_PER_VREG));
4727 default:
4728 gcc_unreachable ();
4729 }
4730 }
4731
4732 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4733 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4734 into CFI_OPS. */
4735
4736 static void
4737 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4738 rtx *cfi_ops)
4739 {
4740 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4741 rtx reg1 = gen_rtx_REG (mode, regno1);
4742
4743 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4744
4745 if (regno2 == INVALID_REGNUM)
4746 {
4747 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4748 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4749 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4750 }
4751 else
4752 {
4753 rtx reg2 = gen_rtx_REG (mode, regno2);
4754 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4755 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4756 reg2, adjustment));
4757 }
4758 }
4759
4760 /* Generate and return a store pair instruction of mode MODE to store
4761 register REG1 to MEM1 and register REG2 to MEM2. */
4762
4763 static rtx
4764 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4765 rtx reg2)
4766 {
4767 switch (mode)
4768 {
4769 case E_DImode:
4770 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4771
4772 case E_DFmode:
4773 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4774
4775 case E_TFmode:
4776 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4777
4778 default:
4779 gcc_unreachable ();
4780 }
4781 }
4782
4783 /* Generate and regurn a load pair isntruction of mode MODE to load register
4784 REG1 from MEM1 and register REG2 from MEM2. */
4785
4786 static rtx
4787 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4788 rtx mem2)
4789 {
4790 switch (mode)
4791 {
4792 case E_DImode:
4793 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4794
4795 case E_DFmode:
4796 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4797
4798 case E_TFmode:
4799 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4800
4801 default:
4802 gcc_unreachable ();
4803 }
4804 }
4805
4806 /* Return TRUE if return address signing should be enabled for the current
4807 function, otherwise return FALSE. */
4808
4809 bool
4810 aarch64_return_address_signing_enabled (void)
4811 {
4812 /* This function should only be called after frame laid out. */
4813 gcc_assert (cfun->machine->frame.laid_out);
4814
4815 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4816 if it's LR is pushed onto stack. */
4817 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4818 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4819 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4820 }
4821
4822 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4823 bool
4824 aarch64_bti_enabled (void)
4825 {
4826 return (aarch64_enable_bti == 1);
4827 }
4828
4829 /* Emit code to save the callee-saved registers from register number START
4830 to LIMIT to the stack at the location starting at offset START_OFFSET,
4831 skipping any write-back candidates if SKIP_WB is true. */
4832
4833 static void
4834 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4835 unsigned start, unsigned limit, bool skip_wb)
4836 {
4837 rtx_insn *insn;
4838 unsigned regno;
4839 unsigned regno2;
4840
4841 for (regno = aarch64_next_callee_save (start, limit);
4842 regno <= limit;
4843 regno = aarch64_next_callee_save (regno + 1, limit))
4844 {
4845 rtx reg, mem;
4846 poly_int64 offset;
4847 int offset_diff;
4848
4849 if (skip_wb
4850 && (regno == cfun->machine->frame.wb_candidate1
4851 || regno == cfun->machine->frame.wb_candidate2))
4852 continue;
4853
4854 if (cfun->machine->reg_is_wrapped_separately[regno])
4855 continue;
4856
4857 reg = gen_rtx_REG (mode, regno);
4858 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4859 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4860 offset));
4861
4862 regno2 = aarch64_next_callee_save (regno + 1, limit);
4863 offset_diff = cfun->machine->frame.reg_offset[regno2]
4864 - cfun->machine->frame.reg_offset[regno];
4865
4866 if (regno2 <= limit
4867 && !cfun->machine->reg_is_wrapped_separately[regno2]
4868 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4869 {
4870 rtx reg2 = gen_rtx_REG (mode, regno2);
4871 rtx mem2;
4872
4873 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4874 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4875 offset));
4876 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4877 reg2));
4878
4879 /* The first part of a frame-related parallel insn is
4880 always assumed to be relevant to the frame
4881 calculations; subsequent parts, are only
4882 frame-related if explicitly marked. */
4883 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4884 regno = regno2;
4885 }
4886 else
4887 insn = emit_move_insn (mem, reg);
4888
4889 RTX_FRAME_RELATED_P (insn) = 1;
4890 }
4891 }
4892
4893 /* Emit code to restore the callee registers of mode MODE from register
4894 number START up to and including LIMIT. Restore from the stack offset
4895 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4896 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4897
4898 static void
4899 aarch64_restore_callee_saves (machine_mode mode,
4900 poly_int64 start_offset, unsigned start,
4901 unsigned limit, bool skip_wb, rtx *cfi_ops)
4902 {
4903 rtx base_rtx = stack_pointer_rtx;
4904 unsigned regno;
4905 unsigned regno2;
4906 poly_int64 offset;
4907
4908 for (regno = aarch64_next_callee_save (start, limit);
4909 regno <= limit;
4910 regno = aarch64_next_callee_save (regno + 1, limit))
4911 {
4912 if (cfun->machine->reg_is_wrapped_separately[regno])
4913 continue;
4914
4915 rtx reg, mem;
4916 int offset_diff;
4917
4918 if (skip_wb
4919 && (regno == cfun->machine->frame.wb_candidate1
4920 || regno == cfun->machine->frame.wb_candidate2))
4921 continue;
4922
4923 reg = gen_rtx_REG (mode, regno);
4924 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4925 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4926
4927 regno2 = aarch64_next_callee_save (regno + 1, limit);
4928 offset_diff = cfun->machine->frame.reg_offset[regno2]
4929 - cfun->machine->frame.reg_offset[regno];
4930
4931 if (regno2 <= limit
4932 && !cfun->machine->reg_is_wrapped_separately[regno2]
4933 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4934 {
4935 rtx reg2 = gen_rtx_REG (mode, regno2);
4936 rtx mem2;
4937
4938 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4939 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4940 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4941
4942 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4943 regno = regno2;
4944 }
4945 else
4946 emit_move_insn (reg, mem);
4947 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4948 }
4949 }
4950
4951 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4952 of MODE. */
4953
4954 static inline bool
4955 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4956 {
4957 HOST_WIDE_INT multiple;
4958 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4959 && IN_RANGE (multiple, -8, 7));
4960 }
4961
4962 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4963 of MODE. */
4964
4965 static inline bool
4966 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4967 {
4968 HOST_WIDE_INT multiple;
4969 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4970 && IN_RANGE (multiple, 0, 63));
4971 }
4972
4973 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4974 of MODE. */
4975
4976 bool
4977 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4978 {
4979 HOST_WIDE_INT multiple;
4980 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4981 && IN_RANGE (multiple, -64, 63));
4982 }
4983
4984 /* Return true if OFFSET is a signed 9-bit value. */
4985
4986 bool
4987 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4988 poly_int64 offset)
4989 {
4990 HOST_WIDE_INT const_offset;
4991 return (offset.is_constant (&const_offset)
4992 && IN_RANGE (const_offset, -256, 255));
4993 }
4994
4995 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4996 of MODE. */
4997
4998 static inline bool
4999 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5000 {
5001 HOST_WIDE_INT multiple;
5002 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5003 && IN_RANGE (multiple, -256, 255));
5004 }
5005
5006 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5007 of MODE. */
5008
5009 static inline bool
5010 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5011 {
5012 HOST_WIDE_INT multiple;
5013 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5014 && IN_RANGE (multiple, 0, 4095));
5015 }
5016
5017 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5018
5019 static sbitmap
5020 aarch64_get_separate_components (void)
5021 {
5022 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5023 bitmap_clear (components);
5024
5025 /* The registers we need saved to the frame. */
5026 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5027 if (aarch64_register_saved_on_entry (regno))
5028 {
5029 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5030 if (!frame_pointer_needed)
5031 offset += cfun->machine->frame.frame_size
5032 - cfun->machine->frame.hard_fp_offset;
5033 /* Check that we can access the stack slot of the register with one
5034 direct load with no adjustments needed. */
5035 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5036 bitmap_set_bit (components, regno);
5037 }
5038
5039 /* Don't mess with the hard frame pointer. */
5040 if (frame_pointer_needed)
5041 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5042
5043 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5044 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5045 /* If registers have been chosen to be stored/restored with
5046 writeback don't interfere with them to avoid having to output explicit
5047 stack adjustment instructions. */
5048 if (reg2 != INVALID_REGNUM)
5049 bitmap_clear_bit (components, reg2);
5050 if (reg1 != INVALID_REGNUM)
5051 bitmap_clear_bit (components, reg1);
5052
5053 bitmap_clear_bit (components, LR_REGNUM);
5054 bitmap_clear_bit (components, SP_REGNUM);
5055
5056 return components;
5057 }
5058
5059 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5060
5061 static sbitmap
5062 aarch64_components_for_bb (basic_block bb)
5063 {
5064 bitmap in = DF_LIVE_IN (bb);
5065 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5066 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5067 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5068
5069 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5070 bitmap_clear (components);
5071
5072 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5073 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5074 if ((!call_used_regs[regno]
5075 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5076 && (bitmap_bit_p (in, regno)
5077 || bitmap_bit_p (gen, regno)
5078 || bitmap_bit_p (kill, regno)))
5079 {
5080 unsigned regno2, offset, offset2;
5081 bitmap_set_bit (components, regno);
5082
5083 /* If there is a callee-save at an adjacent offset, add it too
5084 to increase the use of LDP/STP. */
5085 offset = cfun->machine->frame.reg_offset[regno];
5086 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5087
5088 if (regno2 <= LAST_SAVED_REGNUM)
5089 {
5090 offset2 = cfun->machine->frame.reg_offset[regno2];
5091 if ((offset & ~8) == (offset2 & ~8))
5092 bitmap_set_bit (components, regno2);
5093 }
5094 }
5095
5096 return components;
5097 }
5098
5099 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5100 Nothing to do for aarch64. */
5101
5102 static void
5103 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5104 {
5105 }
5106
5107 /* Return the next set bit in BMP from START onwards. Return the total number
5108 of bits in BMP if no set bit is found at or after START. */
5109
5110 static unsigned int
5111 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5112 {
5113 unsigned int nbits = SBITMAP_SIZE (bmp);
5114 if (start == nbits)
5115 return start;
5116
5117 gcc_assert (start < nbits);
5118 for (unsigned int i = start; i < nbits; i++)
5119 if (bitmap_bit_p (bmp, i))
5120 return i;
5121
5122 return nbits;
5123 }
5124
5125 /* Do the work for aarch64_emit_prologue_components and
5126 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5127 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5128 for these components or the epilogue sequence. That is, it determines
5129 whether we should emit stores or loads and what kind of CFA notes to attach
5130 to the insns. Otherwise the logic for the two sequences is very
5131 similar. */
5132
5133 static void
5134 aarch64_process_components (sbitmap components, bool prologue_p)
5135 {
5136 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5137 ? HARD_FRAME_POINTER_REGNUM
5138 : STACK_POINTER_REGNUM);
5139
5140 unsigned last_regno = SBITMAP_SIZE (components);
5141 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5142 rtx_insn *insn = NULL;
5143
5144 while (regno != last_regno)
5145 {
5146 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5147 so DFmode for the vector registers is enough. For simd functions
5148 we want to save the low 128 bits. */
5149 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5150
5151 rtx reg = gen_rtx_REG (mode, regno);
5152 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5153 if (!frame_pointer_needed)
5154 offset += cfun->machine->frame.frame_size
5155 - cfun->machine->frame.hard_fp_offset;
5156 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5157 rtx mem = gen_frame_mem (mode, addr);
5158
5159 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5160 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5161 /* No more registers to handle after REGNO.
5162 Emit a single save/restore and exit. */
5163 if (regno2 == last_regno)
5164 {
5165 insn = emit_insn (set);
5166 RTX_FRAME_RELATED_P (insn) = 1;
5167 if (prologue_p)
5168 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5169 else
5170 add_reg_note (insn, REG_CFA_RESTORE, reg);
5171 break;
5172 }
5173
5174 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5175 /* The next register is not of the same class or its offset is not
5176 mergeable with the current one into a pair. */
5177 if (!satisfies_constraint_Ump (mem)
5178 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5179 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5180 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5181 GET_MODE_SIZE (mode)))
5182 {
5183 insn = emit_insn (set);
5184 RTX_FRAME_RELATED_P (insn) = 1;
5185 if (prologue_p)
5186 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5187 else
5188 add_reg_note (insn, REG_CFA_RESTORE, reg);
5189
5190 regno = regno2;
5191 continue;
5192 }
5193
5194 /* REGNO2 can be saved/restored in a pair with REGNO. */
5195 rtx reg2 = gen_rtx_REG (mode, regno2);
5196 if (!frame_pointer_needed)
5197 offset2 += cfun->machine->frame.frame_size
5198 - cfun->machine->frame.hard_fp_offset;
5199 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5200 rtx mem2 = gen_frame_mem (mode, addr2);
5201 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5202 : gen_rtx_SET (reg2, mem2);
5203
5204 if (prologue_p)
5205 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5206 else
5207 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5208
5209 RTX_FRAME_RELATED_P (insn) = 1;
5210 if (prologue_p)
5211 {
5212 add_reg_note (insn, REG_CFA_OFFSET, set);
5213 add_reg_note (insn, REG_CFA_OFFSET, set2);
5214 }
5215 else
5216 {
5217 add_reg_note (insn, REG_CFA_RESTORE, reg);
5218 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5219 }
5220
5221 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5222 }
5223 }
5224
5225 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5226
5227 static void
5228 aarch64_emit_prologue_components (sbitmap components)
5229 {
5230 aarch64_process_components (components, true);
5231 }
5232
5233 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5234
5235 static void
5236 aarch64_emit_epilogue_components (sbitmap components)
5237 {
5238 aarch64_process_components (components, false);
5239 }
5240
5241 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5242
5243 static void
5244 aarch64_set_handled_components (sbitmap components)
5245 {
5246 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5247 if (bitmap_bit_p (components, regno))
5248 cfun->machine->reg_is_wrapped_separately[regno] = true;
5249 }
5250
5251 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5252 determining the probe offset for alloca. */
5253
5254 static HOST_WIDE_INT
5255 aarch64_stack_clash_protection_alloca_probe_range (void)
5256 {
5257 return STACK_CLASH_CALLER_GUARD;
5258 }
5259
5260
5261 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5262 registers. If POLY_SIZE is not large enough to require a probe this function
5263 will only adjust the stack. When allocating the stack space
5264 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5265 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5266 arguments. If we are then we ensure that any allocation larger than the ABI
5267 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5268 maintained.
5269
5270 We emit barriers after each stack adjustment to prevent optimizations from
5271 breaking the invariant that we never drop the stack more than a page. This
5272 invariant is needed to make it easier to correctly handle asynchronous
5273 events, e.g. if we were to allow the stack to be dropped by more than a page
5274 and then have multiple probes up and we take a signal somewhere in between
5275 then the signal handler doesn't know the state of the stack and can make no
5276 assumptions about which pages have been probed. */
5277
5278 static void
5279 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5280 poly_int64 poly_size,
5281 bool frame_related_p,
5282 bool final_adjustment_p)
5283 {
5284 HOST_WIDE_INT guard_size
5285 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5286 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5287 /* When doing the final adjustment for the outgoing argument size we can't
5288 assume that LR was saved at position 0. So subtract it's offset from the
5289 ABI safe buffer so that we don't accidentally allow an adjustment that
5290 would result in an allocation larger than the ABI buffer without
5291 probing. */
5292 HOST_WIDE_INT min_probe_threshold
5293 = final_adjustment_p
5294 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5295 : guard_size - guard_used_by_caller;
5296
5297 poly_int64 frame_size = cfun->machine->frame.frame_size;
5298
5299 /* We should always have a positive probe threshold. */
5300 gcc_assert (min_probe_threshold > 0);
5301
5302 if (flag_stack_clash_protection && !final_adjustment_p)
5303 {
5304 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5305 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5306
5307 if (known_eq (frame_size, 0))
5308 {
5309 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5310 }
5311 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5312 && known_lt (final_adjust, guard_used_by_caller))
5313 {
5314 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5315 }
5316 }
5317
5318 /* If SIZE is not large enough to require probing, just adjust the stack and
5319 exit. */
5320 if (known_lt (poly_size, min_probe_threshold)
5321 || !flag_stack_clash_protection)
5322 {
5323 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5324 return;
5325 }
5326
5327 HOST_WIDE_INT size;
5328 /* Handle the SVE non-constant case first. */
5329 if (!poly_size.is_constant (&size))
5330 {
5331 if (dump_file)
5332 {
5333 fprintf (dump_file, "Stack clash SVE prologue: ");
5334 print_dec (poly_size, dump_file);
5335 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5336 }
5337
5338 /* First calculate the amount of bytes we're actually spilling. */
5339 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5340 poly_size, temp1, temp2, false, true);
5341
5342 rtx_insn *insn = get_last_insn ();
5343
5344 if (frame_related_p)
5345 {
5346 /* This is done to provide unwinding information for the stack
5347 adjustments we're about to do, however to prevent the optimizers
5348 from removing the R11 move and leaving the CFA note (which would be
5349 very wrong) we tie the old and new stack pointer together.
5350 The tie will expand to nothing but the optimizers will not touch
5351 the instruction. */
5352 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5353 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5354 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5355
5356 /* We want the CFA independent of the stack pointer for the
5357 duration of the loop. */
5358 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5359 RTX_FRAME_RELATED_P (insn) = 1;
5360 }
5361
5362 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5363 rtx guard_const = gen_int_mode (guard_size, Pmode);
5364
5365 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5366 stack_pointer_rtx, temp1,
5367 probe_const, guard_const));
5368
5369 /* Now reset the CFA register if needed. */
5370 if (frame_related_p)
5371 {
5372 add_reg_note (insn, REG_CFA_DEF_CFA,
5373 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5374 gen_int_mode (poly_size, Pmode)));
5375 RTX_FRAME_RELATED_P (insn) = 1;
5376 }
5377
5378 return;
5379 }
5380
5381 if (dump_file)
5382 fprintf (dump_file,
5383 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5384 " bytes, probing will be required.\n", size);
5385
5386 /* Round size to the nearest multiple of guard_size, and calculate the
5387 residual as the difference between the original size and the rounded
5388 size. */
5389 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5390 HOST_WIDE_INT residual = size - rounded_size;
5391
5392 /* We can handle a small number of allocations/probes inline. Otherwise
5393 punt to a loop. */
5394 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5395 {
5396 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5397 {
5398 aarch64_sub_sp (NULL, temp2, guard_size, true);
5399 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5400 guard_used_by_caller));
5401 emit_insn (gen_blockage ());
5402 }
5403 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5404 }
5405 else
5406 {
5407 /* Compute the ending address. */
5408 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5409 temp1, NULL, false, true);
5410 rtx_insn *insn = get_last_insn ();
5411
5412 /* For the initial allocation, we don't have a frame pointer
5413 set up, so we always need CFI notes. If we're doing the
5414 final allocation, then we may have a frame pointer, in which
5415 case it is the CFA, otherwise we need CFI notes.
5416
5417 We can determine which allocation we are doing by looking at
5418 the value of FRAME_RELATED_P since the final allocations are not
5419 frame related. */
5420 if (frame_related_p)
5421 {
5422 /* We want the CFA independent of the stack pointer for the
5423 duration of the loop. */
5424 add_reg_note (insn, REG_CFA_DEF_CFA,
5425 plus_constant (Pmode, temp1, rounded_size));
5426 RTX_FRAME_RELATED_P (insn) = 1;
5427 }
5428
5429 /* This allocates and probes the stack. Note that this re-uses some of
5430 the existing Ada stack protection code. However we are guaranteed not
5431 to enter the non loop or residual branches of that code.
5432
5433 The non-loop part won't be entered because if our allocation amount
5434 doesn't require a loop, the case above would handle it.
5435
5436 The residual amount won't be entered because TEMP1 is a mutliple of
5437 the allocation size. The residual will always be 0. As such, the only
5438 part we are actually using from that code is the loop setup. The
5439 actual probing is done in aarch64_output_probe_stack_range. */
5440 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5441 stack_pointer_rtx, temp1));
5442
5443 /* Now reset the CFA register if needed. */
5444 if (frame_related_p)
5445 {
5446 add_reg_note (insn, REG_CFA_DEF_CFA,
5447 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5448 RTX_FRAME_RELATED_P (insn) = 1;
5449 }
5450
5451 emit_insn (gen_blockage ());
5452 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5453 }
5454
5455 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5456 be probed. This maintains the requirement that each page is probed at
5457 least once. For initial probing we probe only if the allocation is
5458 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5459 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5460 GUARD_SIZE. This works that for any allocation that is large enough to
5461 trigger a probe here, we'll have at least one, and if they're not large
5462 enough for this code to emit anything for them, The page would have been
5463 probed by the saving of FP/LR either by this function or any callees. If
5464 we don't have any callees then we won't have more stack adjustments and so
5465 are still safe. */
5466 if (residual)
5467 {
5468 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5469 /* If we're doing final adjustments, and we've done any full page
5470 allocations then any residual needs to be probed. */
5471 if (final_adjustment_p && rounded_size != 0)
5472 min_probe_threshold = 0;
5473 /* If doing a small final adjustment, we always probe at offset 0.
5474 This is done to avoid issues when LR is not at position 0 or when
5475 the final adjustment is smaller than the probing offset. */
5476 else if (final_adjustment_p && rounded_size == 0)
5477 residual_probe_offset = 0;
5478
5479 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5480 if (residual >= min_probe_threshold)
5481 {
5482 if (dump_file)
5483 fprintf (dump_file,
5484 "Stack clash AArch64 prologue residuals: "
5485 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5486 "\n", residual);
5487
5488 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5489 residual_probe_offset));
5490 emit_insn (gen_blockage ());
5491 }
5492 }
5493 }
5494
5495 /* Return 1 if the register is used by the epilogue. We need to say the
5496 return register is used, but only after epilogue generation is complete.
5497 Note that in the case of sibcalls, the values "used by the epilogue" are
5498 considered live at the start of the called function.
5499
5500 For SIMD functions we need to return 1 for FP registers that are saved and
5501 restored by a function but are not zero in call_used_regs. If we do not do
5502 this optimizations may remove the restore of the register. */
5503
5504 int
5505 aarch64_epilogue_uses (int regno)
5506 {
5507 if (epilogue_completed)
5508 {
5509 if (regno == LR_REGNUM)
5510 return 1;
5511 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5512 return 1;
5513 }
5514 return 0;
5515 }
5516
5517 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5518 is saved at BASE + OFFSET. */
5519
5520 static void
5521 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5522 rtx base, poly_int64 offset)
5523 {
5524 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5525 add_reg_note (insn, REG_CFA_EXPRESSION,
5526 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5527 }
5528
5529 /* AArch64 stack frames generated by this compiler look like:
5530
5531 +-------------------------------+
5532 | |
5533 | incoming stack arguments |
5534 | |
5535 +-------------------------------+
5536 | | <-- incoming stack pointer (aligned)
5537 | callee-allocated save area |
5538 | for register varargs |
5539 | |
5540 +-------------------------------+
5541 | local variables | <-- frame_pointer_rtx
5542 | |
5543 +-------------------------------+
5544 | padding | \
5545 +-------------------------------+ |
5546 | callee-saved registers | | frame.saved_regs_size
5547 +-------------------------------+ |
5548 | LR' | |
5549 +-------------------------------+ |
5550 | FP' | / <- hard_frame_pointer_rtx (aligned)
5551 +-------------------------------+
5552 | dynamic allocation |
5553 +-------------------------------+
5554 | padding |
5555 +-------------------------------+
5556 | outgoing stack arguments | <-- arg_pointer
5557 | |
5558 +-------------------------------+
5559 | | <-- stack_pointer_rtx (aligned)
5560
5561 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5562 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5563 unchanged.
5564
5565 By default for stack-clash we assume the guard is at least 64KB, but this
5566 value is configurable to either 4KB or 64KB. We also force the guard size to
5567 be the same as the probing interval and both values are kept in sync.
5568
5569 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5570 on the guard size) of stack space without probing.
5571
5572 When probing is needed, we emit a probe at the start of the prologue
5573 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5574
5575 We have to track how much space has been allocated and the only stores
5576 to the stack we track as implicit probes are the FP/LR stores.
5577
5578 For outgoing arguments we probe if the size is larger than 1KB, such that
5579 the ABI specified buffer is maintained for the next callee.
5580
5581 The following registers are reserved during frame layout and should not be
5582 used for any other purpose:
5583
5584 - r11: Used by stack clash protection when SVE is enabled.
5585 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5586 - r14 and r15: Used for speculation tracking.
5587 - r16(IP0), r17(IP1): Used by indirect tailcalls.
5588 - r30(LR), r29(FP): Used by standard frame layout.
5589
5590 These registers must be avoided in frame layout related code unless the
5591 explicit intention is to interact with one of the features listed above. */
5592
5593 /* Generate the prologue instructions for entry into a function.
5594 Establish the stack frame by decreasing the stack pointer with a
5595 properly calculated size and, if necessary, create a frame record
5596 filled with the values of LR and previous frame pointer. The
5597 current FP is also set up if it is in use. */
5598
5599 void
5600 aarch64_expand_prologue (void)
5601 {
5602 poly_int64 frame_size = cfun->machine->frame.frame_size;
5603 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5604 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5605 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5606 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5607 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5608 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5609 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5610 rtx_insn *insn;
5611
5612 /* Sign return address for functions. */
5613 if (aarch64_return_address_signing_enabled ())
5614 {
5615 insn = emit_insn (gen_pacisp ());
5616 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5617 RTX_FRAME_RELATED_P (insn) = 1;
5618 }
5619
5620 if (flag_stack_usage_info)
5621 current_function_static_stack_size = constant_lower_bound (frame_size);
5622
5623 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5624 {
5625 if (crtl->is_leaf && !cfun->calls_alloca)
5626 {
5627 if (maybe_gt (frame_size, PROBE_INTERVAL)
5628 && maybe_gt (frame_size, get_stack_check_protect ()))
5629 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5630 (frame_size
5631 - get_stack_check_protect ()));
5632 }
5633 else if (maybe_gt (frame_size, 0))
5634 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5635 }
5636
5637 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5638 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5639
5640 /* In theory we should never have both an initial adjustment
5641 and a callee save adjustment. Verify that is the case since the
5642 code below does not handle it for -fstack-clash-protection. */
5643 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5644
5645 /* Will only probe if the initial adjustment is larger than the guard
5646 less the amount of the guard reserved for use by the caller's
5647 outgoing args. */
5648 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5649 true, false);
5650
5651 if (callee_adjust != 0)
5652 aarch64_push_regs (reg1, reg2, callee_adjust);
5653
5654 if (emit_frame_chain)
5655 {
5656 poly_int64 reg_offset = callee_adjust;
5657 if (callee_adjust == 0)
5658 {
5659 reg1 = R29_REGNUM;
5660 reg2 = R30_REGNUM;
5661 reg_offset = callee_offset;
5662 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5663 }
5664 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5665 stack_pointer_rtx, callee_offset,
5666 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5667 if (frame_pointer_needed && !frame_size.is_constant ())
5668 {
5669 /* Variable-sized frames need to describe the save slot
5670 address using DW_CFA_expression rather than DW_CFA_offset.
5671 This means that, without taking further action, the
5672 locations of the registers that we've already saved would
5673 remain based on the stack pointer even after we redefine
5674 the CFA based on the frame pointer. We therefore need new
5675 DW_CFA_expressions to re-express the save slots with addresses
5676 based on the frame pointer. */
5677 rtx_insn *insn = get_last_insn ();
5678 gcc_assert (RTX_FRAME_RELATED_P (insn));
5679
5680 /* Add an explicit CFA definition if this was previously
5681 implicit. */
5682 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5683 {
5684 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5685 callee_offset);
5686 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5687 gen_rtx_SET (hard_frame_pointer_rtx, src));
5688 }
5689
5690 /* Change the save slot expressions for the registers that
5691 we've already saved. */
5692 reg_offset -= callee_offset;
5693 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5694 reg_offset + UNITS_PER_WORD);
5695 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5696 reg_offset);
5697 }
5698 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5699 }
5700
5701 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5702 callee_adjust != 0 || emit_frame_chain);
5703 if (aarch64_simd_decl_p (cfun->decl))
5704 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5705 callee_adjust != 0 || emit_frame_chain);
5706 else
5707 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5708 callee_adjust != 0 || emit_frame_chain);
5709
5710 /* We may need to probe the final adjustment if it is larger than the guard
5711 that is assumed by the called. */
5712 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5713 !frame_pointer_needed, true);
5714 }
5715
5716 /* Return TRUE if we can use a simple_return insn.
5717
5718 This function checks whether the callee saved stack is empty, which
5719 means no restore actions are need. The pro_and_epilogue will use
5720 this to check whether shrink-wrapping opt is feasible. */
5721
5722 bool
5723 aarch64_use_return_insn_p (void)
5724 {
5725 if (!reload_completed)
5726 return false;
5727
5728 if (crtl->profile)
5729 return false;
5730
5731 return known_eq (cfun->machine->frame.frame_size, 0);
5732 }
5733
5734 /* Return false for non-leaf SIMD functions in order to avoid
5735 shrink-wrapping them. Doing this will lose the necessary
5736 save/restore of FP registers. */
5737
5738 bool
5739 aarch64_use_simple_return_insn_p (void)
5740 {
5741 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5742 return false;
5743
5744 return true;
5745 }
5746
5747 /* Generate the epilogue instructions for returning from a function.
5748 This is almost exactly the reverse of the prolog sequence, except
5749 that we need to insert barriers to avoid scheduling loads that read
5750 from a deallocated stack, and we optimize the unwind records by
5751 emitting them all together if possible. */
5752 void
5753 aarch64_expand_epilogue (bool for_sibcall)
5754 {
5755 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5756 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5757 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5758 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5759 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5760 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5761 rtx cfi_ops = NULL;
5762 rtx_insn *insn;
5763 /* A stack clash protection prologue may not have left EP0_REGNUM or
5764 EP1_REGNUM in a usable state. The same is true for allocations
5765 with an SVE component, since we then need both temporary registers
5766 for each allocation. For stack clash we are in a usable state if
5767 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5768 HOST_WIDE_INT guard_size
5769 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5770 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5771
5772 /* We can re-use the registers when the allocation amount is smaller than
5773 guard_size - guard_used_by_caller because we won't be doing any probes
5774 then. In such situations the register should remain live with the correct
5775 value. */
5776 bool can_inherit_p = (initial_adjust.is_constant ()
5777 && final_adjust.is_constant ())
5778 && (!flag_stack_clash_protection
5779 || known_lt (initial_adjust,
5780 guard_size - guard_used_by_caller));
5781
5782 /* We need to add memory barrier to prevent read from deallocated stack. */
5783 bool need_barrier_p
5784 = maybe_ne (get_frame_size ()
5785 + cfun->machine->frame.saved_varargs_size, 0);
5786
5787 /* Emit a barrier to prevent loads from a deallocated stack. */
5788 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5789 || cfun->calls_alloca
5790 || crtl->calls_eh_return)
5791 {
5792 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5793 need_barrier_p = false;
5794 }
5795
5796 /* Restore the stack pointer from the frame pointer if it may not
5797 be the same as the stack pointer. */
5798 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5799 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5800 if (frame_pointer_needed
5801 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5802 /* If writeback is used when restoring callee-saves, the CFA
5803 is restored on the instruction doing the writeback. */
5804 aarch64_add_offset (Pmode, stack_pointer_rtx,
5805 hard_frame_pointer_rtx, -callee_offset,
5806 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5807 else
5808 /* The case where we need to re-use the register here is very rare, so
5809 avoid the complicated condition and just always emit a move if the
5810 immediate doesn't fit. */
5811 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5812
5813 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5814 callee_adjust != 0, &cfi_ops);
5815 if (aarch64_simd_decl_p (cfun->decl))
5816 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5817 callee_adjust != 0, &cfi_ops);
5818 else
5819 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5820 callee_adjust != 0, &cfi_ops);
5821
5822 if (need_barrier_p)
5823 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5824
5825 if (callee_adjust != 0)
5826 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5827
5828 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5829 {
5830 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5831 insn = get_last_insn ();
5832 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5833 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5834 RTX_FRAME_RELATED_P (insn) = 1;
5835 cfi_ops = NULL;
5836 }
5837
5838 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5839 add restriction on emit_move optimization to leaf functions. */
5840 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5841 (!can_inherit_p || !crtl->is_leaf
5842 || df_regs_ever_live_p (EP0_REGNUM)));
5843
5844 if (cfi_ops)
5845 {
5846 /* Emit delayed restores and reset the CFA to be SP. */
5847 insn = get_last_insn ();
5848 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5849 REG_NOTES (insn) = cfi_ops;
5850 RTX_FRAME_RELATED_P (insn) = 1;
5851 }
5852
5853 /* We prefer to emit the combined return/authenticate instruction RETAA,
5854 however there are three cases in which we must instead emit an explicit
5855 authentication instruction.
5856
5857 1) Sibcalls don't return in a normal way, so if we're about to call one
5858 we must authenticate.
5859
5860 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5861 generating code for !TARGET_ARMV8_3 we can't use it and must
5862 explicitly authenticate.
5863
5864 3) On an eh_return path we make extra stack adjustments to update the
5865 canonical frame address to be the exception handler's CFA. We want
5866 to authenticate using the CFA of the function which calls eh_return.
5867 */
5868 if (aarch64_return_address_signing_enabled ()
5869 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5870 {
5871 insn = emit_insn (gen_autisp ());
5872 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5873 RTX_FRAME_RELATED_P (insn) = 1;
5874 }
5875
5876 /* Stack adjustment for exception handler. */
5877 if (crtl->calls_eh_return)
5878 {
5879 /* We need to unwind the stack by the offset computed by
5880 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5881 to be SP; letting the CFA move during this adjustment
5882 is just as correct as retaining the CFA from the body
5883 of the function. Therefore, do nothing special. */
5884 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5885 }
5886
5887 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5888 if (!for_sibcall)
5889 emit_jump_insn (ret_rtx);
5890 }
5891
5892 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5893 normally or return to a previous frame after unwinding.
5894
5895 An EH return uses a single shared return sequence. The epilogue is
5896 exactly like a normal epilogue except that it has an extra input
5897 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5898 that must be applied after the frame has been destroyed. An extra label
5899 is inserted before the epilogue which initializes this register to zero,
5900 and this is the entry point for a normal return.
5901
5902 An actual EH return updates the return address, initializes the stack
5903 adjustment and jumps directly into the epilogue (bypassing the zeroing
5904 of the adjustment). Since the return address is typically saved on the
5905 stack when a function makes a call, the saved LR must be updated outside
5906 the epilogue.
5907
5908 This poses problems as the store is generated well before the epilogue,
5909 so the offset of LR is not known yet. Also optimizations will remove the
5910 store as it appears dead, even after the epilogue is generated (as the
5911 base or offset for loading LR is different in many cases).
5912
5913 To avoid these problems this implementation forces the frame pointer
5914 in eh_return functions so that the location of LR is fixed and known early.
5915 It also marks the store volatile, so no optimization is permitted to
5916 remove the store. */
5917 rtx
5918 aarch64_eh_return_handler_rtx (void)
5919 {
5920 rtx tmp = gen_frame_mem (Pmode,
5921 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5922
5923 /* Mark the store volatile, so no optimization is permitted to remove it. */
5924 MEM_VOLATILE_P (tmp) = true;
5925 return tmp;
5926 }
5927
5928 /* Output code to add DELTA to the first argument, and then jump
5929 to FUNCTION. Used for C++ multiple inheritance. */
5930 static void
5931 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5932 HOST_WIDE_INT delta,
5933 HOST_WIDE_INT vcall_offset,
5934 tree function)
5935 {
5936 /* The this pointer is always in x0. Note that this differs from
5937 Arm where the this pointer maybe bumped to r1 if r0 is required
5938 to return a pointer to an aggregate. On AArch64 a result value
5939 pointer will be in x8. */
5940 int this_regno = R0_REGNUM;
5941 rtx this_rtx, temp0, temp1, addr, funexp;
5942 rtx_insn *insn;
5943
5944 reload_completed = 1;
5945 emit_note (NOTE_INSN_PROLOGUE_END);
5946
5947 this_rtx = gen_rtx_REG (Pmode, this_regno);
5948 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5949 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5950
5951 if (vcall_offset == 0)
5952 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5953 else
5954 {
5955 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5956
5957 addr = this_rtx;
5958 if (delta != 0)
5959 {
5960 if (delta >= -256 && delta < 256)
5961 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5962 plus_constant (Pmode, this_rtx, delta));
5963 else
5964 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5965 temp1, temp0, false);
5966 }
5967
5968 if (Pmode == ptr_mode)
5969 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5970 else
5971 aarch64_emit_move (temp0,
5972 gen_rtx_ZERO_EXTEND (Pmode,
5973 gen_rtx_MEM (ptr_mode, addr)));
5974
5975 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5976 addr = plus_constant (Pmode, temp0, vcall_offset);
5977 else
5978 {
5979 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5980 Pmode);
5981 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5982 }
5983
5984 if (Pmode == ptr_mode)
5985 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5986 else
5987 aarch64_emit_move (temp1,
5988 gen_rtx_SIGN_EXTEND (Pmode,
5989 gen_rtx_MEM (ptr_mode, addr)));
5990
5991 emit_insn (gen_add2_insn (this_rtx, temp1));
5992 }
5993
5994 /* Generate a tail call to the target function. */
5995 if (!TREE_USED (function))
5996 {
5997 assemble_external (function);
5998 TREE_USED (function) = 1;
5999 }
6000 funexp = XEXP (DECL_RTL (function), 0);
6001 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6002 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6003 SIBLING_CALL_P (insn) = 1;
6004
6005 insn = get_insns ();
6006 shorten_branches (insn);
6007 final_start_function (insn, file, 1);
6008 final (insn, file, 1);
6009 final_end_function ();
6010
6011 /* Stop pretending to be a post-reload pass. */
6012 reload_completed = 0;
6013 }
6014
6015 static bool
6016 aarch64_tls_referenced_p (rtx x)
6017 {
6018 if (!TARGET_HAVE_TLS)
6019 return false;
6020 subrtx_iterator::array_type array;
6021 FOR_EACH_SUBRTX (iter, array, x, ALL)
6022 {
6023 const_rtx x = *iter;
6024 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6025 return true;
6026 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6027 TLS offsets, not real symbol references. */
6028 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6029 iter.skip_subrtxes ();
6030 }
6031 return false;
6032 }
6033
6034
6035 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6036 a left shift of 0 or 12 bits. */
6037 bool
6038 aarch64_uimm12_shift (HOST_WIDE_INT val)
6039 {
6040 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6041 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6042 );
6043 }
6044
6045 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6046 that can be created with a left shift of 0 or 12. */
6047 static HOST_WIDE_INT
6048 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6049 {
6050 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6051 handle correctly. */
6052 gcc_assert ((val & 0xffffff) == val);
6053
6054 if (((val & 0xfff) << 0) == val)
6055 return val;
6056
6057 return val & (0xfff << 12);
6058 }
6059
6060 /* Return true if val is an immediate that can be loaded into a
6061 register by a MOVZ instruction. */
6062 static bool
6063 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6064 {
6065 if (GET_MODE_SIZE (mode) > 4)
6066 {
6067 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6068 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6069 return 1;
6070 }
6071 else
6072 {
6073 /* Ignore sign extension. */
6074 val &= (HOST_WIDE_INT) 0xffffffff;
6075 }
6076 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6077 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6078 }
6079
6080 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6081 64-bit (DImode) integer. */
6082
6083 static unsigned HOST_WIDE_INT
6084 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6085 {
6086 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6087 while (size < 64)
6088 {
6089 val &= (HOST_WIDE_INT_1U << size) - 1;
6090 val |= val << size;
6091 size *= 2;
6092 }
6093 return val;
6094 }
6095
6096 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6097
6098 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6099 {
6100 0x0000000100000001ull,
6101 0x0001000100010001ull,
6102 0x0101010101010101ull,
6103 0x1111111111111111ull,
6104 0x5555555555555555ull,
6105 };
6106
6107
6108 /* Return true if val is a valid bitmask immediate. */
6109
6110 bool
6111 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6112 {
6113 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6114 int bits;
6115
6116 /* Check for a single sequence of one bits and return quickly if so.
6117 The special cases of all ones and all zeroes returns false. */
6118 val = aarch64_replicate_bitmask_imm (val_in, mode);
6119 tmp = val + (val & -val);
6120
6121 if (tmp == (tmp & -tmp))
6122 return (val + 1) > 1;
6123
6124 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6125 if (mode == SImode)
6126 val = (val << 32) | (val & 0xffffffff);
6127
6128 /* Invert if the immediate doesn't start with a zero bit - this means we
6129 only need to search for sequences of one bits. */
6130 if (val & 1)
6131 val = ~val;
6132
6133 /* Find the first set bit and set tmp to val with the first sequence of one
6134 bits removed. Return success if there is a single sequence of ones. */
6135 first_one = val & -val;
6136 tmp = val & (val + first_one);
6137
6138 if (tmp == 0)
6139 return true;
6140
6141 /* Find the next set bit and compute the difference in bit position. */
6142 next_one = tmp & -tmp;
6143 bits = clz_hwi (first_one) - clz_hwi (next_one);
6144 mask = val ^ tmp;
6145
6146 /* Check the bit position difference is a power of 2, and that the first
6147 sequence of one bits fits within 'bits' bits. */
6148 if ((mask >> bits) != 0 || bits != (bits & -bits))
6149 return false;
6150
6151 /* Check the sequence of one bits is repeated 64/bits times. */
6152 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6153 }
6154
6155 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6156 Assumed precondition: VAL_IN Is not zero. */
6157
6158 unsigned HOST_WIDE_INT
6159 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6160 {
6161 int lowest_bit_set = ctz_hwi (val_in);
6162 int highest_bit_set = floor_log2 (val_in);
6163 gcc_assert (val_in != 0);
6164
6165 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6166 (HOST_WIDE_INT_1U << lowest_bit_set));
6167 }
6168
6169 /* Create constant where bits outside of lowest bit set to highest bit set
6170 are set to 1. */
6171
6172 unsigned HOST_WIDE_INT
6173 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6174 {
6175 return val_in | ~aarch64_and_split_imm1 (val_in);
6176 }
6177
6178 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6179
6180 bool
6181 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6182 {
6183 scalar_int_mode int_mode;
6184 if (!is_a <scalar_int_mode> (mode, &int_mode))
6185 return false;
6186
6187 if (aarch64_bitmask_imm (val_in, int_mode))
6188 return false;
6189
6190 if (aarch64_move_imm (val_in, int_mode))
6191 return false;
6192
6193 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6194
6195 return aarch64_bitmask_imm (imm2, int_mode);
6196 }
6197
6198 /* Return true if val is an immediate that can be loaded into a
6199 register in a single instruction. */
6200 bool
6201 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6202 {
6203 scalar_int_mode int_mode;
6204 if (!is_a <scalar_int_mode> (mode, &int_mode))
6205 return false;
6206
6207 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6208 return 1;
6209 return aarch64_bitmask_imm (val, int_mode);
6210 }
6211
6212 static bool
6213 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6214 {
6215 rtx base, offset;
6216
6217 if (GET_CODE (x) == HIGH)
6218 return true;
6219
6220 /* There's no way to calculate VL-based values using relocations. */
6221 subrtx_iterator::array_type array;
6222 FOR_EACH_SUBRTX (iter, array, x, ALL)
6223 if (GET_CODE (*iter) == CONST_POLY_INT)
6224 return true;
6225
6226 split_const (x, &base, &offset);
6227 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6228 {
6229 if (aarch64_classify_symbol (base, INTVAL (offset))
6230 != SYMBOL_FORCE_TO_MEM)
6231 return true;
6232 else
6233 /* Avoid generating a 64-bit relocation in ILP32; leave
6234 to aarch64_expand_mov_immediate to handle it properly. */
6235 return mode != ptr_mode;
6236 }
6237
6238 return aarch64_tls_referenced_p (x);
6239 }
6240
6241 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6242 The expansion for a table switch is quite expensive due to the number
6243 of instructions, the table lookup and hard to predict indirect jump.
6244 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6245 set, otherwise use tables for > 16 cases as a tradeoff between size and
6246 performance. When optimizing for size, use the default setting. */
6247
6248 static unsigned int
6249 aarch64_case_values_threshold (void)
6250 {
6251 /* Use the specified limit for the number of cases before using jump
6252 tables at higher optimization levels. */
6253 if (optimize > 2
6254 && selected_cpu->tune->max_case_values != 0)
6255 return selected_cpu->tune->max_case_values;
6256 else
6257 return optimize_size ? default_case_values_threshold () : 17;
6258 }
6259
6260 /* Return true if register REGNO is a valid index register.
6261 STRICT_P is true if REG_OK_STRICT is in effect. */
6262
6263 bool
6264 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6265 {
6266 if (!HARD_REGISTER_NUM_P (regno))
6267 {
6268 if (!strict_p)
6269 return true;
6270
6271 if (!reg_renumber)
6272 return false;
6273
6274 regno = reg_renumber[regno];
6275 }
6276 return GP_REGNUM_P (regno);
6277 }
6278
6279 /* Return true if register REGNO is a valid base register for mode MODE.
6280 STRICT_P is true if REG_OK_STRICT is in effect. */
6281
6282 bool
6283 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6284 {
6285 if (!HARD_REGISTER_NUM_P (regno))
6286 {
6287 if (!strict_p)
6288 return true;
6289
6290 if (!reg_renumber)
6291 return false;
6292
6293 regno = reg_renumber[regno];
6294 }
6295
6296 /* The fake registers will be eliminated to either the stack or
6297 hard frame pointer, both of which are usually valid base registers.
6298 Reload deals with the cases where the eliminated form isn't valid. */
6299 return (GP_REGNUM_P (regno)
6300 || regno == SP_REGNUM
6301 || regno == FRAME_POINTER_REGNUM
6302 || regno == ARG_POINTER_REGNUM);
6303 }
6304
6305 /* Return true if X is a valid base register for mode MODE.
6306 STRICT_P is true if REG_OK_STRICT is in effect. */
6307
6308 static bool
6309 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6310 {
6311 if (!strict_p
6312 && GET_CODE (x) == SUBREG
6313 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6314 x = SUBREG_REG (x);
6315
6316 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6317 }
6318
6319 /* Return true if address offset is a valid index. If it is, fill in INFO
6320 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6321
6322 static bool
6323 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6324 machine_mode mode, bool strict_p)
6325 {
6326 enum aarch64_address_type type;
6327 rtx index;
6328 int shift;
6329
6330 /* (reg:P) */
6331 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6332 && GET_MODE (x) == Pmode)
6333 {
6334 type = ADDRESS_REG_REG;
6335 index = x;
6336 shift = 0;
6337 }
6338 /* (sign_extend:DI (reg:SI)) */
6339 else if ((GET_CODE (x) == SIGN_EXTEND
6340 || GET_CODE (x) == ZERO_EXTEND)
6341 && GET_MODE (x) == DImode
6342 && GET_MODE (XEXP (x, 0)) == SImode)
6343 {
6344 type = (GET_CODE (x) == SIGN_EXTEND)
6345 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6346 index = XEXP (x, 0);
6347 shift = 0;
6348 }
6349 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6350 else if (GET_CODE (x) == MULT
6351 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6352 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6353 && GET_MODE (XEXP (x, 0)) == DImode
6354 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6355 && CONST_INT_P (XEXP (x, 1)))
6356 {
6357 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6358 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6359 index = XEXP (XEXP (x, 0), 0);
6360 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6361 }
6362 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6363 else if (GET_CODE (x) == ASHIFT
6364 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6365 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6366 && GET_MODE (XEXP (x, 0)) == DImode
6367 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6368 && CONST_INT_P (XEXP (x, 1)))
6369 {
6370 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6371 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6372 index = XEXP (XEXP (x, 0), 0);
6373 shift = INTVAL (XEXP (x, 1));
6374 }
6375 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6376 else if ((GET_CODE (x) == SIGN_EXTRACT
6377 || GET_CODE (x) == ZERO_EXTRACT)
6378 && GET_MODE (x) == DImode
6379 && GET_CODE (XEXP (x, 0)) == MULT
6380 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6381 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6382 {
6383 type = (GET_CODE (x) == SIGN_EXTRACT)
6384 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6385 index = XEXP (XEXP (x, 0), 0);
6386 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6387 if (INTVAL (XEXP (x, 1)) != 32 + shift
6388 || INTVAL (XEXP (x, 2)) != 0)
6389 shift = -1;
6390 }
6391 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6392 (const_int 0xffffffff<<shift)) */
6393 else if (GET_CODE (x) == AND
6394 && GET_MODE (x) == DImode
6395 && GET_CODE (XEXP (x, 0)) == MULT
6396 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6397 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6398 && CONST_INT_P (XEXP (x, 1)))
6399 {
6400 type = ADDRESS_REG_UXTW;
6401 index = XEXP (XEXP (x, 0), 0);
6402 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6403 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6404 shift = -1;
6405 }
6406 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6407 else if ((GET_CODE (x) == SIGN_EXTRACT
6408 || GET_CODE (x) == ZERO_EXTRACT)
6409 && GET_MODE (x) == DImode
6410 && GET_CODE (XEXP (x, 0)) == ASHIFT
6411 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6412 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6413 {
6414 type = (GET_CODE (x) == SIGN_EXTRACT)
6415 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6416 index = XEXP (XEXP (x, 0), 0);
6417 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6418 if (INTVAL (XEXP (x, 1)) != 32 + shift
6419 || INTVAL (XEXP (x, 2)) != 0)
6420 shift = -1;
6421 }
6422 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6423 (const_int 0xffffffff<<shift)) */
6424 else if (GET_CODE (x) == AND
6425 && GET_MODE (x) == DImode
6426 && GET_CODE (XEXP (x, 0)) == ASHIFT
6427 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6428 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6429 && CONST_INT_P (XEXP (x, 1)))
6430 {
6431 type = ADDRESS_REG_UXTW;
6432 index = XEXP (XEXP (x, 0), 0);
6433 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6434 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6435 shift = -1;
6436 }
6437 /* (mult:P (reg:P) (const_int scale)) */
6438 else if (GET_CODE (x) == MULT
6439 && GET_MODE (x) == Pmode
6440 && GET_MODE (XEXP (x, 0)) == Pmode
6441 && CONST_INT_P (XEXP (x, 1)))
6442 {
6443 type = ADDRESS_REG_REG;
6444 index = XEXP (x, 0);
6445 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6446 }
6447 /* (ashift:P (reg:P) (const_int shift)) */
6448 else if (GET_CODE (x) == ASHIFT
6449 && GET_MODE (x) == Pmode
6450 && GET_MODE (XEXP (x, 0)) == Pmode
6451 && CONST_INT_P (XEXP (x, 1)))
6452 {
6453 type = ADDRESS_REG_REG;
6454 index = XEXP (x, 0);
6455 shift = INTVAL (XEXP (x, 1));
6456 }
6457 else
6458 return false;
6459
6460 if (!strict_p
6461 && GET_CODE (index) == SUBREG
6462 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6463 index = SUBREG_REG (index);
6464
6465 if (aarch64_sve_data_mode_p (mode))
6466 {
6467 if (type != ADDRESS_REG_REG
6468 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6469 return false;
6470 }
6471 else
6472 {
6473 if (shift != 0
6474 && !(IN_RANGE (shift, 1, 3)
6475 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6476 return false;
6477 }
6478
6479 if (REG_P (index)
6480 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6481 {
6482 info->type = type;
6483 info->offset = index;
6484 info->shift = shift;
6485 return true;
6486 }
6487
6488 return false;
6489 }
6490
6491 /* Return true if MODE is one of the modes for which we
6492 support LDP/STP operations. */
6493
6494 static bool
6495 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6496 {
6497 return mode == SImode || mode == DImode
6498 || mode == SFmode || mode == DFmode
6499 || (aarch64_vector_mode_supported_p (mode)
6500 && (known_eq (GET_MODE_SIZE (mode), 8)
6501 || (known_eq (GET_MODE_SIZE (mode), 16)
6502 && (aarch64_tune_params.extra_tuning_flags
6503 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6504 }
6505
6506 /* Return true if REGNO is a virtual pointer register, or an eliminable
6507 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6508 include stack_pointer or hard_frame_pointer. */
6509 static bool
6510 virt_or_elim_regno_p (unsigned regno)
6511 {
6512 return ((regno >= FIRST_VIRTUAL_REGISTER
6513 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6514 || regno == FRAME_POINTER_REGNUM
6515 || regno == ARG_POINTER_REGNUM);
6516 }
6517
6518 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6519 If it is, fill in INFO appropriately. STRICT_P is true if
6520 REG_OK_STRICT is in effect. */
6521
6522 bool
6523 aarch64_classify_address (struct aarch64_address_info *info,
6524 rtx x, machine_mode mode, bool strict_p,
6525 aarch64_addr_query_type type)
6526 {
6527 enum rtx_code code = GET_CODE (x);
6528 rtx op0, op1;
6529 poly_int64 offset;
6530
6531 HOST_WIDE_INT const_size;
6532
6533 /* On BE, we use load/store pair for all large int mode load/stores.
6534 TI/TFmode may also use a load/store pair. */
6535 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6536 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6537 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6538 || type == ADDR_QUERY_LDP_STP_N
6539 || mode == TImode
6540 || mode == TFmode
6541 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6542
6543 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6544 corresponds to the actual size of the memory being loaded/stored and the
6545 mode of the corresponding addressing mode is half of that. */
6546 if (type == ADDR_QUERY_LDP_STP_N
6547 && known_eq (GET_MODE_SIZE (mode), 16))
6548 mode = DFmode;
6549
6550 bool allow_reg_index_p = (!load_store_pair_p
6551 && (known_lt (GET_MODE_SIZE (mode), 16)
6552 || vec_flags == VEC_ADVSIMD
6553 || vec_flags == VEC_SVE_DATA));
6554
6555 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6556 [Rn, #offset, MUL VL]. */
6557 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6558 && (code != REG && code != PLUS))
6559 return false;
6560
6561 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6562 REG addressing. */
6563 if (advsimd_struct_p
6564 && !BYTES_BIG_ENDIAN
6565 && (code != POST_INC && code != REG))
6566 return false;
6567
6568 gcc_checking_assert (GET_MODE (x) == VOIDmode
6569 || SCALAR_INT_MODE_P (GET_MODE (x)));
6570
6571 switch (code)
6572 {
6573 case REG:
6574 case SUBREG:
6575 info->type = ADDRESS_REG_IMM;
6576 info->base = x;
6577 info->offset = const0_rtx;
6578 info->const_offset = 0;
6579 return aarch64_base_register_rtx_p (x, strict_p);
6580
6581 case PLUS:
6582 op0 = XEXP (x, 0);
6583 op1 = XEXP (x, 1);
6584
6585 if (! strict_p
6586 && REG_P (op0)
6587 && virt_or_elim_regno_p (REGNO (op0))
6588 && poly_int_rtx_p (op1, &offset))
6589 {
6590 info->type = ADDRESS_REG_IMM;
6591 info->base = op0;
6592 info->offset = op1;
6593 info->const_offset = offset;
6594
6595 return true;
6596 }
6597
6598 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6599 && aarch64_base_register_rtx_p (op0, strict_p)
6600 && poly_int_rtx_p (op1, &offset))
6601 {
6602 info->type = ADDRESS_REG_IMM;
6603 info->base = op0;
6604 info->offset = op1;
6605 info->const_offset = offset;
6606
6607 /* TImode and TFmode values are allowed in both pairs of X
6608 registers and individual Q registers. The available
6609 address modes are:
6610 X,X: 7-bit signed scaled offset
6611 Q: 9-bit signed offset
6612 We conservatively require an offset representable in either mode.
6613 When performing the check for pairs of X registers i.e. LDP/STP
6614 pass down DImode since that is the natural size of the LDP/STP
6615 instruction memory accesses. */
6616 if (mode == TImode || mode == TFmode)
6617 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6618 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6619 || offset_12bit_unsigned_scaled_p (mode, offset)));
6620
6621 /* A 7bit offset check because OImode will emit a ldp/stp
6622 instruction (only big endian will get here).
6623 For ldp/stp instructions, the offset is scaled for the size of a
6624 single element of the pair. */
6625 if (mode == OImode)
6626 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6627
6628 /* Three 9/12 bit offsets checks because CImode will emit three
6629 ldr/str instructions (only big endian will get here). */
6630 if (mode == CImode)
6631 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6632 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6633 offset + 32)
6634 || offset_12bit_unsigned_scaled_p (V16QImode,
6635 offset + 32)));
6636
6637 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6638 instructions (only big endian will get here). */
6639 if (mode == XImode)
6640 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6641 && aarch64_offset_7bit_signed_scaled_p (TImode,
6642 offset + 32));
6643
6644 /* Make "m" use the LD1 offset range for SVE data modes, so
6645 that pre-RTL optimizers like ivopts will work to that
6646 instead of the wider LDR/STR range. */
6647 if (vec_flags == VEC_SVE_DATA)
6648 return (type == ADDR_QUERY_M
6649 ? offset_4bit_signed_scaled_p (mode, offset)
6650 : offset_9bit_signed_scaled_p (mode, offset));
6651
6652 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6653 {
6654 poly_int64 end_offset = (offset
6655 + GET_MODE_SIZE (mode)
6656 - BYTES_PER_SVE_VECTOR);
6657 return (type == ADDR_QUERY_M
6658 ? offset_4bit_signed_scaled_p (mode, offset)
6659 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6660 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6661 end_offset)));
6662 }
6663
6664 if (vec_flags == VEC_SVE_PRED)
6665 return offset_9bit_signed_scaled_p (mode, offset);
6666
6667 if (load_store_pair_p)
6668 return ((known_eq (GET_MODE_SIZE (mode), 4)
6669 || known_eq (GET_MODE_SIZE (mode), 8)
6670 || known_eq (GET_MODE_SIZE (mode), 16))
6671 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6672 else
6673 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6674 || offset_12bit_unsigned_scaled_p (mode, offset));
6675 }
6676
6677 if (allow_reg_index_p)
6678 {
6679 /* Look for base + (scaled/extended) index register. */
6680 if (aarch64_base_register_rtx_p (op0, strict_p)
6681 && aarch64_classify_index (info, op1, mode, strict_p))
6682 {
6683 info->base = op0;
6684 return true;
6685 }
6686 if (aarch64_base_register_rtx_p (op1, strict_p)
6687 && aarch64_classify_index (info, op0, mode, strict_p))
6688 {
6689 info->base = op1;
6690 return true;
6691 }
6692 }
6693
6694 return false;
6695
6696 case POST_INC:
6697 case POST_DEC:
6698 case PRE_INC:
6699 case PRE_DEC:
6700 info->type = ADDRESS_REG_WB;
6701 info->base = XEXP (x, 0);
6702 info->offset = NULL_RTX;
6703 return aarch64_base_register_rtx_p (info->base, strict_p);
6704
6705 case POST_MODIFY:
6706 case PRE_MODIFY:
6707 info->type = ADDRESS_REG_WB;
6708 info->base = XEXP (x, 0);
6709 if (GET_CODE (XEXP (x, 1)) == PLUS
6710 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6711 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6712 && aarch64_base_register_rtx_p (info->base, strict_p))
6713 {
6714 info->offset = XEXP (XEXP (x, 1), 1);
6715 info->const_offset = offset;
6716
6717 /* TImode and TFmode values are allowed in both pairs of X
6718 registers and individual Q registers. The available
6719 address modes are:
6720 X,X: 7-bit signed scaled offset
6721 Q: 9-bit signed offset
6722 We conservatively require an offset representable in either mode.
6723 */
6724 if (mode == TImode || mode == TFmode)
6725 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6726 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6727
6728 if (load_store_pair_p)
6729 return ((known_eq (GET_MODE_SIZE (mode), 4)
6730 || known_eq (GET_MODE_SIZE (mode), 8)
6731 || known_eq (GET_MODE_SIZE (mode), 16))
6732 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6733 else
6734 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6735 }
6736 return false;
6737
6738 case CONST:
6739 case SYMBOL_REF:
6740 case LABEL_REF:
6741 /* load literal: pc-relative constant pool entry. Only supported
6742 for SI mode or larger. */
6743 info->type = ADDRESS_SYMBOLIC;
6744
6745 if (!load_store_pair_p
6746 && GET_MODE_SIZE (mode).is_constant (&const_size)
6747 && const_size >= 4)
6748 {
6749 rtx sym, addend;
6750
6751 split_const (x, &sym, &addend);
6752 return ((GET_CODE (sym) == LABEL_REF
6753 || (GET_CODE (sym) == SYMBOL_REF
6754 && CONSTANT_POOL_ADDRESS_P (sym)
6755 && aarch64_pcrelative_literal_loads)));
6756 }
6757 return false;
6758
6759 case LO_SUM:
6760 info->type = ADDRESS_LO_SUM;
6761 info->base = XEXP (x, 0);
6762 info->offset = XEXP (x, 1);
6763 if (allow_reg_index_p
6764 && aarch64_base_register_rtx_p (info->base, strict_p))
6765 {
6766 rtx sym, offs;
6767 split_const (info->offset, &sym, &offs);
6768 if (GET_CODE (sym) == SYMBOL_REF
6769 && (aarch64_classify_symbol (sym, INTVAL (offs))
6770 == SYMBOL_SMALL_ABSOLUTE))
6771 {
6772 /* The symbol and offset must be aligned to the access size. */
6773 unsigned int align;
6774
6775 if (CONSTANT_POOL_ADDRESS_P (sym))
6776 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6777 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6778 {
6779 tree exp = SYMBOL_REF_DECL (sym);
6780 align = TYPE_ALIGN (TREE_TYPE (exp));
6781 align = aarch64_constant_alignment (exp, align);
6782 }
6783 else if (SYMBOL_REF_DECL (sym))
6784 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6785 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6786 && SYMBOL_REF_BLOCK (sym) != NULL)
6787 align = SYMBOL_REF_BLOCK (sym)->alignment;
6788 else
6789 align = BITS_PER_UNIT;
6790
6791 poly_int64 ref_size = GET_MODE_SIZE (mode);
6792 if (known_eq (ref_size, 0))
6793 ref_size = GET_MODE_SIZE (DImode);
6794
6795 return (multiple_p (INTVAL (offs), ref_size)
6796 && multiple_p (align / BITS_PER_UNIT, ref_size));
6797 }
6798 }
6799 return false;
6800
6801 default:
6802 return false;
6803 }
6804 }
6805
6806 /* Return true if the address X is valid for a PRFM instruction.
6807 STRICT_P is true if we should do strict checking with
6808 aarch64_classify_address. */
6809
6810 bool
6811 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6812 {
6813 struct aarch64_address_info addr;
6814
6815 /* PRFM accepts the same addresses as DImode... */
6816 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6817 if (!res)
6818 return false;
6819
6820 /* ... except writeback forms. */
6821 return addr.type != ADDRESS_REG_WB;
6822 }
6823
6824 bool
6825 aarch64_symbolic_address_p (rtx x)
6826 {
6827 rtx offset;
6828
6829 split_const (x, &x, &offset);
6830 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6831 }
6832
6833 /* Classify the base of symbolic expression X. */
6834
6835 enum aarch64_symbol_type
6836 aarch64_classify_symbolic_expression (rtx x)
6837 {
6838 rtx offset;
6839
6840 split_const (x, &x, &offset);
6841 return aarch64_classify_symbol (x, INTVAL (offset));
6842 }
6843
6844
6845 /* Return TRUE if X is a legitimate address for accessing memory in
6846 mode MODE. */
6847 static bool
6848 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6849 {
6850 struct aarch64_address_info addr;
6851
6852 return aarch64_classify_address (&addr, x, mode, strict_p);
6853 }
6854
6855 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6856 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6857 bool
6858 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6859 aarch64_addr_query_type type)
6860 {
6861 struct aarch64_address_info addr;
6862
6863 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6864 }
6865
6866 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6867
6868 static bool
6869 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6870 poly_int64 orig_offset,
6871 machine_mode mode)
6872 {
6873 HOST_WIDE_INT size;
6874 if (GET_MODE_SIZE (mode).is_constant (&size))
6875 {
6876 HOST_WIDE_INT const_offset, second_offset;
6877
6878 /* A general SVE offset is A * VQ + B. Remove the A component from
6879 coefficient 0 in order to get the constant B. */
6880 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6881
6882 /* Split an out-of-range address displacement into a base and
6883 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6884 range otherwise to increase opportunities for sharing the base
6885 address of different sizes. Unaligned accesses use the signed
6886 9-bit range, TImode/TFmode use the intersection of signed
6887 scaled 7-bit and signed 9-bit offset. */
6888 if (mode == TImode || mode == TFmode)
6889 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6890 else if ((const_offset & (size - 1)) != 0)
6891 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6892 else
6893 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6894
6895 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6896 return false;
6897
6898 /* Split the offset into second_offset and the rest. */
6899 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6900 *offset2 = gen_int_mode (second_offset, Pmode);
6901 return true;
6902 }
6903 else
6904 {
6905 /* Get the mode we should use as the basis of the range. For structure
6906 modes this is the mode of one vector. */
6907 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6908 machine_mode step_mode
6909 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6910
6911 /* Get the "mul vl" multiplier we'd like to use. */
6912 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6913 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6914 if (vec_flags & VEC_SVE_DATA)
6915 /* LDR supports a 9-bit range, but the move patterns for
6916 structure modes require all vectors to be in range of the
6917 same base. The simplest way of accomodating that while still
6918 promoting reuse of anchor points between different modes is
6919 to use an 8-bit range unconditionally. */
6920 vnum = ((vnum + 128) & 255) - 128;
6921 else
6922 /* Predicates are only handled singly, so we might as well use
6923 the full range. */
6924 vnum = ((vnum + 256) & 511) - 256;
6925 if (vnum == 0)
6926 return false;
6927
6928 /* Convert the "mul vl" multiplier into a byte offset. */
6929 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6930 if (known_eq (second_offset, orig_offset))
6931 return false;
6932
6933 /* Split the offset into second_offset and the rest. */
6934 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6935 *offset2 = gen_int_mode (second_offset, Pmode);
6936 return true;
6937 }
6938 }
6939
6940 /* Return the binary representation of floating point constant VALUE in INTVAL.
6941 If the value cannot be converted, return false without setting INTVAL.
6942 The conversion is done in the given MODE. */
6943 bool
6944 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6945 {
6946
6947 /* We make a general exception for 0. */
6948 if (aarch64_float_const_zero_rtx_p (value))
6949 {
6950 *intval = 0;
6951 return true;
6952 }
6953
6954 scalar_float_mode mode;
6955 if (GET_CODE (value) != CONST_DOUBLE
6956 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6957 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6958 /* Only support up to DF mode. */
6959 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6960 return false;
6961
6962 unsigned HOST_WIDE_INT ival = 0;
6963
6964 long res[2];
6965 real_to_target (res,
6966 CONST_DOUBLE_REAL_VALUE (value),
6967 REAL_MODE_FORMAT (mode));
6968
6969 if (mode == DFmode)
6970 {
6971 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6972 ival = zext_hwi (res[order], 32);
6973 ival |= (zext_hwi (res[1 - order], 32) << 32);
6974 }
6975 else
6976 ival = zext_hwi (res[0], 32);
6977
6978 *intval = ival;
6979 return true;
6980 }
6981
6982 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6983 single MOV(+MOVK) followed by an FMOV. */
6984 bool
6985 aarch64_float_const_rtx_p (rtx x)
6986 {
6987 machine_mode mode = GET_MODE (x);
6988 if (mode == VOIDmode)
6989 return false;
6990
6991 /* Determine whether it's cheaper to write float constants as
6992 mov/movk pairs over ldr/adrp pairs. */
6993 unsigned HOST_WIDE_INT ival;
6994
6995 if (GET_CODE (x) == CONST_DOUBLE
6996 && SCALAR_FLOAT_MODE_P (mode)
6997 && aarch64_reinterpret_float_as_int (x, &ival))
6998 {
6999 scalar_int_mode imode = (mode == HFmode
7000 ? SImode
7001 : int_mode_for_mode (mode).require ());
7002 int num_instr = aarch64_internal_mov_immediate
7003 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7004 return num_instr < 3;
7005 }
7006
7007 return false;
7008 }
7009
7010 /* Return TRUE if rtx X is immediate constant 0.0 */
7011 bool
7012 aarch64_float_const_zero_rtx_p (rtx x)
7013 {
7014 if (GET_MODE (x) == VOIDmode)
7015 return false;
7016
7017 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7018 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7019 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7020 }
7021
7022 /* Return TRUE if rtx X is immediate constant that fits in a single
7023 MOVI immediate operation. */
7024 bool
7025 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7026 {
7027 if (!TARGET_SIMD)
7028 return false;
7029
7030 machine_mode vmode;
7031 scalar_int_mode imode;
7032 unsigned HOST_WIDE_INT ival;
7033
7034 if (GET_CODE (x) == CONST_DOUBLE
7035 && SCALAR_FLOAT_MODE_P (mode))
7036 {
7037 if (!aarch64_reinterpret_float_as_int (x, &ival))
7038 return false;
7039
7040 /* We make a general exception for 0. */
7041 if (aarch64_float_const_zero_rtx_p (x))
7042 return true;
7043
7044 imode = int_mode_for_mode (mode).require ();
7045 }
7046 else if (GET_CODE (x) == CONST_INT
7047 && is_a <scalar_int_mode> (mode, &imode))
7048 ival = INTVAL (x);
7049 else
7050 return false;
7051
7052 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7053 a 128 bit vector mode. */
7054 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7055
7056 vmode = aarch64_simd_container_mode (imode, width);
7057 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7058
7059 return aarch64_simd_valid_immediate (v_op, NULL);
7060 }
7061
7062
7063 /* Return the fixed registers used for condition codes. */
7064
7065 static bool
7066 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7067 {
7068 *p1 = CC_REGNUM;
7069 *p2 = INVALID_REGNUM;
7070 return true;
7071 }
7072
7073 /* This function is used by the call expanders of the machine description.
7074 RESULT is the register in which the result is returned. It's NULL for
7075 "call" and "sibcall".
7076 MEM is the location of the function call.
7077 SIBCALL indicates whether this function call is normal call or sibling call.
7078 It will generate different pattern accordingly. */
7079
7080 void
7081 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7082 {
7083 rtx call, callee, tmp;
7084 rtvec vec;
7085 machine_mode mode;
7086
7087 gcc_assert (MEM_P (mem));
7088 callee = XEXP (mem, 0);
7089 mode = GET_MODE (callee);
7090 gcc_assert (mode == Pmode);
7091
7092 /* Decide if we should generate indirect calls by loading the
7093 address of the callee into a register before performing
7094 the branch-and-link. */
7095 if (SYMBOL_REF_P (callee)
7096 ? (aarch64_is_long_call_p (callee)
7097 || aarch64_is_noplt_call_p (callee))
7098 : !REG_P (callee))
7099 XEXP (mem, 0) = force_reg (mode, callee);
7100
7101 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7102
7103 if (result != NULL_RTX)
7104 call = gen_rtx_SET (result, call);
7105
7106 if (sibcall)
7107 tmp = ret_rtx;
7108 else
7109 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7110
7111 vec = gen_rtvec (2, call, tmp);
7112 call = gen_rtx_PARALLEL (VOIDmode, vec);
7113
7114 aarch64_emit_call_insn (call);
7115 }
7116
7117 /* Emit call insn with PAT and do aarch64-specific handling. */
7118
7119 void
7120 aarch64_emit_call_insn (rtx pat)
7121 {
7122 rtx insn = emit_call_insn (pat);
7123
7124 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7125 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7126 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7127 }
7128
7129 machine_mode
7130 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7131 {
7132 machine_mode mode_x = GET_MODE (x);
7133 rtx_code code_x = GET_CODE (x);
7134
7135 /* All floating point compares return CCFP if it is an equality
7136 comparison, and CCFPE otherwise. */
7137 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7138 {
7139 switch (code)
7140 {
7141 case EQ:
7142 case NE:
7143 case UNORDERED:
7144 case ORDERED:
7145 case UNLT:
7146 case UNLE:
7147 case UNGT:
7148 case UNGE:
7149 case UNEQ:
7150 return CCFPmode;
7151
7152 case LT:
7153 case LE:
7154 case GT:
7155 case GE:
7156 case LTGT:
7157 return CCFPEmode;
7158
7159 default:
7160 gcc_unreachable ();
7161 }
7162 }
7163
7164 /* Equality comparisons of short modes against zero can be performed
7165 using the TST instruction with the appropriate bitmask. */
7166 if (y == const0_rtx && REG_P (x)
7167 && (code == EQ || code == NE)
7168 && (mode_x == HImode || mode_x == QImode))
7169 return CC_NZmode;
7170
7171 /* Similarly, comparisons of zero_extends from shorter modes can
7172 be performed using an ANDS with an immediate mask. */
7173 if (y == const0_rtx && code_x == ZERO_EXTEND
7174 && (mode_x == SImode || mode_x == DImode)
7175 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7176 && (code == EQ || code == NE))
7177 return CC_NZmode;
7178
7179 if ((mode_x == SImode || mode_x == DImode)
7180 && y == const0_rtx
7181 && (code == EQ || code == NE || code == LT || code == GE)
7182 && (code_x == PLUS || code_x == MINUS || code_x == AND
7183 || code_x == NEG
7184 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7185 && CONST_INT_P (XEXP (x, 2)))))
7186 return CC_NZmode;
7187
7188 /* A compare with a shifted operand. Because of canonicalization,
7189 the comparison will have to be swapped when we emit the assembly
7190 code. */
7191 if ((mode_x == SImode || mode_x == DImode)
7192 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7193 && (code_x == ASHIFT || code_x == ASHIFTRT
7194 || code_x == LSHIFTRT
7195 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7196 return CC_SWPmode;
7197
7198 /* Similarly for a negated operand, but we can only do this for
7199 equalities. */
7200 if ((mode_x == SImode || mode_x == DImode)
7201 && (REG_P (y) || GET_CODE (y) == SUBREG)
7202 && (code == EQ || code == NE)
7203 && code_x == NEG)
7204 return CC_Zmode;
7205
7206 /* A test for unsigned overflow from an addition. */
7207 if ((mode_x == DImode || mode_x == TImode)
7208 && (code == LTU || code == GEU)
7209 && code_x == PLUS
7210 && rtx_equal_p (XEXP (x, 0), y))
7211 return CC_Cmode;
7212
7213 /* A test for unsigned overflow from an add with carry. */
7214 if ((mode_x == DImode || mode_x == TImode)
7215 && (code == LTU || code == GEU)
7216 && code_x == PLUS
7217 && CONST_SCALAR_INT_P (y)
7218 && (rtx_mode_t (y, mode_x)
7219 == (wi::shwi (1, mode_x)
7220 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7221 return CC_ADCmode;
7222
7223 /* A test for signed overflow. */
7224 if ((mode_x == DImode || mode_x == TImode)
7225 && code == NE
7226 && code_x == PLUS
7227 && GET_CODE (y) == SIGN_EXTEND)
7228 return CC_Vmode;
7229
7230 /* For everything else, return CCmode. */
7231 return CCmode;
7232 }
7233
7234 static int
7235 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7236
7237 int
7238 aarch64_get_condition_code (rtx x)
7239 {
7240 machine_mode mode = GET_MODE (XEXP (x, 0));
7241 enum rtx_code comp_code = GET_CODE (x);
7242
7243 if (GET_MODE_CLASS (mode) != MODE_CC)
7244 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7245 return aarch64_get_condition_code_1 (mode, comp_code);
7246 }
7247
7248 static int
7249 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7250 {
7251 switch (mode)
7252 {
7253 case E_CCFPmode:
7254 case E_CCFPEmode:
7255 switch (comp_code)
7256 {
7257 case GE: return AARCH64_GE;
7258 case GT: return AARCH64_GT;
7259 case LE: return AARCH64_LS;
7260 case LT: return AARCH64_MI;
7261 case NE: return AARCH64_NE;
7262 case EQ: return AARCH64_EQ;
7263 case ORDERED: return AARCH64_VC;
7264 case UNORDERED: return AARCH64_VS;
7265 case UNLT: return AARCH64_LT;
7266 case UNLE: return AARCH64_LE;
7267 case UNGT: return AARCH64_HI;
7268 case UNGE: return AARCH64_PL;
7269 default: return -1;
7270 }
7271 break;
7272
7273 case E_CCmode:
7274 switch (comp_code)
7275 {
7276 case NE: return AARCH64_NE;
7277 case EQ: return AARCH64_EQ;
7278 case GE: return AARCH64_GE;
7279 case GT: return AARCH64_GT;
7280 case LE: return AARCH64_LE;
7281 case LT: return AARCH64_LT;
7282 case GEU: return AARCH64_CS;
7283 case GTU: return AARCH64_HI;
7284 case LEU: return AARCH64_LS;
7285 case LTU: return AARCH64_CC;
7286 default: return -1;
7287 }
7288 break;
7289
7290 case E_CC_SWPmode:
7291 switch (comp_code)
7292 {
7293 case NE: return AARCH64_NE;
7294 case EQ: return AARCH64_EQ;
7295 case GE: return AARCH64_LE;
7296 case GT: return AARCH64_LT;
7297 case LE: return AARCH64_GE;
7298 case LT: return AARCH64_GT;
7299 case GEU: return AARCH64_LS;
7300 case GTU: return AARCH64_CC;
7301 case LEU: return AARCH64_CS;
7302 case LTU: return AARCH64_HI;
7303 default: return -1;
7304 }
7305 break;
7306
7307 case E_CC_NZmode:
7308 switch (comp_code)
7309 {
7310 case NE: return AARCH64_NE;
7311 case EQ: return AARCH64_EQ;
7312 case GE: return AARCH64_PL;
7313 case LT: return AARCH64_MI;
7314 default: return -1;
7315 }
7316 break;
7317
7318 case E_CC_Zmode:
7319 switch (comp_code)
7320 {
7321 case NE: return AARCH64_NE;
7322 case EQ: return AARCH64_EQ;
7323 default: return -1;
7324 }
7325 break;
7326
7327 case E_CC_Cmode:
7328 switch (comp_code)
7329 {
7330 case LTU: return AARCH64_CS;
7331 case GEU: return AARCH64_CC;
7332 default: return -1;
7333 }
7334 break;
7335
7336 case E_CC_ADCmode:
7337 switch (comp_code)
7338 {
7339 case GEU: return AARCH64_CS;
7340 case LTU: return AARCH64_CC;
7341 default: return -1;
7342 }
7343 break;
7344
7345 case E_CC_Vmode:
7346 switch (comp_code)
7347 {
7348 case NE: return AARCH64_VS;
7349 case EQ: return AARCH64_VC;
7350 default: return -1;
7351 }
7352 break;
7353
7354 default:
7355 return -1;
7356 }
7357
7358 return -1;
7359 }
7360
7361 bool
7362 aarch64_const_vec_all_same_in_range_p (rtx x,
7363 HOST_WIDE_INT minval,
7364 HOST_WIDE_INT maxval)
7365 {
7366 rtx elt;
7367 return (const_vec_duplicate_p (x, &elt)
7368 && CONST_INT_P (elt)
7369 && IN_RANGE (INTVAL (elt), minval, maxval));
7370 }
7371
7372 bool
7373 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7374 {
7375 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7376 }
7377
7378 /* Return true if VEC is a constant in which every element is in the range
7379 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7380
7381 static bool
7382 aarch64_const_vec_all_in_range_p (rtx vec,
7383 HOST_WIDE_INT minval,
7384 HOST_WIDE_INT maxval)
7385 {
7386 if (GET_CODE (vec) != CONST_VECTOR
7387 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7388 return false;
7389
7390 int nunits;
7391 if (!CONST_VECTOR_STEPPED_P (vec))
7392 nunits = const_vector_encoded_nelts (vec);
7393 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7394 return false;
7395
7396 for (int i = 0; i < nunits; i++)
7397 {
7398 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7399 if (!CONST_INT_P (vec_elem)
7400 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7401 return false;
7402 }
7403 return true;
7404 }
7405
7406 /* N Z C V. */
7407 #define AARCH64_CC_V 1
7408 #define AARCH64_CC_C (1 << 1)
7409 #define AARCH64_CC_Z (1 << 2)
7410 #define AARCH64_CC_N (1 << 3)
7411
7412 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7413 static const int aarch64_nzcv_codes[] =
7414 {
7415 0, /* EQ, Z == 1. */
7416 AARCH64_CC_Z, /* NE, Z == 0. */
7417 0, /* CS, C == 1. */
7418 AARCH64_CC_C, /* CC, C == 0. */
7419 0, /* MI, N == 1. */
7420 AARCH64_CC_N, /* PL, N == 0. */
7421 0, /* VS, V == 1. */
7422 AARCH64_CC_V, /* VC, V == 0. */
7423 0, /* HI, C ==1 && Z == 0. */
7424 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7425 AARCH64_CC_V, /* GE, N == V. */
7426 0, /* LT, N != V. */
7427 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7428 0, /* LE, !(Z == 0 && N == V). */
7429 0, /* AL, Any. */
7430 0 /* NV, Any. */
7431 };
7432
7433 /* Print floating-point vector immediate operand X to F, negating it
7434 first if NEGATE is true. Return true on success, false if it isn't
7435 a constant we can handle. */
7436
7437 static bool
7438 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7439 {
7440 rtx elt;
7441
7442 if (!const_vec_duplicate_p (x, &elt))
7443 return false;
7444
7445 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7446 if (negate)
7447 r = real_value_negate (&r);
7448
7449 /* We only handle the SVE single-bit immediates here. */
7450 if (real_equal (&r, &dconst0))
7451 asm_fprintf (f, "0.0");
7452 else if (real_equal (&r, &dconst1))
7453 asm_fprintf (f, "1.0");
7454 else if (real_equal (&r, &dconsthalf))
7455 asm_fprintf (f, "0.5");
7456 else
7457 return false;
7458
7459 return true;
7460 }
7461
7462 /* Return the equivalent letter for size. */
7463 static char
7464 sizetochar (int size)
7465 {
7466 switch (size)
7467 {
7468 case 64: return 'd';
7469 case 32: return 's';
7470 case 16: return 'h';
7471 case 8 : return 'b';
7472 default: gcc_unreachable ();
7473 }
7474 }
7475
7476 /* Print operand X to file F in a target specific manner according to CODE.
7477 The acceptable formatting commands given by CODE are:
7478 'c': An integer or symbol address without a preceding #
7479 sign.
7480 'C': Take the duplicated element in a vector constant
7481 and print it in hex.
7482 'D': Take the duplicated element in a vector constant
7483 and print it as an unsigned integer, in decimal.
7484 'e': Print the sign/zero-extend size as a character 8->b,
7485 16->h, 32->w.
7486 'p': Prints N such that 2^N == X (X must be power of 2 and
7487 const int).
7488 'P': Print the number of non-zero bits in X (a const_int).
7489 'H': Print the higher numbered register of a pair (TImode)
7490 of regs.
7491 'm': Print a condition (eq, ne, etc).
7492 'M': Same as 'm', but invert condition.
7493 'N': Take the duplicated element in a vector constant
7494 and print the negative of it in decimal.
7495 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7496 'S/T/U/V': Print a FP/SIMD register name for a register list.
7497 The register printed is the FP/SIMD register name
7498 of X + 0/1/2/3 for S/T/U/V.
7499 'R': Print a scalar FP/SIMD register name + 1.
7500 'X': Print bottom 16 bits of integer constant in hex.
7501 'w/x': Print a general register name or the zero register
7502 (32-bit or 64-bit).
7503 '0': Print a normal operand, if it's a general register,
7504 then we assume DImode.
7505 'k': Print NZCV for conditional compare instructions.
7506 'A': Output address constant representing the first
7507 argument of X, specifying a relocation offset
7508 if appropriate.
7509 'L': Output constant address specified by X
7510 with a relocation offset if appropriate.
7511 'G': Prints address of X, specifying a PC relative
7512 relocation mode if appropriate.
7513 'y': Output address of LDP or STP - this is used for
7514 some LDP/STPs which don't use a PARALLEL in their
7515 pattern (so the mode needs to be adjusted).
7516 'z': Output address of a typical LDP or STP. */
7517
7518 static void
7519 aarch64_print_operand (FILE *f, rtx x, int code)
7520 {
7521 rtx elt;
7522 switch (code)
7523 {
7524 case 'c':
7525 switch (GET_CODE (x))
7526 {
7527 case CONST_INT:
7528 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7529 break;
7530
7531 case SYMBOL_REF:
7532 output_addr_const (f, x);
7533 break;
7534
7535 case CONST:
7536 if (GET_CODE (XEXP (x, 0)) == PLUS
7537 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7538 {
7539 output_addr_const (f, x);
7540 break;
7541 }
7542 /* Fall through. */
7543
7544 default:
7545 output_operand_lossage ("unsupported operand for code '%c'", code);
7546 }
7547 break;
7548
7549 case 'e':
7550 {
7551 int n;
7552
7553 if (!CONST_INT_P (x)
7554 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7555 {
7556 output_operand_lossage ("invalid operand for '%%%c'", code);
7557 return;
7558 }
7559
7560 switch (n)
7561 {
7562 case 3:
7563 fputc ('b', f);
7564 break;
7565 case 4:
7566 fputc ('h', f);
7567 break;
7568 case 5:
7569 fputc ('w', f);
7570 break;
7571 default:
7572 output_operand_lossage ("invalid operand for '%%%c'", code);
7573 return;
7574 }
7575 }
7576 break;
7577
7578 case 'p':
7579 {
7580 int n;
7581
7582 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7583 {
7584 output_operand_lossage ("invalid operand for '%%%c'", code);
7585 return;
7586 }
7587
7588 asm_fprintf (f, "%d", n);
7589 }
7590 break;
7591
7592 case 'P':
7593 if (!CONST_INT_P (x))
7594 {
7595 output_operand_lossage ("invalid operand for '%%%c'", code);
7596 return;
7597 }
7598
7599 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7600 break;
7601
7602 case 'H':
7603 if (x == const0_rtx)
7604 {
7605 asm_fprintf (f, "xzr");
7606 break;
7607 }
7608
7609 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7610 {
7611 output_operand_lossage ("invalid operand for '%%%c'", code);
7612 return;
7613 }
7614
7615 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7616 break;
7617
7618 case 'M':
7619 case 'm':
7620 {
7621 int cond_code;
7622 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7623 if (x == const_true_rtx)
7624 {
7625 if (code == 'M')
7626 fputs ("nv", f);
7627 return;
7628 }
7629
7630 if (!COMPARISON_P (x))
7631 {
7632 output_operand_lossage ("invalid operand for '%%%c'", code);
7633 return;
7634 }
7635
7636 cond_code = aarch64_get_condition_code (x);
7637 gcc_assert (cond_code >= 0);
7638 if (code == 'M')
7639 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7640 fputs (aarch64_condition_codes[cond_code], f);
7641 }
7642 break;
7643
7644 case 'N':
7645 if (!const_vec_duplicate_p (x, &elt))
7646 {
7647 output_operand_lossage ("invalid vector constant");
7648 return;
7649 }
7650
7651 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7652 asm_fprintf (f, "%wd", -INTVAL (elt));
7653 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7654 && aarch64_print_vector_float_operand (f, x, true))
7655 ;
7656 else
7657 {
7658 output_operand_lossage ("invalid vector constant");
7659 return;
7660 }
7661 break;
7662
7663 case 'b':
7664 case 'h':
7665 case 's':
7666 case 'd':
7667 case 'q':
7668 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7669 {
7670 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7671 return;
7672 }
7673 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7674 break;
7675
7676 case 'S':
7677 case 'T':
7678 case 'U':
7679 case 'V':
7680 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7681 {
7682 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7683 return;
7684 }
7685 asm_fprintf (f, "%c%d",
7686 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7687 REGNO (x) - V0_REGNUM + (code - 'S'));
7688 break;
7689
7690 case 'R':
7691 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7692 {
7693 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7694 return;
7695 }
7696 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7697 break;
7698
7699 case 'X':
7700 if (!CONST_INT_P (x))
7701 {
7702 output_operand_lossage ("invalid operand for '%%%c'", code);
7703 return;
7704 }
7705 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7706 break;
7707
7708 case 'C':
7709 {
7710 /* Print a replicated constant in hex. */
7711 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7712 {
7713 output_operand_lossage ("invalid operand for '%%%c'", code);
7714 return;
7715 }
7716 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7717 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7718 }
7719 break;
7720
7721 case 'D':
7722 {
7723 /* Print a replicated constant in decimal, treating it as
7724 unsigned. */
7725 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7726 {
7727 output_operand_lossage ("invalid operand for '%%%c'", code);
7728 return;
7729 }
7730 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7731 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7732 }
7733 break;
7734
7735 case 'w':
7736 case 'x':
7737 if (x == const0_rtx
7738 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7739 {
7740 asm_fprintf (f, "%czr", code);
7741 break;
7742 }
7743
7744 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7745 {
7746 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7747 break;
7748 }
7749
7750 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7751 {
7752 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7753 break;
7754 }
7755
7756 /* Fall through */
7757
7758 case 0:
7759 if (x == NULL)
7760 {
7761 output_operand_lossage ("missing operand");
7762 return;
7763 }
7764
7765 switch (GET_CODE (x))
7766 {
7767 case REG:
7768 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7769 {
7770 if (REG_NREGS (x) == 1)
7771 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7772 else
7773 {
7774 char suffix
7775 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7776 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7777 REGNO (x) - V0_REGNUM, suffix,
7778 END_REGNO (x) - V0_REGNUM - 1, suffix);
7779 }
7780 }
7781 else
7782 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7783 break;
7784
7785 case MEM:
7786 output_address (GET_MODE (x), XEXP (x, 0));
7787 break;
7788
7789 case LABEL_REF:
7790 case SYMBOL_REF:
7791 output_addr_const (asm_out_file, x);
7792 break;
7793
7794 case CONST_INT:
7795 asm_fprintf (f, "%wd", INTVAL (x));
7796 break;
7797
7798 case CONST:
7799 if (!VECTOR_MODE_P (GET_MODE (x)))
7800 {
7801 output_addr_const (asm_out_file, x);
7802 break;
7803 }
7804 /* fall through */
7805
7806 case CONST_VECTOR:
7807 if (!const_vec_duplicate_p (x, &elt))
7808 {
7809 output_operand_lossage ("invalid vector constant");
7810 return;
7811 }
7812
7813 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7814 asm_fprintf (f, "%wd", INTVAL (elt));
7815 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7816 && aarch64_print_vector_float_operand (f, x, false))
7817 ;
7818 else
7819 {
7820 output_operand_lossage ("invalid vector constant");
7821 return;
7822 }
7823 break;
7824
7825 case CONST_DOUBLE:
7826 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7827 be getting CONST_DOUBLEs holding integers. */
7828 gcc_assert (GET_MODE (x) != VOIDmode);
7829 if (aarch64_float_const_zero_rtx_p (x))
7830 {
7831 fputc ('0', f);
7832 break;
7833 }
7834 else if (aarch64_float_const_representable_p (x))
7835 {
7836 #define buf_size 20
7837 char float_buf[buf_size] = {'\0'};
7838 real_to_decimal_for_mode (float_buf,
7839 CONST_DOUBLE_REAL_VALUE (x),
7840 buf_size, buf_size,
7841 1, GET_MODE (x));
7842 asm_fprintf (asm_out_file, "%s", float_buf);
7843 break;
7844 #undef buf_size
7845 }
7846 output_operand_lossage ("invalid constant");
7847 return;
7848 default:
7849 output_operand_lossage ("invalid operand");
7850 return;
7851 }
7852 break;
7853
7854 case 'A':
7855 if (GET_CODE (x) == HIGH)
7856 x = XEXP (x, 0);
7857
7858 switch (aarch64_classify_symbolic_expression (x))
7859 {
7860 case SYMBOL_SMALL_GOT_4G:
7861 asm_fprintf (asm_out_file, ":got:");
7862 break;
7863
7864 case SYMBOL_SMALL_TLSGD:
7865 asm_fprintf (asm_out_file, ":tlsgd:");
7866 break;
7867
7868 case SYMBOL_SMALL_TLSDESC:
7869 asm_fprintf (asm_out_file, ":tlsdesc:");
7870 break;
7871
7872 case SYMBOL_SMALL_TLSIE:
7873 asm_fprintf (asm_out_file, ":gottprel:");
7874 break;
7875
7876 case SYMBOL_TLSLE24:
7877 asm_fprintf (asm_out_file, ":tprel:");
7878 break;
7879
7880 case SYMBOL_TINY_GOT:
7881 gcc_unreachable ();
7882 break;
7883
7884 default:
7885 break;
7886 }
7887 output_addr_const (asm_out_file, x);
7888 break;
7889
7890 case 'L':
7891 switch (aarch64_classify_symbolic_expression (x))
7892 {
7893 case SYMBOL_SMALL_GOT_4G:
7894 asm_fprintf (asm_out_file, ":lo12:");
7895 break;
7896
7897 case SYMBOL_SMALL_TLSGD:
7898 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7899 break;
7900
7901 case SYMBOL_SMALL_TLSDESC:
7902 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7903 break;
7904
7905 case SYMBOL_SMALL_TLSIE:
7906 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7907 break;
7908
7909 case SYMBOL_TLSLE12:
7910 asm_fprintf (asm_out_file, ":tprel_lo12:");
7911 break;
7912
7913 case SYMBOL_TLSLE24:
7914 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7915 break;
7916
7917 case SYMBOL_TINY_GOT:
7918 asm_fprintf (asm_out_file, ":got:");
7919 break;
7920
7921 case SYMBOL_TINY_TLSIE:
7922 asm_fprintf (asm_out_file, ":gottprel:");
7923 break;
7924
7925 default:
7926 break;
7927 }
7928 output_addr_const (asm_out_file, x);
7929 break;
7930
7931 case 'G':
7932 switch (aarch64_classify_symbolic_expression (x))
7933 {
7934 case SYMBOL_TLSLE24:
7935 asm_fprintf (asm_out_file, ":tprel_hi12:");
7936 break;
7937 default:
7938 break;
7939 }
7940 output_addr_const (asm_out_file, x);
7941 break;
7942
7943 case 'k':
7944 {
7945 HOST_WIDE_INT cond_code;
7946
7947 if (!CONST_INT_P (x))
7948 {
7949 output_operand_lossage ("invalid operand for '%%%c'", code);
7950 return;
7951 }
7952
7953 cond_code = INTVAL (x);
7954 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7955 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7956 }
7957 break;
7958
7959 case 'y':
7960 case 'z':
7961 {
7962 machine_mode mode = GET_MODE (x);
7963
7964 if (GET_CODE (x) != MEM
7965 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7966 {
7967 output_operand_lossage ("invalid operand for '%%%c'", code);
7968 return;
7969 }
7970
7971 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7972 code == 'y'
7973 ? ADDR_QUERY_LDP_STP_N
7974 : ADDR_QUERY_LDP_STP))
7975 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7976 }
7977 break;
7978
7979 default:
7980 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7981 return;
7982 }
7983 }
7984
7985 /* Print address 'x' of a memory access with mode 'mode'.
7986 'op' is the context required by aarch64_classify_address. It can either be
7987 MEM for a normal memory access or PARALLEL for LDP/STP. */
7988 static bool
7989 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7990 aarch64_addr_query_type type)
7991 {
7992 struct aarch64_address_info addr;
7993 unsigned int size;
7994
7995 /* Check all addresses are Pmode - including ILP32. */
7996 if (GET_MODE (x) != Pmode
7997 && (!CONST_INT_P (x)
7998 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7999 {
8000 output_operand_lossage ("invalid address mode");
8001 return false;
8002 }
8003
8004 if (aarch64_classify_address (&addr, x, mode, true, type))
8005 switch (addr.type)
8006 {
8007 case ADDRESS_REG_IMM:
8008 if (known_eq (addr.const_offset, 0))
8009 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8010 else if (aarch64_sve_data_mode_p (mode))
8011 {
8012 HOST_WIDE_INT vnum
8013 = exact_div (addr.const_offset,
8014 BYTES_PER_SVE_VECTOR).to_constant ();
8015 asm_fprintf (f, "[%s, #%wd, mul vl]",
8016 reg_names[REGNO (addr.base)], vnum);
8017 }
8018 else if (aarch64_sve_pred_mode_p (mode))
8019 {
8020 HOST_WIDE_INT vnum
8021 = exact_div (addr.const_offset,
8022 BYTES_PER_SVE_PRED).to_constant ();
8023 asm_fprintf (f, "[%s, #%wd, mul vl]",
8024 reg_names[REGNO (addr.base)], vnum);
8025 }
8026 else
8027 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8028 INTVAL (addr.offset));
8029 return true;
8030
8031 case ADDRESS_REG_REG:
8032 if (addr.shift == 0)
8033 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8034 reg_names [REGNO (addr.offset)]);
8035 else
8036 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8037 reg_names [REGNO (addr.offset)], addr.shift);
8038 return true;
8039
8040 case ADDRESS_REG_UXTW:
8041 if (addr.shift == 0)
8042 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8043 REGNO (addr.offset) - R0_REGNUM);
8044 else
8045 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8046 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8047 return true;
8048
8049 case ADDRESS_REG_SXTW:
8050 if (addr.shift == 0)
8051 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8052 REGNO (addr.offset) - R0_REGNUM);
8053 else
8054 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8055 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8056 return true;
8057
8058 case ADDRESS_REG_WB:
8059 /* Writeback is only supported for fixed-width modes. */
8060 size = GET_MODE_SIZE (mode).to_constant ();
8061 switch (GET_CODE (x))
8062 {
8063 case PRE_INC:
8064 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8065 return true;
8066 case POST_INC:
8067 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8068 return true;
8069 case PRE_DEC:
8070 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8071 return true;
8072 case POST_DEC:
8073 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8074 return true;
8075 case PRE_MODIFY:
8076 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8077 INTVAL (addr.offset));
8078 return true;
8079 case POST_MODIFY:
8080 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8081 INTVAL (addr.offset));
8082 return true;
8083 default:
8084 break;
8085 }
8086 break;
8087
8088 case ADDRESS_LO_SUM:
8089 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8090 output_addr_const (f, addr.offset);
8091 asm_fprintf (f, "]");
8092 return true;
8093
8094 case ADDRESS_SYMBOLIC:
8095 output_addr_const (f, x);
8096 return true;
8097 }
8098
8099 return false;
8100 }
8101
8102 /* Print address 'x' of a memory access with mode 'mode'. */
8103 static void
8104 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8105 {
8106 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8107 output_addr_const (f, x);
8108 }
8109
8110 bool
8111 aarch64_label_mentioned_p (rtx x)
8112 {
8113 const char *fmt;
8114 int i;
8115
8116 if (GET_CODE (x) == LABEL_REF)
8117 return true;
8118
8119 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8120 referencing instruction, but they are constant offsets, not
8121 symbols. */
8122 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8123 return false;
8124
8125 fmt = GET_RTX_FORMAT (GET_CODE (x));
8126 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8127 {
8128 if (fmt[i] == 'E')
8129 {
8130 int j;
8131
8132 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8133 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8134 return 1;
8135 }
8136 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8137 return 1;
8138 }
8139
8140 return 0;
8141 }
8142
8143 /* Implement REGNO_REG_CLASS. */
8144
8145 enum reg_class
8146 aarch64_regno_regclass (unsigned regno)
8147 {
8148 if (GP_REGNUM_P (regno))
8149 return GENERAL_REGS;
8150
8151 if (regno == SP_REGNUM)
8152 return STACK_REG;
8153
8154 if (regno == FRAME_POINTER_REGNUM
8155 || regno == ARG_POINTER_REGNUM)
8156 return POINTER_REGS;
8157
8158 if (FP_REGNUM_P (regno))
8159 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
8160
8161 if (PR_REGNUM_P (regno))
8162 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8163
8164 return NO_REGS;
8165 }
8166
8167 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8168 If OFFSET is out of range, return an offset of an anchor point
8169 that is in range. Return 0 otherwise. */
8170
8171 static HOST_WIDE_INT
8172 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8173 machine_mode mode)
8174 {
8175 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8176 if (size > 16)
8177 return (offset + 0x400) & ~0x7f0;
8178
8179 /* For offsets that aren't a multiple of the access size, the limit is
8180 -256...255. */
8181 if (offset & (size - 1))
8182 {
8183 /* BLKmode typically uses LDP of X-registers. */
8184 if (mode == BLKmode)
8185 return (offset + 512) & ~0x3ff;
8186 return (offset + 0x100) & ~0x1ff;
8187 }
8188
8189 /* Small negative offsets are supported. */
8190 if (IN_RANGE (offset, -256, 0))
8191 return 0;
8192
8193 if (mode == TImode || mode == TFmode)
8194 return (offset + 0x100) & ~0x1ff;
8195
8196 /* Use 12-bit offset by access size. */
8197 return offset & (~0xfff * size);
8198 }
8199
8200 static rtx
8201 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8202 {
8203 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8204 where mask is selected by alignment and size of the offset.
8205 We try to pick as large a range for the offset as possible to
8206 maximize the chance of a CSE. However, for aligned addresses
8207 we limit the range to 4k so that structures with different sized
8208 elements are likely to use the same base. We need to be careful
8209 not to split a CONST for some forms of address expression, otherwise
8210 it will generate sub-optimal code. */
8211
8212 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8213 {
8214 rtx base = XEXP (x, 0);
8215 rtx offset_rtx = XEXP (x, 1);
8216 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8217
8218 if (GET_CODE (base) == PLUS)
8219 {
8220 rtx op0 = XEXP (base, 0);
8221 rtx op1 = XEXP (base, 1);
8222
8223 /* Force any scaling into a temp for CSE. */
8224 op0 = force_reg (Pmode, op0);
8225 op1 = force_reg (Pmode, op1);
8226
8227 /* Let the pointer register be in op0. */
8228 if (REG_POINTER (op1))
8229 std::swap (op0, op1);
8230
8231 /* If the pointer is virtual or frame related, then we know that
8232 virtual register instantiation or register elimination is going
8233 to apply a second constant. We want the two constants folded
8234 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8235 if (virt_or_elim_regno_p (REGNO (op0)))
8236 {
8237 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8238 NULL_RTX, true, OPTAB_DIRECT);
8239 return gen_rtx_PLUS (Pmode, base, op1);
8240 }
8241
8242 /* Otherwise, in order to encourage CSE (and thence loop strength
8243 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8244 base = expand_binop (Pmode, add_optab, op0, op1,
8245 NULL_RTX, true, OPTAB_DIRECT);
8246 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8247 }
8248
8249 HOST_WIDE_INT size;
8250 if (GET_MODE_SIZE (mode).is_constant (&size))
8251 {
8252 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8253 mode);
8254 if (base_offset != 0)
8255 {
8256 base = plus_constant (Pmode, base, base_offset);
8257 base = force_operand (base, NULL_RTX);
8258 return plus_constant (Pmode, base, offset - base_offset);
8259 }
8260 }
8261 }
8262
8263 return x;
8264 }
8265
8266 static reg_class_t
8267 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8268 reg_class_t rclass,
8269 machine_mode mode,
8270 secondary_reload_info *sri)
8271 {
8272 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8273 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8274 comment at the head of aarch64-sve.md for more details about the
8275 big-endian handling. */
8276 if (BYTES_BIG_ENDIAN
8277 && reg_class_subset_p (rclass, FP_REGS)
8278 && !((REG_P (x) && HARD_REGISTER_P (x))
8279 || aarch64_simd_valid_immediate (x, NULL))
8280 && aarch64_sve_data_mode_p (mode))
8281 {
8282 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8283 return NO_REGS;
8284 }
8285
8286 /* If we have to disable direct literal pool loads and stores because the
8287 function is too big, then we need a scratch register. */
8288 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8289 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8290 || targetm.vector_mode_supported_p (GET_MODE (x)))
8291 && !aarch64_pcrelative_literal_loads)
8292 {
8293 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8294 return NO_REGS;
8295 }
8296
8297 /* Without the TARGET_SIMD instructions we cannot move a Q register
8298 to a Q register directly. We need a scratch. */
8299 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8300 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8301 && reg_class_subset_p (rclass, FP_REGS))
8302 {
8303 sri->icode = code_for_aarch64_reload_mov (mode);
8304 return NO_REGS;
8305 }
8306
8307 /* A TFmode or TImode memory access should be handled via an FP_REGS
8308 because AArch64 has richer addressing modes for LDR/STR instructions
8309 than LDP/STP instructions. */
8310 if (TARGET_FLOAT && rclass == GENERAL_REGS
8311 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8312 return FP_REGS;
8313
8314 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8315 return GENERAL_REGS;
8316
8317 return NO_REGS;
8318 }
8319
8320 static bool
8321 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8322 {
8323 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8324
8325 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8326 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8327 if (frame_pointer_needed)
8328 return to == HARD_FRAME_POINTER_REGNUM;
8329 return true;
8330 }
8331
8332 poly_int64
8333 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8334 {
8335 if (to == HARD_FRAME_POINTER_REGNUM)
8336 {
8337 if (from == ARG_POINTER_REGNUM)
8338 return cfun->machine->frame.hard_fp_offset;
8339
8340 if (from == FRAME_POINTER_REGNUM)
8341 return cfun->machine->frame.hard_fp_offset
8342 - cfun->machine->frame.locals_offset;
8343 }
8344
8345 if (to == STACK_POINTER_REGNUM)
8346 {
8347 if (from == FRAME_POINTER_REGNUM)
8348 return cfun->machine->frame.frame_size
8349 - cfun->machine->frame.locals_offset;
8350 }
8351
8352 return cfun->machine->frame.frame_size;
8353 }
8354
8355 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8356 previous frame. */
8357
8358 rtx
8359 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8360 {
8361 if (count != 0)
8362 return const0_rtx;
8363 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8364 }
8365
8366
8367 static void
8368 aarch64_asm_trampoline_template (FILE *f)
8369 {
8370 int offset1 = 16;
8371 int offset2 = 20;
8372
8373 if (aarch64_bti_enabled ())
8374 {
8375 asm_fprintf (f, "\thint\t34 // bti c\n");
8376 offset1 -= 4;
8377 offset2 -= 4;
8378 }
8379
8380 if (TARGET_ILP32)
8381 {
8382 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8383 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8384 offset1);
8385 }
8386 else
8387 {
8388 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8389 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8390 offset2);
8391 }
8392 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8393
8394 /* The trampoline needs an extra padding instruction. In case if BTI is
8395 enabled the padding instruction is replaced by the BTI instruction at
8396 the beginning. */
8397 if (!aarch64_bti_enabled ())
8398 assemble_aligned_integer (4, const0_rtx);
8399
8400 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8401 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8402 }
8403
8404 static void
8405 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8406 {
8407 rtx fnaddr, mem, a_tramp;
8408 const int tramp_code_sz = 16;
8409
8410 /* Don't need to copy the trailing D-words, we fill those in below. */
8411 emit_block_move (m_tramp, assemble_trampoline_template (),
8412 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8413 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8414 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8415 if (GET_MODE (fnaddr) != ptr_mode)
8416 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8417 emit_move_insn (mem, fnaddr);
8418
8419 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8420 emit_move_insn (mem, chain_value);
8421
8422 /* XXX We should really define a "clear_cache" pattern and use
8423 gen_clear_cache(). */
8424 a_tramp = XEXP (m_tramp, 0);
8425 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8426 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8427 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8428 ptr_mode);
8429 }
8430
8431 static unsigned char
8432 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8433 {
8434 /* ??? Logically we should only need to provide a value when
8435 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8436 can hold MODE, but at the moment we need to handle all modes.
8437 Just ignore any runtime parts for registers that can't store them. */
8438 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8439 unsigned int nregs;
8440 switch (regclass)
8441 {
8442 case TAILCALL_ADDR_REGS:
8443 case POINTER_REGS:
8444 case GENERAL_REGS:
8445 case ALL_REGS:
8446 case POINTER_AND_FP_REGS:
8447 case FP_REGS:
8448 case FP_LO_REGS:
8449 if (aarch64_sve_data_mode_p (mode)
8450 && constant_multiple_p (GET_MODE_SIZE (mode),
8451 BYTES_PER_SVE_VECTOR, &nregs))
8452 return nregs;
8453 return (aarch64_vector_data_mode_p (mode)
8454 ? CEIL (lowest_size, UNITS_PER_VREG)
8455 : CEIL (lowest_size, UNITS_PER_WORD));
8456 case STACK_REG:
8457 case PR_REGS:
8458 case PR_LO_REGS:
8459 case PR_HI_REGS:
8460 return 1;
8461
8462 case NO_REGS:
8463 return 0;
8464
8465 default:
8466 break;
8467 }
8468 gcc_unreachable ();
8469 }
8470
8471 static reg_class_t
8472 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8473 {
8474 if (regclass == POINTER_REGS)
8475 return GENERAL_REGS;
8476
8477 if (regclass == STACK_REG)
8478 {
8479 if (REG_P(x)
8480 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8481 return regclass;
8482
8483 return NO_REGS;
8484 }
8485
8486 /* Register eliminiation can result in a request for
8487 SP+constant->FP_REGS. We cannot support such operations which
8488 use SP as source and an FP_REG as destination, so reject out
8489 right now. */
8490 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8491 {
8492 rtx lhs = XEXP (x, 0);
8493
8494 /* Look through a possible SUBREG introduced by ILP32. */
8495 if (GET_CODE (lhs) == SUBREG)
8496 lhs = SUBREG_REG (lhs);
8497
8498 gcc_assert (REG_P (lhs));
8499 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8500 POINTER_REGS));
8501 return NO_REGS;
8502 }
8503
8504 return regclass;
8505 }
8506
8507 void
8508 aarch64_asm_output_labelref (FILE* f, const char *name)
8509 {
8510 asm_fprintf (f, "%U%s", name);
8511 }
8512
8513 static void
8514 aarch64_elf_asm_constructor (rtx symbol, int priority)
8515 {
8516 if (priority == DEFAULT_INIT_PRIORITY)
8517 default_ctor_section_asm_out_constructor (symbol, priority);
8518 else
8519 {
8520 section *s;
8521 /* While priority is known to be in range [0, 65535], so 18 bytes
8522 would be enough, the compiler might not know that. To avoid
8523 -Wformat-truncation false positive, use a larger size. */
8524 char buf[23];
8525 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8526 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8527 switch_to_section (s);
8528 assemble_align (POINTER_SIZE);
8529 assemble_aligned_integer (POINTER_BYTES, symbol);
8530 }
8531 }
8532
8533 static void
8534 aarch64_elf_asm_destructor (rtx symbol, int priority)
8535 {
8536 if (priority == DEFAULT_INIT_PRIORITY)
8537 default_dtor_section_asm_out_destructor (symbol, priority);
8538 else
8539 {
8540 section *s;
8541 /* While priority is known to be in range [0, 65535], so 18 bytes
8542 would be enough, the compiler might not know that. To avoid
8543 -Wformat-truncation false positive, use a larger size. */
8544 char buf[23];
8545 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8546 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8547 switch_to_section (s);
8548 assemble_align (POINTER_SIZE);
8549 assemble_aligned_integer (POINTER_BYTES, symbol);
8550 }
8551 }
8552
8553 const char*
8554 aarch64_output_casesi (rtx *operands)
8555 {
8556 char buf[100];
8557 char label[100];
8558 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8559 int index;
8560 static const char *const patterns[4][2] =
8561 {
8562 {
8563 "ldrb\t%w3, [%0,%w1,uxtw]",
8564 "add\t%3, %4, %w3, sxtb #2"
8565 },
8566 {
8567 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8568 "add\t%3, %4, %w3, sxth #2"
8569 },
8570 {
8571 "ldr\t%w3, [%0,%w1,uxtw #2]",
8572 "add\t%3, %4, %w3, sxtw #2"
8573 },
8574 /* We assume that DImode is only generated when not optimizing and
8575 that we don't really need 64-bit address offsets. That would
8576 imply an object file with 8GB of code in a single function! */
8577 {
8578 "ldr\t%w3, [%0,%w1,uxtw #2]",
8579 "add\t%3, %4, %w3, sxtw #2"
8580 }
8581 };
8582
8583 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8584
8585 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8586 index = exact_log2 (GET_MODE_SIZE (mode));
8587
8588 gcc_assert (index >= 0 && index <= 3);
8589
8590 /* Need to implement table size reduction, by chaning the code below. */
8591 output_asm_insn (patterns[index][0], operands);
8592 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8593 snprintf (buf, sizeof (buf),
8594 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8595 output_asm_insn (buf, operands);
8596 output_asm_insn (patterns[index][1], operands);
8597 output_asm_insn ("br\t%3", operands);
8598 assemble_label (asm_out_file, label);
8599 return "";
8600 }
8601
8602
8603 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8604 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8605 operator. */
8606
8607 int
8608 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8609 {
8610 if (shift >= 0 && shift <= 3)
8611 {
8612 int size;
8613 for (size = 8; size <= 32; size *= 2)
8614 {
8615 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8616 if (mask == bits << shift)
8617 return size;
8618 }
8619 }
8620 return 0;
8621 }
8622
8623 /* Constant pools are per function only when PC relative
8624 literal loads are true or we are in the large memory
8625 model. */
8626
8627 static inline bool
8628 aarch64_can_use_per_function_literal_pools_p (void)
8629 {
8630 return (aarch64_pcrelative_literal_loads
8631 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8632 }
8633
8634 static bool
8635 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8636 {
8637 /* We can't use blocks for constants when we're using a per-function
8638 constant pool. */
8639 return !aarch64_can_use_per_function_literal_pools_p ();
8640 }
8641
8642 /* Select appropriate section for constants depending
8643 on where we place literal pools. */
8644
8645 static section *
8646 aarch64_select_rtx_section (machine_mode mode,
8647 rtx x,
8648 unsigned HOST_WIDE_INT align)
8649 {
8650 if (aarch64_can_use_per_function_literal_pools_p ())
8651 return function_section (current_function_decl);
8652
8653 return default_elf_select_rtx_section (mode, x, align);
8654 }
8655
8656 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8657 void
8658 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8659 HOST_WIDE_INT offset)
8660 {
8661 /* When using per-function literal pools, we must ensure that any code
8662 section is aligned to the minimal instruction length, lest we get
8663 errors from the assembler re "unaligned instructions". */
8664 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8665 ASM_OUTPUT_ALIGN (f, 2);
8666 }
8667
8668 /* Costs. */
8669
8670 /* Helper function for rtx cost calculation. Strip a shift expression
8671 from X. Returns the inner operand if successful, or the original
8672 expression on failure. */
8673 static rtx
8674 aarch64_strip_shift (rtx x)
8675 {
8676 rtx op = x;
8677
8678 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8679 we can convert both to ROR during final output. */
8680 if ((GET_CODE (op) == ASHIFT
8681 || GET_CODE (op) == ASHIFTRT
8682 || GET_CODE (op) == LSHIFTRT
8683 || GET_CODE (op) == ROTATERT
8684 || GET_CODE (op) == ROTATE)
8685 && CONST_INT_P (XEXP (op, 1)))
8686 return XEXP (op, 0);
8687
8688 if (GET_CODE (op) == MULT
8689 && CONST_INT_P (XEXP (op, 1))
8690 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8691 return XEXP (op, 0);
8692
8693 return x;
8694 }
8695
8696 /* Helper function for rtx cost calculation. Strip an extend
8697 expression from X. Returns the inner operand if successful, or the
8698 original expression on failure. We deal with a number of possible
8699 canonicalization variations here. If STRIP_SHIFT is true, then
8700 we can strip off a shift also. */
8701 static rtx
8702 aarch64_strip_extend (rtx x, bool strip_shift)
8703 {
8704 scalar_int_mode mode;
8705 rtx op = x;
8706
8707 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8708 return op;
8709
8710 /* Zero and sign extraction of a widened value. */
8711 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8712 && XEXP (op, 2) == const0_rtx
8713 && GET_CODE (XEXP (op, 0)) == MULT
8714 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8715 XEXP (op, 1)))
8716 return XEXP (XEXP (op, 0), 0);
8717
8718 /* It can also be represented (for zero-extend) as an AND with an
8719 immediate. */
8720 if (GET_CODE (op) == AND
8721 && GET_CODE (XEXP (op, 0)) == MULT
8722 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8723 && CONST_INT_P (XEXP (op, 1))
8724 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8725 INTVAL (XEXP (op, 1))) != 0)
8726 return XEXP (XEXP (op, 0), 0);
8727
8728 /* Now handle extended register, as this may also have an optional
8729 left shift by 1..4. */
8730 if (strip_shift
8731 && GET_CODE (op) == ASHIFT
8732 && CONST_INT_P (XEXP (op, 1))
8733 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8734 op = XEXP (op, 0);
8735
8736 if (GET_CODE (op) == ZERO_EXTEND
8737 || GET_CODE (op) == SIGN_EXTEND)
8738 op = XEXP (op, 0);
8739
8740 if (op != x)
8741 return op;
8742
8743 return x;
8744 }
8745
8746 /* Return true iff CODE is a shift supported in combination
8747 with arithmetic instructions. */
8748
8749 static bool
8750 aarch64_shift_p (enum rtx_code code)
8751 {
8752 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8753 }
8754
8755
8756 /* Return true iff X is a cheap shift without a sign extend. */
8757
8758 static bool
8759 aarch64_cheap_mult_shift_p (rtx x)
8760 {
8761 rtx op0, op1;
8762
8763 op0 = XEXP (x, 0);
8764 op1 = XEXP (x, 1);
8765
8766 if (!(aarch64_tune_params.extra_tuning_flags
8767 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8768 return false;
8769
8770 if (GET_CODE (op0) == SIGN_EXTEND)
8771 return false;
8772
8773 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8774 && UINTVAL (op1) <= 4)
8775 return true;
8776
8777 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8778 return false;
8779
8780 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8781
8782 if (l2 > 0 && l2 <= 4)
8783 return true;
8784
8785 return false;
8786 }
8787
8788 /* Helper function for rtx cost calculation. Calculate the cost of
8789 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8790 Return the calculated cost of the expression, recursing manually in to
8791 operands where needed. */
8792
8793 static int
8794 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8795 {
8796 rtx op0, op1;
8797 const struct cpu_cost_table *extra_cost
8798 = aarch64_tune_params.insn_extra_cost;
8799 int cost = 0;
8800 bool compound_p = (outer == PLUS || outer == MINUS);
8801 machine_mode mode = GET_MODE (x);
8802
8803 gcc_checking_assert (code == MULT);
8804
8805 op0 = XEXP (x, 0);
8806 op1 = XEXP (x, 1);
8807
8808 if (VECTOR_MODE_P (mode))
8809 mode = GET_MODE_INNER (mode);
8810
8811 /* Integer multiply/fma. */
8812 if (GET_MODE_CLASS (mode) == MODE_INT)
8813 {
8814 /* The multiply will be canonicalized as a shift, cost it as such. */
8815 if (aarch64_shift_p (GET_CODE (x))
8816 || (CONST_INT_P (op1)
8817 && exact_log2 (INTVAL (op1)) > 0))
8818 {
8819 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8820 || GET_CODE (op0) == SIGN_EXTEND;
8821 if (speed)
8822 {
8823 if (compound_p)
8824 {
8825 /* If the shift is considered cheap,
8826 then don't add any cost. */
8827 if (aarch64_cheap_mult_shift_p (x))
8828 ;
8829 else if (REG_P (op1))
8830 /* ARITH + shift-by-register. */
8831 cost += extra_cost->alu.arith_shift_reg;
8832 else if (is_extend)
8833 /* ARITH + extended register. We don't have a cost field
8834 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8835 cost += extra_cost->alu.extend_arith;
8836 else
8837 /* ARITH + shift-by-immediate. */
8838 cost += extra_cost->alu.arith_shift;
8839 }
8840 else
8841 /* LSL (immediate). */
8842 cost += extra_cost->alu.shift;
8843
8844 }
8845 /* Strip extends as we will have costed them in the case above. */
8846 if (is_extend)
8847 op0 = aarch64_strip_extend (op0, true);
8848
8849 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8850
8851 return cost;
8852 }
8853
8854 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8855 compound and let the below cases handle it. After all, MNEG is a
8856 special-case alias of MSUB. */
8857 if (GET_CODE (op0) == NEG)
8858 {
8859 op0 = XEXP (op0, 0);
8860 compound_p = true;
8861 }
8862
8863 /* Integer multiplies or FMAs have zero/sign extending variants. */
8864 if ((GET_CODE (op0) == ZERO_EXTEND
8865 && GET_CODE (op1) == ZERO_EXTEND)
8866 || (GET_CODE (op0) == SIGN_EXTEND
8867 && GET_CODE (op1) == SIGN_EXTEND))
8868 {
8869 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8870 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8871
8872 if (speed)
8873 {
8874 if (compound_p)
8875 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8876 cost += extra_cost->mult[0].extend_add;
8877 else
8878 /* MUL/SMULL/UMULL. */
8879 cost += extra_cost->mult[0].extend;
8880 }
8881
8882 return cost;
8883 }
8884
8885 /* This is either an integer multiply or a MADD. In both cases
8886 we want to recurse and cost the operands. */
8887 cost += rtx_cost (op0, mode, MULT, 0, speed);
8888 cost += rtx_cost (op1, mode, MULT, 1, speed);
8889
8890 if (speed)
8891 {
8892 if (compound_p)
8893 /* MADD/MSUB. */
8894 cost += extra_cost->mult[mode == DImode].add;
8895 else
8896 /* MUL. */
8897 cost += extra_cost->mult[mode == DImode].simple;
8898 }
8899
8900 return cost;
8901 }
8902 else
8903 {
8904 if (speed)
8905 {
8906 /* Floating-point FMA/FMUL can also support negations of the
8907 operands, unless the rounding mode is upward or downward in
8908 which case FNMUL is different than FMUL with operand negation. */
8909 bool neg0 = GET_CODE (op0) == NEG;
8910 bool neg1 = GET_CODE (op1) == NEG;
8911 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8912 {
8913 if (neg0)
8914 op0 = XEXP (op0, 0);
8915 if (neg1)
8916 op1 = XEXP (op1, 0);
8917 }
8918
8919 if (compound_p)
8920 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8921 cost += extra_cost->fp[mode == DFmode].fma;
8922 else
8923 /* FMUL/FNMUL. */
8924 cost += extra_cost->fp[mode == DFmode].mult;
8925 }
8926
8927 cost += rtx_cost (op0, mode, MULT, 0, speed);
8928 cost += rtx_cost (op1, mode, MULT, 1, speed);
8929 return cost;
8930 }
8931 }
8932
8933 static int
8934 aarch64_address_cost (rtx x,
8935 machine_mode mode,
8936 addr_space_t as ATTRIBUTE_UNUSED,
8937 bool speed)
8938 {
8939 enum rtx_code c = GET_CODE (x);
8940 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8941 struct aarch64_address_info info;
8942 int cost = 0;
8943 info.shift = 0;
8944
8945 if (!aarch64_classify_address (&info, x, mode, false))
8946 {
8947 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8948 {
8949 /* This is a CONST or SYMBOL ref which will be split
8950 in a different way depending on the code model in use.
8951 Cost it through the generic infrastructure. */
8952 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8953 /* Divide through by the cost of one instruction to
8954 bring it to the same units as the address costs. */
8955 cost_symbol_ref /= COSTS_N_INSNS (1);
8956 /* The cost is then the cost of preparing the address,
8957 followed by an immediate (possibly 0) offset. */
8958 return cost_symbol_ref + addr_cost->imm_offset;
8959 }
8960 else
8961 {
8962 /* This is most likely a jump table from a case
8963 statement. */
8964 return addr_cost->register_offset;
8965 }
8966 }
8967
8968 switch (info.type)
8969 {
8970 case ADDRESS_LO_SUM:
8971 case ADDRESS_SYMBOLIC:
8972 case ADDRESS_REG_IMM:
8973 cost += addr_cost->imm_offset;
8974 break;
8975
8976 case ADDRESS_REG_WB:
8977 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8978 cost += addr_cost->pre_modify;
8979 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8980 cost += addr_cost->post_modify;
8981 else
8982 gcc_unreachable ();
8983
8984 break;
8985
8986 case ADDRESS_REG_REG:
8987 cost += addr_cost->register_offset;
8988 break;
8989
8990 case ADDRESS_REG_SXTW:
8991 cost += addr_cost->register_sextend;
8992 break;
8993
8994 case ADDRESS_REG_UXTW:
8995 cost += addr_cost->register_zextend;
8996 break;
8997
8998 default:
8999 gcc_unreachable ();
9000 }
9001
9002
9003 if (info.shift > 0)
9004 {
9005 /* For the sake of calculating the cost of the shifted register
9006 component, we can treat same sized modes in the same way. */
9007 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9008 cost += addr_cost->addr_scale_costs.hi;
9009 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9010 cost += addr_cost->addr_scale_costs.si;
9011 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9012 cost += addr_cost->addr_scale_costs.di;
9013 else
9014 /* We can't tell, or this is a 128-bit vector. */
9015 cost += addr_cost->addr_scale_costs.ti;
9016 }
9017
9018 return cost;
9019 }
9020
9021 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9022 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9023 to be taken. */
9024
9025 int
9026 aarch64_branch_cost (bool speed_p, bool predictable_p)
9027 {
9028 /* When optimizing for speed, use the cost of unpredictable branches. */
9029 const struct cpu_branch_cost *branch_costs =
9030 aarch64_tune_params.branch_costs;
9031
9032 if (!speed_p || predictable_p)
9033 return branch_costs->predictable;
9034 else
9035 return branch_costs->unpredictable;
9036 }
9037
9038 /* Return true if the RTX X in mode MODE is a zero or sign extract
9039 usable in an ADD or SUB (extended register) instruction. */
9040 static bool
9041 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9042 {
9043 /* Catch add with a sign extract.
9044 This is add_<optab><mode>_multp2. */
9045 if (GET_CODE (x) == SIGN_EXTRACT
9046 || GET_CODE (x) == ZERO_EXTRACT)
9047 {
9048 rtx op0 = XEXP (x, 0);
9049 rtx op1 = XEXP (x, 1);
9050 rtx op2 = XEXP (x, 2);
9051
9052 if (GET_CODE (op0) == MULT
9053 && CONST_INT_P (op1)
9054 && op2 == const0_rtx
9055 && CONST_INT_P (XEXP (op0, 1))
9056 && aarch64_is_extend_from_extract (mode,
9057 XEXP (op0, 1),
9058 op1))
9059 {
9060 return true;
9061 }
9062 }
9063 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9064 No shift. */
9065 else if (GET_CODE (x) == SIGN_EXTEND
9066 || GET_CODE (x) == ZERO_EXTEND)
9067 return REG_P (XEXP (x, 0));
9068
9069 return false;
9070 }
9071
9072 static bool
9073 aarch64_frint_unspec_p (unsigned int u)
9074 {
9075 switch (u)
9076 {
9077 case UNSPEC_FRINTZ:
9078 case UNSPEC_FRINTP:
9079 case UNSPEC_FRINTM:
9080 case UNSPEC_FRINTA:
9081 case UNSPEC_FRINTN:
9082 case UNSPEC_FRINTX:
9083 case UNSPEC_FRINTI:
9084 return true;
9085
9086 default:
9087 return false;
9088 }
9089 }
9090
9091 /* Return true iff X is an rtx that will match an extr instruction
9092 i.e. as described in the *extr<mode>5_insn family of patterns.
9093 OP0 and OP1 will be set to the operands of the shifts involved
9094 on success and will be NULL_RTX otherwise. */
9095
9096 static bool
9097 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9098 {
9099 rtx op0, op1;
9100 scalar_int_mode mode;
9101 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9102 return false;
9103
9104 *res_op0 = NULL_RTX;
9105 *res_op1 = NULL_RTX;
9106
9107 if (GET_CODE (x) != IOR)
9108 return false;
9109
9110 op0 = XEXP (x, 0);
9111 op1 = XEXP (x, 1);
9112
9113 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9114 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9115 {
9116 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9117 if (GET_CODE (op1) == ASHIFT)
9118 std::swap (op0, op1);
9119
9120 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9121 return false;
9122
9123 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9124 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9125
9126 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9127 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9128 {
9129 *res_op0 = XEXP (op0, 0);
9130 *res_op1 = XEXP (op1, 0);
9131 return true;
9132 }
9133 }
9134
9135 return false;
9136 }
9137
9138 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9139 storing it in *COST. Result is true if the total cost of the operation
9140 has now been calculated. */
9141 static bool
9142 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9143 {
9144 rtx inner;
9145 rtx comparator;
9146 enum rtx_code cmpcode;
9147
9148 if (COMPARISON_P (op0))
9149 {
9150 inner = XEXP (op0, 0);
9151 comparator = XEXP (op0, 1);
9152 cmpcode = GET_CODE (op0);
9153 }
9154 else
9155 {
9156 inner = op0;
9157 comparator = const0_rtx;
9158 cmpcode = NE;
9159 }
9160
9161 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9162 {
9163 /* Conditional branch. */
9164 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9165 return true;
9166 else
9167 {
9168 if (cmpcode == NE || cmpcode == EQ)
9169 {
9170 if (comparator == const0_rtx)
9171 {
9172 /* TBZ/TBNZ/CBZ/CBNZ. */
9173 if (GET_CODE (inner) == ZERO_EXTRACT)
9174 /* TBZ/TBNZ. */
9175 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9176 ZERO_EXTRACT, 0, speed);
9177 else
9178 /* CBZ/CBNZ. */
9179 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9180
9181 return true;
9182 }
9183 }
9184 else if (cmpcode == LT || cmpcode == GE)
9185 {
9186 /* TBZ/TBNZ. */
9187 if (comparator == const0_rtx)
9188 return true;
9189 }
9190 }
9191 }
9192 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9193 {
9194 /* CCMP. */
9195 if (GET_CODE (op1) == COMPARE)
9196 {
9197 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9198 if (XEXP (op1, 1) == const0_rtx)
9199 *cost += 1;
9200 if (speed)
9201 {
9202 machine_mode mode = GET_MODE (XEXP (op1, 0));
9203 const struct cpu_cost_table *extra_cost
9204 = aarch64_tune_params.insn_extra_cost;
9205
9206 if (GET_MODE_CLASS (mode) == MODE_INT)
9207 *cost += extra_cost->alu.arith;
9208 else
9209 *cost += extra_cost->fp[mode == DFmode].compare;
9210 }
9211 return true;
9212 }
9213
9214 /* It's a conditional operation based on the status flags,
9215 so it must be some flavor of CSEL. */
9216
9217 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9218 if (GET_CODE (op1) == NEG
9219 || GET_CODE (op1) == NOT
9220 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9221 op1 = XEXP (op1, 0);
9222 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9223 {
9224 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9225 op1 = XEXP (op1, 0);
9226 op2 = XEXP (op2, 0);
9227 }
9228
9229 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9230 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9231 return true;
9232 }
9233
9234 /* We don't know what this is, cost all operands. */
9235 return false;
9236 }
9237
9238 /* Check whether X is a bitfield operation of the form shift + extend that
9239 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9240 operand to which the bitfield operation is applied. Otherwise return
9241 NULL_RTX. */
9242
9243 static rtx
9244 aarch64_extend_bitfield_pattern_p (rtx x)
9245 {
9246 rtx_code outer_code = GET_CODE (x);
9247 machine_mode outer_mode = GET_MODE (x);
9248
9249 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9250 && outer_mode != SImode && outer_mode != DImode)
9251 return NULL_RTX;
9252
9253 rtx inner = XEXP (x, 0);
9254 rtx_code inner_code = GET_CODE (inner);
9255 machine_mode inner_mode = GET_MODE (inner);
9256 rtx op = NULL_RTX;
9257
9258 switch (inner_code)
9259 {
9260 case ASHIFT:
9261 if (CONST_INT_P (XEXP (inner, 1))
9262 && (inner_mode == QImode || inner_mode == HImode))
9263 op = XEXP (inner, 0);
9264 break;
9265 case LSHIFTRT:
9266 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9267 && (inner_mode == QImode || inner_mode == HImode))
9268 op = XEXP (inner, 0);
9269 break;
9270 case ASHIFTRT:
9271 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9272 && (inner_mode == QImode || inner_mode == HImode))
9273 op = XEXP (inner, 0);
9274 break;
9275 default:
9276 break;
9277 }
9278
9279 return op;
9280 }
9281
9282 /* Return true if the mask and a shift amount from an RTX of the form
9283 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9284 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9285
9286 bool
9287 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9288 rtx shft_amnt)
9289 {
9290 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9291 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9292 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9293 && (INTVAL (mask)
9294 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9295 }
9296
9297 /* Calculate the cost of calculating X, storing it in *COST. Result
9298 is true if the total cost of the operation has now been calculated. */
9299 static bool
9300 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9301 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9302 {
9303 rtx op0, op1, op2;
9304 const struct cpu_cost_table *extra_cost
9305 = aarch64_tune_params.insn_extra_cost;
9306 int code = GET_CODE (x);
9307 scalar_int_mode int_mode;
9308
9309 /* By default, assume that everything has equivalent cost to the
9310 cheapest instruction. Any additional costs are applied as a delta
9311 above this default. */
9312 *cost = COSTS_N_INSNS (1);
9313
9314 switch (code)
9315 {
9316 case SET:
9317 /* The cost depends entirely on the operands to SET. */
9318 *cost = 0;
9319 op0 = SET_DEST (x);
9320 op1 = SET_SRC (x);
9321
9322 switch (GET_CODE (op0))
9323 {
9324 case MEM:
9325 if (speed)
9326 {
9327 rtx address = XEXP (op0, 0);
9328 if (VECTOR_MODE_P (mode))
9329 *cost += extra_cost->ldst.storev;
9330 else if (GET_MODE_CLASS (mode) == MODE_INT)
9331 *cost += extra_cost->ldst.store;
9332 else if (mode == SFmode)
9333 *cost += extra_cost->ldst.storef;
9334 else if (mode == DFmode)
9335 *cost += extra_cost->ldst.stored;
9336
9337 *cost +=
9338 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9339 0, speed));
9340 }
9341
9342 *cost += rtx_cost (op1, mode, SET, 1, speed);
9343 return true;
9344
9345 case SUBREG:
9346 if (! REG_P (SUBREG_REG (op0)))
9347 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9348
9349 /* Fall through. */
9350 case REG:
9351 /* The cost is one per vector-register copied. */
9352 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9353 {
9354 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9355 *cost = COSTS_N_INSNS (nregs);
9356 }
9357 /* const0_rtx is in general free, but we will use an
9358 instruction to set a register to 0. */
9359 else if (REG_P (op1) || op1 == const0_rtx)
9360 {
9361 /* The cost is 1 per register copied. */
9362 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9363 *cost = COSTS_N_INSNS (nregs);
9364 }
9365 else
9366 /* Cost is just the cost of the RHS of the set. */
9367 *cost += rtx_cost (op1, mode, SET, 1, speed);
9368 return true;
9369
9370 case ZERO_EXTRACT:
9371 case SIGN_EXTRACT:
9372 /* Bit-field insertion. Strip any redundant widening of
9373 the RHS to meet the width of the target. */
9374 if (GET_CODE (op1) == SUBREG)
9375 op1 = SUBREG_REG (op1);
9376 if ((GET_CODE (op1) == ZERO_EXTEND
9377 || GET_CODE (op1) == SIGN_EXTEND)
9378 && CONST_INT_P (XEXP (op0, 1))
9379 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9380 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9381 op1 = XEXP (op1, 0);
9382
9383 if (CONST_INT_P (op1))
9384 {
9385 /* MOV immediate is assumed to always be cheap. */
9386 *cost = COSTS_N_INSNS (1);
9387 }
9388 else
9389 {
9390 /* BFM. */
9391 if (speed)
9392 *cost += extra_cost->alu.bfi;
9393 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9394 }
9395
9396 return true;
9397
9398 default:
9399 /* We can't make sense of this, assume default cost. */
9400 *cost = COSTS_N_INSNS (1);
9401 return false;
9402 }
9403 return false;
9404
9405 case CONST_INT:
9406 /* If an instruction can incorporate a constant within the
9407 instruction, the instruction's expression avoids calling
9408 rtx_cost() on the constant. If rtx_cost() is called on a
9409 constant, then it is usually because the constant must be
9410 moved into a register by one or more instructions.
9411
9412 The exception is constant 0, which can be expressed
9413 as XZR/WZR and is therefore free. The exception to this is
9414 if we have (set (reg) (const0_rtx)) in which case we must cost
9415 the move. However, we can catch that when we cost the SET, so
9416 we don't need to consider that here. */
9417 if (x == const0_rtx)
9418 *cost = 0;
9419 else
9420 {
9421 /* To an approximation, building any other constant is
9422 proportionally expensive to the number of instructions
9423 required to build that constant. This is true whether we
9424 are compiling for SPEED or otherwise. */
9425 if (!is_a <scalar_int_mode> (mode, &int_mode))
9426 int_mode = word_mode;
9427 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9428 (NULL_RTX, x, false, int_mode));
9429 }
9430 return true;
9431
9432 case CONST_DOUBLE:
9433
9434 /* First determine number of instructions to do the move
9435 as an integer constant. */
9436 if (!aarch64_float_const_representable_p (x)
9437 && !aarch64_can_const_movi_rtx_p (x, mode)
9438 && aarch64_float_const_rtx_p (x))
9439 {
9440 unsigned HOST_WIDE_INT ival;
9441 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9442 gcc_assert (succeed);
9443
9444 scalar_int_mode imode = (mode == HFmode
9445 ? SImode
9446 : int_mode_for_mode (mode).require ());
9447 int ncost = aarch64_internal_mov_immediate
9448 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9449 *cost += COSTS_N_INSNS (ncost);
9450 return true;
9451 }
9452
9453 if (speed)
9454 {
9455 /* mov[df,sf]_aarch64. */
9456 if (aarch64_float_const_representable_p (x))
9457 /* FMOV (scalar immediate). */
9458 *cost += extra_cost->fp[mode == DFmode].fpconst;
9459 else if (!aarch64_float_const_zero_rtx_p (x))
9460 {
9461 /* This will be a load from memory. */
9462 if (mode == DFmode)
9463 *cost += extra_cost->ldst.loadd;
9464 else
9465 *cost += extra_cost->ldst.loadf;
9466 }
9467 else
9468 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9469 or MOV v0.s[0], wzr - neither of which are modeled by the
9470 cost tables. Just use the default cost. */
9471 {
9472 }
9473 }
9474
9475 return true;
9476
9477 case MEM:
9478 if (speed)
9479 {
9480 /* For loads we want the base cost of a load, plus an
9481 approximation for the additional cost of the addressing
9482 mode. */
9483 rtx address = XEXP (x, 0);
9484 if (VECTOR_MODE_P (mode))
9485 *cost += extra_cost->ldst.loadv;
9486 else if (GET_MODE_CLASS (mode) == MODE_INT)
9487 *cost += extra_cost->ldst.load;
9488 else if (mode == SFmode)
9489 *cost += extra_cost->ldst.loadf;
9490 else if (mode == DFmode)
9491 *cost += extra_cost->ldst.loadd;
9492
9493 *cost +=
9494 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9495 0, speed));
9496 }
9497
9498 return true;
9499
9500 case NEG:
9501 op0 = XEXP (x, 0);
9502
9503 if (VECTOR_MODE_P (mode))
9504 {
9505 if (speed)
9506 {
9507 /* FNEG. */
9508 *cost += extra_cost->vect.alu;
9509 }
9510 return false;
9511 }
9512
9513 if (GET_MODE_CLASS (mode) == MODE_INT)
9514 {
9515 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9516 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9517 {
9518 /* CSETM. */
9519 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9520 return true;
9521 }
9522
9523 /* Cost this as SUB wzr, X. */
9524 op0 = CONST0_RTX (mode);
9525 op1 = XEXP (x, 0);
9526 goto cost_minus;
9527 }
9528
9529 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9530 {
9531 /* Support (neg(fma...)) as a single instruction only if
9532 sign of zeros is unimportant. This matches the decision
9533 making in aarch64.md. */
9534 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9535 {
9536 /* FNMADD. */
9537 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9538 return true;
9539 }
9540 if (GET_CODE (op0) == MULT)
9541 {
9542 /* FNMUL. */
9543 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9544 return true;
9545 }
9546 if (speed)
9547 /* FNEG. */
9548 *cost += extra_cost->fp[mode == DFmode].neg;
9549 return false;
9550 }
9551
9552 return false;
9553
9554 case CLRSB:
9555 case CLZ:
9556 if (speed)
9557 {
9558 if (VECTOR_MODE_P (mode))
9559 *cost += extra_cost->vect.alu;
9560 else
9561 *cost += extra_cost->alu.clz;
9562 }
9563
9564 return false;
9565
9566 case COMPARE:
9567 op0 = XEXP (x, 0);
9568 op1 = XEXP (x, 1);
9569
9570 if (op1 == const0_rtx
9571 && GET_CODE (op0) == AND)
9572 {
9573 x = op0;
9574 mode = GET_MODE (op0);
9575 goto cost_logic;
9576 }
9577
9578 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9579 {
9580 /* TODO: A write to the CC flags possibly costs extra, this
9581 needs encoding in the cost tables. */
9582
9583 mode = GET_MODE (op0);
9584 /* ANDS. */
9585 if (GET_CODE (op0) == AND)
9586 {
9587 x = op0;
9588 goto cost_logic;
9589 }
9590
9591 if (GET_CODE (op0) == PLUS)
9592 {
9593 /* ADDS (and CMN alias). */
9594 x = op0;
9595 goto cost_plus;
9596 }
9597
9598 if (GET_CODE (op0) == MINUS)
9599 {
9600 /* SUBS. */
9601 x = op0;
9602 goto cost_minus;
9603 }
9604
9605 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9606 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9607 && CONST_INT_P (XEXP (op0, 2)))
9608 {
9609 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9610 Handle it here directly rather than going to cost_logic
9611 since we know the immediate generated for the TST is valid
9612 so we can avoid creating an intermediate rtx for it only
9613 for costing purposes. */
9614 if (speed)
9615 *cost += extra_cost->alu.logical;
9616
9617 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9618 ZERO_EXTRACT, 0, speed);
9619 return true;
9620 }
9621
9622 if (GET_CODE (op1) == NEG)
9623 {
9624 /* CMN. */
9625 if (speed)
9626 *cost += extra_cost->alu.arith;
9627
9628 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9629 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9630 return true;
9631 }
9632
9633 /* CMP.
9634
9635 Compare can freely swap the order of operands, and
9636 canonicalization puts the more complex operation first.
9637 But the integer MINUS logic expects the shift/extend
9638 operation in op1. */
9639 if (! (REG_P (op0)
9640 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9641 {
9642 op0 = XEXP (x, 1);
9643 op1 = XEXP (x, 0);
9644 }
9645 goto cost_minus;
9646 }
9647
9648 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9649 {
9650 /* FCMP. */
9651 if (speed)
9652 *cost += extra_cost->fp[mode == DFmode].compare;
9653
9654 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9655 {
9656 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9657 /* FCMP supports constant 0.0 for no extra cost. */
9658 return true;
9659 }
9660 return false;
9661 }
9662
9663 if (VECTOR_MODE_P (mode))
9664 {
9665 /* Vector compare. */
9666 if (speed)
9667 *cost += extra_cost->vect.alu;
9668
9669 if (aarch64_float_const_zero_rtx_p (op1))
9670 {
9671 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9672 cost. */
9673 return true;
9674 }
9675 return false;
9676 }
9677 return false;
9678
9679 case MINUS:
9680 {
9681 op0 = XEXP (x, 0);
9682 op1 = XEXP (x, 1);
9683
9684 cost_minus:
9685 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9686
9687 /* Detect valid immediates. */
9688 if ((GET_MODE_CLASS (mode) == MODE_INT
9689 || (GET_MODE_CLASS (mode) == MODE_CC
9690 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9691 && CONST_INT_P (op1)
9692 && aarch64_uimm12_shift (INTVAL (op1)))
9693 {
9694 if (speed)
9695 /* SUB(S) (immediate). */
9696 *cost += extra_cost->alu.arith;
9697 return true;
9698 }
9699
9700 /* Look for SUB (extended register). */
9701 if (is_a <scalar_int_mode> (mode, &int_mode)
9702 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9703 {
9704 if (speed)
9705 *cost += extra_cost->alu.extend_arith;
9706
9707 op1 = aarch64_strip_extend (op1, true);
9708 *cost += rtx_cost (op1, VOIDmode,
9709 (enum rtx_code) GET_CODE (op1), 0, speed);
9710 return true;
9711 }
9712
9713 rtx new_op1 = aarch64_strip_extend (op1, false);
9714
9715 /* Cost this as an FMA-alike operation. */
9716 if ((GET_CODE (new_op1) == MULT
9717 || aarch64_shift_p (GET_CODE (new_op1)))
9718 && code != COMPARE)
9719 {
9720 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9721 (enum rtx_code) code,
9722 speed);
9723 return true;
9724 }
9725
9726 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9727
9728 if (speed)
9729 {
9730 if (VECTOR_MODE_P (mode))
9731 {
9732 /* Vector SUB. */
9733 *cost += extra_cost->vect.alu;
9734 }
9735 else if (GET_MODE_CLASS (mode) == MODE_INT)
9736 {
9737 /* SUB(S). */
9738 *cost += extra_cost->alu.arith;
9739 }
9740 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9741 {
9742 /* FSUB. */
9743 *cost += extra_cost->fp[mode == DFmode].addsub;
9744 }
9745 }
9746 return true;
9747 }
9748
9749 case PLUS:
9750 {
9751 rtx new_op0;
9752
9753 op0 = XEXP (x, 0);
9754 op1 = XEXP (x, 1);
9755
9756 cost_plus:
9757 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9758 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9759 {
9760 /* CSINC. */
9761 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9762 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9763 return true;
9764 }
9765
9766 if (GET_MODE_CLASS (mode) == MODE_INT
9767 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9768 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9769 {
9770 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9771
9772 if (speed)
9773 /* ADD (immediate). */
9774 *cost += extra_cost->alu.arith;
9775 return true;
9776 }
9777
9778 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9779
9780 /* Look for ADD (extended register). */
9781 if (is_a <scalar_int_mode> (mode, &int_mode)
9782 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9783 {
9784 if (speed)
9785 *cost += extra_cost->alu.extend_arith;
9786
9787 op0 = aarch64_strip_extend (op0, true);
9788 *cost += rtx_cost (op0, VOIDmode,
9789 (enum rtx_code) GET_CODE (op0), 0, speed);
9790 return true;
9791 }
9792
9793 /* Strip any extend, leave shifts behind as we will
9794 cost them through mult_cost. */
9795 new_op0 = aarch64_strip_extend (op0, false);
9796
9797 if (GET_CODE (new_op0) == MULT
9798 || aarch64_shift_p (GET_CODE (new_op0)))
9799 {
9800 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9801 speed);
9802 return true;
9803 }
9804
9805 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9806
9807 if (speed)
9808 {
9809 if (VECTOR_MODE_P (mode))
9810 {
9811 /* Vector ADD. */
9812 *cost += extra_cost->vect.alu;
9813 }
9814 else if (GET_MODE_CLASS (mode) == MODE_INT)
9815 {
9816 /* ADD. */
9817 *cost += extra_cost->alu.arith;
9818 }
9819 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9820 {
9821 /* FADD. */
9822 *cost += extra_cost->fp[mode == DFmode].addsub;
9823 }
9824 }
9825 return true;
9826 }
9827
9828 case BSWAP:
9829 *cost = COSTS_N_INSNS (1);
9830
9831 if (speed)
9832 {
9833 if (VECTOR_MODE_P (mode))
9834 *cost += extra_cost->vect.alu;
9835 else
9836 *cost += extra_cost->alu.rev;
9837 }
9838 return false;
9839
9840 case IOR:
9841 if (aarch_rev16_p (x))
9842 {
9843 *cost = COSTS_N_INSNS (1);
9844
9845 if (speed)
9846 {
9847 if (VECTOR_MODE_P (mode))
9848 *cost += extra_cost->vect.alu;
9849 else
9850 *cost += extra_cost->alu.rev;
9851 }
9852 return true;
9853 }
9854
9855 if (aarch64_extr_rtx_p (x, &op0, &op1))
9856 {
9857 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9858 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9859 if (speed)
9860 *cost += extra_cost->alu.shift;
9861
9862 return true;
9863 }
9864 /* Fall through. */
9865 case XOR:
9866 case AND:
9867 cost_logic:
9868 op0 = XEXP (x, 0);
9869 op1 = XEXP (x, 1);
9870
9871 if (VECTOR_MODE_P (mode))
9872 {
9873 if (speed)
9874 *cost += extra_cost->vect.alu;
9875 return true;
9876 }
9877
9878 if (code == AND
9879 && GET_CODE (op0) == MULT
9880 && CONST_INT_P (XEXP (op0, 1))
9881 && CONST_INT_P (op1)
9882 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9883 INTVAL (op1)) != 0)
9884 {
9885 /* This is a UBFM/SBFM. */
9886 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9887 if (speed)
9888 *cost += extra_cost->alu.bfx;
9889 return true;
9890 }
9891
9892 if (is_int_mode (mode, &int_mode))
9893 {
9894 if (CONST_INT_P (op1))
9895 {
9896 /* We have a mask + shift version of a UBFIZ
9897 i.e. the *andim_ashift<mode>_bfiz pattern. */
9898 if (GET_CODE (op0) == ASHIFT
9899 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9900 XEXP (op0, 1)))
9901 {
9902 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9903 (enum rtx_code) code, 0, speed);
9904 if (speed)
9905 *cost += extra_cost->alu.bfx;
9906
9907 return true;
9908 }
9909 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9910 {
9911 /* We possibly get the immediate for free, this is not
9912 modelled. */
9913 *cost += rtx_cost (op0, int_mode,
9914 (enum rtx_code) code, 0, speed);
9915 if (speed)
9916 *cost += extra_cost->alu.logical;
9917
9918 return true;
9919 }
9920 }
9921 else
9922 {
9923 rtx new_op0 = op0;
9924
9925 /* Handle ORN, EON, or BIC. */
9926 if (GET_CODE (op0) == NOT)
9927 op0 = XEXP (op0, 0);
9928
9929 new_op0 = aarch64_strip_shift (op0);
9930
9931 /* If we had a shift on op0 then this is a logical-shift-
9932 by-register/immediate operation. Otherwise, this is just
9933 a logical operation. */
9934 if (speed)
9935 {
9936 if (new_op0 != op0)
9937 {
9938 /* Shift by immediate. */
9939 if (CONST_INT_P (XEXP (op0, 1)))
9940 *cost += extra_cost->alu.log_shift;
9941 else
9942 *cost += extra_cost->alu.log_shift_reg;
9943 }
9944 else
9945 *cost += extra_cost->alu.logical;
9946 }
9947
9948 /* In both cases we want to cost both operands. */
9949 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9950 0, speed);
9951 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9952 1, speed);
9953
9954 return true;
9955 }
9956 }
9957 return false;
9958
9959 case NOT:
9960 x = XEXP (x, 0);
9961 op0 = aarch64_strip_shift (x);
9962
9963 if (VECTOR_MODE_P (mode))
9964 {
9965 /* Vector NOT. */
9966 *cost += extra_cost->vect.alu;
9967 return false;
9968 }
9969
9970 /* MVN-shifted-reg. */
9971 if (op0 != x)
9972 {
9973 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9974
9975 if (speed)
9976 *cost += extra_cost->alu.log_shift;
9977
9978 return true;
9979 }
9980 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9981 Handle the second form here taking care that 'a' in the above can
9982 be a shift. */
9983 else if (GET_CODE (op0) == XOR)
9984 {
9985 rtx newop0 = XEXP (op0, 0);
9986 rtx newop1 = XEXP (op0, 1);
9987 rtx op0_stripped = aarch64_strip_shift (newop0);
9988
9989 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9990 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9991
9992 if (speed)
9993 {
9994 if (op0_stripped != newop0)
9995 *cost += extra_cost->alu.log_shift;
9996 else
9997 *cost += extra_cost->alu.logical;
9998 }
9999
10000 return true;
10001 }
10002 /* MVN. */
10003 if (speed)
10004 *cost += extra_cost->alu.logical;
10005
10006 return false;
10007
10008 case ZERO_EXTEND:
10009
10010 op0 = XEXP (x, 0);
10011 /* If a value is written in SI mode, then zero extended to DI
10012 mode, the operation will in general be free as a write to
10013 a 'w' register implicitly zeroes the upper bits of an 'x'
10014 register. However, if this is
10015
10016 (set (reg) (zero_extend (reg)))
10017
10018 we must cost the explicit register move. */
10019 if (mode == DImode
10020 && GET_MODE (op0) == SImode
10021 && outer == SET)
10022 {
10023 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10024
10025 /* If OP_COST is non-zero, then the cost of the zero extend
10026 is effectively the cost of the inner operation. Otherwise
10027 we have a MOV instruction and we take the cost from the MOV
10028 itself. This is true independently of whether we are
10029 optimizing for space or time. */
10030 if (op_cost)
10031 *cost = op_cost;
10032
10033 return true;
10034 }
10035 else if (MEM_P (op0))
10036 {
10037 /* All loads can zero extend to any size for free. */
10038 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10039 return true;
10040 }
10041
10042 op0 = aarch64_extend_bitfield_pattern_p (x);
10043 if (op0)
10044 {
10045 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10046 if (speed)
10047 *cost += extra_cost->alu.bfx;
10048 return true;
10049 }
10050
10051 if (speed)
10052 {
10053 if (VECTOR_MODE_P (mode))
10054 {
10055 /* UMOV. */
10056 *cost += extra_cost->vect.alu;
10057 }
10058 else
10059 {
10060 /* We generate an AND instead of UXTB/UXTH. */
10061 *cost += extra_cost->alu.logical;
10062 }
10063 }
10064 return false;
10065
10066 case SIGN_EXTEND:
10067 if (MEM_P (XEXP (x, 0)))
10068 {
10069 /* LDRSH. */
10070 if (speed)
10071 {
10072 rtx address = XEXP (XEXP (x, 0), 0);
10073 *cost += extra_cost->ldst.load_sign_extend;
10074
10075 *cost +=
10076 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10077 0, speed));
10078 }
10079 return true;
10080 }
10081
10082 op0 = aarch64_extend_bitfield_pattern_p (x);
10083 if (op0)
10084 {
10085 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10086 if (speed)
10087 *cost += extra_cost->alu.bfx;
10088 return true;
10089 }
10090
10091 if (speed)
10092 {
10093 if (VECTOR_MODE_P (mode))
10094 *cost += extra_cost->vect.alu;
10095 else
10096 *cost += extra_cost->alu.extend;
10097 }
10098 return false;
10099
10100 case ASHIFT:
10101 op0 = XEXP (x, 0);
10102 op1 = XEXP (x, 1);
10103
10104 if (CONST_INT_P (op1))
10105 {
10106 if (speed)
10107 {
10108 if (VECTOR_MODE_P (mode))
10109 {
10110 /* Vector shift (immediate). */
10111 *cost += extra_cost->vect.alu;
10112 }
10113 else
10114 {
10115 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10116 aliases. */
10117 *cost += extra_cost->alu.shift;
10118 }
10119 }
10120
10121 /* We can incorporate zero/sign extend for free. */
10122 if (GET_CODE (op0) == ZERO_EXTEND
10123 || GET_CODE (op0) == SIGN_EXTEND)
10124 op0 = XEXP (op0, 0);
10125
10126 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10127 return true;
10128 }
10129 else
10130 {
10131 if (VECTOR_MODE_P (mode))
10132 {
10133 if (speed)
10134 /* Vector shift (register). */
10135 *cost += extra_cost->vect.alu;
10136 }
10137 else
10138 {
10139 if (speed)
10140 /* LSLV. */
10141 *cost += extra_cost->alu.shift_reg;
10142
10143 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10144 && CONST_INT_P (XEXP (op1, 1))
10145 && known_eq (INTVAL (XEXP (op1, 1)),
10146 GET_MODE_BITSIZE (mode) - 1))
10147 {
10148 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10149 /* We already demanded XEXP (op1, 0) to be REG_P, so
10150 don't recurse into it. */
10151 return true;
10152 }
10153 }
10154 return false; /* All arguments need to be in registers. */
10155 }
10156
10157 case ROTATE:
10158 case ROTATERT:
10159 case LSHIFTRT:
10160 case ASHIFTRT:
10161 op0 = XEXP (x, 0);
10162 op1 = XEXP (x, 1);
10163
10164 if (CONST_INT_P (op1))
10165 {
10166 /* ASR (immediate) and friends. */
10167 if (speed)
10168 {
10169 if (VECTOR_MODE_P (mode))
10170 *cost += extra_cost->vect.alu;
10171 else
10172 *cost += extra_cost->alu.shift;
10173 }
10174
10175 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10176 return true;
10177 }
10178 else
10179 {
10180 if (VECTOR_MODE_P (mode))
10181 {
10182 if (speed)
10183 /* Vector shift (register). */
10184 *cost += extra_cost->vect.alu;
10185 }
10186 else
10187 {
10188 if (speed)
10189 /* ASR (register) and friends. */
10190 *cost += extra_cost->alu.shift_reg;
10191
10192 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10193 && CONST_INT_P (XEXP (op1, 1))
10194 && known_eq (INTVAL (XEXP (op1, 1)),
10195 GET_MODE_BITSIZE (mode) - 1))
10196 {
10197 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10198 /* We already demanded XEXP (op1, 0) to be REG_P, so
10199 don't recurse into it. */
10200 return true;
10201 }
10202 }
10203 return false; /* All arguments need to be in registers. */
10204 }
10205
10206 case SYMBOL_REF:
10207
10208 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10209 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10210 {
10211 /* LDR. */
10212 if (speed)
10213 *cost += extra_cost->ldst.load;
10214 }
10215 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10216 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10217 {
10218 /* ADRP, followed by ADD. */
10219 *cost += COSTS_N_INSNS (1);
10220 if (speed)
10221 *cost += 2 * extra_cost->alu.arith;
10222 }
10223 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10224 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10225 {
10226 /* ADR. */
10227 if (speed)
10228 *cost += extra_cost->alu.arith;
10229 }
10230
10231 if (flag_pic)
10232 {
10233 /* One extra load instruction, after accessing the GOT. */
10234 *cost += COSTS_N_INSNS (1);
10235 if (speed)
10236 *cost += extra_cost->ldst.load;
10237 }
10238 return true;
10239
10240 case HIGH:
10241 case LO_SUM:
10242 /* ADRP/ADD (immediate). */
10243 if (speed)
10244 *cost += extra_cost->alu.arith;
10245 return true;
10246
10247 case ZERO_EXTRACT:
10248 case SIGN_EXTRACT:
10249 /* UBFX/SBFX. */
10250 if (speed)
10251 {
10252 if (VECTOR_MODE_P (mode))
10253 *cost += extra_cost->vect.alu;
10254 else
10255 *cost += extra_cost->alu.bfx;
10256 }
10257
10258 /* We can trust that the immediates used will be correct (there
10259 are no by-register forms), so we need only cost op0. */
10260 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10261 return true;
10262
10263 case MULT:
10264 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10265 /* aarch64_rtx_mult_cost always handles recursion to its
10266 operands. */
10267 return true;
10268
10269 case MOD:
10270 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10271 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10272 an unconditional negate. This case should only ever be reached through
10273 the set_smod_pow2_cheap check in expmed.c. */
10274 if (CONST_INT_P (XEXP (x, 1))
10275 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10276 && (mode == SImode || mode == DImode))
10277 {
10278 /* We expand to 4 instructions. Reset the baseline. */
10279 *cost = COSTS_N_INSNS (4);
10280
10281 if (speed)
10282 *cost += 2 * extra_cost->alu.logical
10283 + 2 * extra_cost->alu.arith;
10284
10285 return true;
10286 }
10287
10288 /* Fall-through. */
10289 case UMOD:
10290 if (speed)
10291 {
10292 /* Slighly prefer UMOD over SMOD. */
10293 if (VECTOR_MODE_P (mode))
10294 *cost += extra_cost->vect.alu;
10295 else if (GET_MODE_CLASS (mode) == MODE_INT)
10296 *cost += (extra_cost->mult[mode == DImode].add
10297 + extra_cost->mult[mode == DImode].idiv
10298 + (code == MOD ? 1 : 0));
10299 }
10300 return false; /* All arguments need to be in registers. */
10301
10302 case DIV:
10303 case UDIV:
10304 case SQRT:
10305 if (speed)
10306 {
10307 if (VECTOR_MODE_P (mode))
10308 *cost += extra_cost->vect.alu;
10309 else if (GET_MODE_CLASS (mode) == MODE_INT)
10310 /* There is no integer SQRT, so only DIV and UDIV can get
10311 here. */
10312 *cost += (extra_cost->mult[mode == DImode].idiv
10313 /* Slighly prefer UDIV over SDIV. */
10314 + (code == DIV ? 1 : 0));
10315 else
10316 *cost += extra_cost->fp[mode == DFmode].div;
10317 }
10318 return false; /* All arguments need to be in registers. */
10319
10320 case IF_THEN_ELSE:
10321 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10322 XEXP (x, 2), cost, speed);
10323
10324 case EQ:
10325 case NE:
10326 case GT:
10327 case GTU:
10328 case LT:
10329 case LTU:
10330 case GE:
10331 case GEU:
10332 case LE:
10333 case LEU:
10334
10335 return false; /* All arguments must be in registers. */
10336
10337 case FMA:
10338 op0 = XEXP (x, 0);
10339 op1 = XEXP (x, 1);
10340 op2 = XEXP (x, 2);
10341
10342 if (speed)
10343 {
10344 if (VECTOR_MODE_P (mode))
10345 *cost += extra_cost->vect.alu;
10346 else
10347 *cost += extra_cost->fp[mode == DFmode].fma;
10348 }
10349
10350 /* FMSUB, FNMADD, and FNMSUB are free. */
10351 if (GET_CODE (op0) == NEG)
10352 op0 = XEXP (op0, 0);
10353
10354 if (GET_CODE (op2) == NEG)
10355 op2 = XEXP (op2, 0);
10356
10357 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10358 and the by-element operand as operand 0. */
10359 if (GET_CODE (op1) == NEG)
10360 op1 = XEXP (op1, 0);
10361
10362 /* Catch vector-by-element operations. The by-element operand can
10363 either be (vec_duplicate (vec_select (x))) or just
10364 (vec_select (x)), depending on whether we are multiplying by
10365 a vector or a scalar.
10366
10367 Canonicalization is not very good in these cases, FMA4 will put the
10368 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10369 if (GET_CODE (op0) == VEC_DUPLICATE)
10370 op0 = XEXP (op0, 0);
10371 else if (GET_CODE (op1) == VEC_DUPLICATE)
10372 op1 = XEXP (op1, 0);
10373
10374 if (GET_CODE (op0) == VEC_SELECT)
10375 op0 = XEXP (op0, 0);
10376 else if (GET_CODE (op1) == VEC_SELECT)
10377 op1 = XEXP (op1, 0);
10378
10379 /* If the remaining parameters are not registers,
10380 get the cost to put them into registers. */
10381 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10382 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10383 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10384 return true;
10385
10386 case FLOAT:
10387 case UNSIGNED_FLOAT:
10388 if (speed)
10389 *cost += extra_cost->fp[mode == DFmode].fromint;
10390 return false;
10391
10392 case FLOAT_EXTEND:
10393 if (speed)
10394 {
10395 if (VECTOR_MODE_P (mode))
10396 {
10397 /*Vector truncate. */
10398 *cost += extra_cost->vect.alu;
10399 }
10400 else
10401 *cost += extra_cost->fp[mode == DFmode].widen;
10402 }
10403 return false;
10404
10405 case FLOAT_TRUNCATE:
10406 if (speed)
10407 {
10408 if (VECTOR_MODE_P (mode))
10409 {
10410 /*Vector conversion. */
10411 *cost += extra_cost->vect.alu;
10412 }
10413 else
10414 *cost += extra_cost->fp[mode == DFmode].narrow;
10415 }
10416 return false;
10417
10418 case FIX:
10419 case UNSIGNED_FIX:
10420 x = XEXP (x, 0);
10421 /* Strip the rounding part. They will all be implemented
10422 by the fcvt* family of instructions anyway. */
10423 if (GET_CODE (x) == UNSPEC)
10424 {
10425 unsigned int uns_code = XINT (x, 1);
10426
10427 if (uns_code == UNSPEC_FRINTA
10428 || uns_code == UNSPEC_FRINTM
10429 || uns_code == UNSPEC_FRINTN
10430 || uns_code == UNSPEC_FRINTP
10431 || uns_code == UNSPEC_FRINTZ)
10432 x = XVECEXP (x, 0, 0);
10433 }
10434
10435 if (speed)
10436 {
10437 if (VECTOR_MODE_P (mode))
10438 *cost += extra_cost->vect.alu;
10439 else
10440 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10441 }
10442
10443 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10444 fixed-point fcvt. */
10445 if (GET_CODE (x) == MULT
10446 && ((VECTOR_MODE_P (mode)
10447 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10448 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10449 {
10450 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10451 0, speed);
10452 return true;
10453 }
10454
10455 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10456 return true;
10457
10458 case ABS:
10459 if (VECTOR_MODE_P (mode))
10460 {
10461 /* ABS (vector). */
10462 if (speed)
10463 *cost += extra_cost->vect.alu;
10464 }
10465 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10466 {
10467 op0 = XEXP (x, 0);
10468
10469 /* FABD, which is analogous to FADD. */
10470 if (GET_CODE (op0) == MINUS)
10471 {
10472 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10473 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10474 if (speed)
10475 *cost += extra_cost->fp[mode == DFmode].addsub;
10476
10477 return true;
10478 }
10479 /* Simple FABS is analogous to FNEG. */
10480 if (speed)
10481 *cost += extra_cost->fp[mode == DFmode].neg;
10482 }
10483 else
10484 {
10485 /* Integer ABS will either be split to
10486 two arithmetic instructions, or will be an ABS
10487 (scalar), which we don't model. */
10488 *cost = COSTS_N_INSNS (2);
10489 if (speed)
10490 *cost += 2 * extra_cost->alu.arith;
10491 }
10492 return false;
10493
10494 case SMAX:
10495 case SMIN:
10496 if (speed)
10497 {
10498 if (VECTOR_MODE_P (mode))
10499 *cost += extra_cost->vect.alu;
10500 else
10501 {
10502 /* FMAXNM/FMINNM/FMAX/FMIN.
10503 TODO: This may not be accurate for all implementations, but
10504 we do not model this in the cost tables. */
10505 *cost += extra_cost->fp[mode == DFmode].addsub;
10506 }
10507 }
10508 return false;
10509
10510 case UNSPEC:
10511 /* The floating point round to integer frint* instructions. */
10512 if (aarch64_frint_unspec_p (XINT (x, 1)))
10513 {
10514 if (speed)
10515 *cost += extra_cost->fp[mode == DFmode].roundint;
10516
10517 return false;
10518 }
10519
10520 if (XINT (x, 1) == UNSPEC_RBIT)
10521 {
10522 if (speed)
10523 *cost += extra_cost->alu.rev;
10524
10525 return false;
10526 }
10527 break;
10528
10529 case TRUNCATE:
10530
10531 /* Decompose <su>muldi3_highpart. */
10532 if (/* (truncate:DI */
10533 mode == DImode
10534 /* (lshiftrt:TI */
10535 && GET_MODE (XEXP (x, 0)) == TImode
10536 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10537 /* (mult:TI */
10538 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10539 /* (ANY_EXTEND:TI (reg:DI))
10540 (ANY_EXTEND:TI (reg:DI))) */
10541 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10542 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10543 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10544 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10545 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10546 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10547 /* (const_int 64) */
10548 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10549 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10550 {
10551 /* UMULH/SMULH. */
10552 if (speed)
10553 *cost += extra_cost->mult[mode == DImode].extend;
10554 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10555 mode, MULT, 0, speed);
10556 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10557 mode, MULT, 1, speed);
10558 return true;
10559 }
10560
10561 /* Fall through. */
10562 default:
10563 break;
10564 }
10565
10566 if (dump_file
10567 && flag_aarch64_verbose_cost)
10568 fprintf (dump_file,
10569 "\nFailed to cost RTX. Assuming default cost.\n");
10570
10571 return true;
10572 }
10573
10574 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10575 calculated for X. This cost is stored in *COST. Returns true
10576 if the total cost of X was calculated. */
10577 static bool
10578 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10579 int param, int *cost, bool speed)
10580 {
10581 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10582
10583 if (dump_file
10584 && flag_aarch64_verbose_cost)
10585 {
10586 print_rtl_single (dump_file, x);
10587 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10588 speed ? "Hot" : "Cold",
10589 *cost, result ? "final" : "partial");
10590 }
10591
10592 return result;
10593 }
10594
10595 static int
10596 aarch64_register_move_cost (machine_mode mode,
10597 reg_class_t from_i, reg_class_t to_i)
10598 {
10599 enum reg_class from = (enum reg_class) from_i;
10600 enum reg_class to = (enum reg_class) to_i;
10601 const struct cpu_regmove_cost *regmove_cost
10602 = aarch64_tune_params.regmove_cost;
10603
10604 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10605 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10606 to = GENERAL_REGS;
10607
10608 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10609 from = GENERAL_REGS;
10610
10611 /* Moving between GPR and stack cost is the same as GP2GP. */
10612 if ((from == GENERAL_REGS && to == STACK_REG)
10613 || (to == GENERAL_REGS && from == STACK_REG))
10614 return regmove_cost->GP2GP;
10615
10616 /* To/From the stack register, we move via the gprs. */
10617 if (to == STACK_REG || from == STACK_REG)
10618 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10619 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10620
10621 if (known_eq (GET_MODE_SIZE (mode), 16))
10622 {
10623 /* 128-bit operations on general registers require 2 instructions. */
10624 if (from == GENERAL_REGS && to == GENERAL_REGS)
10625 return regmove_cost->GP2GP * 2;
10626 else if (from == GENERAL_REGS)
10627 return regmove_cost->GP2FP * 2;
10628 else if (to == GENERAL_REGS)
10629 return regmove_cost->FP2GP * 2;
10630
10631 /* When AdvSIMD instructions are disabled it is not possible to move
10632 a 128-bit value directly between Q registers. This is handled in
10633 secondary reload. A general register is used as a scratch to move
10634 the upper DI value and the lower DI value is moved directly,
10635 hence the cost is the sum of three moves. */
10636 if (! TARGET_SIMD)
10637 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10638
10639 return regmove_cost->FP2FP;
10640 }
10641
10642 if (from == GENERAL_REGS && to == GENERAL_REGS)
10643 return regmove_cost->GP2GP;
10644 else if (from == GENERAL_REGS)
10645 return regmove_cost->GP2FP;
10646 else if (to == GENERAL_REGS)
10647 return regmove_cost->FP2GP;
10648
10649 return regmove_cost->FP2FP;
10650 }
10651
10652 static int
10653 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10654 reg_class_t rclass ATTRIBUTE_UNUSED,
10655 bool in ATTRIBUTE_UNUSED)
10656 {
10657 return aarch64_tune_params.memmov_cost;
10658 }
10659
10660 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10661 to optimize 1.0/sqrt. */
10662
10663 static bool
10664 use_rsqrt_p (machine_mode mode)
10665 {
10666 return (!flag_trapping_math
10667 && flag_unsafe_math_optimizations
10668 && ((aarch64_tune_params.approx_modes->recip_sqrt
10669 & AARCH64_APPROX_MODE (mode))
10670 || flag_mrecip_low_precision_sqrt));
10671 }
10672
10673 /* Function to decide when to use the approximate reciprocal square root
10674 builtin. */
10675
10676 static tree
10677 aarch64_builtin_reciprocal (tree fndecl)
10678 {
10679 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10680
10681 if (!use_rsqrt_p (mode))
10682 return NULL_TREE;
10683 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10684 }
10685
10686 /* Emit instruction sequence to compute either the approximate square root
10687 or its approximate reciprocal, depending on the flag RECP, and return
10688 whether the sequence was emitted or not. */
10689
10690 bool
10691 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10692 {
10693 machine_mode mode = GET_MODE (dst);
10694
10695 if (GET_MODE_INNER (mode) == HFmode)
10696 {
10697 gcc_assert (!recp);
10698 return false;
10699 }
10700
10701 if (!recp)
10702 {
10703 if (!(flag_mlow_precision_sqrt
10704 || (aarch64_tune_params.approx_modes->sqrt
10705 & AARCH64_APPROX_MODE (mode))))
10706 return false;
10707
10708 if (flag_finite_math_only
10709 || flag_trapping_math
10710 || !flag_unsafe_math_optimizations
10711 || optimize_function_for_size_p (cfun))
10712 return false;
10713 }
10714 else
10715 /* Caller assumes we cannot fail. */
10716 gcc_assert (use_rsqrt_p (mode));
10717
10718 machine_mode mmsk = mode_for_int_vector (mode).require ();
10719 rtx xmsk = gen_reg_rtx (mmsk);
10720 if (!recp)
10721 /* When calculating the approximate square root, compare the
10722 argument with 0.0 and create a mask. */
10723 emit_insn (gen_rtx_SET (xmsk,
10724 gen_rtx_NEG (mmsk,
10725 gen_rtx_EQ (mmsk, src,
10726 CONST0_RTX (mode)))));
10727
10728 /* Estimate the approximate reciprocal square root. */
10729 rtx xdst = gen_reg_rtx (mode);
10730 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10731
10732 /* Iterate over the series twice for SF and thrice for DF. */
10733 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10734
10735 /* Optionally iterate over the series once less for faster performance
10736 while sacrificing the accuracy. */
10737 if ((recp && flag_mrecip_low_precision_sqrt)
10738 || (!recp && flag_mlow_precision_sqrt))
10739 iterations--;
10740
10741 /* Iterate over the series to calculate the approximate reciprocal square
10742 root. */
10743 rtx x1 = gen_reg_rtx (mode);
10744 while (iterations--)
10745 {
10746 rtx x2 = gen_reg_rtx (mode);
10747 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10748
10749 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10750
10751 if (iterations > 0)
10752 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10753 }
10754
10755 if (!recp)
10756 {
10757 /* Qualify the approximate reciprocal square root when the argument is
10758 0.0 by squashing the intermediary result to 0.0. */
10759 rtx xtmp = gen_reg_rtx (mmsk);
10760 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10761 gen_rtx_SUBREG (mmsk, xdst, 0)));
10762 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10763
10764 /* Calculate the approximate square root. */
10765 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10766 }
10767
10768 /* Finalize the approximation. */
10769 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10770
10771 return true;
10772 }
10773
10774 /* Emit the instruction sequence to compute the approximation for the division
10775 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10776
10777 bool
10778 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10779 {
10780 machine_mode mode = GET_MODE (quo);
10781
10782 if (GET_MODE_INNER (mode) == HFmode)
10783 return false;
10784
10785 bool use_approx_division_p = (flag_mlow_precision_div
10786 || (aarch64_tune_params.approx_modes->division
10787 & AARCH64_APPROX_MODE (mode)));
10788
10789 if (!flag_finite_math_only
10790 || flag_trapping_math
10791 || !flag_unsafe_math_optimizations
10792 || optimize_function_for_size_p (cfun)
10793 || !use_approx_division_p)
10794 return false;
10795
10796 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10797 return false;
10798
10799 /* Estimate the approximate reciprocal. */
10800 rtx xrcp = gen_reg_rtx (mode);
10801 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10802
10803 /* Iterate over the series twice for SF and thrice for DF. */
10804 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10805
10806 /* Optionally iterate over the series once less for faster performance,
10807 while sacrificing the accuracy. */
10808 if (flag_mlow_precision_div)
10809 iterations--;
10810
10811 /* Iterate over the series to calculate the approximate reciprocal. */
10812 rtx xtmp = gen_reg_rtx (mode);
10813 while (iterations--)
10814 {
10815 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10816
10817 if (iterations > 0)
10818 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10819 }
10820
10821 if (num != CONST1_RTX (mode))
10822 {
10823 /* As the approximate reciprocal of DEN is already calculated, only
10824 calculate the approximate division when NUM is not 1.0. */
10825 rtx xnum = force_reg (mode, num);
10826 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10827 }
10828
10829 /* Finalize the approximation. */
10830 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10831 return true;
10832 }
10833
10834 /* Return the number of instructions that can be issued per cycle. */
10835 static int
10836 aarch64_sched_issue_rate (void)
10837 {
10838 return aarch64_tune_params.issue_rate;
10839 }
10840
10841 static int
10842 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10843 {
10844 int issue_rate = aarch64_sched_issue_rate ();
10845
10846 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10847 }
10848
10849
10850 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10851 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10852 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10853
10854 static int
10855 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10856 int ready_index)
10857 {
10858 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10859 }
10860
10861
10862 /* Vectorizer cost model target hooks. */
10863
10864 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10865 static int
10866 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10867 tree vectype,
10868 int misalign ATTRIBUTE_UNUSED)
10869 {
10870 unsigned elements;
10871 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10872 bool fp = false;
10873
10874 if (vectype != NULL)
10875 fp = FLOAT_TYPE_P (vectype);
10876
10877 switch (type_of_cost)
10878 {
10879 case scalar_stmt:
10880 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10881
10882 case scalar_load:
10883 return costs->scalar_load_cost;
10884
10885 case scalar_store:
10886 return costs->scalar_store_cost;
10887
10888 case vector_stmt:
10889 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10890
10891 case vector_load:
10892 return costs->vec_align_load_cost;
10893
10894 case vector_store:
10895 return costs->vec_store_cost;
10896
10897 case vec_to_scalar:
10898 return costs->vec_to_scalar_cost;
10899
10900 case scalar_to_vec:
10901 return costs->scalar_to_vec_cost;
10902
10903 case unaligned_load:
10904 case vector_gather_load:
10905 return costs->vec_unalign_load_cost;
10906
10907 case unaligned_store:
10908 case vector_scatter_store:
10909 return costs->vec_unalign_store_cost;
10910
10911 case cond_branch_taken:
10912 return costs->cond_taken_branch_cost;
10913
10914 case cond_branch_not_taken:
10915 return costs->cond_not_taken_branch_cost;
10916
10917 case vec_perm:
10918 return costs->vec_permute_cost;
10919
10920 case vec_promote_demote:
10921 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10922
10923 case vec_construct:
10924 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10925 return elements / 2 + 1;
10926
10927 default:
10928 gcc_unreachable ();
10929 }
10930 }
10931
10932 /* Implement targetm.vectorize.add_stmt_cost. */
10933 static unsigned
10934 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10935 struct _stmt_vec_info *stmt_info, int misalign,
10936 enum vect_cost_model_location where)
10937 {
10938 unsigned *cost = (unsigned *) data;
10939 unsigned retval = 0;
10940
10941 if (flag_vect_cost_model)
10942 {
10943 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10944 int stmt_cost =
10945 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10946
10947 /* Statements in an inner loop relative to the loop being
10948 vectorized are weighted more heavily. The value here is
10949 arbitrary and could potentially be improved with analysis. */
10950 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10951 count *= 50; /* FIXME */
10952
10953 retval = (unsigned) (count * stmt_cost);
10954 cost[where] += retval;
10955 }
10956
10957 return retval;
10958 }
10959
10960 static void initialize_aarch64_code_model (struct gcc_options *);
10961
10962 /* Parse the TO_PARSE string and put the architecture struct that it
10963 selects into RES and the architectural features into ISA_FLAGS.
10964 Return an aarch64_parse_opt_result describing the parse result.
10965 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10966 When the TO_PARSE string contains an invalid extension,
10967 a copy of the string is created and stored to INVALID_EXTENSION. */
10968
10969 static enum aarch64_parse_opt_result
10970 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10971 unsigned long *isa_flags, std::string *invalid_extension)
10972 {
10973 const char *ext;
10974 const struct processor *arch;
10975 size_t len;
10976
10977 ext = strchr (to_parse, '+');
10978
10979 if (ext != NULL)
10980 len = ext - to_parse;
10981 else
10982 len = strlen (to_parse);
10983
10984 if (len == 0)
10985 return AARCH64_PARSE_MISSING_ARG;
10986
10987
10988 /* Loop through the list of supported ARCHes to find a match. */
10989 for (arch = all_architectures; arch->name != NULL; arch++)
10990 {
10991 if (strlen (arch->name) == len
10992 && strncmp (arch->name, to_parse, len) == 0)
10993 {
10994 unsigned long isa_temp = arch->flags;
10995
10996 if (ext != NULL)
10997 {
10998 /* TO_PARSE string contains at least one extension. */
10999 enum aarch64_parse_opt_result ext_res
11000 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11001
11002 if (ext_res != AARCH64_PARSE_OK)
11003 return ext_res;
11004 }
11005 /* Extension parsing was successful. Confirm the result
11006 arch and ISA flags. */
11007 *res = arch;
11008 *isa_flags = isa_temp;
11009 return AARCH64_PARSE_OK;
11010 }
11011 }
11012
11013 /* ARCH name not found in list. */
11014 return AARCH64_PARSE_INVALID_ARG;
11015 }
11016
11017 /* Parse the TO_PARSE string and put the result tuning in RES and the
11018 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11019 describing the parse result. If there is an error parsing, RES and
11020 ISA_FLAGS are left unchanged.
11021 When the TO_PARSE string contains an invalid extension,
11022 a copy of the string is created and stored to INVALID_EXTENSION. */
11023
11024 static enum aarch64_parse_opt_result
11025 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11026 unsigned long *isa_flags, std::string *invalid_extension)
11027 {
11028 const char *ext;
11029 const struct processor *cpu;
11030 size_t len;
11031
11032 ext = strchr (to_parse, '+');
11033
11034 if (ext != NULL)
11035 len = ext - to_parse;
11036 else
11037 len = strlen (to_parse);
11038
11039 if (len == 0)
11040 return AARCH64_PARSE_MISSING_ARG;
11041
11042
11043 /* Loop through the list of supported CPUs to find a match. */
11044 for (cpu = all_cores; cpu->name != NULL; cpu++)
11045 {
11046 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11047 {
11048 unsigned long isa_temp = cpu->flags;
11049
11050
11051 if (ext != NULL)
11052 {
11053 /* TO_PARSE string contains at least one extension. */
11054 enum aarch64_parse_opt_result ext_res
11055 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11056
11057 if (ext_res != AARCH64_PARSE_OK)
11058 return ext_res;
11059 }
11060 /* Extension parsing was successfull. Confirm the result
11061 cpu and ISA flags. */
11062 *res = cpu;
11063 *isa_flags = isa_temp;
11064 return AARCH64_PARSE_OK;
11065 }
11066 }
11067
11068 /* CPU name not found in list. */
11069 return AARCH64_PARSE_INVALID_ARG;
11070 }
11071
11072 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11073 Return an aarch64_parse_opt_result describing the parse result.
11074 If the parsing fails the RES does not change. */
11075
11076 static enum aarch64_parse_opt_result
11077 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11078 {
11079 const struct processor *cpu;
11080
11081 /* Loop through the list of supported CPUs to find a match. */
11082 for (cpu = all_cores; cpu->name != NULL; cpu++)
11083 {
11084 if (strcmp (cpu->name, to_parse) == 0)
11085 {
11086 *res = cpu;
11087 return AARCH64_PARSE_OK;
11088 }
11089 }
11090
11091 /* CPU name not found in list. */
11092 return AARCH64_PARSE_INVALID_ARG;
11093 }
11094
11095 /* Parse TOKEN, which has length LENGTH to see if it is an option
11096 described in FLAG. If it is, return the index bit for that fusion type.
11097 If not, error (printing OPTION_NAME) and return zero. */
11098
11099 static unsigned int
11100 aarch64_parse_one_option_token (const char *token,
11101 size_t length,
11102 const struct aarch64_flag_desc *flag,
11103 const char *option_name)
11104 {
11105 for (; flag->name != NULL; flag++)
11106 {
11107 if (length == strlen (flag->name)
11108 && !strncmp (flag->name, token, length))
11109 return flag->flag;
11110 }
11111
11112 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
11113 return 0;
11114 }
11115
11116 /* Parse OPTION which is a comma-separated list of flags to enable.
11117 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11118 default state we inherit from the CPU tuning structures. OPTION_NAME
11119 gives the top-level option we are parsing in the -moverride string,
11120 for use in error messages. */
11121
11122 static unsigned int
11123 aarch64_parse_boolean_options (const char *option,
11124 const struct aarch64_flag_desc *flags,
11125 unsigned int initial_state,
11126 const char *option_name)
11127 {
11128 const char separator = '.';
11129 const char* specs = option;
11130 const char* ntoken = option;
11131 unsigned int found_flags = initial_state;
11132
11133 while ((ntoken = strchr (specs, separator)))
11134 {
11135 size_t token_length = ntoken - specs;
11136 unsigned token_ops = aarch64_parse_one_option_token (specs,
11137 token_length,
11138 flags,
11139 option_name);
11140 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11141 in the token stream, reset the supported operations. So:
11142
11143 adrp+add.cmp+branch.none.adrp+add
11144
11145 would have the result of turning on only adrp+add fusion. */
11146 if (!token_ops)
11147 found_flags = 0;
11148
11149 found_flags |= token_ops;
11150 specs = ++ntoken;
11151 }
11152
11153 /* We ended with a comma, print something. */
11154 if (!(*specs))
11155 {
11156 error ("%s string ill-formed\n", option_name);
11157 return 0;
11158 }
11159
11160 /* We still have one more token to parse. */
11161 size_t token_length = strlen (specs);
11162 unsigned token_ops = aarch64_parse_one_option_token (specs,
11163 token_length,
11164 flags,
11165 option_name);
11166 if (!token_ops)
11167 found_flags = 0;
11168
11169 found_flags |= token_ops;
11170 return found_flags;
11171 }
11172
11173 /* Support for overriding instruction fusion. */
11174
11175 static void
11176 aarch64_parse_fuse_string (const char *fuse_string,
11177 struct tune_params *tune)
11178 {
11179 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11180 aarch64_fusible_pairs,
11181 tune->fusible_ops,
11182 "fuse=");
11183 }
11184
11185 /* Support for overriding other tuning flags. */
11186
11187 static void
11188 aarch64_parse_tune_string (const char *tune_string,
11189 struct tune_params *tune)
11190 {
11191 tune->extra_tuning_flags
11192 = aarch64_parse_boolean_options (tune_string,
11193 aarch64_tuning_flags,
11194 tune->extra_tuning_flags,
11195 "tune=");
11196 }
11197
11198 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11199 Accept the valid SVE vector widths allowed by
11200 aarch64_sve_vector_bits_enum and use it to override sve_width
11201 in TUNE. */
11202
11203 static void
11204 aarch64_parse_sve_width_string (const char *tune_string,
11205 struct tune_params *tune)
11206 {
11207 int width = -1;
11208
11209 int n = sscanf (tune_string, "%d", &width);
11210 if (n == EOF)
11211 {
11212 error ("invalid format for sve_width");
11213 return;
11214 }
11215 switch (width)
11216 {
11217 case SVE_128:
11218 case SVE_256:
11219 case SVE_512:
11220 case SVE_1024:
11221 case SVE_2048:
11222 break;
11223 default:
11224 error ("invalid sve_width value: %d", width);
11225 }
11226 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11227 }
11228
11229 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11230 we understand. If it is, extract the option string and handoff to
11231 the appropriate function. */
11232
11233 void
11234 aarch64_parse_one_override_token (const char* token,
11235 size_t length,
11236 struct tune_params *tune)
11237 {
11238 const struct aarch64_tuning_override_function *fn
11239 = aarch64_tuning_override_functions;
11240
11241 const char *option_part = strchr (token, '=');
11242 if (!option_part)
11243 {
11244 error ("tuning string missing in option (%s)", token);
11245 return;
11246 }
11247
11248 /* Get the length of the option name. */
11249 length = option_part - token;
11250 /* Skip the '=' to get to the option string. */
11251 option_part++;
11252
11253 for (; fn->name != NULL; fn++)
11254 {
11255 if (!strncmp (fn->name, token, length))
11256 {
11257 fn->parse_override (option_part, tune);
11258 return;
11259 }
11260 }
11261
11262 error ("unknown tuning option (%s)",token);
11263 return;
11264 }
11265
11266 /* A checking mechanism for the implementation of the tls size. */
11267
11268 static void
11269 initialize_aarch64_tls_size (struct gcc_options *opts)
11270 {
11271 if (aarch64_tls_size == 0)
11272 aarch64_tls_size = 24;
11273
11274 switch (opts->x_aarch64_cmodel_var)
11275 {
11276 case AARCH64_CMODEL_TINY:
11277 /* Both the default and maximum TLS size allowed under tiny is 1M which
11278 needs two instructions to address, so we clamp the size to 24. */
11279 if (aarch64_tls_size > 24)
11280 aarch64_tls_size = 24;
11281 break;
11282 case AARCH64_CMODEL_SMALL:
11283 /* The maximum TLS size allowed under small is 4G. */
11284 if (aarch64_tls_size > 32)
11285 aarch64_tls_size = 32;
11286 break;
11287 case AARCH64_CMODEL_LARGE:
11288 /* The maximum TLS size allowed under large is 16E.
11289 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11290 if (aarch64_tls_size > 48)
11291 aarch64_tls_size = 48;
11292 break;
11293 default:
11294 gcc_unreachable ();
11295 }
11296
11297 return;
11298 }
11299
11300 /* Parse STRING looking for options in the format:
11301 string :: option:string
11302 option :: name=substring
11303 name :: {a-z}
11304 substring :: defined by option. */
11305
11306 static void
11307 aarch64_parse_override_string (const char* input_string,
11308 struct tune_params* tune)
11309 {
11310 const char separator = ':';
11311 size_t string_length = strlen (input_string) + 1;
11312 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11313 char *string = string_root;
11314 strncpy (string, input_string, string_length);
11315 string[string_length - 1] = '\0';
11316
11317 char* ntoken = string;
11318
11319 while ((ntoken = strchr (string, separator)))
11320 {
11321 size_t token_length = ntoken - string;
11322 /* Make this substring look like a string. */
11323 *ntoken = '\0';
11324 aarch64_parse_one_override_token (string, token_length, tune);
11325 string = ++ntoken;
11326 }
11327
11328 /* One last option to parse. */
11329 aarch64_parse_one_override_token (string, strlen (string), tune);
11330 free (string_root);
11331 }
11332
11333
11334 static void
11335 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11336 {
11337 if (accepted_branch_protection_string)
11338 {
11339 opts->x_aarch64_branch_protection_string
11340 = xstrdup (accepted_branch_protection_string);
11341 }
11342
11343 /* PR 70044: We have to be careful about being called multiple times for the
11344 same function. This means all changes should be repeatable. */
11345
11346 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11347 Disable the frame pointer flag so the mid-end will not use a frame
11348 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11349 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11350 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11351 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11352 if (opts->x_flag_omit_frame_pointer == 0)
11353 opts->x_flag_omit_frame_pointer = 2;
11354
11355 /* If not optimizing for size, set the default
11356 alignment to what the target wants. */
11357 if (!opts->x_optimize_size)
11358 {
11359 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11360 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11361 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11362 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11363 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11364 opts->x_str_align_functions = aarch64_tune_params.function_align;
11365 }
11366
11367 /* We default to no pc-relative literal loads. */
11368
11369 aarch64_pcrelative_literal_loads = false;
11370
11371 /* If -mpc-relative-literal-loads is set on the command line, this
11372 implies that the user asked for PC relative literal loads. */
11373 if (opts->x_pcrelative_literal_loads == 1)
11374 aarch64_pcrelative_literal_loads = true;
11375
11376 /* In the tiny memory model it makes no sense to disallow PC relative
11377 literal pool loads. */
11378 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11379 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11380 aarch64_pcrelative_literal_loads = true;
11381
11382 /* When enabling the lower precision Newton series for the square root, also
11383 enable it for the reciprocal square root, since the latter is an
11384 intermediary step for the former. */
11385 if (flag_mlow_precision_sqrt)
11386 flag_mrecip_low_precision_sqrt = true;
11387 }
11388
11389 /* 'Unpack' up the internal tuning structs and update the options
11390 in OPTS. The caller must have set up selected_tune and selected_arch
11391 as all the other target-specific codegen decisions are
11392 derived from them. */
11393
11394 void
11395 aarch64_override_options_internal (struct gcc_options *opts)
11396 {
11397 aarch64_tune_flags = selected_tune->flags;
11398 aarch64_tune = selected_tune->sched_core;
11399 /* Make a copy of the tuning parameters attached to the core, which
11400 we may later overwrite. */
11401 aarch64_tune_params = *(selected_tune->tune);
11402 aarch64_architecture_version = selected_arch->architecture_version;
11403
11404 if (opts->x_aarch64_override_tune_string)
11405 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11406 &aarch64_tune_params);
11407
11408 /* This target defaults to strict volatile bitfields. */
11409 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11410 opts->x_flag_strict_volatile_bitfields = 1;
11411
11412 if (aarch64_stack_protector_guard == SSP_GLOBAL
11413 && opts->x_aarch64_stack_protector_guard_offset_str)
11414 {
11415 error ("incompatible options -mstack-protector-guard=global and"
11416 "-mstack-protector-guard-offset=%qs",
11417 aarch64_stack_protector_guard_offset_str);
11418 }
11419
11420 if (aarch64_stack_protector_guard == SSP_SYSREG
11421 && !(opts->x_aarch64_stack_protector_guard_offset_str
11422 && opts->x_aarch64_stack_protector_guard_reg_str))
11423 {
11424 error ("both -mstack-protector-guard-offset and "
11425 "-mstack-protector-guard-reg must be used "
11426 "with -mstack-protector-guard=sysreg");
11427 }
11428
11429 if (opts->x_aarch64_stack_protector_guard_reg_str)
11430 {
11431 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11432 error ("specify a system register with a small string length.");
11433 }
11434
11435 if (opts->x_aarch64_stack_protector_guard_offset_str)
11436 {
11437 char *end;
11438 const char *str = aarch64_stack_protector_guard_offset_str;
11439 errno = 0;
11440 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11441 if (!*str || *end || errno)
11442 error ("%qs is not a valid offset in %qs", str,
11443 "-mstack-protector-guard-offset=");
11444 aarch64_stack_protector_guard_offset = offs;
11445 }
11446
11447 initialize_aarch64_code_model (opts);
11448 initialize_aarch64_tls_size (opts);
11449
11450 int queue_depth = 0;
11451 switch (aarch64_tune_params.autoprefetcher_model)
11452 {
11453 case tune_params::AUTOPREFETCHER_OFF:
11454 queue_depth = -1;
11455 break;
11456 case tune_params::AUTOPREFETCHER_WEAK:
11457 queue_depth = 0;
11458 break;
11459 case tune_params::AUTOPREFETCHER_STRONG:
11460 queue_depth = max_insn_queue_index + 1;
11461 break;
11462 default:
11463 gcc_unreachable ();
11464 }
11465
11466 /* We don't mind passing in global_options_set here as we don't use
11467 the *options_set structs anyway. */
11468 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11469 queue_depth,
11470 opts->x_param_values,
11471 global_options_set.x_param_values);
11472
11473 /* Set up parameters to be used in prefetching algorithm. Do not
11474 override the defaults unless we are tuning for a core we have
11475 researched values for. */
11476 if (aarch64_tune_params.prefetch->num_slots > 0)
11477 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11478 aarch64_tune_params.prefetch->num_slots,
11479 opts->x_param_values,
11480 global_options_set.x_param_values);
11481 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11482 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11483 aarch64_tune_params.prefetch->l1_cache_size,
11484 opts->x_param_values,
11485 global_options_set.x_param_values);
11486 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11487 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11488 aarch64_tune_params.prefetch->l1_cache_line_size,
11489 opts->x_param_values,
11490 global_options_set.x_param_values);
11491 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11492 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11493 aarch64_tune_params.prefetch->l2_cache_size,
11494 opts->x_param_values,
11495 global_options_set.x_param_values);
11496 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11497 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11498 0,
11499 opts->x_param_values,
11500 global_options_set.x_param_values);
11501 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11502 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11503 aarch64_tune_params.prefetch->minimum_stride,
11504 opts->x_param_values,
11505 global_options_set.x_param_values);
11506
11507 /* Use the alternative scheduling-pressure algorithm by default. */
11508 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11509 opts->x_param_values,
11510 global_options_set.x_param_values);
11511
11512 /* If the user hasn't changed it via configure then set the default to 64 KB
11513 for the backend. */
11514 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11515 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11516 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11517 opts->x_param_values,
11518 global_options_set.x_param_values);
11519
11520 /* Validate the guard size. */
11521 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11522
11523 /* Enforce that interval is the same size as size so the mid-end does the
11524 right thing. */
11525 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11526 guard_size,
11527 opts->x_param_values,
11528 global_options_set.x_param_values);
11529
11530 /* The maybe_set calls won't update the value if the user has explicitly set
11531 one. Which means we need to validate that probing interval and guard size
11532 are equal. */
11533 int probe_interval
11534 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11535 if (guard_size != probe_interval)
11536 error ("stack clash guard size '%d' must be equal to probing interval "
11537 "'%d'", guard_size, probe_interval);
11538
11539 /* Enable sw prefetching at specified optimization level for
11540 CPUS that have prefetch. Lower optimization level threshold by 1
11541 when profiling is enabled. */
11542 if (opts->x_flag_prefetch_loop_arrays < 0
11543 && !opts->x_optimize_size
11544 && aarch64_tune_params.prefetch->default_opt_level >= 0
11545 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11546 opts->x_flag_prefetch_loop_arrays = 1;
11547
11548 if (opts->x_aarch64_arch_string == NULL)
11549 opts->x_aarch64_arch_string = selected_arch->name;
11550 if (opts->x_aarch64_cpu_string == NULL)
11551 opts->x_aarch64_cpu_string = selected_cpu->name;
11552 if (opts->x_aarch64_tune_string == NULL)
11553 opts->x_aarch64_tune_string = selected_tune->name;
11554
11555 aarch64_override_options_after_change_1 (opts);
11556 }
11557
11558 /* Print a hint with a suggestion for a core or architecture name that
11559 most closely resembles what the user passed in STR. ARCH is true if
11560 the user is asking for an architecture name. ARCH is false if the user
11561 is asking for a core name. */
11562
11563 static void
11564 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11565 {
11566 auto_vec<const char *> candidates;
11567 const struct processor *entry = arch ? all_architectures : all_cores;
11568 for (; entry->name != NULL; entry++)
11569 candidates.safe_push (entry->name);
11570
11571 #ifdef HAVE_LOCAL_CPU_DETECT
11572 /* Add also "native" as possible value. */
11573 if (arch)
11574 candidates.safe_push ("native");
11575 #endif
11576
11577 char *s;
11578 const char *hint = candidates_list_and_hint (str, s, candidates);
11579 if (hint)
11580 inform (input_location, "valid arguments are: %s;"
11581 " did you mean %qs?", s, hint);
11582 else
11583 inform (input_location, "valid arguments are: %s", s);
11584
11585 XDELETEVEC (s);
11586 }
11587
11588 /* Print a hint with a suggestion for a core name that most closely resembles
11589 what the user passed in STR. */
11590
11591 inline static void
11592 aarch64_print_hint_for_core (const char *str)
11593 {
11594 aarch64_print_hint_for_core_or_arch (str, false);
11595 }
11596
11597 /* Print a hint with a suggestion for an architecture name that most closely
11598 resembles what the user passed in STR. */
11599
11600 inline static void
11601 aarch64_print_hint_for_arch (const char *str)
11602 {
11603 aarch64_print_hint_for_core_or_arch (str, true);
11604 }
11605
11606
11607 /* Print a hint with a suggestion for an extension name
11608 that most closely resembles what the user passed in STR. */
11609
11610 void
11611 aarch64_print_hint_for_extensions (const std::string &str)
11612 {
11613 auto_vec<const char *> candidates;
11614 aarch64_get_all_extension_candidates (&candidates);
11615 char *s;
11616 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11617 if (hint)
11618 inform (input_location, "valid arguments are: %s;"
11619 " did you mean %qs?", s, hint);
11620 else
11621 inform (input_location, "valid arguments are: %s;", s);
11622
11623 XDELETEVEC (s);
11624 }
11625
11626 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11627 specified in STR and throw errors if appropriate. Put the results if
11628 they are valid in RES and ISA_FLAGS. Return whether the option is
11629 valid. */
11630
11631 static bool
11632 aarch64_validate_mcpu (const char *str, const struct processor **res,
11633 unsigned long *isa_flags)
11634 {
11635 std::string invalid_extension;
11636 enum aarch64_parse_opt_result parse_res
11637 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11638
11639 if (parse_res == AARCH64_PARSE_OK)
11640 return true;
11641
11642 switch (parse_res)
11643 {
11644 case AARCH64_PARSE_MISSING_ARG:
11645 error ("missing cpu name in %<-mcpu=%s%>", str);
11646 break;
11647 case AARCH64_PARSE_INVALID_ARG:
11648 error ("unknown value %qs for -mcpu", str);
11649 aarch64_print_hint_for_core (str);
11650 break;
11651 case AARCH64_PARSE_INVALID_FEATURE:
11652 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11653 invalid_extension.c_str (), str);
11654 aarch64_print_hint_for_extensions (invalid_extension);
11655 break;
11656 default:
11657 gcc_unreachable ();
11658 }
11659
11660 return false;
11661 }
11662
11663 /* Parses CONST_STR for branch protection features specified in
11664 aarch64_branch_protect_types, and set any global variables required. Returns
11665 the parsing result and assigns LAST_STR to the last processed token from
11666 CONST_STR so that it can be used for error reporting. */
11667
11668 static enum
11669 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11670 char** last_str)
11671 {
11672 char *str_root = xstrdup (const_str);
11673 char* token_save = NULL;
11674 char *str = strtok_r (str_root, "+", &token_save);
11675 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11676 if (!str)
11677 res = AARCH64_PARSE_MISSING_ARG;
11678 else
11679 {
11680 char *next_str = strtok_r (NULL, "+", &token_save);
11681 /* Reset the branch protection features to their defaults. */
11682 aarch64_handle_no_branch_protection (NULL, NULL);
11683
11684 while (str && res == AARCH64_PARSE_OK)
11685 {
11686 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11687 bool found = false;
11688 /* Search for this type. */
11689 while (type && type->name && !found && res == AARCH64_PARSE_OK)
11690 {
11691 if (strcmp (str, type->name) == 0)
11692 {
11693 found = true;
11694 res = type->handler (str, next_str);
11695 str = next_str;
11696 next_str = strtok_r (NULL, "+", &token_save);
11697 }
11698 else
11699 type++;
11700 }
11701 if (found && res == AARCH64_PARSE_OK)
11702 {
11703 bool found_subtype = true;
11704 /* Loop through each token until we find one that isn't a
11705 subtype. */
11706 while (found_subtype)
11707 {
11708 found_subtype = false;
11709 const aarch64_branch_protect_type *subtype = type->subtypes;
11710 /* Search for the subtype. */
11711 while (str && subtype && subtype->name && !found_subtype
11712 && res == AARCH64_PARSE_OK)
11713 {
11714 if (strcmp (str, subtype->name) == 0)
11715 {
11716 found_subtype = true;
11717 res = subtype->handler (str, next_str);
11718 str = next_str;
11719 next_str = strtok_r (NULL, "+", &token_save);
11720 }
11721 else
11722 subtype++;
11723 }
11724 }
11725 }
11726 else if (!found)
11727 res = AARCH64_PARSE_INVALID_ARG;
11728 }
11729 }
11730 /* Copy the last processed token into the argument to pass it back.
11731 Used by option and attribute validation to print the offending token. */
11732 if (last_str)
11733 {
11734 if (str) strcpy (*last_str, str);
11735 else *last_str = NULL;
11736 }
11737 if (res == AARCH64_PARSE_OK)
11738 {
11739 /* If needed, alloc the accepted string then copy in const_str.
11740 Used by override_option_after_change_1. */
11741 if (!accepted_branch_protection_string)
11742 accepted_branch_protection_string = (char *) xmalloc (
11743 BRANCH_PROTECT_STR_MAX
11744 + 1);
11745 strncpy (accepted_branch_protection_string, const_str,
11746 BRANCH_PROTECT_STR_MAX + 1);
11747 /* Forcibly null-terminate. */
11748 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11749 }
11750 return res;
11751 }
11752
11753 static bool
11754 aarch64_validate_mbranch_protection (const char *const_str)
11755 {
11756 char *str = (char *) xmalloc (strlen (const_str));
11757 enum aarch64_parse_opt_result res =
11758 aarch64_parse_branch_protection (const_str, &str);
11759 if (res == AARCH64_PARSE_INVALID_ARG)
11760 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11761 else if (res == AARCH64_PARSE_MISSING_ARG)
11762 error ("missing arg for %<-mbranch-protection=%>");
11763 free (str);
11764 return res == AARCH64_PARSE_OK;
11765 }
11766
11767 /* Validate a command-line -march option. Parse the arch and extensions
11768 (if any) specified in STR and throw errors if appropriate. Put the
11769 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11770 option is valid. */
11771
11772 static bool
11773 aarch64_validate_march (const char *str, const struct processor **res,
11774 unsigned long *isa_flags)
11775 {
11776 std::string invalid_extension;
11777 enum aarch64_parse_opt_result parse_res
11778 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11779
11780 if (parse_res == AARCH64_PARSE_OK)
11781 return true;
11782
11783 switch (parse_res)
11784 {
11785 case AARCH64_PARSE_MISSING_ARG:
11786 error ("missing arch name in %<-march=%s%>", str);
11787 break;
11788 case AARCH64_PARSE_INVALID_ARG:
11789 error ("unknown value %qs for -march", str);
11790 aarch64_print_hint_for_arch (str);
11791 break;
11792 case AARCH64_PARSE_INVALID_FEATURE:
11793 error ("invalid feature modifier %qs in %<-march=%s%>",
11794 invalid_extension.c_str (), str);
11795 aarch64_print_hint_for_extensions (invalid_extension);
11796 break;
11797 default:
11798 gcc_unreachable ();
11799 }
11800
11801 return false;
11802 }
11803
11804 /* Validate a command-line -mtune option. Parse the cpu
11805 specified in STR and throw errors if appropriate. Put the
11806 result, if it is valid, in RES. Return whether the option is
11807 valid. */
11808
11809 static bool
11810 aarch64_validate_mtune (const char *str, const struct processor **res)
11811 {
11812 enum aarch64_parse_opt_result parse_res
11813 = aarch64_parse_tune (str, res);
11814
11815 if (parse_res == AARCH64_PARSE_OK)
11816 return true;
11817
11818 switch (parse_res)
11819 {
11820 case AARCH64_PARSE_MISSING_ARG:
11821 error ("missing cpu name in %<-mtune=%s%>", str);
11822 break;
11823 case AARCH64_PARSE_INVALID_ARG:
11824 error ("unknown value %qs for -mtune", str);
11825 aarch64_print_hint_for_core (str);
11826 break;
11827 default:
11828 gcc_unreachable ();
11829 }
11830 return false;
11831 }
11832
11833 /* Return the CPU corresponding to the enum CPU.
11834 If it doesn't specify a cpu, return the default. */
11835
11836 static const struct processor *
11837 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11838 {
11839 if (cpu != aarch64_none)
11840 return &all_cores[cpu];
11841
11842 /* The & 0x3f is to extract the bottom 6 bits that encode the
11843 default cpu as selected by the --with-cpu GCC configure option
11844 in config.gcc.
11845 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11846 flags mechanism should be reworked to make it more sane. */
11847 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11848 }
11849
11850 /* Return the architecture corresponding to the enum ARCH.
11851 If it doesn't specify a valid architecture, return the default. */
11852
11853 static const struct processor *
11854 aarch64_get_arch (enum aarch64_arch arch)
11855 {
11856 if (arch != aarch64_no_arch)
11857 return &all_architectures[arch];
11858
11859 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11860
11861 return &all_architectures[cpu->arch];
11862 }
11863
11864 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11865
11866 static poly_uint16
11867 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11868 {
11869 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11870 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11871 deciding which .md file patterns to use and when deciding whether
11872 something is a legitimate address or constant. */
11873 if (value == SVE_SCALABLE || value == SVE_128)
11874 return poly_uint16 (2, 2);
11875 else
11876 return (int) value / 64;
11877 }
11878
11879 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11880 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11881 tuning structs. In particular it must set selected_tune and
11882 aarch64_isa_flags that define the available ISA features and tuning
11883 decisions. It must also set selected_arch as this will be used to
11884 output the .arch asm tags for each function. */
11885
11886 static void
11887 aarch64_override_options (void)
11888 {
11889 unsigned long cpu_isa = 0;
11890 unsigned long arch_isa = 0;
11891 aarch64_isa_flags = 0;
11892
11893 bool valid_cpu = true;
11894 bool valid_tune = true;
11895 bool valid_arch = true;
11896
11897 selected_cpu = NULL;
11898 selected_arch = NULL;
11899 selected_tune = NULL;
11900
11901 if (aarch64_branch_protection_string)
11902 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11903
11904 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11905 If either of -march or -mtune is given, they override their
11906 respective component of -mcpu. */
11907 if (aarch64_cpu_string)
11908 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11909 &cpu_isa);
11910
11911 if (aarch64_arch_string)
11912 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11913 &arch_isa);
11914
11915 if (aarch64_tune_string)
11916 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11917
11918 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11919 SUBTARGET_OVERRIDE_OPTIONS;
11920 #endif
11921
11922 /* If the user did not specify a processor, choose the default
11923 one for them. This will be the CPU set during configuration using
11924 --with-cpu, otherwise it is "generic". */
11925 if (!selected_cpu)
11926 {
11927 if (selected_arch)
11928 {
11929 selected_cpu = &all_cores[selected_arch->ident];
11930 aarch64_isa_flags = arch_isa;
11931 explicit_arch = selected_arch->arch;
11932 }
11933 else
11934 {
11935 /* Get default configure-time CPU. */
11936 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11937 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11938 }
11939
11940 if (selected_tune)
11941 explicit_tune_core = selected_tune->ident;
11942 }
11943 /* If both -mcpu and -march are specified check that they are architecturally
11944 compatible, warn if they're not and prefer the -march ISA flags. */
11945 else if (selected_arch)
11946 {
11947 if (selected_arch->arch != selected_cpu->arch)
11948 {
11949 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11950 all_architectures[selected_cpu->arch].name,
11951 selected_arch->name);
11952 }
11953 aarch64_isa_flags = arch_isa;
11954 explicit_arch = selected_arch->arch;
11955 explicit_tune_core = selected_tune ? selected_tune->ident
11956 : selected_cpu->ident;
11957 }
11958 else
11959 {
11960 /* -mcpu but no -march. */
11961 aarch64_isa_flags = cpu_isa;
11962 explicit_tune_core = selected_tune ? selected_tune->ident
11963 : selected_cpu->ident;
11964 gcc_assert (selected_cpu);
11965 selected_arch = &all_architectures[selected_cpu->arch];
11966 explicit_arch = selected_arch->arch;
11967 }
11968
11969 /* Set the arch as well as we will need it when outputing
11970 the .arch directive in assembly. */
11971 if (!selected_arch)
11972 {
11973 gcc_assert (selected_cpu);
11974 selected_arch = &all_architectures[selected_cpu->arch];
11975 }
11976
11977 if (!selected_tune)
11978 selected_tune = selected_cpu;
11979
11980 if (aarch64_enable_bti == 2)
11981 {
11982 #ifdef TARGET_ENABLE_BTI
11983 aarch64_enable_bti = 1;
11984 #else
11985 aarch64_enable_bti = 0;
11986 #endif
11987 }
11988
11989 /* Return address signing is currently not supported for ILP32 targets. For
11990 LP64 targets use the configured option in the absence of a command-line
11991 option for -mbranch-protection. */
11992 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
11993 {
11994 #ifdef TARGET_ENABLE_PAC_RET
11995 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
11996 aarch64_ra_sign_key = AARCH64_KEY_A;
11997 #else
11998 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
11999 #endif
12000 }
12001
12002 #ifndef HAVE_AS_MABI_OPTION
12003 /* The compiler may have been configured with 2.23.* binutils, which does
12004 not have support for ILP32. */
12005 if (TARGET_ILP32)
12006 error ("assembler does not support -mabi=ilp32");
12007 #endif
12008
12009 /* Convert -msve-vector-bits to a VG count. */
12010 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12011
12012 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12013 sorry ("return address signing is only supported for -mabi=lp64");
12014
12015 /* Make sure we properly set up the explicit options. */
12016 if ((aarch64_cpu_string && valid_cpu)
12017 || (aarch64_tune_string && valid_tune))
12018 gcc_assert (explicit_tune_core != aarch64_none);
12019
12020 if ((aarch64_cpu_string && valid_cpu)
12021 || (aarch64_arch_string && valid_arch))
12022 gcc_assert (explicit_arch != aarch64_no_arch);
12023
12024 /* The pass to insert speculation tracking runs before
12025 shrink-wrapping and the latter does not know how to update the
12026 tracking status. So disable it in this case. */
12027 if (aarch64_track_speculation)
12028 flag_shrink_wrap = 0;
12029
12030 aarch64_override_options_internal (&global_options);
12031
12032 /* Save these options as the default ones in case we push and pop them later
12033 while processing functions with potential target attributes. */
12034 target_option_default_node = target_option_current_node
12035 = build_target_option_node (&global_options);
12036 }
12037
12038 /* Implement targetm.override_options_after_change. */
12039
12040 static void
12041 aarch64_override_options_after_change (void)
12042 {
12043 aarch64_override_options_after_change_1 (&global_options);
12044 }
12045
12046 static struct machine_function *
12047 aarch64_init_machine_status (void)
12048 {
12049 struct machine_function *machine;
12050 machine = ggc_cleared_alloc<machine_function> ();
12051 return machine;
12052 }
12053
12054 void
12055 aarch64_init_expanders (void)
12056 {
12057 init_machine_status = aarch64_init_machine_status;
12058 }
12059
12060 /* A checking mechanism for the implementation of the various code models. */
12061 static void
12062 initialize_aarch64_code_model (struct gcc_options *opts)
12063 {
12064 if (opts->x_flag_pic)
12065 {
12066 switch (opts->x_aarch64_cmodel_var)
12067 {
12068 case AARCH64_CMODEL_TINY:
12069 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12070 break;
12071 case AARCH64_CMODEL_SMALL:
12072 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12073 aarch64_cmodel = (flag_pic == 2
12074 ? AARCH64_CMODEL_SMALL_PIC
12075 : AARCH64_CMODEL_SMALL_SPIC);
12076 #else
12077 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12078 #endif
12079 break;
12080 case AARCH64_CMODEL_LARGE:
12081 sorry ("code model %qs with -f%s", "large",
12082 opts->x_flag_pic > 1 ? "PIC" : "pic");
12083 break;
12084 default:
12085 gcc_unreachable ();
12086 }
12087 }
12088 else
12089 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12090 }
12091
12092 /* Implement TARGET_OPTION_SAVE. */
12093
12094 static void
12095 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12096 {
12097 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12098 ptr->x_aarch64_branch_protection_string
12099 = opts->x_aarch64_branch_protection_string;
12100 }
12101
12102 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12103 using the information saved in PTR. */
12104
12105 static void
12106 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12107 {
12108 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12109 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12110 opts->x_explicit_arch = ptr->x_explicit_arch;
12111 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12112 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12113 opts->x_aarch64_branch_protection_string
12114 = ptr->x_aarch64_branch_protection_string;
12115 if (opts->x_aarch64_branch_protection_string)
12116 {
12117 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12118 NULL);
12119 }
12120
12121 aarch64_override_options_internal (opts);
12122 }
12123
12124 /* Implement TARGET_OPTION_PRINT. */
12125
12126 static void
12127 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12128 {
12129 const struct processor *cpu
12130 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12131 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12132 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12133 std::string extension
12134 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12135
12136 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12137 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12138 arch->name, extension.c_str ());
12139 }
12140
12141 static GTY(()) tree aarch64_previous_fndecl;
12142
12143 void
12144 aarch64_reset_previous_fndecl (void)
12145 {
12146 aarch64_previous_fndecl = NULL;
12147 }
12148
12149 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12150 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12151 make sure optab availability predicates are recomputed when necessary. */
12152
12153 void
12154 aarch64_save_restore_target_globals (tree new_tree)
12155 {
12156 if (TREE_TARGET_GLOBALS (new_tree))
12157 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12158 else if (new_tree == target_option_default_node)
12159 restore_target_globals (&default_target_globals);
12160 else
12161 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12162 }
12163
12164 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12165 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12166 of the function, if such exists. This function may be called multiple
12167 times on a single function so use aarch64_previous_fndecl to avoid
12168 setting up identical state. */
12169
12170 static void
12171 aarch64_set_current_function (tree fndecl)
12172 {
12173 if (!fndecl || fndecl == aarch64_previous_fndecl)
12174 return;
12175
12176 tree old_tree = (aarch64_previous_fndecl
12177 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12178 : NULL_TREE);
12179
12180 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12181
12182 /* If current function has no attributes but the previous one did,
12183 use the default node. */
12184 if (!new_tree && old_tree)
12185 new_tree = target_option_default_node;
12186
12187 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12188 the default have been handled by aarch64_save_restore_target_globals from
12189 aarch64_pragma_target_parse. */
12190 if (old_tree == new_tree)
12191 return;
12192
12193 aarch64_previous_fndecl = fndecl;
12194
12195 /* First set the target options. */
12196 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12197
12198 aarch64_save_restore_target_globals (new_tree);
12199 }
12200
12201 /* Enum describing the various ways we can handle attributes.
12202 In many cases we can reuse the generic option handling machinery. */
12203
12204 enum aarch64_attr_opt_type
12205 {
12206 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12207 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12208 aarch64_attr_enum, /* Attribute sets an enum variable. */
12209 aarch64_attr_custom /* Attribute requires a custom handling function. */
12210 };
12211
12212 /* All the information needed to handle a target attribute.
12213 NAME is the name of the attribute.
12214 ATTR_TYPE specifies the type of behavior of the attribute as described
12215 in the definition of enum aarch64_attr_opt_type.
12216 ALLOW_NEG is true if the attribute supports a "no-" form.
12217 HANDLER is the function that takes the attribute string as an argument
12218 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12219 OPT_NUM is the enum specifying the option that the attribute modifies.
12220 This is needed for attributes that mirror the behavior of a command-line
12221 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12222 aarch64_attr_enum. */
12223
12224 struct aarch64_attribute_info
12225 {
12226 const char *name;
12227 enum aarch64_attr_opt_type attr_type;
12228 bool allow_neg;
12229 bool (*handler) (const char *);
12230 enum opt_code opt_num;
12231 };
12232
12233 /* Handle the ARCH_STR argument to the arch= target attribute. */
12234
12235 static bool
12236 aarch64_handle_attr_arch (const char *str)
12237 {
12238 const struct processor *tmp_arch = NULL;
12239 std::string invalid_extension;
12240 enum aarch64_parse_opt_result parse_res
12241 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12242
12243 if (parse_res == AARCH64_PARSE_OK)
12244 {
12245 gcc_assert (tmp_arch);
12246 selected_arch = tmp_arch;
12247 explicit_arch = selected_arch->arch;
12248 return true;
12249 }
12250
12251 switch (parse_res)
12252 {
12253 case AARCH64_PARSE_MISSING_ARG:
12254 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12255 break;
12256 case AARCH64_PARSE_INVALID_ARG:
12257 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12258 aarch64_print_hint_for_arch (str);
12259 break;
12260 case AARCH64_PARSE_INVALID_FEATURE:
12261 error ("invalid feature modifier %s of value (\"%s\") in "
12262 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12263 aarch64_print_hint_for_extensions (invalid_extension);
12264 break;
12265 default:
12266 gcc_unreachable ();
12267 }
12268
12269 return false;
12270 }
12271
12272 /* Handle the argument CPU_STR to the cpu= target attribute. */
12273
12274 static bool
12275 aarch64_handle_attr_cpu (const char *str)
12276 {
12277 const struct processor *tmp_cpu = NULL;
12278 std::string invalid_extension;
12279 enum aarch64_parse_opt_result parse_res
12280 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12281
12282 if (parse_res == AARCH64_PARSE_OK)
12283 {
12284 gcc_assert (tmp_cpu);
12285 selected_tune = tmp_cpu;
12286 explicit_tune_core = selected_tune->ident;
12287
12288 selected_arch = &all_architectures[tmp_cpu->arch];
12289 explicit_arch = selected_arch->arch;
12290 return true;
12291 }
12292
12293 switch (parse_res)
12294 {
12295 case AARCH64_PARSE_MISSING_ARG:
12296 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12297 break;
12298 case AARCH64_PARSE_INVALID_ARG:
12299 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12300 aarch64_print_hint_for_core (str);
12301 break;
12302 case AARCH64_PARSE_INVALID_FEATURE:
12303 error ("invalid feature modifier %s of value (\"%s\") in "
12304 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12305 aarch64_print_hint_for_extensions (invalid_extension);
12306 break;
12307 default:
12308 gcc_unreachable ();
12309 }
12310
12311 return false;
12312 }
12313
12314 /* Handle the argument STR to the branch-protection= attribute. */
12315
12316 static bool
12317 aarch64_handle_attr_branch_protection (const char* str)
12318 {
12319 char *err_str = (char *) xmalloc (strlen (str));
12320 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12321 &err_str);
12322 bool success = false;
12323 switch (res)
12324 {
12325 case AARCH64_PARSE_MISSING_ARG:
12326 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12327 " attribute");
12328 break;
12329 case AARCH64_PARSE_INVALID_ARG:
12330 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12331 "=\")%> pragma or attribute", err_str);
12332 break;
12333 case AARCH64_PARSE_OK:
12334 success = true;
12335 /* Fall through. */
12336 case AARCH64_PARSE_INVALID_FEATURE:
12337 break;
12338 default:
12339 gcc_unreachable ();
12340 }
12341 free (err_str);
12342 return success;
12343 }
12344
12345 /* Handle the argument STR to the tune= target attribute. */
12346
12347 static bool
12348 aarch64_handle_attr_tune (const char *str)
12349 {
12350 const struct processor *tmp_tune = NULL;
12351 enum aarch64_parse_opt_result parse_res
12352 = aarch64_parse_tune (str, &tmp_tune);
12353
12354 if (parse_res == AARCH64_PARSE_OK)
12355 {
12356 gcc_assert (tmp_tune);
12357 selected_tune = tmp_tune;
12358 explicit_tune_core = selected_tune->ident;
12359 return true;
12360 }
12361
12362 switch (parse_res)
12363 {
12364 case AARCH64_PARSE_INVALID_ARG:
12365 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12366 aarch64_print_hint_for_core (str);
12367 break;
12368 default:
12369 gcc_unreachable ();
12370 }
12371
12372 return false;
12373 }
12374
12375 /* Parse an architecture extensions target attribute string specified in STR.
12376 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12377 if successful. Update aarch64_isa_flags to reflect the ISA features
12378 modified. */
12379
12380 static bool
12381 aarch64_handle_attr_isa_flags (char *str)
12382 {
12383 enum aarch64_parse_opt_result parse_res;
12384 unsigned long isa_flags = aarch64_isa_flags;
12385
12386 /* We allow "+nothing" in the beginning to clear out all architectural
12387 features if the user wants to handpick specific features. */
12388 if (strncmp ("+nothing", str, 8) == 0)
12389 {
12390 isa_flags = 0;
12391 str += 8;
12392 }
12393
12394 std::string invalid_extension;
12395 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12396
12397 if (parse_res == AARCH64_PARSE_OK)
12398 {
12399 aarch64_isa_flags = isa_flags;
12400 return true;
12401 }
12402
12403 switch (parse_res)
12404 {
12405 case AARCH64_PARSE_MISSING_ARG:
12406 error ("missing value in %<target()%> pragma or attribute");
12407 break;
12408
12409 case AARCH64_PARSE_INVALID_FEATURE:
12410 error ("invalid feature modifier %s of value (\"%s\") in "
12411 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12412 break;
12413
12414 default:
12415 gcc_unreachable ();
12416 }
12417
12418 return false;
12419 }
12420
12421 /* The target attributes that we support. On top of these we also support just
12422 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12423 handled explicitly in aarch64_process_one_target_attr. */
12424
12425 static const struct aarch64_attribute_info aarch64_attributes[] =
12426 {
12427 { "general-regs-only", aarch64_attr_mask, false, NULL,
12428 OPT_mgeneral_regs_only },
12429 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12430 OPT_mfix_cortex_a53_835769 },
12431 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12432 OPT_mfix_cortex_a53_843419 },
12433 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12434 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12435 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12436 OPT_momit_leaf_frame_pointer },
12437 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12438 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12439 OPT_march_ },
12440 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12441 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12442 OPT_mtune_ },
12443 { "branch-protection", aarch64_attr_custom, false,
12444 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12445 { "sign-return-address", aarch64_attr_enum, false, NULL,
12446 OPT_msign_return_address_ },
12447 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12448 };
12449
12450 /* Parse ARG_STR which contains the definition of one target attribute.
12451 Show appropriate errors if any or return true if the attribute is valid. */
12452
12453 static bool
12454 aarch64_process_one_target_attr (char *arg_str)
12455 {
12456 bool invert = false;
12457
12458 size_t len = strlen (arg_str);
12459
12460 if (len == 0)
12461 {
12462 error ("malformed %<target()%> pragma or attribute");
12463 return false;
12464 }
12465
12466 char *str_to_check = (char *) alloca (len + 1);
12467 strcpy (str_to_check, arg_str);
12468
12469 /* Skip leading whitespace. */
12470 while (*str_to_check == ' ' || *str_to_check == '\t')
12471 str_to_check++;
12472
12473 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12474 It is easier to detect and handle it explicitly here rather than going
12475 through the machinery for the rest of the target attributes in this
12476 function. */
12477 if (*str_to_check == '+')
12478 return aarch64_handle_attr_isa_flags (str_to_check);
12479
12480 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12481 {
12482 invert = true;
12483 str_to_check += 3;
12484 }
12485 char *arg = strchr (str_to_check, '=');
12486
12487 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12488 and point ARG to "foo". */
12489 if (arg)
12490 {
12491 *arg = '\0';
12492 arg++;
12493 }
12494 const struct aarch64_attribute_info *p_attr;
12495 bool found = false;
12496 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12497 {
12498 /* If the names don't match up, or the user has given an argument
12499 to an attribute that doesn't accept one, or didn't give an argument
12500 to an attribute that expects one, fail to match. */
12501 if (strcmp (str_to_check, p_attr->name) != 0)
12502 continue;
12503
12504 found = true;
12505 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12506 || p_attr->attr_type == aarch64_attr_enum;
12507
12508 if (attr_need_arg_p ^ (arg != NULL))
12509 {
12510 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12511 return false;
12512 }
12513
12514 /* If the name matches but the attribute does not allow "no-" versions
12515 then we can't match. */
12516 if (invert && !p_attr->allow_neg)
12517 {
12518 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12519 return false;
12520 }
12521
12522 switch (p_attr->attr_type)
12523 {
12524 /* Has a custom handler registered.
12525 For example, cpu=, arch=, tune=. */
12526 case aarch64_attr_custom:
12527 gcc_assert (p_attr->handler);
12528 if (!p_attr->handler (arg))
12529 return false;
12530 break;
12531
12532 /* Either set or unset a boolean option. */
12533 case aarch64_attr_bool:
12534 {
12535 struct cl_decoded_option decoded;
12536
12537 generate_option (p_attr->opt_num, NULL, !invert,
12538 CL_TARGET, &decoded);
12539 aarch64_handle_option (&global_options, &global_options_set,
12540 &decoded, input_location);
12541 break;
12542 }
12543 /* Set or unset a bit in the target_flags. aarch64_handle_option
12544 should know what mask to apply given the option number. */
12545 case aarch64_attr_mask:
12546 {
12547 struct cl_decoded_option decoded;
12548 /* We only need to specify the option number.
12549 aarch64_handle_option will know which mask to apply. */
12550 decoded.opt_index = p_attr->opt_num;
12551 decoded.value = !invert;
12552 aarch64_handle_option (&global_options, &global_options_set,
12553 &decoded, input_location);
12554 break;
12555 }
12556 /* Use the option setting machinery to set an option to an enum. */
12557 case aarch64_attr_enum:
12558 {
12559 gcc_assert (arg);
12560 bool valid;
12561 int value;
12562 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12563 &value, CL_TARGET);
12564 if (valid)
12565 {
12566 set_option (&global_options, NULL, p_attr->opt_num, value,
12567 NULL, DK_UNSPECIFIED, input_location,
12568 global_dc);
12569 }
12570 else
12571 {
12572 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12573 }
12574 break;
12575 }
12576 default:
12577 gcc_unreachable ();
12578 }
12579 }
12580
12581 /* If we reached here we either have found an attribute and validated
12582 it or didn't match any. If we matched an attribute but its arguments
12583 were malformed we will have returned false already. */
12584 return found;
12585 }
12586
12587 /* Count how many times the character C appears in
12588 NULL-terminated string STR. */
12589
12590 static unsigned int
12591 num_occurences_in_str (char c, char *str)
12592 {
12593 unsigned int res = 0;
12594 while (*str != '\0')
12595 {
12596 if (*str == c)
12597 res++;
12598
12599 str++;
12600 }
12601
12602 return res;
12603 }
12604
12605 /* Parse the tree in ARGS that contains the target attribute information
12606 and update the global target options space. */
12607
12608 bool
12609 aarch64_process_target_attr (tree args)
12610 {
12611 if (TREE_CODE (args) == TREE_LIST)
12612 {
12613 do
12614 {
12615 tree head = TREE_VALUE (args);
12616 if (head)
12617 {
12618 if (!aarch64_process_target_attr (head))
12619 return false;
12620 }
12621 args = TREE_CHAIN (args);
12622 } while (args);
12623
12624 return true;
12625 }
12626
12627 if (TREE_CODE (args) != STRING_CST)
12628 {
12629 error ("attribute %<target%> argument not a string");
12630 return false;
12631 }
12632
12633 size_t len = strlen (TREE_STRING_POINTER (args));
12634 char *str_to_check = (char *) alloca (len + 1);
12635 strcpy (str_to_check, TREE_STRING_POINTER (args));
12636
12637 if (len == 0)
12638 {
12639 error ("malformed %<target()%> pragma or attribute");
12640 return false;
12641 }
12642
12643 /* Used to catch empty spaces between commas i.e.
12644 attribute ((target ("attr1,,attr2"))). */
12645 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12646
12647 /* Handle multiple target attributes separated by ','. */
12648 char *token = strtok_r (str_to_check, ",", &str_to_check);
12649
12650 unsigned int num_attrs = 0;
12651 while (token)
12652 {
12653 num_attrs++;
12654 if (!aarch64_process_one_target_attr (token))
12655 {
12656 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12657 return false;
12658 }
12659
12660 token = strtok_r (NULL, ",", &str_to_check);
12661 }
12662
12663 if (num_attrs != num_commas + 1)
12664 {
12665 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12666 return false;
12667 }
12668
12669 return true;
12670 }
12671
12672 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12673 process attribute ((target ("..."))). */
12674
12675 static bool
12676 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12677 {
12678 struct cl_target_option cur_target;
12679 bool ret;
12680 tree old_optimize;
12681 tree new_target, new_optimize;
12682 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12683
12684 /* If what we're processing is the current pragma string then the
12685 target option node is already stored in target_option_current_node
12686 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12687 having to re-parse the string. This is especially useful to keep
12688 arm_neon.h compile times down since that header contains a lot
12689 of intrinsics enclosed in pragmas. */
12690 if (!existing_target && args == current_target_pragma)
12691 {
12692 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12693 return true;
12694 }
12695 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12696
12697 old_optimize = build_optimization_node (&global_options);
12698 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12699
12700 /* If the function changed the optimization levels as well as setting
12701 target options, start with the optimizations specified. */
12702 if (func_optimize && func_optimize != old_optimize)
12703 cl_optimization_restore (&global_options,
12704 TREE_OPTIMIZATION (func_optimize));
12705
12706 /* Save the current target options to restore at the end. */
12707 cl_target_option_save (&cur_target, &global_options);
12708
12709 /* If fndecl already has some target attributes applied to it, unpack
12710 them so that we add this attribute on top of them, rather than
12711 overwriting them. */
12712 if (existing_target)
12713 {
12714 struct cl_target_option *existing_options
12715 = TREE_TARGET_OPTION (existing_target);
12716
12717 if (existing_options)
12718 cl_target_option_restore (&global_options, existing_options);
12719 }
12720 else
12721 cl_target_option_restore (&global_options,
12722 TREE_TARGET_OPTION (target_option_current_node));
12723
12724 ret = aarch64_process_target_attr (args);
12725
12726 /* Set up any additional state. */
12727 if (ret)
12728 {
12729 aarch64_override_options_internal (&global_options);
12730 /* Initialize SIMD builtins if we haven't already.
12731 Set current_target_pragma to NULL for the duration so that
12732 the builtin initialization code doesn't try to tag the functions
12733 being built with the attributes specified by any current pragma, thus
12734 going into an infinite recursion. */
12735 if (TARGET_SIMD)
12736 {
12737 tree saved_current_target_pragma = current_target_pragma;
12738 current_target_pragma = NULL;
12739 aarch64_init_simd_builtins ();
12740 current_target_pragma = saved_current_target_pragma;
12741 }
12742 new_target = build_target_option_node (&global_options);
12743 }
12744 else
12745 new_target = NULL;
12746
12747 new_optimize = build_optimization_node (&global_options);
12748
12749 if (fndecl && ret)
12750 {
12751 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12752
12753 if (old_optimize != new_optimize)
12754 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12755 }
12756
12757 cl_target_option_restore (&global_options, &cur_target);
12758
12759 if (old_optimize != new_optimize)
12760 cl_optimization_restore (&global_options,
12761 TREE_OPTIMIZATION (old_optimize));
12762 return ret;
12763 }
12764
12765 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12766 tri-bool options (yes, no, don't care) and the default value is
12767 DEF, determine whether to reject inlining. */
12768
12769 static bool
12770 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12771 int dont_care, int def)
12772 {
12773 /* If the callee doesn't care, always allow inlining. */
12774 if (callee == dont_care)
12775 return true;
12776
12777 /* If the caller doesn't care, always allow inlining. */
12778 if (caller == dont_care)
12779 return true;
12780
12781 /* Otherwise, allow inlining if either the callee and caller values
12782 agree, or if the callee is using the default value. */
12783 return (callee == caller || callee == def);
12784 }
12785
12786 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12787 to inline CALLEE into CALLER based on target-specific info.
12788 Make sure that the caller and callee have compatible architectural
12789 features. Then go through the other possible target attributes
12790 and see if they can block inlining. Try not to reject always_inline
12791 callees unless they are incompatible architecturally. */
12792
12793 static bool
12794 aarch64_can_inline_p (tree caller, tree callee)
12795 {
12796 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12797 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12798
12799 struct cl_target_option *caller_opts
12800 = TREE_TARGET_OPTION (caller_tree ? caller_tree
12801 : target_option_default_node);
12802
12803 struct cl_target_option *callee_opts
12804 = TREE_TARGET_OPTION (callee_tree ? callee_tree
12805 : target_option_default_node);
12806
12807 /* Callee's ISA flags should be a subset of the caller's. */
12808 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12809 != callee_opts->x_aarch64_isa_flags)
12810 return false;
12811
12812 /* Allow non-strict aligned functions inlining into strict
12813 aligned ones. */
12814 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12815 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12816 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12817 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12818 return false;
12819
12820 bool always_inline = lookup_attribute ("always_inline",
12821 DECL_ATTRIBUTES (callee));
12822
12823 /* If the architectural features match up and the callee is always_inline
12824 then the other attributes don't matter. */
12825 if (always_inline)
12826 return true;
12827
12828 if (caller_opts->x_aarch64_cmodel_var
12829 != callee_opts->x_aarch64_cmodel_var)
12830 return false;
12831
12832 if (caller_opts->x_aarch64_tls_dialect
12833 != callee_opts->x_aarch64_tls_dialect)
12834 return false;
12835
12836 /* Honour explicit requests to workaround errata. */
12837 if (!aarch64_tribools_ok_for_inlining_p (
12838 caller_opts->x_aarch64_fix_a53_err835769,
12839 callee_opts->x_aarch64_fix_a53_err835769,
12840 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12841 return false;
12842
12843 if (!aarch64_tribools_ok_for_inlining_p (
12844 caller_opts->x_aarch64_fix_a53_err843419,
12845 callee_opts->x_aarch64_fix_a53_err843419,
12846 2, TARGET_FIX_ERR_A53_843419))
12847 return false;
12848
12849 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12850 caller and calle and they don't match up, reject inlining. */
12851 if (!aarch64_tribools_ok_for_inlining_p (
12852 caller_opts->x_flag_omit_leaf_frame_pointer,
12853 callee_opts->x_flag_omit_leaf_frame_pointer,
12854 2, 1))
12855 return false;
12856
12857 /* If the callee has specific tuning overrides, respect them. */
12858 if (callee_opts->x_aarch64_override_tune_string != NULL
12859 && caller_opts->x_aarch64_override_tune_string == NULL)
12860 return false;
12861
12862 /* If the user specified tuning override strings for the
12863 caller and callee and they don't match up, reject inlining.
12864 We just do a string compare here, we don't analyze the meaning
12865 of the string, as it would be too costly for little gain. */
12866 if (callee_opts->x_aarch64_override_tune_string
12867 && caller_opts->x_aarch64_override_tune_string
12868 && (strcmp (callee_opts->x_aarch64_override_tune_string,
12869 caller_opts->x_aarch64_override_tune_string) != 0))
12870 return false;
12871
12872 return true;
12873 }
12874
12875 /* Return true if SYMBOL_REF X binds locally. */
12876
12877 static bool
12878 aarch64_symbol_binds_local_p (const_rtx x)
12879 {
12880 return (SYMBOL_REF_DECL (x)
12881 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12882 : SYMBOL_REF_LOCAL_P (x));
12883 }
12884
12885 /* Return true if SYMBOL_REF X is thread local */
12886 static bool
12887 aarch64_tls_symbol_p (rtx x)
12888 {
12889 if (! TARGET_HAVE_TLS)
12890 return false;
12891
12892 if (GET_CODE (x) != SYMBOL_REF)
12893 return false;
12894
12895 return SYMBOL_REF_TLS_MODEL (x) != 0;
12896 }
12897
12898 /* Classify a TLS symbol into one of the TLS kinds. */
12899 enum aarch64_symbol_type
12900 aarch64_classify_tls_symbol (rtx x)
12901 {
12902 enum tls_model tls_kind = tls_symbolic_operand_type (x);
12903
12904 switch (tls_kind)
12905 {
12906 case TLS_MODEL_GLOBAL_DYNAMIC:
12907 case TLS_MODEL_LOCAL_DYNAMIC:
12908 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12909
12910 case TLS_MODEL_INITIAL_EXEC:
12911 switch (aarch64_cmodel)
12912 {
12913 case AARCH64_CMODEL_TINY:
12914 case AARCH64_CMODEL_TINY_PIC:
12915 return SYMBOL_TINY_TLSIE;
12916 default:
12917 return SYMBOL_SMALL_TLSIE;
12918 }
12919
12920 case TLS_MODEL_LOCAL_EXEC:
12921 if (aarch64_tls_size == 12)
12922 return SYMBOL_TLSLE12;
12923 else if (aarch64_tls_size == 24)
12924 return SYMBOL_TLSLE24;
12925 else if (aarch64_tls_size == 32)
12926 return SYMBOL_TLSLE32;
12927 else if (aarch64_tls_size == 48)
12928 return SYMBOL_TLSLE48;
12929 else
12930 gcc_unreachable ();
12931
12932 case TLS_MODEL_EMULATED:
12933 case TLS_MODEL_NONE:
12934 return SYMBOL_FORCE_TO_MEM;
12935
12936 default:
12937 gcc_unreachable ();
12938 }
12939 }
12940
12941 /* Return the correct method for accessing X + OFFSET, where X is either
12942 a SYMBOL_REF or LABEL_REF. */
12943
12944 enum aarch64_symbol_type
12945 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12946 {
12947 if (GET_CODE (x) == LABEL_REF)
12948 {
12949 switch (aarch64_cmodel)
12950 {
12951 case AARCH64_CMODEL_LARGE:
12952 return SYMBOL_FORCE_TO_MEM;
12953
12954 case AARCH64_CMODEL_TINY_PIC:
12955 case AARCH64_CMODEL_TINY:
12956 return SYMBOL_TINY_ABSOLUTE;
12957
12958 case AARCH64_CMODEL_SMALL_SPIC:
12959 case AARCH64_CMODEL_SMALL_PIC:
12960 case AARCH64_CMODEL_SMALL:
12961 return SYMBOL_SMALL_ABSOLUTE;
12962
12963 default:
12964 gcc_unreachable ();
12965 }
12966 }
12967
12968 if (GET_CODE (x) == SYMBOL_REF)
12969 {
12970 if (aarch64_tls_symbol_p (x))
12971 return aarch64_classify_tls_symbol (x);
12972
12973 switch (aarch64_cmodel)
12974 {
12975 case AARCH64_CMODEL_TINY:
12976 /* When we retrieve symbol + offset address, we have to make sure
12977 the offset does not cause overflow of the final address. But
12978 we have no way of knowing the address of symbol at compile time
12979 so we can't accurately say if the distance between the PC and
12980 symbol + offset is outside the addressible range of +/-1M in the
12981 TINY code model. So we rely on images not being greater than
12982 1M and cap the offset at 1M and anything beyond 1M will have to
12983 be loaded using an alternative mechanism. Furthermore if the
12984 symbol is a weak reference to something that isn't known to
12985 resolve to a symbol in this module, then force to memory. */
12986 if ((SYMBOL_REF_WEAK (x)
12987 && !aarch64_symbol_binds_local_p (x))
12988 || !IN_RANGE (offset, -1048575, 1048575))
12989 return SYMBOL_FORCE_TO_MEM;
12990 return SYMBOL_TINY_ABSOLUTE;
12991
12992 case AARCH64_CMODEL_SMALL:
12993 /* Same reasoning as the tiny code model, but the offset cap here is
12994 4G. */
12995 if ((SYMBOL_REF_WEAK (x)
12996 && !aarch64_symbol_binds_local_p (x))
12997 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
12998 HOST_WIDE_INT_C (4294967264)))
12999 return SYMBOL_FORCE_TO_MEM;
13000 return SYMBOL_SMALL_ABSOLUTE;
13001
13002 case AARCH64_CMODEL_TINY_PIC:
13003 if (!aarch64_symbol_binds_local_p (x))
13004 return SYMBOL_TINY_GOT;
13005 return SYMBOL_TINY_ABSOLUTE;
13006
13007 case AARCH64_CMODEL_SMALL_SPIC:
13008 case AARCH64_CMODEL_SMALL_PIC:
13009 if (!aarch64_symbol_binds_local_p (x))
13010 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13011 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13012 return SYMBOL_SMALL_ABSOLUTE;
13013
13014 case AARCH64_CMODEL_LARGE:
13015 /* This is alright even in PIC code as the constant
13016 pool reference is always PC relative and within
13017 the same translation unit. */
13018 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13019 return SYMBOL_SMALL_ABSOLUTE;
13020 else
13021 return SYMBOL_FORCE_TO_MEM;
13022
13023 default:
13024 gcc_unreachable ();
13025 }
13026 }
13027
13028 /* By default push everything into the constant pool. */
13029 return SYMBOL_FORCE_TO_MEM;
13030 }
13031
13032 bool
13033 aarch64_constant_address_p (rtx x)
13034 {
13035 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13036 }
13037
13038 bool
13039 aarch64_legitimate_pic_operand_p (rtx x)
13040 {
13041 if (GET_CODE (x) == SYMBOL_REF
13042 || (GET_CODE (x) == CONST
13043 && GET_CODE (XEXP (x, 0)) == PLUS
13044 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13045 return false;
13046
13047 return true;
13048 }
13049
13050 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13051 that should be rematerialized rather than spilled. */
13052
13053 static bool
13054 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13055 {
13056 /* Support CSE and rematerialization of common constants. */
13057 if (CONST_INT_P (x)
13058 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13059 || GET_CODE (x) == CONST_VECTOR)
13060 return true;
13061
13062 /* Do not allow vector struct mode constants for Advanced SIMD.
13063 We could support 0 and -1 easily, but they need support in
13064 aarch64-simd.md. */
13065 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13066 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13067 return false;
13068
13069 /* Only accept variable-length vector constants if they can be
13070 handled directly.
13071
13072 ??? It would be possible to handle rematerialization of other
13073 constants via secondary reloads. */
13074 if (vec_flags & VEC_ANY_SVE)
13075 return aarch64_simd_valid_immediate (x, NULL);
13076
13077 if (GET_CODE (x) == HIGH)
13078 x = XEXP (x, 0);
13079
13080 /* Accept polynomial constants that can be calculated by using the
13081 destination of a move as the sole temporary. Constants that
13082 require a second temporary cannot be rematerialized (they can't be
13083 forced to memory and also aren't legitimate constants). */
13084 poly_int64 offset;
13085 if (poly_int_rtx_p (x, &offset))
13086 return aarch64_offset_temporaries (false, offset) <= 1;
13087
13088 /* If an offset is being added to something else, we need to allow the
13089 base to be moved into the destination register, meaning that there
13090 are no free temporaries for the offset. */
13091 x = strip_offset (x, &offset);
13092 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13093 return false;
13094
13095 /* Do not allow const (plus (anchor_symbol, const_int)). */
13096 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13097 return false;
13098
13099 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13100 so spilling them is better than rematerialization. */
13101 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13102 return true;
13103
13104 /* Label references are always constant. */
13105 if (GET_CODE (x) == LABEL_REF)
13106 return true;
13107
13108 return false;
13109 }
13110
13111 rtx
13112 aarch64_load_tp (rtx target)
13113 {
13114 if (!target
13115 || GET_MODE (target) != Pmode
13116 || !register_operand (target, Pmode))
13117 target = gen_reg_rtx (Pmode);
13118
13119 /* Can return in any reg. */
13120 emit_insn (gen_aarch64_load_tp_hard (target));
13121 return target;
13122 }
13123
13124 /* On AAPCS systems, this is the "struct __va_list". */
13125 static GTY(()) tree va_list_type;
13126
13127 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13128 Return the type to use as __builtin_va_list.
13129
13130 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13131
13132 struct __va_list
13133 {
13134 void *__stack;
13135 void *__gr_top;
13136 void *__vr_top;
13137 int __gr_offs;
13138 int __vr_offs;
13139 }; */
13140
13141 static tree
13142 aarch64_build_builtin_va_list (void)
13143 {
13144 tree va_list_name;
13145 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13146
13147 /* Create the type. */
13148 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13149 /* Give it the required name. */
13150 va_list_name = build_decl (BUILTINS_LOCATION,
13151 TYPE_DECL,
13152 get_identifier ("__va_list"),
13153 va_list_type);
13154 DECL_ARTIFICIAL (va_list_name) = 1;
13155 TYPE_NAME (va_list_type) = va_list_name;
13156 TYPE_STUB_DECL (va_list_type) = va_list_name;
13157
13158 /* Create the fields. */
13159 f_stack = build_decl (BUILTINS_LOCATION,
13160 FIELD_DECL, get_identifier ("__stack"),
13161 ptr_type_node);
13162 f_grtop = build_decl (BUILTINS_LOCATION,
13163 FIELD_DECL, get_identifier ("__gr_top"),
13164 ptr_type_node);
13165 f_vrtop = build_decl (BUILTINS_LOCATION,
13166 FIELD_DECL, get_identifier ("__vr_top"),
13167 ptr_type_node);
13168 f_groff = build_decl (BUILTINS_LOCATION,
13169 FIELD_DECL, get_identifier ("__gr_offs"),
13170 integer_type_node);
13171 f_vroff = build_decl (BUILTINS_LOCATION,
13172 FIELD_DECL, get_identifier ("__vr_offs"),
13173 integer_type_node);
13174
13175 /* Tell tree-stdarg pass about our internal offset fields.
13176 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13177 purpose to identify whether the code is updating va_list internal
13178 offset fields through irregular way. */
13179 va_list_gpr_counter_field = f_groff;
13180 va_list_fpr_counter_field = f_vroff;
13181
13182 DECL_ARTIFICIAL (f_stack) = 1;
13183 DECL_ARTIFICIAL (f_grtop) = 1;
13184 DECL_ARTIFICIAL (f_vrtop) = 1;
13185 DECL_ARTIFICIAL (f_groff) = 1;
13186 DECL_ARTIFICIAL (f_vroff) = 1;
13187
13188 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13189 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13190 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13191 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13192 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13193
13194 TYPE_FIELDS (va_list_type) = f_stack;
13195 DECL_CHAIN (f_stack) = f_grtop;
13196 DECL_CHAIN (f_grtop) = f_vrtop;
13197 DECL_CHAIN (f_vrtop) = f_groff;
13198 DECL_CHAIN (f_groff) = f_vroff;
13199
13200 /* Compute its layout. */
13201 layout_type (va_list_type);
13202
13203 return va_list_type;
13204 }
13205
13206 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13207 static void
13208 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13209 {
13210 const CUMULATIVE_ARGS *cum;
13211 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13212 tree stack, grtop, vrtop, groff, vroff;
13213 tree t;
13214 int gr_save_area_size = cfun->va_list_gpr_size;
13215 int vr_save_area_size = cfun->va_list_fpr_size;
13216 int vr_offset;
13217
13218 cum = &crtl->args.info;
13219 if (cfun->va_list_gpr_size)
13220 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13221 cfun->va_list_gpr_size);
13222 if (cfun->va_list_fpr_size)
13223 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13224 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13225
13226 if (!TARGET_FLOAT)
13227 {
13228 gcc_assert (cum->aapcs_nvrn == 0);
13229 vr_save_area_size = 0;
13230 }
13231
13232 f_stack = TYPE_FIELDS (va_list_type_node);
13233 f_grtop = DECL_CHAIN (f_stack);
13234 f_vrtop = DECL_CHAIN (f_grtop);
13235 f_groff = DECL_CHAIN (f_vrtop);
13236 f_vroff = DECL_CHAIN (f_groff);
13237
13238 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13239 NULL_TREE);
13240 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13241 NULL_TREE);
13242 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13243 NULL_TREE);
13244 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13245 NULL_TREE);
13246 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13247 NULL_TREE);
13248
13249 /* Emit code to initialize STACK, which points to the next varargs stack
13250 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13251 by named arguments. STACK is 8-byte aligned. */
13252 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13253 if (cum->aapcs_stack_size > 0)
13254 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13255 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13256 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13257
13258 /* Emit code to initialize GRTOP, the top of the GR save area.
13259 virtual_incoming_args_rtx should have been 16 byte aligned. */
13260 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13261 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13262 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13263
13264 /* Emit code to initialize VRTOP, the top of the VR save area.
13265 This address is gr_save_area_bytes below GRTOP, rounded
13266 down to the next 16-byte boundary. */
13267 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13268 vr_offset = ROUND_UP (gr_save_area_size,
13269 STACK_BOUNDARY / BITS_PER_UNIT);
13270
13271 if (vr_offset)
13272 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13273 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13274 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13275
13276 /* Emit code to initialize GROFF, the offset from GRTOP of the
13277 next GPR argument. */
13278 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13279 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13280 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13281
13282 /* Likewise emit code to initialize VROFF, the offset from FTOP
13283 of the next VR argument. */
13284 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13285 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13286 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13287 }
13288
13289 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13290
13291 static tree
13292 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13293 gimple_seq *post_p ATTRIBUTE_UNUSED)
13294 {
13295 tree addr;
13296 bool indirect_p;
13297 bool is_ha; /* is HFA or HVA. */
13298 bool dw_align; /* double-word align. */
13299 machine_mode ag_mode = VOIDmode;
13300 int nregs;
13301 machine_mode mode;
13302
13303 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13304 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13305 HOST_WIDE_INT size, rsize, adjust, align;
13306 tree t, u, cond1, cond2;
13307
13308 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13309 if (indirect_p)
13310 type = build_pointer_type (type);
13311
13312 mode = TYPE_MODE (type);
13313
13314 f_stack = TYPE_FIELDS (va_list_type_node);
13315 f_grtop = DECL_CHAIN (f_stack);
13316 f_vrtop = DECL_CHAIN (f_grtop);
13317 f_groff = DECL_CHAIN (f_vrtop);
13318 f_vroff = DECL_CHAIN (f_groff);
13319
13320 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13321 f_stack, NULL_TREE);
13322 size = int_size_in_bytes (type);
13323 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
13324
13325 dw_align = false;
13326 adjust = 0;
13327 if (aarch64_vfp_is_call_or_return_candidate (mode,
13328 type,
13329 &ag_mode,
13330 &nregs,
13331 &is_ha))
13332 {
13333 /* No frontends can create types with variable-sized modes, so we
13334 shouldn't be asked to pass or return them. */
13335 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13336
13337 /* TYPE passed in fp/simd registers. */
13338 if (!TARGET_FLOAT)
13339 aarch64_err_no_fpadvsimd (mode);
13340
13341 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13342 unshare_expr (valist), f_vrtop, NULL_TREE);
13343 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13344 unshare_expr (valist), f_vroff, NULL_TREE);
13345
13346 rsize = nregs * UNITS_PER_VREG;
13347
13348 if (is_ha)
13349 {
13350 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13351 adjust = UNITS_PER_VREG - ag_size;
13352 }
13353 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13354 && size < UNITS_PER_VREG)
13355 {
13356 adjust = UNITS_PER_VREG - size;
13357 }
13358 }
13359 else
13360 {
13361 /* TYPE passed in general registers. */
13362 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13363 unshare_expr (valist), f_grtop, NULL_TREE);
13364 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13365 unshare_expr (valist), f_groff, NULL_TREE);
13366 rsize = ROUND_UP (size, UNITS_PER_WORD);
13367 nregs = rsize / UNITS_PER_WORD;
13368
13369 if (align > 8)
13370 dw_align = true;
13371
13372 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13373 && size < UNITS_PER_WORD)
13374 {
13375 adjust = UNITS_PER_WORD - size;
13376 }
13377 }
13378
13379 /* Get a local temporary for the field value. */
13380 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13381
13382 /* Emit code to branch if off >= 0. */
13383 t = build2 (GE_EXPR, boolean_type_node, off,
13384 build_int_cst (TREE_TYPE (off), 0));
13385 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13386
13387 if (dw_align)
13388 {
13389 /* Emit: offs = (offs + 15) & -16. */
13390 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13391 build_int_cst (TREE_TYPE (off), 15));
13392 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13393 build_int_cst (TREE_TYPE (off), -16));
13394 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13395 }
13396 else
13397 roundup = NULL;
13398
13399 /* Update ap.__[g|v]r_offs */
13400 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13401 build_int_cst (TREE_TYPE (off), rsize));
13402 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13403
13404 /* String up. */
13405 if (roundup)
13406 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13407
13408 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13409 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13410 build_int_cst (TREE_TYPE (f_off), 0));
13411 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13412
13413 /* String up: make sure the assignment happens before the use. */
13414 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13415 COND_EXPR_ELSE (cond1) = t;
13416
13417 /* Prepare the trees handling the argument that is passed on the stack;
13418 the top level node will store in ON_STACK. */
13419 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13420 if (align > 8)
13421 {
13422 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13423 t = fold_build_pointer_plus_hwi (arg, 15);
13424 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13425 build_int_cst (TREE_TYPE (t), -16));
13426 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13427 }
13428 else
13429 roundup = NULL;
13430 /* Advance ap.__stack */
13431 t = fold_build_pointer_plus_hwi (arg, size + 7);
13432 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13433 build_int_cst (TREE_TYPE (t), -8));
13434 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13435 /* String up roundup and advance. */
13436 if (roundup)
13437 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13438 /* String up with arg */
13439 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13440 /* Big-endianness related address adjustment. */
13441 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13442 && size < UNITS_PER_WORD)
13443 {
13444 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13445 size_int (UNITS_PER_WORD - size));
13446 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13447 }
13448
13449 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13450 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13451
13452 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13453 t = off;
13454 if (adjust)
13455 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13456 build_int_cst (TREE_TYPE (off), adjust));
13457
13458 t = fold_convert (sizetype, t);
13459 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13460
13461 if (is_ha)
13462 {
13463 /* type ha; // treat as "struct {ftype field[n];}"
13464 ... [computing offs]
13465 for (i = 0; i <nregs; ++i, offs += 16)
13466 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13467 return ha; */
13468 int i;
13469 tree tmp_ha, field_t, field_ptr_t;
13470
13471 /* Declare a local variable. */
13472 tmp_ha = create_tmp_var_raw (type, "ha");
13473 gimple_add_tmp_var (tmp_ha);
13474
13475 /* Establish the base type. */
13476 switch (ag_mode)
13477 {
13478 case E_SFmode:
13479 field_t = float_type_node;
13480 field_ptr_t = float_ptr_type_node;
13481 break;
13482 case E_DFmode:
13483 field_t = double_type_node;
13484 field_ptr_t = double_ptr_type_node;
13485 break;
13486 case E_TFmode:
13487 field_t = long_double_type_node;
13488 field_ptr_t = long_double_ptr_type_node;
13489 break;
13490 case E_HFmode:
13491 field_t = aarch64_fp16_type_node;
13492 field_ptr_t = aarch64_fp16_ptr_type_node;
13493 break;
13494 case E_V2SImode:
13495 case E_V4SImode:
13496 {
13497 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13498 field_t = build_vector_type_for_mode (innertype, ag_mode);
13499 field_ptr_t = build_pointer_type (field_t);
13500 }
13501 break;
13502 default:
13503 gcc_assert (0);
13504 }
13505
13506 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13507 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13508 addr = t;
13509 t = fold_convert (field_ptr_t, addr);
13510 t = build2 (MODIFY_EXPR, field_t,
13511 build1 (INDIRECT_REF, field_t, tmp_ha),
13512 build1 (INDIRECT_REF, field_t, t));
13513
13514 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13515 for (i = 1; i < nregs; ++i)
13516 {
13517 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13518 u = fold_convert (field_ptr_t, addr);
13519 u = build2 (MODIFY_EXPR, field_t,
13520 build2 (MEM_REF, field_t, tmp_ha,
13521 build_int_cst (field_ptr_t,
13522 (i *
13523 int_size_in_bytes (field_t)))),
13524 build1 (INDIRECT_REF, field_t, u));
13525 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13526 }
13527
13528 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13529 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13530 }
13531
13532 COND_EXPR_ELSE (cond2) = t;
13533 addr = fold_convert (build_pointer_type (type), cond1);
13534 addr = build_va_arg_indirect_ref (addr);
13535
13536 if (indirect_p)
13537 addr = build_va_arg_indirect_ref (addr);
13538
13539 return addr;
13540 }
13541
13542 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13543
13544 static void
13545 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13546 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13547 int no_rtl)
13548 {
13549 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13550 CUMULATIVE_ARGS local_cum;
13551 int gr_saved = cfun->va_list_gpr_size;
13552 int vr_saved = cfun->va_list_fpr_size;
13553
13554 /* The caller has advanced CUM up to, but not beyond, the last named
13555 argument. Advance a local copy of CUM past the last "real" named
13556 argument, to find out how many registers are left over. */
13557 local_cum = *cum;
13558 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13559
13560 /* Found out how many registers we need to save.
13561 Honor tree-stdvar analysis results. */
13562 if (cfun->va_list_gpr_size)
13563 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13564 cfun->va_list_gpr_size / UNITS_PER_WORD);
13565 if (cfun->va_list_fpr_size)
13566 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13567 cfun->va_list_fpr_size / UNITS_PER_VREG);
13568
13569 if (!TARGET_FLOAT)
13570 {
13571 gcc_assert (local_cum.aapcs_nvrn == 0);
13572 vr_saved = 0;
13573 }
13574
13575 if (!no_rtl)
13576 {
13577 if (gr_saved > 0)
13578 {
13579 rtx ptr, mem;
13580
13581 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13582 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13583 - gr_saved * UNITS_PER_WORD);
13584 mem = gen_frame_mem (BLKmode, ptr);
13585 set_mem_alias_set (mem, get_varargs_alias_set ());
13586
13587 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13588 mem, gr_saved);
13589 }
13590 if (vr_saved > 0)
13591 {
13592 /* We can't use move_block_from_reg, because it will use
13593 the wrong mode, storing D regs only. */
13594 machine_mode mode = TImode;
13595 int off, i, vr_start;
13596
13597 /* Set OFF to the offset from virtual_incoming_args_rtx of
13598 the first vector register. The VR save area lies below
13599 the GR one, and is aligned to 16 bytes. */
13600 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13601 STACK_BOUNDARY / BITS_PER_UNIT);
13602 off -= vr_saved * UNITS_PER_VREG;
13603
13604 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13605 for (i = 0; i < vr_saved; ++i)
13606 {
13607 rtx ptr, mem;
13608
13609 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13610 mem = gen_frame_mem (mode, ptr);
13611 set_mem_alias_set (mem, get_varargs_alias_set ());
13612 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13613 off += UNITS_PER_VREG;
13614 }
13615 }
13616 }
13617
13618 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13619 any complication of having crtl->args.pretend_args_size changed. */
13620 cfun->machine->frame.saved_varargs_size
13621 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13622 STACK_BOUNDARY / BITS_PER_UNIT)
13623 + vr_saved * UNITS_PER_VREG);
13624 }
13625
13626 static void
13627 aarch64_conditional_register_usage (void)
13628 {
13629 int i;
13630 if (!TARGET_FLOAT)
13631 {
13632 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13633 {
13634 fixed_regs[i] = 1;
13635 call_used_regs[i] = 1;
13636 }
13637 }
13638 if (!TARGET_SVE)
13639 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13640 {
13641 fixed_regs[i] = 1;
13642 call_used_regs[i] = 1;
13643 }
13644
13645 /* When tracking speculation, we need a couple of call-clobbered registers
13646 to track the speculation state. It would be nice to just use
13647 IP0 and IP1, but currently there are numerous places that just
13648 assume these registers are free for other uses (eg pointer
13649 authentication). */
13650 if (aarch64_track_speculation)
13651 {
13652 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13653 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13654 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13655 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13656 }
13657 }
13658
13659 /* Walk down the type tree of TYPE counting consecutive base elements.
13660 If *MODEP is VOIDmode, then set it to the first valid floating point
13661 type. If a non-floating point type is found, or if a floating point
13662 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13663 otherwise return the count in the sub-tree. */
13664 static int
13665 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13666 {
13667 machine_mode mode;
13668 HOST_WIDE_INT size;
13669
13670 switch (TREE_CODE (type))
13671 {
13672 case REAL_TYPE:
13673 mode = TYPE_MODE (type);
13674 if (mode != DFmode && mode != SFmode
13675 && mode != TFmode && mode != HFmode)
13676 return -1;
13677
13678 if (*modep == VOIDmode)
13679 *modep = mode;
13680
13681 if (*modep == mode)
13682 return 1;
13683
13684 break;
13685
13686 case COMPLEX_TYPE:
13687 mode = TYPE_MODE (TREE_TYPE (type));
13688 if (mode != DFmode && mode != SFmode
13689 && mode != TFmode && mode != HFmode)
13690 return -1;
13691
13692 if (*modep == VOIDmode)
13693 *modep = mode;
13694
13695 if (*modep == mode)
13696 return 2;
13697
13698 break;
13699
13700 case VECTOR_TYPE:
13701 /* Use V2SImode and V4SImode as representatives of all 64-bit
13702 and 128-bit vector types. */
13703 size = int_size_in_bytes (type);
13704 switch (size)
13705 {
13706 case 8:
13707 mode = V2SImode;
13708 break;
13709 case 16:
13710 mode = V4SImode;
13711 break;
13712 default:
13713 return -1;
13714 }
13715
13716 if (*modep == VOIDmode)
13717 *modep = mode;
13718
13719 /* Vector modes are considered to be opaque: two vectors are
13720 equivalent for the purposes of being homogeneous aggregates
13721 if they are the same size. */
13722 if (*modep == mode)
13723 return 1;
13724
13725 break;
13726
13727 case ARRAY_TYPE:
13728 {
13729 int count;
13730 tree index = TYPE_DOMAIN (type);
13731
13732 /* Can't handle incomplete types nor sizes that are not
13733 fixed. */
13734 if (!COMPLETE_TYPE_P (type)
13735 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13736 return -1;
13737
13738 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13739 if (count == -1
13740 || !index
13741 || !TYPE_MAX_VALUE (index)
13742 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13743 || !TYPE_MIN_VALUE (index)
13744 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13745 || count < 0)
13746 return -1;
13747
13748 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13749 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13750
13751 /* There must be no padding. */
13752 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13753 count * GET_MODE_BITSIZE (*modep)))
13754 return -1;
13755
13756 return count;
13757 }
13758
13759 case RECORD_TYPE:
13760 {
13761 int count = 0;
13762 int sub_count;
13763 tree field;
13764
13765 /* Can't handle incomplete types nor sizes that are not
13766 fixed. */
13767 if (!COMPLETE_TYPE_P (type)
13768 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13769 return -1;
13770
13771 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13772 {
13773 if (TREE_CODE (field) != FIELD_DECL)
13774 continue;
13775
13776 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13777 if (sub_count < 0)
13778 return -1;
13779 count += sub_count;
13780 }
13781
13782 /* There must be no padding. */
13783 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13784 count * GET_MODE_BITSIZE (*modep)))
13785 return -1;
13786
13787 return count;
13788 }
13789
13790 case UNION_TYPE:
13791 case QUAL_UNION_TYPE:
13792 {
13793 /* These aren't very interesting except in a degenerate case. */
13794 int count = 0;
13795 int sub_count;
13796 tree field;
13797
13798 /* Can't handle incomplete types nor sizes that are not
13799 fixed. */
13800 if (!COMPLETE_TYPE_P (type)
13801 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13802 return -1;
13803
13804 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13805 {
13806 if (TREE_CODE (field) != FIELD_DECL)
13807 continue;
13808
13809 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13810 if (sub_count < 0)
13811 return -1;
13812 count = count > sub_count ? count : sub_count;
13813 }
13814
13815 /* There must be no padding. */
13816 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13817 count * GET_MODE_BITSIZE (*modep)))
13818 return -1;
13819
13820 return count;
13821 }
13822
13823 default:
13824 break;
13825 }
13826
13827 return -1;
13828 }
13829
13830 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13831 type as described in AAPCS64 \S 4.1.2.
13832
13833 See the comment above aarch64_composite_type_p for the notes on MODE. */
13834
13835 static bool
13836 aarch64_short_vector_p (const_tree type,
13837 machine_mode mode)
13838 {
13839 poly_int64 size = -1;
13840
13841 if (type && TREE_CODE (type) == VECTOR_TYPE)
13842 size = int_size_in_bytes (type);
13843 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13844 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13845 size = GET_MODE_SIZE (mode);
13846
13847 return known_eq (size, 8) || known_eq (size, 16);
13848 }
13849
13850 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13851 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13852 array types. The C99 floating-point complex types are also considered
13853 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13854 types, which are GCC extensions and out of the scope of AAPCS64, are
13855 treated as composite types here as well.
13856
13857 Note that MODE itself is not sufficient in determining whether a type
13858 is such a composite type or not. This is because
13859 stor-layout.c:compute_record_mode may have already changed the MODE
13860 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13861 structure with only one field may have its MODE set to the mode of the
13862 field. Also an integer mode whose size matches the size of the
13863 RECORD_TYPE type may be used to substitute the original mode
13864 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13865 solely relied on. */
13866
13867 static bool
13868 aarch64_composite_type_p (const_tree type,
13869 machine_mode mode)
13870 {
13871 if (aarch64_short_vector_p (type, mode))
13872 return false;
13873
13874 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13875 return true;
13876
13877 if (mode == BLKmode
13878 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13879 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13880 return true;
13881
13882 return false;
13883 }
13884
13885 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13886 shall be passed or returned in simd/fp register(s) (providing these
13887 parameter passing registers are available).
13888
13889 Upon successful return, *COUNT returns the number of needed registers,
13890 *BASE_MODE returns the mode of the individual register and when IS_HAF
13891 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13892 floating-point aggregate or a homogeneous short-vector aggregate. */
13893
13894 static bool
13895 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13896 const_tree type,
13897 machine_mode *base_mode,
13898 int *count,
13899 bool *is_ha)
13900 {
13901 machine_mode new_mode = VOIDmode;
13902 bool composite_p = aarch64_composite_type_p (type, mode);
13903
13904 if (is_ha != NULL) *is_ha = false;
13905
13906 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13907 || aarch64_short_vector_p (type, mode))
13908 {
13909 *count = 1;
13910 new_mode = mode;
13911 }
13912 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13913 {
13914 if (is_ha != NULL) *is_ha = true;
13915 *count = 2;
13916 new_mode = GET_MODE_INNER (mode);
13917 }
13918 else if (type && composite_p)
13919 {
13920 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13921
13922 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13923 {
13924 if (is_ha != NULL) *is_ha = true;
13925 *count = ag_count;
13926 }
13927 else
13928 return false;
13929 }
13930 else
13931 return false;
13932
13933 *base_mode = new_mode;
13934 return true;
13935 }
13936
13937 /* Implement TARGET_STRUCT_VALUE_RTX. */
13938
13939 static rtx
13940 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13941 int incoming ATTRIBUTE_UNUSED)
13942 {
13943 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13944 }
13945
13946 /* Implements target hook vector_mode_supported_p. */
13947 static bool
13948 aarch64_vector_mode_supported_p (machine_mode mode)
13949 {
13950 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13951 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13952 }
13953
13954 /* Return appropriate SIMD container
13955 for MODE within a vector of WIDTH bits. */
13956 static machine_mode
13957 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
13958 {
13959 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
13960 switch (mode)
13961 {
13962 case E_DFmode:
13963 return VNx2DFmode;
13964 case E_SFmode:
13965 return VNx4SFmode;
13966 case E_HFmode:
13967 return VNx8HFmode;
13968 case E_DImode:
13969 return VNx2DImode;
13970 case E_SImode:
13971 return VNx4SImode;
13972 case E_HImode:
13973 return VNx8HImode;
13974 case E_QImode:
13975 return VNx16QImode;
13976 default:
13977 return word_mode;
13978 }
13979
13980 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
13981 if (TARGET_SIMD)
13982 {
13983 if (known_eq (width, 128))
13984 switch (mode)
13985 {
13986 case E_DFmode:
13987 return V2DFmode;
13988 case E_SFmode:
13989 return V4SFmode;
13990 case E_HFmode:
13991 return V8HFmode;
13992 case E_SImode:
13993 return V4SImode;
13994 case E_HImode:
13995 return V8HImode;
13996 case E_QImode:
13997 return V16QImode;
13998 case E_DImode:
13999 return V2DImode;
14000 default:
14001 break;
14002 }
14003 else
14004 switch (mode)
14005 {
14006 case E_SFmode:
14007 return V2SFmode;
14008 case E_HFmode:
14009 return V4HFmode;
14010 case E_SImode:
14011 return V2SImode;
14012 case E_HImode:
14013 return V4HImode;
14014 case E_QImode:
14015 return V8QImode;
14016 default:
14017 break;
14018 }
14019 }
14020 return word_mode;
14021 }
14022
14023 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14024 static machine_mode
14025 aarch64_preferred_simd_mode (scalar_mode mode)
14026 {
14027 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14028 return aarch64_simd_container_mode (mode, bits);
14029 }
14030
14031 /* Return a list of possible vector sizes for the vectorizer
14032 to iterate over. */
14033 static void
14034 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14035 {
14036 if (TARGET_SVE)
14037 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14038 sizes->safe_push (16);
14039 sizes->safe_push (8);
14040 }
14041
14042 /* Implement TARGET_MANGLE_TYPE. */
14043
14044 static const char *
14045 aarch64_mangle_type (const_tree type)
14046 {
14047 /* The AArch64 ABI documents say that "__va_list" has to be
14048 mangled as if it is in the "std" namespace. */
14049 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14050 return "St9__va_list";
14051
14052 /* Half-precision float. */
14053 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14054 return "Dh";
14055
14056 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14057 builtin types. */
14058 if (TYPE_NAME (type) != NULL)
14059 return aarch64_mangle_builtin_type (type);
14060
14061 /* Use the default mangling. */
14062 return NULL;
14063 }
14064
14065 /* Find the first rtx_insn before insn that will generate an assembly
14066 instruction. */
14067
14068 static rtx_insn *
14069 aarch64_prev_real_insn (rtx_insn *insn)
14070 {
14071 if (!insn)
14072 return NULL;
14073
14074 do
14075 {
14076 insn = prev_real_insn (insn);
14077 }
14078 while (insn && recog_memoized (insn) < 0);
14079
14080 return insn;
14081 }
14082
14083 static bool
14084 is_madd_op (enum attr_type t1)
14085 {
14086 unsigned int i;
14087 /* A number of these may be AArch32 only. */
14088 enum attr_type mlatypes[] = {
14089 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14090 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14091 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14092 };
14093
14094 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14095 {
14096 if (t1 == mlatypes[i])
14097 return true;
14098 }
14099
14100 return false;
14101 }
14102
14103 /* Check if there is a register dependency between a load and the insn
14104 for which we hold recog_data. */
14105
14106 static bool
14107 dep_between_memop_and_curr (rtx memop)
14108 {
14109 rtx load_reg;
14110 int opno;
14111
14112 gcc_assert (GET_CODE (memop) == SET);
14113
14114 if (!REG_P (SET_DEST (memop)))
14115 return false;
14116
14117 load_reg = SET_DEST (memop);
14118 for (opno = 1; opno < recog_data.n_operands; opno++)
14119 {
14120 rtx operand = recog_data.operand[opno];
14121 if (REG_P (operand)
14122 && reg_overlap_mentioned_p (load_reg, operand))
14123 return true;
14124
14125 }
14126 return false;
14127 }
14128
14129
14130 /* When working around the Cortex-A53 erratum 835769,
14131 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14132 instruction and has a preceding memory instruction such that a NOP
14133 should be inserted between them. */
14134
14135 bool
14136 aarch64_madd_needs_nop (rtx_insn* insn)
14137 {
14138 enum attr_type attr_type;
14139 rtx_insn *prev;
14140 rtx body;
14141
14142 if (!TARGET_FIX_ERR_A53_835769)
14143 return false;
14144
14145 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14146 return false;
14147
14148 attr_type = get_attr_type (insn);
14149 if (!is_madd_op (attr_type))
14150 return false;
14151
14152 prev = aarch64_prev_real_insn (insn);
14153 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14154 Restore recog state to INSN to avoid state corruption. */
14155 extract_constrain_insn_cached (insn);
14156
14157 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14158 return false;
14159
14160 body = single_set (prev);
14161
14162 /* If the previous insn is a memory op and there is no dependency between
14163 it and the DImode madd, emit a NOP between them. If body is NULL then we
14164 have a complex memory operation, probably a load/store pair.
14165 Be conservative for now and emit a NOP. */
14166 if (GET_MODE (recog_data.operand[0]) == DImode
14167 && (!body || !dep_between_memop_and_curr (body)))
14168 return true;
14169
14170 return false;
14171
14172 }
14173
14174
14175 /* Implement FINAL_PRESCAN_INSN. */
14176
14177 void
14178 aarch64_final_prescan_insn (rtx_insn *insn)
14179 {
14180 if (aarch64_madd_needs_nop (insn))
14181 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14182 }
14183
14184
14185 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14186 instruction. */
14187
14188 bool
14189 aarch64_sve_index_immediate_p (rtx base_or_step)
14190 {
14191 return (CONST_INT_P (base_or_step)
14192 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14193 }
14194
14195 /* Return true if X is a valid immediate for the SVE ADD and SUB
14196 instructions. Negate X first if NEGATE_P is true. */
14197
14198 bool
14199 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14200 {
14201 rtx elt;
14202
14203 if (!const_vec_duplicate_p (x, &elt)
14204 || !CONST_INT_P (elt))
14205 return false;
14206
14207 HOST_WIDE_INT val = INTVAL (elt);
14208 if (negate_p)
14209 val = -val;
14210 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14211
14212 if (val & 0xff)
14213 return IN_RANGE (val, 0, 0xff);
14214 return IN_RANGE (val, 0, 0xff00);
14215 }
14216
14217 /* Return true if X is a valid immediate operand for an SVE logical
14218 instruction such as AND. */
14219
14220 bool
14221 aarch64_sve_bitmask_immediate_p (rtx x)
14222 {
14223 rtx elt;
14224
14225 return (const_vec_duplicate_p (x, &elt)
14226 && CONST_INT_P (elt)
14227 && aarch64_bitmask_imm (INTVAL (elt),
14228 GET_MODE_INNER (GET_MODE (x))));
14229 }
14230
14231 /* Return true if X is a valid immediate for the SVE DUP and CPY
14232 instructions. */
14233
14234 bool
14235 aarch64_sve_dup_immediate_p (rtx x)
14236 {
14237 rtx elt;
14238
14239 if (!const_vec_duplicate_p (x, &elt)
14240 || !CONST_INT_P (elt))
14241 return false;
14242
14243 HOST_WIDE_INT val = INTVAL (elt);
14244 if (val & 0xff)
14245 return IN_RANGE (val, -0x80, 0x7f);
14246 return IN_RANGE (val, -0x8000, 0x7f00);
14247 }
14248
14249 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14250 SIGNED_P says whether the operand is signed rather than unsigned. */
14251
14252 bool
14253 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14254 {
14255 rtx elt;
14256
14257 return (const_vec_duplicate_p (x, &elt)
14258 && CONST_INT_P (elt)
14259 && (signed_p
14260 ? IN_RANGE (INTVAL (elt), -16, 15)
14261 : IN_RANGE (INTVAL (elt), 0, 127)));
14262 }
14263
14264 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14265 instruction. Negate X first if NEGATE_P is true. */
14266
14267 bool
14268 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14269 {
14270 rtx elt;
14271 REAL_VALUE_TYPE r;
14272
14273 if (!const_vec_duplicate_p (x, &elt)
14274 || GET_CODE (elt) != CONST_DOUBLE)
14275 return false;
14276
14277 r = *CONST_DOUBLE_REAL_VALUE (elt);
14278
14279 if (negate_p)
14280 r = real_value_negate (&r);
14281
14282 if (real_equal (&r, &dconst1))
14283 return true;
14284 if (real_equal (&r, &dconsthalf))
14285 return true;
14286 return false;
14287 }
14288
14289 /* Return true if X is a valid immediate operand for an SVE FMUL
14290 instruction. */
14291
14292 bool
14293 aarch64_sve_float_mul_immediate_p (rtx x)
14294 {
14295 rtx elt;
14296
14297 /* GCC will never generate a multiply with an immediate of 2, so there is no
14298 point testing for it (even though it is a valid constant). */
14299 return (const_vec_duplicate_p (x, &elt)
14300 && GET_CODE (elt) == CONST_DOUBLE
14301 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14302 }
14303
14304 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14305 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14306 is nonnull, use it to describe valid immediates. */
14307 static bool
14308 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14309 simd_immediate_info *info,
14310 enum simd_immediate_check which,
14311 simd_immediate_info::insn_type insn)
14312 {
14313 /* Try a 4-byte immediate with LSL. */
14314 for (unsigned int shift = 0; shift < 32; shift += 8)
14315 if ((val32 & (0xff << shift)) == val32)
14316 {
14317 if (info)
14318 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14319 simd_immediate_info::LSL, shift);
14320 return true;
14321 }
14322
14323 /* Try a 2-byte immediate with LSL. */
14324 unsigned int imm16 = val32 & 0xffff;
14325 if (imm16 == (val32 >> 16))
14326 for (unsigned int shift = 0; shift < 16; shift += 8)
14327 if ((imm16 & (0xff << shift)) == imm16)
14328 {
14329 if (info)
14330 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14331 simd_immediate_info::LSL, shift);
14332 return true;
14333 }
14334
14335 /* Try a 4-byte immediate with MSL, except for cases that MVN
14336 can handle. */
14337 if (which == AARCH64_CHECK_MOV)
14338 for (unsigned int shift = 8; shift < 24; shift += 8)
14339 {
14340 unsigned int low = (1 << shift) - 1;
14341 if (((val32 & (0xff << shift)) | low) == val32)
14342 {
14343 if (info)
14344 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14345 simd_immediate_info::MSL, shift);
14346 return true;
14347 }
14348 }
14349
14350 return false;
14351 }
14352
14353 /* Return true if replicating VAL64 is a valid immediate for the
14354 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14355 use it to describe valid immediates. */
14356 static bool
14357 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14358 simd_immediate_info *info,
14359 enum simd_immediate_check which)
14360 {
14361 unsigned int val32 = val64 & 0xffffffff;
14362 unsigned int val16 = val64 & 0xffff;
14363 unsigned int val8 = val64 & 0xff;
14364
14365 if (val32 == (val64 >> 32))
14366 {
14367 if ((which & AARCH64_CHECK_ORR) != 0
14368 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14369 simd_immediate_info::MOV))
14370 return true;
14371
14372 if ((which & AARCH64_CHECK_BIC) != 0
14373 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14374 simd_immediate_info::MVN))
14375 return true;
14376
14377 /* Try using a replicated byte. */
14378 if (which == AARCH64_CHECK_MOV
14379 && val16 == (val32 >> 16)
14380 && val8 == (val16 >> 8))
14381 {
14382 if (info)
14383 *info = simd_immediate_info (QImode, val8);
14384 return true;
14385 }
14386 }
14387
14388 /* Try using a bit-to-bytemask. */
14389 if (which == AARCH64_CHECK_MOV)
14390 {
14391 unsigned int i;
14392 for (i = 0; i < 64; i += 8)
14393 {
14394 unsigned char byte = (val64 >> i) & 0xff;
14395 if (byte != 0 && byte != 0xff)
14396 break;
14397 }
14398 if (i == 64)
14399 {
14400 if (info)
14401 *info = simd_immediate_info (DImode, val64);
14402 return true;
14403 }
14404 }
14405 return false;
14406 }
14407
14408 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14409 instruction. If INFO is nonnull, use it to describe valid immediates. */
14410
14411 static bool
14412 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14413 simd_immediate_info *info)
14414 {
14415 scalar_int_mode mode = DImode;
14416 unsigned int val32 = val64 & 0xffffffff;
14417 if (val32 == (val64 >> 32))
14418 {
14419 mode = SImode;
14420 unsigned int val16 = val32 & 0xffff;
14421 if (val16 == (val32 >> 16))
14422 {
14423 mode = HImode;
14424 unsigned int val8 = val16 & 0xff;
14425 if (val8 == (val16 >> 8))
14426 mode = QImode;
14427 }
14428 }
14429 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14430 if (IN_RANGE (val, -0x80, 0x7f))
14431 {
14432 /* DUP with no shift. */
14433 if (info)
14434 *info = simd_immediate_info (mode, val);
14435 return true;
14436 }
14437 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14438 {
14439 /* DUP with LSL #8. */
14440 if (info)
14441 *info = simd_immediate_info (mode, val);
14442 return true;
14443 }
14444 if (aarch64_bitmask_imm (val64, mode))
14445 {
14446 /* DUPM. */
14447 if (info)
14448 *info = simd_immediate_info (mode, val);
14449 return true;
14450 }
14451 return false;
14452 }
14453
14454 /* Return true if OP is a valid SIMD immediate for the operation
14455 described by WHICH. If INFO is nonnull, use it to describe valid
14456 immediates. */
14457 bool
14458 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14459 enum simd_immediate_check which)
14460 {
14461 machine_mode mode = GET_MODE (op);
14462 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14463 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14464 return false;
14465
14466 scalar_mode elt_mode = GET_MODE_INNER (mode);
14467 rtx base, step;
14468 unsigned int n_elts;
14469 if (GET_CODE (op) == CONST_VECTOR
14470 && CONST_VECTOR_DUPLICATE_P (op))
14471 n_elts = CONST_VECTOR_NPATTERNS (op);
14472 else if ((vec_flags & VEC_SVE_DATA)
14473 && const_vec_series_p (op, &base, &step))
14474 {
14475 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14476 if (!aarch64_sve_index_immediate_p (base)
14477 || !aarch64_sve_index_immediate_p (step))
14478 return false;
14479
14480 if (info)
14481 *info = simd_immediate_info (elt_mode, base, step);
14482 return true;
14483 }
14484 else if (GET_CODE (op) == CONST_VECTOR
14485 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14486 /* N_ELTS set above. */;
14487 else
14488 return false;
14489
14490 /* Handle PFALSE and PTRUE. */
14491 if (vec_flags & VEC_SVE_PRED)
14492 return (op == CONST0_RTX (mode)
14493 || op == CONSTM1_RTX (mode));
14494
14495 scalar_float_mode elt_float_mode;
14496 if (n_elts == 1
14497 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14498 {
14499 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14500 if (aarch64_float_const_zero_rtx_p (elt)
14501 || aarch64_float_const_representable_p (elt))
14502 {
14503 if (info)
14504 *info = simd_immediate_info (elt_float_mode, elt);
14505 return true;
14506 }
14507 }
14508
14509 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14510 if (elt_size > 8)
14511 return false;
14512
14513 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14514
14515 /* Expand the vector constant out into a byte vector, with the least
14516 significant byte of the register first. */
14517 auto_vec<unsigned char, 16> bytes;
14518 bytes.reserve (n_elts * elt_size);
14519 for (unsigned int i = 0; i < n_elts; i++)
14520 {
14521 /* The vector is provided in gcc endian-neutral fashion.
14522 For aarch64_be Advanced SIMD, it must be laid out in the vector
14523 register in reverse order. */
14524 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14525 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14526
14527 if (elt_mode != elt_int_mode)
14528 elt = gen_lowpart (elt_int_mode, elt);
14529
14530 if (!CONST_INT_P (elt))
14531 return false;
14532
14533 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14534 for (unsigned int byte = 0; byte < elt_size; byte++)
14535 {
14536 bytes.quick_push (elt_val & 0xff);
14537 elt_val >>= BITS_PER_UNIT;
14538 }
14539 }
14540
14541 /* The immediate must repeat every eight bytes. */
14542 unsigned int nbytes = bytes.length ();
14543 for (unsigned i = 8; i < nbytes; ++i)
14544 if (bytes[i] != bytes[i - 8])
14545 return false;
14546
14547 /* Get the repeating 8-byte value as an integer. No endian correction
14548 is needed here because bytes is already in lsb-first order. */
14549 unsigned HOST_WIDE_INT val64 = 0;
14550 for (unsigned int i = 0; i < 8; i++)
14551 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14552 << (i * BITS_PER_UNIT));
14553
14554 if (vec_flags & VEC_SVE_DATA)
14555 return aarch64_sve_valid_immediate (val64, info);
14556 else
14557 return aarch64_advsimd_valid_immediate (val64, info, which);
14558 }
14559
14560 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14561 has a step in the range of INDEX. Return the index expression if so,
14562 otherwise return null. */
14563 rtx
14564 aarch64_check_zero_based_sve_index_immediate (rtx x)
14565 {
14566 rtx base, step;
14567 if (const_vec_series_p (x, &base, &step)
14568 && base == const0_rtx
14569 && aarch64_sve_index_immediate_p (step))
14570 return step;
14571 return NULL_RTX;
14572 }
14573
14574 /* Check of immediate shift constants are within range. */
14575 bool
14576 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14577 {
14578 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14579 if (left)
14580 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14581 else
14582 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14583 }
14584
14585 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14586 operation of width WIDTH at bit position POS. */
14587
14588 rtx
14589 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14590 {
14591 gcc_assert (CONST_INT_P (width));
14592 gcc_assert (CONST_INT_P (pos));
14593
14594 unsigned HOST_WIDE_INT mask
14595 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14596 return GEN_INT (mask << UINTVAL (pos));
14597 }
14598
14599 bool
14600 aarch64_mov_operand_p (rtx x, machine_mode mode)
14601 {
14602 if (GET_CODE (x) == HIGH
14603 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14604 return true;
14605
14606 if (CONST_INT_P (x))
14607 return true;
14608
14609 if (VECTOR_MODE_P (GET_MODE (x)))
14610 return aarch64_simd_valid_immediate (x, NULL);
14611
14612 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14613 return true;
14614
14615 if (aarch64_sve_cnt_immediate_p (x))
14616 return true;
14617
14618 return aarch64_classify_symbolic_expression (x)
14619 == SYMBOL_TINY_ABSOLUTE;
14620 }
14621
14622 /* Return a const_int vector of VAL. */
14623 rtx
14624 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14625 {
14626 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14627 return gen_const_vec_duplicate (mode, c);
14628 }
14629
14630 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14631
14632 bool
14633 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14634 {
14635 machine_mode vmode;
14636
14637 vmode = aarch64_simd_container_mode (mode, 64);
14638 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14639 return aarch64_simd_valid_immediate (op_v, NULL);
14640 }
14641
14642 /* Construct and return a PARALLEL RTX vector with elements numbering the
14643 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14644 the vector - from the perspective of the architecture. This does not
14645 line up with GCC's perspective on lane numbers, so we end up with
14646 different masks depending on our target endian-ness. The diagram
14647 below may help. We must draw the distinction when building masks
14648 which select one half of the vector. An instruction selecting
14649 architectural low-lanes for a big-endian target, must be described using
14650 a mask selecting GCC high-lanes.
14651
14652 Big-Endian Little-Endian
14653
14654 GCC 0 1 2 3 3 2 1 0
14655 | x | x | x | x | | x | x | x | x |
14656 Architecture 3 2 1 0 3 2 1 0
14657
14658 Low Mask: { 2, 3 } { 0, 1 }
14659 High Mask: { 0, 1 } { 2, 3 }
14660
14661 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14662
14663 rtx
14664 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14665 {
14666 rtvec v = rtvec_alloc (nunits / 2);
14667 int high_base = nunits / 2;
14668 int low_base = 0;
14669 int base;
14670 rtx t1;
14671 int i;
14672
14673 if (BYTES_BIG_ENDIAN)
14674 base = high ? low_base : high_base;
14675 else
14676 base = high ? high_base : low_base;
14677
14678 for (i = 0; i < nunits / 2; i++)
14679 RTVEC_ELT (v, i) = GEN_INT (base + i);
14680
14681 t1 = gen_rtx_PARALLEL (mode, v);
14682 return t1;
14683 }
14684
14685 /* Check OP for validity as a PARALLEL RTX vector with elements
14686 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14687 from the perspective of the architecture. See the diagram above
14688 aarch64_simd_vect_par_cnst_half for more details. */
14689
14690 bool
14691 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14692 bool high)
14693 {
14694 int nelts;
14695 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14696 return false;
14697
14698 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14699 HOST_WIDE_INT count_op = XVECLEN (op, 0);
14700 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14701 int i = 0;
14702
14703 if (count_op != count_ideal)
14704 return false;
14705
14706 for (i = 0; i < count_ideal; i++)
14707 {
14708 rtx elt_op = XVECEXP (op, 0, i);
14709 rtx elt_ideal = XVECEXP (ideal, 0, i);
14710
14711 if (!CONST_INT_P (elt_op)
14712 || INTVAL (elt_ideal) != INTVAL (elt_op))
14713 return false;
14714 }
14715 return true;
14716 }
14717
14718 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14719 HIGH (exclusive). */
14720 void
14721 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14722 const_tree exp)
14723 {
14724 HOST_WIDE_INT lane;
14725 gcc_assert (CONST_INT_P (operand));
14726 lane = INTVAL (operand);
14727
14728 if (lane < low || lane >= high)
14729 {
14730 if (exp)
14731 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14732 else
14733 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14734 }
14735 }
14736
14737 /* Peform endian correction on lane number N, which indexes a vector
14738 of mode MODE, and return the result as an SImode rtx. */
14739
14740 rtx
14741 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14742 {
14743 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14744 }
14745
14746 /* Return TRUE if OP is a valid vector addressing mode. */
14747
14748 bool
14749 aarch64_simd_mem_operand_p (rtx op)
14750 {
14751 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14752 || REG_P (XEXP (op, 0)));
14753 }
14754
14755 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14756
14757 bool
14758 aarch64_sve_ld1r_operand_p (rtx op)
14759 {
14760 struct aarch64_address_info addr;
14761 scalar_mode mode;
14762
14763 return (MEM_P (op)
14764 && is_a <scalar_mode> (GET_MODE (op), &mode)
14765 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14766 && addr.type == ADDRESS_REG_IMM
14767 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14768 }
14769
14770 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14771 The conditions for STR are the same. */
14772 bool
14773 aarch64_sve_ldr_operand_p (rtx op)
14774 {
14775 struct aarch64_address_info addr;
14776
14777 return (MEM_P (op)
14778 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14779 false, ADDR_QUERY_ANY)
14780 && addr.type == ADDRESS_REG_IMM);
14781 }
14782
14783 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14784 We need to be able to access the individual pieces, so the range
14785 is different from LD[234] and ST[234]. */
14786 bool
14787 aarch64_sve_struct_memory_operand_p (rtx op)
14788 {
14789 if (!MEM_P (op))
14790 return false;
14791
14792 machine_mode mode = GET_MODE (op);
14793 struct aarch64_address_info addr;
14794 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14795 ADDR_QUERY_ANY)
14796 || addr.type != ADDRESS_REG_IMM)
14797 return false;
14798
14799 poly_int64 first = addr.const_offset;
14800 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14801 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14802 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14803 }
14804
14805 /* Emit a register copy from operand to operand, taking care not to
14806 early-clobber source registers in the process.
14807
14808 COUNT is the number of components into which the copy needs to be
14809 decomposed. */
14810 void
14811 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14812 unsigned int count)
14813 {
14814 unsigned int i;
14815 int rdest = REGNO (operands[0]);
14816 int rsrc = REGNO (operands[1]);
14817
14818 if (!reg_overlap_mentioned_p (operands[0], operands[1])
14819 || rdest < rsrc)
14820 for (i = 0; i < count; i++)
14821 emit_move_insn (gen_rtx_REG (mode, rdest + i),
14822 gen_rtx_REG (mode, rsrc + i));
14823 else
14824 for (i = 0; i < count; i++)
14825 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14826 gen_rtx_REG (mode, rsrc + count - i - 1));
14827 }
14828
14829 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14830 one of VSTRUCT modes: OI, CI, or XI. */
14831 int
14832 aarch64_simd_attr_length_rglist (machine_mode mode)
14833 {
14834 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14835 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14836 }
14837
14838 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14839 alignment of a vector to 128 bits. SVE predicates have an alignment of
14840 16 bits. */
14841 static HOST_WIDE_INT
14842 aarch64_simd_vector_alignment (const_tree type)
14843 {
14844 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14845 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14846 be set for non-predicate vectors of booleans. Modes are the most
14847 direct way we have of identifying real SVE predicate types. */
14848 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14849 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14850 return MIN (align, 128);
14851 }
14852
14853 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14854 static poly_uint64
14855 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14856 {
14857 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14858 {
14859 /* If the length of the vector is fixed, try to align to that length,
14860 otherwise don't try to align at all. */
14861 HOST_WIDE_INT result;
14862 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14863 result = TYPE_ALIGN (TREE_TYPE (type));
14864 return result;
14865 }
14866 return TYPE_ALIGN (type);
14867 }
14868
14869 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14870 static bool
14871 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14872 {
14873 if (is_packed)
14874 return false;
14875
14876 /* For fixed-length vectors, check that the vectorizer will aim for
14877 full-vector alignment. This isn't true for generic GCC vectors
14878 that are wider than the ABI maximum of 128 bits. */
14879 poly_uint64 preferred_alignment =
14880 aarch64_vectorize_preferred_vector_alignment (type);
14881 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14882 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14883 preferred_alignment))
14884 return false;
14885
14886 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
14887 return true;
14888 }
14889
14890 /* Return true if the vector misalignment factor is supported by the
14891 target. */
14892 static bool
14893 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14894 const_tree type, int misalignment,
14895 bool is_packed)
14896 {
14897 if (TARGET_SIMD && STRICT_ALIGNMENT)
14898 {
14899 /* Return if movmisalign pattern is not supported for this mode. */
14900 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14901 return false;
14902
14903 /* Misalignment factor is unknown at compile time. */
14904 if (misalignment == -1)
14905 return false;
14906 }
14907 return default_builtin_support_vector_misalignment (mode, type, misalignment,
14908 is_packed);
14909 }
14910
14911 /* If VALS is a vector constant that can be loaded into a register
14912 using DUP, generate instructions to do so and return an RTX to
14913 assign to the register. Otherwise return NULL_RTX. */
14914 static rtx
14915 aarch64_simd_dup_constant (rtx vals)
14916 {
14917 machine_mode mode = GET_MODE (vals);
14918 machine_mode inner_mode = GET_MODE_INNER (mode);
14919 rtx x;
14920
14921 if (!const_vec_duplicate_p (vals, &x))
14922 return NULL_RTX;
14923
14924 /* We can load this constant by using DUP and a constant in a
14925 single ARM register. This will be cheaper than a vector
14926 load. */
14927 x = copy_to_mode_reg (inner_mode, x);
14928 return gen_vec_duplicate (mode, x);
14929 }
14930
14931
14932 /* Generate code to load VALS, which is a PARALLEL containing only
14933 constants (for vec_init) or CONST_VECTOR, efficiently into a
14934 register. Returns an RTX to copy into the register, or NULL_RTX
14935 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
14936 static rtx
14937 aarch64_simd_make_constant (rtx vals)
14938 {
14939 machine_mode mode = GET_MODE (vals);
14940 rtx const_dup;
14941 rtx const_vec = NULL_RTX;
14942 int n_const = 0;
14943 int i;
14944
14945 if (GET_CODE (vals) == CONST_VECTOR)
14946 const_vec = vals;
14947 else if (GET_CODE (vals) == PARALLEL)
14948 {
14949 /* A CONST_VECTOR must contain only CONST_INTs and
14950 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14951 Only store valid constants in a CONST_VECTOR. */
14952 int n_elts = XVECLEN (vals, 0);
14953 for (i = 0; i < n_elts; ++i)
14954 {
14955 rtx x = XVECEXP (vals, 0, i);
14956 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14957 n_const++;
14958 }
14959 if (n_const == n_elts)
14960 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
14961 }
14962 else
14963 gcc_unreachable ();
14964
14965 if (const_vec != NULL_RTX
14966 && aarch64_simd_valid_immediate (const_vec, NULL))
14967 /* Load using MOVI/MVNI. */
14968 return const_vec;
14969 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
14970 /* Loaded using DUP. */
14971 return const_dup;
14972 else if (const_vec != NULL_RTX)
14973 /* Load from constant pool. We cannot take advantage of single-cycle
14974 LD1 because we need a PC-relative addressing mode. */
14975 return const_vec;
14976 else
14977 /* A PARALLEL containing something not valid inside CONST_VECTOR.
14978 We cannot construct an initializer. */
14979 return NULL_RTX;
14980 }
14981
14982 /* Expand a vector initialisation sequence, such that TARGET is
14983 initialised to contain VALS. */
14984
14985 void
14986 aarch64_expand_vector_init (rtx target, rtx vals)
14987 {
14988 machine_mode mode = GET_MODE (target);
14989 scalar_mode inner_mode = GET_MODE_INNER (mode);
14990 /* The number of vector elements. */
14991 int n_elts = XVECLEN (vals, 0);
14992 /* The number of vector elements which are not constant. */
14993 int n_var = 0;
14994 rtx any_const = NULL_RTX;
14995 /* The first element of vals. */
14996 rtx v0 = XVECEXP (vals, 0, 0);
14997 bool all_same = true;
14998
14999 /* Count the number of variable elements to initialise. */
15000 for (int i = 0; i < n_elts; ++i)
15001 {
15002 rtx x = XVECEXP (vals, 0, i);
15003 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15004 ++n_var;
15005 else
15006 any_const = x;
15007
15008 all_same &= rtx_equal_p (x, v0);
15009 }
15010
15011 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15012 how best to handle this. */
15013 if (n_var == 0)
15014 {
15015 rtx constant = aarch64_simd_make_constant (vals);
15016 if (constant != NULL_RTX)
15017 {
15018 emit_move_insn (target, constant);
15019 return;
15020 }
15021 }
15022
15023 /* Splat a single non-constant element if we can. */
15024 if (all_same)
15025 {
15026 rtx x = copy_to_mode_reg (inner_mode, v0);
15027 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15028 return;
15029 }
15030
15031 enum insn_code icode = optab_handler (vec_set_optab, mode);
15032 gcc_assert (icode != CODE_FOR_nothing);
15033
15034 /* If there are only variable elements, try to optimize
15035 the insertion using dup for the most common element
15036 followed by insertions. */
15037
15038 /* The algorithm will fill matches[*][0] with the earliest matching element,
15039 and matches[X][1] with the count of duplicate elements (if X is the
15040 earliest element which has duplicates). */
15041
15042 if (n_var == n_elts && n_elts <= 16)
15043 {
15044 int matches[16][2] = {0};
15045 for (int i = 0; i < n_elts; i++)
15046 {
15047 for (int j = 0; j <= i; j++)
15048 {
15049 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15050 {
15051 matches[i][0] = j;
15052 matches[j][1]++;
15053 break;
15054 }
15055 }
15056 }
15057 int maxelement = 0;
15058 int maxv = 0;
15059 for (int i = 0; i < n_elts; i++)
15060 if (matches[i][1] > maxv)
15061 {
15062 maxelement = i;
15063 maxv = matches[i][1];
15064 }
15065
15066 /* Create a duplicate of the most common element, unless all elements
15067 are equally useless to us, in which case just immediately set the
15068 vector register using the first element. */
15069
15070 if (maxv == 1)
15071 {
15072 /* For vectors of two 64-bit elements, we can do even better. */
15073 if (n_elts == 2
15074 && (inner_mode == E_DImode
15075 || inner_mode == E_DFmode))
15076
15077 {
15078 rtx x0 = XVECEXP (vals, 0, 0);
15079 rtx x1 = XVECEXP (vals, 0, 1);
15080 /* Combine can pick up this case, but handling it directly
15081 here leaves clearer RTL.
15082
15083 This is load_pair_lanes<mode>, and also gives us a clean-up
15084 for store_pair_lanes<mode>. */
15085 if (memory_operand (x0, inner_mode)
15086 && memory_operand (x1, inner_mode)
15087 && !STRICT_ALIGNMENT
15088 && rtx_equal_p (XEXP (x1, 0),
15089 plus_constant (Pmode,
15090 XEXP (x0, 0),
15091 GET_MODE_SIZE (inner_mode))))
15092 {
15093 rtx t;
15094 if (inner_mode == DFmode)
15095 t = gen_load_pair_lanesdf (target, x0, x1);
15096 else
15097 t = gen_load_pair_lanesdi (target, x0, x1);
15098 emit_insn (t);
15099 return;
15100 }
15101 }
15102 /* The subreg-move sequence below will move into lane zero of the
15103 vector register. For big-endian we want that position to hold
15104 the last element of VALS. */
15105 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15106 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15107 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15108 }
15109 else
15110 {
15111 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15112 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15113 }
15114
15115 /* Insert the rest. */
15116 for (int i = 0; i < n_elts; i++)
15117 {
15118 rtx x = XVECEXP (vals, 0, i);
15119 if (matches[i][0] == maxelement)
15120 continue;
15121 x = copy_to_mode_reg (inner_mode, x);
15122 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15123 }
15124 return;
15125 }
15126
15127 /* Initialise a vector which is part-variable. We want to first try
15128 to build those lanes which are constant in the most efficient way we
15129 can. */
15130 if (n_var != n_elts)
15131 {
15132 rtx copy = copy_rtx (vals);
15133
15134 /* Load constant part of vector. We really don't care what goes into the
15135 parts we will overwrite, but we're more likely to be able to load the
15136 constant efficiently if it has fewer, larger, repeating parts
15137 (see aarch64_simd_valid_immediate). */
15138 for (int i = 0; i < n_elts; i++)
15139 {
15140 rtx x = XVECEXP (vals, 0, i);
15141 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15142 continue;
15143 rtx subst = any_const;
15144 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15145 {
15146 /* Look in the copied vector, as more elements are const. */
15147 rtx test = XVECEXP (copy, 0, i ^ bit);
15148 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15149 {
15150 subst = test;
15151 break;
15152 }
15153 }
15154 XVECEXP (copy, 0, i) = subst;
15155 }
15156 aarch64_expand_vector_init (target, copy);
15157 }
15158
15159 /* Insert the variable lanes directly. */
15160 for (int i = 0; i < n_elts; i++)
15161 {
15162 rtx x = XVECEXP (vals, 0, i);
15163 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15164 continue;
15165 x = copy_to_mode_reg (inner_mode, x);
15166 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15167 }
15168 }
15169
15170 static unsigned HOST_WIDE_INT
15171 aarch64_shift_truncation_mask (machine_mode mode)
15172 {
15173 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15174 return 0;
15175 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15176 }
15177
15178 /* Select a format to encode pointers in exception handling data. */
15179 int
15180 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15181 {
15182 int type;
15183 switch (aarch64_cmodel)
15184 {
15185 case AARCH64_CMODEL_TINY:
15186 case AARCH64_CMODEL_TINY_PIC:
15187 case AARCH64_CMODEL_SMALL:
15188 case AARCH64_CMODEL_SMALL_PIC:
15189 case AARCH64_CMODEL_SMALL_SPIC:
15190 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15191 for everything. */
15192 type = DW_EH_PE_sdata4;
15193 break;
15194 default:
15195 /* No assumptions here. 8-byte relocs required. */
15196 type = DW_EH_PE_sdata8;
15197 break;
15198 }
15199 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15200 }
15201
15202 /* The last .arch and .tune assembly strings that we printed. */
15203 static std::string aarch64_last_printed_arch_string;
15204 static std::string aarch64_last_printed_tune_string;
15205
15206 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15207 by the function fndecl. */
15208
15209 void
15210 aarch64_declare_function_name (FILE *stream, const char* name,
15211 tree fndecl)
15212 {
15213 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15214
15215 struct cl_target_option *targ_options;
15216 if (target_parts)
15217 targ_options = TREE_TARGET_OPTION (target_parts);
15218 else
15219 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15220 gcc_assert (targ_options);
15221
15222 const struct processor *this_arch
15223 = aarch64_get_arch (targ_options->x_explicit_arch);
15224
15225 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15226 std::string extension
15227 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15228 this_arch->flags);
15229 /* Only update the assembler .arch string if it is distinct from the last
15230 such string we printed. */
15231 std::string to_print = this_arch->name + extension;
15232 if (to_print != aarch64_last_printed_arch_string)
15233 {
15234 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15235 aarch64_last_printed_arch_string = to_print;
15236 }
15237
15238 /* Print the cpu name we're tuning for in the comments, might be
15239 useful to readers of the generated asm. Do it only when it changes
15240 from function to function and verbose assembly is requested. */
15241 const struct processor *this_tune
15242 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15243
15244 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15245 {
15246 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15247 this_tune->name);
15248 aarch64_last_printed_tune_string = this_tune->name;
15249 }
15250
15251 /* Don't forget the type directive for ELF. */
15252 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15253 ASM_OUTPUT_LABEL (stream, name);
15254 }
15255
15256 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15257
15258 static void
15259 aarch64_start_file (void)
15260 {
15261 struct cl_target_option *default_options
15262 = TREE_TARGET_OPTION (target_option_default_node);
15263
15264 const struct processor *default_arch
15265 = aarch64_get_arch (default_options->x_explicit_arch);
15266 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15267 std::string extension
15268 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15269 default_arch->flags);
15270
15271 aarch64_last_printed_arch_string = default_arch->name + extension;
15272 aarch64_last_printed_tune_string = "";
15273 asm_fprintf (asm_out_file, "\t.arch %s\n",
15274 aarch64_last_printed_arch_string.c_str ());
15275
15276 default_file_start ();
15277 }
15278
15279 /* Emit load exclusive. */
15280
15281 static void
15282 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15283 rtx mem, rtx model_rtx)
15284 {
15285 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15286 }
15287
15288 /* Emit store exclusive. */
15289
15290 static void
15291 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15292 rtx rval, rtx mem, rtx model_rtx)
15293 {
15294 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15295 }
15296
15297 /* Mark the previous jump instruction as unlikely. */
15298
15299 static void
15300 aarch64_emit_unlikely_jump (rtx insn)
15301 {
15302 rtx_insn *jump = emit_jump_insn (insn);
15303 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15304 }
15305
15306 /* Expand a compare and swap pattern. */
15307
15308 void
15309 aarch64_expand_compare_and_swap (rtx operands[])
15310 {
15311 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15312 machine_mode mode, r_mode;
15313
15314 bval = operands[0];
15315 rval = operands[1];
15316 mem = operands[2];
15317 oldval = operands[3];
15318 newval = operands[4];
15319 is_weak = operands[5];
15320 mod_s = operands[6];
15321 mod_f = operands[7];
15322 mode = GET_MODE (mem);
15323
15324 /* Normally the succ memory model must be stronger than fail, but in the
15325 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15326 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15327 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15328 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15329 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15330
15331 r_mode = mode;
15332 if (mode == QImode || mode == HImode)
15333 {
15334 r_mode = SImode;
15335 rval = gen_reg_rtx (r_mode);
15336 }
15337
15338 if (TARGET_LSE)
15339 {
15340 /* The CAS insn requires oldval and rval overlap, but we need to
15341 have a copy of oldval saved across the operation to tell if
15342 the operation is successful. */
15343 if (reg_overlap_mentioned_p (rval, oldval))
15344 rval = copy_to_mode_reg (r_mode, oldval);
15345 else
15346 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15347
15348 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15349 newval, mod_s));
15350 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15351 }
15352 else
15353 {
15354 /* The oldval predicate varies by mode. Test it and force to reg. */
15355 insn_code code = code_for_aarch64_compare_and_swap (mode);
15356 if (!insn_data[code].operand[2].predicate (oldval, mode))
15357 oldval = force_reg (mode, oldval);
15358
15359 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15360 is_weak, mod_s, mod_f));
15361 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15362 }
15363
15364 if (r_mode != mode)
15365 rval = gen_lowpart (mode, rval);
15366 emit_move_insn (operands[1], rval);
15367
15368 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15369 emit_insn (gen_rtx_SET (bval, x));
15370 }
15371
15372 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15373 sequence implementing an atomic operation. */
15374
15375 static void
15376 aarch64_emit_post_barrier (enum memmodel model)
15377 {
15378 const enum memmodel base_model = memmodel_base (model);
15379
15380 if (is_mm_sync (model)
15381 && (base_model == MEMMODEL_ACQUIRE
15382 || base_model == MEMMODEL_ACQ_REL
15383 || base_model == MEMMODEL_SEQ_CST))
15384 {
15385 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15386 }
15387 }
15388
15389 /* Split a compare and swap pattern. */
15390
15391 void
15392 aarch64_split_compare_and_swap (rtx operands[])
15393 {
15394 rtx rval, mem, oldval, newval, scratch;
15395 machine_mode mode;
15396 bool is_weak;
15397 rtx_code_label *label1, *label2;
15398 rtx x, cond;
15399 enum memmodel model;
15400 rtx model_rtx;
15401
15402 rval = operands[0];
15403 mem = operands[1];
15404 oldval = operands[2];
15405 newval = operands[3];
15406 is_weak = (operands[4] != const0_rtx);
15407 model_rtx = operands[5];
15408 scratch = operands[7];
15409 mode = GET_MODE (mem);
15410 model = memmodel_from_int (INTVAL (model_rtx));
15411
15412 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15413 loop:
15414 .label1:
15415 LD[A]XR rval, [mem]
15416 CBNZ rval, .label2
15417 ST[L]XR scratch, newval, [mem]
15418 CBNZ scratch, .label1
15419 .label2:
15420 CMP rval, 0. */
15421 bool strong_zero_p = !is_weak && oldval == const0_rtx;
15422
15423 label1 = NULL;
15424 if (!is_weak)
15425 {
15426 label1 = gen_label_rtx ();
15427 emit_label (label1);
15428 }
15429 label2 = gen_label_rtx ();
15430
15431 /* The initial load can be relaxed for a __sync operation since a final
15432 barrier will be emitted to stop code hoisting. */
15433 if (is_mm_sync (model))
15434 aarch64_emit_load_exclusive (mode, rval, mem,
15435 GEN_INT (MEMMODEL_RELAXED));
15436 else
15437 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15438
15439 if (strong_zero_p)
15440 {
15441 if (aarch64_track_speculation)
15442 {
15443 /* Emit an explicit compare instruction, so that we can correctly
15444 track the condition codes. */
15445 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15446 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15447 }
15448 else
15449 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15450
15451 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15452 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15453 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15454 }
15455 else
15456 {
15457 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15458 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15459 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15460 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15461 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15462 }
15463
15464 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15465
15466 if (!is_weak)
15467 {
15468 if (aarch64_track_speculation)
15469 {
15470 /* Emit an explicit compare instruction, so that we can correctly
15471 track the condition codes. */
15472 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15473 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15474 }
15475 else
15476 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15477
15478 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15479 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15480 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15481 }
15482 else
15483 {
15484 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15485 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15486 emit_insn (gen_rtx_SET (cond, x));
15487 }
15488
15489 emit_label (label2);
15490 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15491 to set the condition flags. If this is not used it will be removed by
15492 later passes. */
15493 if (strong_zero_p)
15494 {
15495 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15496 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15497 emit_insn (gen_rtx_SET (cond, x));
15498 }
15499 /* Emit any final barrier needed for a __sync operation. */
15500 if (is_mm_sync (model))
15501 aarch64_emit_post_barrier (model);
15502 }
15503
15504 /* Split an atomic operation. */
15505
15506 void
15507 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15508 rtx value, rtx model_rtx, rtx cond)
15509 {
15510 machine_mode mode = GET_MODE (mem);
15511 machine_mode wmode = (mode == DImode ? DImode : SImode);
15512 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15513 const bool is_sync = is_mm_sync (model);
15514 rtx_code_label *label;
15515 rtx x;
15516
15517 /* Split the atomic operation into a sequence. */
15518 label = gen_label_rtx ();
15519 emit_label (label);
15520
15521 if (new_out)
15522 new_out = gen_lowpart (wmode, new_out);
15523 if (old_out)
15524 old_out = gen_lowpart (wmode, old_out);
15525 else
15526 old_out = new_out;
15527 value = simplify_gen_subreg (wmode, value, mode, 0);
15528
15529 /* The initial load can be relaxed for a __sync operation since a final
15530 barrier will be emitted to stop code hoisting. */
15531 if (is_sync)
15532 aarch64_emit_load_exclusive (mode, old_out, mem,
15533 GEN_INT (MEMMODEL_RELAXED));
15534 else
15535 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15536
15537 switch (code)
15538 {
15539 case SET:
15540 new_out = value;
15541 break;
15542
15543 case NOT:
15544 x = gen_rtx_AND (wmode, old_out, value);
15545 emit_insn (gen_rtx_SET (new_out, x));
15546 x = gen_rtx_NOT (wmode, new_out);
15547 emit_insn (gen_rtx_SET (new_out, x));
15548 break;
15549
15550 case MINUS:
15551 if (CONST_INT_P (value))
15552 {
15553 value = GEN_INT (-INTVAL (value));
15554 code = PLUS;
15555 }
15556 /* Fall through. */
15557
15558 default:
15559 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15560 emit_insn (gen_rtx_SET (new_out, x));
15561 break;
15562 }
15563
15564 aarch64_emit_store_exclusive (mode, cond, mem,
15565 gen_lowpart (mode, new_out), model_rtx);
15566
15567 if (aarch64_track_speculation)
15568 {
15569 /* Emit an explicit compare instruction, so that we can correctly
15570 track the condition codes. */
15571 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15572 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15573 }
15574 else
15575 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15576
15577 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15578 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15579 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15580
15581 /* Emit any final barrier needed for a __sync operation. */
15582 if (is_sync)
15583 aarch64_emit_post_barrier (model);
15584 }
15585
15586 static void
15587 aarch64_init_libfuncs (void)
15588 {
15589 /* Half-precision float operations. The compiler handles all operations
15590 with NULL libfuncs by converting to SFmode. */
15591
15592 /* Conversions. */
15593 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15594 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15595
15596 /* Arithmetic. */
15597 set_optab_libfunc (add_optab, HFmode, NULL);
15598 set_optab_libfunc (sdiv_optab, HFmode, NULL);
15599 set_optab_libfunc (smul_optab, HFmode, NULL);
15600 set_optab_libfunc (neg_optab, HFmode, NULL);
15601 set_optab_libfunc (sub_optab, HFmode, NULL);
15602
15603 /* Comparisons. */
15604 set_optab_libfunc (eq_optab, HFmode, NULL);
15605 set_optab_libfunc (ne_optab, HFmode, NULL);
15606 set_optab_libfunc (lt_optab, HFmode, NULL);
15607 set_optab_libfunc (le_optab, HFmode, NULL);
15608 set_optab_libfunc (ge_optab, HFmode, NULL);
15609 set_optab_libfunc (gt_optab, HFmode, NULL);
15610 set_optab_libfunc (unord_optab, HFmode, NULL);
15611 }
15612
15613 /* Target hook for c_mode_for_suffix. */
15614 static machine_mode
15615 aarch64_c_mode_for_suffix (char suffix)
15616 {
15617 if (suffix == 'q')
15618 return TFmode;
15619
15620 return VOIDmode;
15621 }
15622
15623 /* We can only represent floating point constants which will fit in
15624 "quarter-precision" values. These values are characterised by
15625 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
15626 by:
15627
15628 (-1)^s * (n/16) * 2^r
15629
15630 Where:
15631 's' is the sign bit.
15632 'n' is an integer in the range 16 <= n <= 31.
15633 'r' is an integer in the range -3 <= r <= 4. */
15634
15635 /* Return true iff X can be represented by a quarter-precision
15636 floating point immediate operand X. Note, we cannot represent 0.0. */
15637 bool
15638 aarch64_float_const_representable_p (rtx x)
15639 {
15640 /* This represents our current view of how many bits
15641 make up the mantissa. */
15642 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15643 int exponent;
15644 unsigned HOST_WIDE_INT mantissa, mask;
15645 REAL_VALUE_TYPE r, m;
15646 bool fail;
15647
15648 if (!CONST_DOUBLE_P (x))
15649 return false;
15650
15651 if (GET_MODE (x) == VOIDmode
15652 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15653 return false;
15654
15655 r = *CONST_DOUBLE_REAL_VALUE (x);
15656
15657 /* We cannot represent infinities, NaNs or +/-zero. We won't
15658 know if we have +zero until we analyse the mantissa, but we
15659 can reject the other invalid values. */
15660 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15661 || REAL_VALUE_MINUS_ZERO (r))
15662 return false;
15663
15664 /* Extract exponent. */
15665 r = real_value_abs (&r);
15666 exponent = REAL_EXP (&r);
15667
15668 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15669 highest (sign) bit, with a fixed binary point at bit point_pos.
15670 m1 holds the low part of the mantissa, m2 the high part.
15671 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15672 bits for the mantissa, this can fail (low bits will be lost). */
15673 real_ldexp (&m, &r, point_pos - exponent);
15674 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15675
15676 /* If the low part of the mantissa has bits set we cannot represent
15677 the value. */
15678 if (w.ulow () != 0)
15679 return false;
15680 /* We have rejected the lower HOST_WIDE_INT, so update our
15681 understanding of how many bits lie in the mantissa and
15682 look only at the high HOST_WIDE_INT. */
15683 mantissa = w.elt (1);
15684 point_pos -= HOST_BITS_PER_WIDE_INT;
15685
15686 /* We can only represent values with a mantissa of the form 1.xxxx. */
15687 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15688 if ((mantissa & mask) != 0)
15689 return false;
15690
15691 /* Having filtered unrepresentable values, we may now remove all
15692 but the highest 5 bits. */
15693 mantissa >>= point_pos - 5;
15694
15695 /* We cannot represent the value 0.0, so reject it. This is handled
15696 elsewhere. */
15697 if (mantissa == 0)
15698 return false;
15699
15700 /* Then, as bit 4 is always set, we can mask it off, leaving
15701 the mantissa in the range [0, 15]. */
15702 mantissa &= ~(1 << 4);
15703 gcc_assert (mantissa <= 15);
15704
15705 /* GCC internally does not use IEEE754-like encoding (where normalized
15706 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
15707 Our mantissa values are shifted 4 places to the left relative to
15708 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15709 by 5 places to correct for GCC's representation. */
15710 exponent = 5 - exponent;
15711
15712 return (exponent >= 0 && exponent <= 7);
15713 }
15714
15715 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15716 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
15717 output MOVI/MVNI, ORR or BIC immediate. */
15718 char*
15719 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15720 enum simd_immediate_check which)
15721 {
15722 bool is_valid;
15723 static char templ[40];
15724 const char *mnemonic;
15725 const char *shift_op;
15726 unsigned int lane_count = 0;
15727 char element_char;
15728
15729 struct simd_immediate_info info;
15730
15731 /* This will return true to show const_vector is legal for use as either
15732 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15733 It will also update INFO to show how the immediate should be generated.
15734 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
15735 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15736 gcc_assert (is_valid);
15737
15738 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15739 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15740
15741 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15742 {
15743 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15744 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15745 move immediate path. */
15746 if (aarch64_float_const_zero_rtx_p (info.value))
15747 info.value = GEN_INT (0);
15748 else
15749 {
15750 const unsigned int buf_size = 20;
15751 char float_buf[buf_size] = {'\0'};
15752 real_to_decimal_for_mode (float_buf,
15753 CONST_DOUBLE_REAL_VALUE (info.value),
15754 buf_size, buf_size, 1, info.elt_mode);
15755
15756 if (lane_count == 1)
15757 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15758 else
15759 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15760 lane_count, element_char, float_buf);
15761 return templ;
15762 }
15763 }
15764
15765 gcc_assert (CONST_INT_P (info.value));
15766
15767 if (which == AARCH64_CHECK_MOV)
15768 {
15769 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15770 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15771 if (lane_count == 1)
15772 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15773 mnemonic, UINTVAL (info.value));
15774 else if (info.shift)
15775 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15776 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15777 element_char, UINTVAL (info.value), shift_op, info.shift);
15778 else
15779 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15780 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15781 element_char, UINTVAL (info.value));
15782 }
15783 else
15784 {
15785 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15786 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15787 if (info.shift)
15788 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15789 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15790 element_char, UINTVAL (info.value), "lsl", info.shift);
15791 else
15792 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15793 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15794 element_char, UINTVAL (info.value));
15795 }
15796 return templ;
15797 }
15798
15799 char*
15800 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15801 {
15802
15803 /* If a floating point number was passed and we desire to use it in an
15804 integer mode do the conversion to integer. */
15805 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15806 {
15807 unsigned HOST_WIDE_INT ival;
15808 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15809 gcc_unreachable ();
15810 immediate = gen_int_mode (ival, mode);
15811 }
15812
15813 machine_mode vmode;
15814 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15815 a 128 bit vector mode. */
15816 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15817
15818 vmode = aarch64_simd_container_mode (mode, width);
15819 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15820 return aarch64_output_simd_mov_immediate (v_op, width);
15821 }
15822
15823 /* Return the output string to use for moving immediate CONST_VECTOR
15824 into an SVE register. */
15825
15826 char *
15827 aarch64_output_sve_mov_immediate (rtx const_vector)
15828 {
15829 static char templ[40];
15830 struct simd_immediate_info info;
15831 char element_char;
15832
15833 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15834 gcc_assert (is_valid);
15835
15836 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15837
15838 if (info.step)
15839 {
15840 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15841 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15842 element_char, INTVAL (info.value), INTVAL (info.step));
15843 return templ;
15844 }
15845
15846 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15847 {
15848 if (aarch64_float_const_zero_rtx_p (info.value))
15849 info.value = GEN_INT (0);
15850 else
15851 {
15852 const int buf_size = 20;
15853 char float_buf[buf_size] = {};
15854 real_to_decimal_for_mode (float_buf,
15855 CONST_DOUBLE_REAL_VALUE (info.value),
15856 buf_size, buf_size, 1, info.elt_mode);
15857
15858 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15859 element_char, float_buf);
15860 return templ;
15861 }
15862 }
15863
15864 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15865 element_char, INTVAL (info.value));
15866 return templ;
15867 }
15868
15869 /* Return the asm format for a PTRUE instruction whose destination has
15870 mode MODE. SUFFIX is the element size suffix. */
15871
15872 char *
15873 aarch64_output_ptrue (machine_mode mode, char suffix)
15874 {
15875 unsigned int nunits;
15876 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15877 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15878 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15879 else
15880 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15881 return buf;
15882 }
15883
15884 /* Split operands into moves from op[1] + op[2] into op[0]. */
15885
15886 void
15887 aarch64_split_combinev16qi (rtx operands[3])
15888 {
15889 unsigned int dest = REGNO (operands[0]);
15890 unsigned int src1 = REGNO (operands[1]);
15891 unsigned int src2 = REGNO (operands[2]);
15892 machine_mode halfmode = GET_MODE (operands[1]);
15893 unsigned int halfregs = REG_NREGS (operands[1]);
15894 rtx destlo, desthi;
15895
15896 gcc_assert (halfmode == V16QImode);
15897
15898 if (src1 == dest && src2 == dest + halfregs)
15899 {
15900 /* No-op move. Can't split to nothing; emit something. */
15901 emit_note (NOTE_INSN_DELETED);
15902 return;
15903 }
15904
15905 /* Preserve register attributes for variable tracking. */
15906 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15907 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15908 GET_MODE_SIZE (halfmode));
15909
15910 /* Special case of reversed high/low parts. */
15911 if (reg_overlap_mentioned_p (operands[2], destlo)
15912 && reg_overlap_mentioned_p (operands[1], desthi))
15913 {
15914 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15915 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15916 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15917 }
15918 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15919 {
15920 /* Try to avoid unnecessary moves if part of the result
15921 is in the right place already. */
15922 if (src1 != dest)
15923 emit_move_insn (destlo, operands[1]);
15924 if (src2 != dest + halfregs)
15925 emit_move_insn (desthi, operands[2]);
15926 }
15927 else
15928 {
15929 if (src2 != dest + halfregs)
15930 emit_move_insn (desthi, operands[2]);
15931 if (src1 != dest)
15932 emit_move_insn (destlo, operands[1]);
15933 }
15934 }
15935
15936 /* vec_perm support. */
15937
15938 struct expand_vec_perm_d
15939 {
15940 rtx target, op0, op1;
15941 vec_perm_indices perm;
15942 machine_mode vmode;
15943 unsigned int vec_flags;
15944 bool one_vector_p;
15945 bool testing_p;
15946 };
15947
15948 /* Generate a variable permutation. */
15949
15950 static void
15951 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15952 {
15953 machine_mode vmode = GET_MODE (target);
15954 bool one_vector_p = rtx_equal_p (op0, op1);
15955
15956 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15957 gcc_checking_assert (GET_MODE (op0) == vmode);
15958 gcc_checking_assert (GET_MODE (op1) == vmode);
15959 gcc_checking_assert (GET_MODE (sel) == vmode);
15960 gcc_checking_assert (TARGET_SIMD);
15961
15962 if (one_vector_p)
15963 {
15964 if (vmode == V8QImode)
15965 {
15966 /* Expand the argument to a V16QI mode by duplicating it. */
15967 rtx pair = gen_reg_rtx (V16QImode);
15968 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15969 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15970 }
15971 else
15972 {
15973 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15974 }
15975 }
15976 else
15977 {
15978 rtx pair;
15979
15980 if (vmode == V8QImode)
15981 {
15982 pair = gen_reg_rtx (V16QImode);
15983 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15984 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15985 }
15986 else
15987 {
15988 pair = gen_reg_rtx (OImode);
15989 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15990 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15991 }
15992 }
15993 }
15994
15995 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15996 NELT is the number of elements in the vector. */
15997
15998 void
15999 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16000 unsigned int nelt)
16001 {
16002 machine_mode vmode = GET_MODE (target);
16003 bool one_vector_p = rtx_equal_p (op0, op1);
16004 rtx mask;
16005
16006 /* The TBL instruction does not use a modulo index, so we must take care
16007 of that ourselves. */
16008 mask = aarch64_simd_gen_const_vector_dup (vmode,
16009 one_vector_p ? nelt - 1 : 2 * nelt - 1);
16010 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16011
16012 /* For big-endian, we also need to reverse the index within the vector
16013 (but not which vector). */
16014 if (BYTES_BIG_ENDIAN)
16015 {
16016 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16017 if (!one_vector_p)
16018 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16019 sel = expand_simple_binop (vmode, XOR, sel, mask,
16020 NULL, 0, OPTAB_LIB_WIDEN);
16021 }
16022 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16023 }
16024
16025 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
16026
16027 static void
16028 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16029 {
16030 emit_insn (gen_rtx_SET (target,
16031 gen_rtx_UNSPEC (GET_MODE (target),
16032 gen_rtvec (2, op0, op1), code)));
16033 }
16034
16035 /* Expand an SVE vec_perm with the given operands. */
16036
16037 void
16038 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16039 {
16040 machine_mode data_mode = GET_MODE (target);
16041 machine_mode sel_mode = GET_MODE (sel);
16042 /* Enforced by the pattern condition. */
16043 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16044
16045 /* Note: vec_perm indices are supposed to wrap when they go beyond the
16046 size of the two value vectors, i.e. the upper bits of the indices
16047 are effectively ignored. SVE TBL instead produces 0 for any
16048 out-of-range indices, so we need to modulo all the vec_perm indices
16049 to ensure they are all in range. */
16050 rtx sel_reg = force_reg (sel_mode, sel);
16051
16052 /* Check if the sel only references the first values vector. */
16053 if (GET_CODE (sel) == CONST_VECTOR
16054 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16055 {
16056 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16057 return;
16058 }
16059
16060 /* Check if the two values vectors are the same. */
16061 if (rtx_equal_p (op0, op1))
16062 {
16063 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16064 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16065 NULL, 0, OPTAB_DIRECT);
16066 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16067 return;
16068 }
16069
16070 /* Run TBL on for each value vector and combine the results. */
16071
16072 rtx res0 = gen_reg_rtx (data_mode);
16073 rtx res1 = gen_reg_rtx (data_mode);
16074 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16075 if (GET_CODE (sel) != CONST_VECTOR
16076 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16077 {
16078 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16079 2 * nunits - 1);
16080 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16081 NULL, 0, OPTAB_DIRECT);
16082 }
16083 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16084 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16085 NULL, 0, OPTAB_DIRECT);
16086 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16087 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16088 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16089 else
16090 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16091 }
16092
16093 /* Recognize patterns suitable for the TRN instructions. */
16094 static bool
16095 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16096 {
16097 HOST_WIDE_INT odd;
16098 poly_uint64 nelt = d->perm.length ();
16099 rtx out, in0, in1, x;
16100 machine_mode vmode = d->vmode;
16101
16102 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16103 return false;
16104
16105 /* Note that these are little-endian tests.
16106 We correct for big-endian later. */
16107 if (!d->perm[0].is_constant (&odd)
16108 || (odd != 0 && odd != 1)
16109 || !d->perm.series_p (0, 2, odd, 2)
16110 || !d->perm.series_p (1, 2, nelt + odd, 2))
16111 return false;
16112
16113 /* Success! */
16114 if (d->testing_p)
16115 return true;
16116
16117 in0 = d->op0;
16118 in1 = d->op1;
16119 /* We don't need a big-endian lane correction for SVE; see the comment
16120 at the head of aarch64-sve.md for details. */
16121 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16122 {
16123 x = in0, in0 = in1, in1 = x;
16124 odd = !odd;
16125 }
16126 out = d->target;
16127
16128 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16129 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16130 return true;
16131 }
16132
16133 /* Recognize patterns suitable for the UZP instructions. */
16134 static bool
16135 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16136 {
16137 HOST_WIDE_INT odd;
16138 rtx out, in0, in1, x;
16139 machine_mode vmode = d->vmode;
16140
16141 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16142 return false;
16143
16144 /* Note that these are little-endian tests.
16145 We correct for big-endian later. */
16146 if (!d->perm[0].is_constant (&odd)
16147 || (odd != 0 && odd != 1)
16148 || !d->perm.series_p (0, 1, odd, 2))
16149 return false;
16150
16151 /* Success! */
16152 if (d->testing_p)
16153 return true;
16154
16155 in0 = d->op0;
16156 in1 = d->op1;
16157 /* We don't need a big-endian lane correction for SVE; see the comment
16158 at the head of aarch64-sve.md for details. */
16159 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16160 {
16161 x = in0, in0 = in1, in1 = x;
16162 odd = !odd;
16163 }
16164 out = d->target;
16165
16166 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16167 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16168 return true;
16169 }
16170
16171 /* Recognize patterns suitable for the ZIP instructions. */
16172 static bool
16173 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16174 {
16175 unsigned int high;
16176 poly_uint64 nelt = d->perm.length ();
16177 rtx out, in0, in1, x;
16178 machine_mode vmode = d->vmode;
16179
16180 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16181 return false;
16182
16183 /* Note that these are little-endian tests.
16184 We correct for big-endian later. */
16185 poly_uint64 first = d->perm[0];
16186 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16187 || !d->perm.series_p (0, 2, first, 1)
16188 || !d->perm.series_p (1, 2, first + nelt, 1))
16189 return false;
16190 high = maybe_ne (first, 0U);
16191
16192 /* Success! */
16193 if (d->testing_p)
16194 return true;
16195
16196 in0 = d->op0;
16197 in1 = d->op1;
16198 /* We don't need a big-endian lane correction for SVE; see the comment
16199 at the head of aarch64-sve.md for details. */
16200 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16201 {
16202 x = in0, in0 = in1, in1 = x;
16203 high = !high;
16204 }
16205 out = d->target;
16206
16207 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16208 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16209 return true;
16210 }
16211
16212 /* Recognize patterns for the EXT insn. */
16213
16214 static bool
16215 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16216 {
16217 HOST_WIDE_INT location;
16218 rtx offset;
16219
16220 /* The first element always refers to the first vector.
16221 Check if the extracted indices are increasing by one. */
16222 if (d->vec_flags == VEC_SVE_PRED
16223 || !d->perm[0].is_constant (&location)
16224 || !d->perm.series_p (0, 1, location, 1))
16225 return false;
16226
16227 /* Success! */
16228 if (d->testing_p)
16229 return true;
16230
16231 /* The case where (location == 0) is a no-op for both big- and little-endian,
16232 and is removed by the mid-end at optimization levels -O1 and higher.
16233
16234 We don't need a big-endian lane correction for SVE; see the comment
16235 at the head of aarch64-sve.md for details. */
16236 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16237 {
16238 /* After setup, we want the high elements of the first vector (stored
16239 at the LSB end of the register), and the low elements of the second
16240 vector (stored at the MSB end of the register). So swap. */
16241 std::swap (d->op0, d->op1);
16242 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16243 to_constant () is safe since this is restricted to Advanced SIMD
16244 vectors. */
16245 location = d->perm.length ().to_constant () - location;
16246 }
16247
16248 offset = GEN_INT (location);
16249 emit_set_insn (d->target,
16250 gen_rtx_UNSPEC (d->vmode,
16251 gen_rtvec (3, d->op0, d->op1, offset),
16252 UNSPEC_EXT));
16253 return true;
16254 }
16255
16256 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16257 within each 64-bit, 32-bit or 16-bit granule. */
16258
16259 static bool
16260 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16261 {
16262 HOST_WIDE_INT diff;
16263 unsigned int i, size, unspec;
16264 machine_mode pred_mode;
16265
16266 if (d->vec_flags == VEC_SVE_PRED
16267 || !d->one_vector_p
16268 || !d->perm[0].is_constant (&diff))
16269 return false;
16270
16271 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16272 if (size == 8)
16273 {
16274 unspec = UNSPEC_REV64;
16275 pred_mode = VNx2BImode;
16276 }
16277 else if (size == 4)
16278 {
16279 unspec = UNSPEC_REV32;
16280 pred_mode = VNx4BImode;
16281 }
16282 else if (size == 2)
16283 {
16284 unspec = UNSPEC_REV16;
16285 pred_mode = VNx8BImode;
16286 }
16287 else
16288 return false;
16289
16290 unsigned int step = diff + 1;
16291 for (i = 0; i < step; ++i)
16292 if (!d->perm.series_p (i, step, diff - i, step))
16293 return false;
16294
16295 /* Success! */
16296 if (d->testing_p)
16297 return true;
16298
16299 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16300 if (d->vec_flags == VEC_SVE_DATA)
16301 {
16302 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16303 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16304 UNSPEC_MERGE_PTRUE);
16305 }
16306 emit_set_insn (d->target, src);
16307 return true;
16308 }
16309
16310 /* Recognize patterns for the REV insn, which reverses elements within
16311 a full vector. */
16312
16313 static bool
16314 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16315 {
16316 poly_uint64 nelt = d->perm.length ();
16317
16318 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16319 return false;
16320
16321 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16322 return false;
16323
16324 /* Success! */
16325 if (d->testing_p)
16326 return true;
16327
16328 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16329 emit_set_insn (d->target, src);
16330 return true;
16331 }
16332
16333 static bool
16334 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16335 {
16336 rtx out = d->target;
16337 rtx in0;
16338 HOST_WIDE_INT elt;
16339 machine_mode vmode = d->vmode;
16340 rtx lane;
16341
16342 if (d->vec_flags == VEC_SVE_PRED
16343 || d->perm.encoding ().encoded_nelts () != 1
16344 || !d->perm[0].is_constant (&elt))
16345 return false;
16346
16347 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16348 return false;
16349
16350 /* Success! */
16351 if (d->testing_p)
16352 return true;
16353
16354 /* The generic preparation in aarch64_expand_vec_perm_const_1
16355 swaps the operand order and the permute indices if it finds
16356 d->perm[0] to be in the second operand. Thus, we can always
16357 use d->op0 and need not do any extra arithmetic to get the
16358 correct lane number. */
16359 in0 = d->op0;
16360 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16361
16362 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16363 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16364 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16365 return true;
16366 }
16367
16368 static bool
16369 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16370 {
16371 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16372 machine_mode vmode = d->vmode;
16373
16374 /* Make sure that the indices are constant. */
16375 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16376 for (unsigned int i = 0; i < encoded_nelts; ++i)
16377 if (!d->perm[i].is_constant ())
16378 return false;
16379
16380 if (d->testing_p)
16381 return true;
16382
16383 /* Generic code will try constant permutation twice. Once with the
16384 original mode and again with the elements lowered to QImode.
16385 So wait and don't do the selector expansion ourselves. */
16386 if (vmode != V8QImode && vmode != V16QImode)
16387 return false;
16388
16389 /* to_constant is safe since this routine is specific to Advanced SIMD
16390 vectors. */
16391 unsigned int nelt = d->perm.length ().to_constant ();
16392 for (unsigned int i = 0; i < nelt; ++i)
16393 /* If big-endian and two vectors we end up with a weird mixed-endian
16394 mode on NEON. Reverse the index within each word but not the word
16395 itself. to_constant is safe because we checked is_constant above. */
16396 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16397 ? d->perm[i].to_constant () ^ (nelt - 1)
16398 : d->perm[i].to_constant ());
16399
16400 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16401 sel = force_reg (vmode, sel);
16402
16403 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16404 return true;
16405 }
16406
16407 /* Try to implement D using an SVE TBL instruction. */
16408
16409 static bool
16410 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16411 {
16412 unsigned HOST_WIDE_INT nelt;
16413
16414 /* Permuting two variable-length vectors could overflow the
16415 index range. */
16416 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16417 return false;
16418
16419 if (d->testing_p)
16420 return true;
16421
16422 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16423 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16424 if (d->one_vector_p)
16425 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16426 else
16427 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16428 return true;
16429 }
16430
16431 static bool
16432 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16433 {
16434 /* The pattern matching functions above are written to look for a small
16435 number to begin the sequence (0, 1, N/2). If we begin with an index
16436 from the second operand, we can swap the operands. */
16437 poly_int64 nelt = d->perm.length ();
16438 if (known_ge (d->perm[0], nelt))
16439 {
16440 d->perm.rotate_inputs (1);
16441 std::swap (d->op0, d->op1);
16442 }
16443
16444 if ((d->vec_flags == VEC_ADVSIMD
16445 || d->vec_flags == VEC_SVE_DATA
16446 || d->vec_flags == VEC_SVE_PRED)
16447 && known_gt (nelt, 1))
16448 {
16449 if (aarch64_evpc_rev_local (d))
16450 return true;
16451 else if (aarch64_evpc_rev_global (d))
16452 return true;
16453 else if (aarch64_evpc_ext (d))
16454 return true;
16455 else if (aarch64_evpc_dup (d))
16456 return true;
16457 else if (aarch64_evpc_zip (d))
16458 return true;
16459 else if (aarch64_evpc_uzp (d))
16460 return true;
16461 else if (aarch64_evpc_trn (d))
16462 return true;
16463 if (d->vec_flags == VEC_SVE_DATA)
16464 return aarch64_evpc_sve_tbl (d);
16465 else if (d->vec_flags == VEC_ADVSIMD)
16466 return aarch64_evpc_tbl (d);
16467 }
16468 return false;
16469 }
16470
16471 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16472
16473 static bool
16474 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16475 rtx op1, const vec_perm_indices &sel)
16476 {
16477 struct expand_vec_perm_d d;
16478
16479 /* Check whether the mask can be applied to a single vector. */
16480 if (sel.ninputs () == 1
16481 || (op0 && rtx_equal_p (op0, op1)))
16482 d.one_vector_p = true;
16483 else if (sel.all_from_input_p (0))
16484 {
16485 d.one_vector_p = true;
16486 op1 = op0;
16487 }
16488 else if (sel.all_from_input_p (1))
16489 {
16490 d.one_vector_p = true;
16491 op0 = op1;
16492 }
16493 else
16494 d.one_vector_p = false;
16495
16496 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16497 sel.nelts_per_input ());
16498 d.vmode = vmode;
16499 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16500 d.target = target;
16501 d.op0 = op0;
16502 d.op1 = op1;
16503 d.testing_p = !target;
16504
16505 if (!d.testing_p)
16506 return aarch64_expand_vec_perm_const_1 (&d);
16507
16508 rtx_insn *last = get_last_insn ();
16509 bool ret = aarch64_expand_vec_perm_const_1 (&d);
16510 gcc_assert (last == get_last_insn ());
16511
16512 return ret;
16513 }
16514
16515 /* Generate a byte permute mask for a register of mode MODE,
16516 which has NUNITS units. */
16517
16518 rtx
16519 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16520 {
16521 /* We have to reverse each vector because we dont have
16522 a permuted load that can reverse-load according to ABI rules. */
16523 rtx mask;
16524 rtvec v = rtvec_alloc (16);
16525 unsigned int i, j;
16526 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16527
16528 gcc_assert (BYTES_BIG_ENDIAN);
16529 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16530
16531 for (i = 0; i < nunits; i++)
16532 for (j = 0; j < usize; j++)
16533 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16534 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16535 return force_reg (V16QImode, mask);
16536 }
16537
16538 /* Return true if X is a valid second operand for the SVE instruction
16539 that implements integer comparison OP_CODE. */
16540
16541 static bool
16542 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16543 {
16544 if (register_operand (x, VOIDmode))
16545 return true;
16546
16547 switch (op_code)
16548 {
16549 case LTU:
16550 case LEU:
16551 case GEU:
16552 case GTU:
16553 return aarch64_sve_cmp_immediate_p (x, false);
16554 case LT:
16555 case LE:
16556 case GE:
16557 case GT:
16558 case NE:
16559 case EQ:
16560 return aarch64_sve_cmp_immediate_p (x, true);
16561 default:
16562 gcc_unreachable ();
16563 }
16564 }
16565
16566 /* Use predicated SVE instructions to implement the equivalent of:
16567
16568 (set TARGET OP)
16569
16570 given that PTRUE is an all-true predicate of the appropriate mode. */
16571
16572 static void
16573 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16574 {
16575 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16576 gen_rtvec (2, ptrue, op),
16577 UNSPEC_MERGE_PTRUE);
16578 rtx_insn *insn = emit_set_insn (target, unspec);
16579 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16580 }
16581
16582 /* Likewise, but also clobber the condition codes. */
16583
16584 static void
16585 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16586 {
16587 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16588 gen_rtvec (2, ptrue, op),
16589 UNSPEC_MERGE_PTRUE);
16590 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16591 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16592 }
16593
16594 /* Return the UNSPEC_COND_* code for comparison CODE. */
16595
16596 static unsigned int
16597 aarch64_unspec_cond_code (rtx_code code)
16598 {
16599 switch (code)
16600 {
16601 case NE:
16602 return UNSPEC_COND_NE;
16603 case EQ:
16604 return UNSPEC_COND_EQ;
16605 case LT:
16606 return UNSPEC_COND_LT;
16607 case GT:
16608 return UNSPEC_COND_GT;
16609 case LE:
16610 return UNSPEC_COND_LE;
16611 case GE:
16612 return UNSPEC_COND_GE;
16613 default:
16614 gcc_unreachable ();
16615 }
16616 }
16617
16618 /* Emit:
16619
16620 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16621
16622 where <X> is the operation associated with comparison CODE. This form
16623 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16624 semantics, such as when PRED might not be all-true and when comparing
16625 inactive lanes could have side effects. */
16626
16627 static void
16628 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16629 rtx pred, rtx op0, rtx op1)
16630 {
16631 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16632 gen_rtvec (3, pred, op0, op1),
16633 aarch64_unspec_cond_code (code));
16634 emit_set_insn (target, unspec);
16635 }
16636
16637 /* Expand an SVE integer comparison using the SVE equivalent of:
16638
16639 (set TARGET (CODE OP0 OP1)). */
16640
16641 void
16642 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16643 {
16644 machine_mode pred_mode = GET_MODE (target);
16645 machine_mode data_mode = GET_MODE (op0);
16646
16647 if (!aarch64_sve_cmp_operand_p (code, op1))
16648 op1 = force_reg (data_mode, op1);
16649
16650 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16651 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16652 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16653 }
16654
16655 /* Emit the SVE equivalent of:
16656
16657 (set TMP1 (CODE1 OP0 OP1))
16658 (set TMP2 (CODE2 OP0 OP1))
16659 (set TARGET (ior:PRED_MODE TMP1 TMP2))
16660
16661 PTRUE is an all-true predicate with the same mode as TARGET. */
16662
16663 static void
16664 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16665 rtx ptrue, rtx op0, rtx op1)
16666 {
16667 machine_mode pred_mode = GET_MODE (ptrue);
16668 rtx tmp1 = gen_reg_rtx (pred_mode);
16669 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16670 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16671 rtx tmp2 = gen_reg_rtx (pred_mode);
16672 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16673 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16674 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16675 }
16676
16677 /* Emit the SVE equivalent of:
16678
16679 (set TMP (CODE OP0 OP1))
16680 (set TARGET (not TMP))
16681
16682 PTRUE is an all-true predicate with the same mode as TARGET. */
16683
16684 static void
16685 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16686 rtx op0, rtx op1)
16687 {
16688 machine_mode pred_mode = GET_MODE (ptrue);
16689 rtx tmp = gen_reg_rtx (pred_mode);
16690 aarch64_emit_sve_ptrue_op (tmp, ptrue,
16691 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16692 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16693 }
16694
16695 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16696
16697 (set TARGET (CODE OP0 OP1))
16698
16699 If CAN_INVERT_P is true, the caller can also handle inverted results;
16700 return true if the result is in fact inverted. */
16701
16702 bool
16703 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16704 rtx op0, rtx op1, bool can_invert_p)
16705 {
16706 machine_mode pred_mode = GET_MODE (target);
16707 machine_mode data_mode = GET_MODE (op0);
16708
16709 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16710 switch (code)
16711 {
16712 case UNORDERED:
16713 /* UNORDERED has no immediate form. */
16714 op1 = force_reg (data_mode, op1);
16715 /* fall through */
16716 case LT:
16717 case LE:
16718 case GT:
16719 case GE:
16720 case EQ:
16721 case NE:
16722 {
16723 /* There is native support for the comparison. */
16724 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16725 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16726 return false;
16727 }
16728
16729 case LTGT:
16730 /* This is a trapping operation (LT or GT). */
16731 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16732 return false;
16733
16734 case UNEQ:
16735 if (!flag_trapping_math)
16736 {
16737 /* This would trap for signaling NaNs. */
16738 op1 = force_reg (data_mode, op1);
16739 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16740 return false;
16741 }
16742 /* fall through */
16743 case UNLT:
16744 case UNLE:
16745 case UNGT:
16746 case UNGE:
16747 if (flag_trapping_math)
16748 {
16749 /* Work out which elements are ordered. */
16750 rtx ordered = gen_reg_rtx (pred_mode);
16751 op1 = force_reg (data_mode, op1);
16752 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16753
16754 /* Test the opposite condition for the ordered elements,
16755 then invert the result. */
16756 if (code == UNEQ)
16757 code = NE;
16758 else
16759 code = reverse_condition_maybe_unordered (code);
16760 if (can_invert_p)
16761 {
16762 aarch64_emit_sve_predicated_cond (target, code,
16763 ordered, op0, op1);
16764 return true;
16765 }
16766 rtx tmp = gen_reg_rtx (pred_mode);
16767 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16768 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16769 return false;
16770 }
16771 break;
16772
16773 case ORDERED:
16774 /* ORDERED has no immediate form. */
16775 op1 = force_reg (data_mode, op1);
16776 break;
16777
16778 default:
16779 gcc_unreachable ();
16780 }
16781
16782 /* There is native support for the inverse comparison. */
16783 code = reverse_condition_maybe_unordered (code);
16784 if (can_invert_p)
16785 {
16786 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16787 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16788 return true;
16789 }
16790 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16791 return false;
16792 }
16793
16794 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16795 of the data being selected and CMP_MODE is the mode of the values being
16796 compared. */
16797
16798 void
16799 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16800 rtx *ops)
16801 {
16802 machine_mode pred_mode
16803 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16804 GET_MODE_SIZE (cmp_mode)).require ();
16805 rtx pred = gen_reg_rtx (pred_mode);
16806 if (FLOAT_MODE_P (cmp_mode))
16807 {
16808 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16809 ops[4], ops[5], true))
16810 std::swap (ops[1], ops[2]);
16811 }
16812 else
16813 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16814
16815 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16816 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16817 }
16818
16819 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16820 true. However due to issues with register allocation it is preferable
16821 to avoid tieing integer scalar and FP scalar modes. Executing integer
16822 operations in general registers is better than treating them as scalar
16823 vector operations. This reduces latency and avoids redundant int<->FP
16824 moves. So tie modes if they are either the same class, or vector modes
16825 with other vector modes, vector structs or any scalar mode. */
16826
16827 static bool
16828 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16829 {
16830 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16831 return true;
16832
16833 /* We specifically want to allow elements of "structure" modes to
16834 be tieable to the structure. This more general condition allows
16835 other rarer situations too. The reason we don't extend this to
16836 predicate modes is that there are no predicate structure modes
16837 nor any specific instructions for extracting part of a predicate
16838 register. */
16839 if (aarch64_vector_data_mode_p (mode1)
16840 && aarch64_vector_data_mode_p (mode2))
16841 return true;
16842
16843 /* Also allow any scalar modes with vectors. */
16844 if (aarch64_vector_mode_supported_p (mode1)
16845 || aarch64_vector_mode_supported_p (mode2))
16846 return true;
16847
16848 return false;
16849 }
16850
16851 /* Return a new RTX holding the result of moving POINTER forward by
16852 AMOUNT bytes. */
16853
16854 static rtx
16855 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16856 {
16857 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16858
16859 return adjust_automodify_address (pointer, GET_MODE (pointer),
16860 next, amount);
16861 }
16862
16863 /* Return a new RTX holding the result of moving POINTER forward by the
16864 size of the mode it points to. */
16865
16866 static rtx
16867 aarch64_progress_pointer (rtx pointer)
16868 {
16869 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16870 }
16871
16872 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16873 MODE bytes. */
16874
16875 static void
16876 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16877 machine_mode mode)
16878 {
16879 rtx reg = gen_reg_rtx (mode);
16880
16881 /* "Cast" the pointers to the correct mode. */
16882 *src = adjust_address (*src, mode, 0);
16883 *dst = adjust_address (*dst, mode, 0);
16884 /* Emit the memcpy. */
16885 emit_move_insn (reg, *src);
16886 emit_move_insn (*dst, reg);
16887 /* Move the pointers forward. */
16888 *src = aarch64_progress_pointer (*src);
16889 *dst = aarch64_progress_pointer (*dst);
16890 }
16891
16892 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16893 we succeed, otherwise return false. */
16894
16895 bool
16896 aarch64_expand_movmem (rtx *operands)
16897 {
16898 int n, mode_bits;
16899 rtx dst = operands[0];
16900 rtx src = operands[1];
16901 rtx base;
16902 machine_mode cur_mode = BLKmode, next_mode;
16903 bool speed_p = !optimize_function_for_size_p (cfun);
16904
16905 /* When optimizing for size, give a better estimate of the length of a
16906 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16907 will always require an even number of instructions to do now. And each
16908 operation requires both a load+store, so devide the max number by 2. */
16909 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16910
16911 /* We can't do anything smart if the amount to copy is not constant. */
16912 if (!CONST_INT_P (operands[2]))
16913 return false;
16914
16915 n = INTVAL (operands[2]);
16916
16917 /* Try to keep the number of instructions low. For all cases we will do at
16918 most two moves for the residual amount, since we'll always overlap the
16919 remainder. */
16920 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16921 return false;
16922
16923 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16924 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16925
16926 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16927 src = adjust_automodify_address (src, VOIDmode, base, 0);
16928
16929 /* Convert n to bits to make the rest of the code simpler. */
16930 n = n * BITS_PER_UNIT;
16931
16932 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
16933 larger than TImode, but we should not use them for loads/stores here. */
16934 const int copy_limit = GET_MODE_BITSIZE (TImode);
16935
16936 while (n > 0)
16937 {
16938 /* Find the largest mode in which to do the copy in without over reading
16939 or writing. */
16940 opt_scalar_int_mode mode_iter;
16941 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16942 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16943 cur_mode = mode_iter.require ();
16944
16945 gcc_assert (cur_mode != BLKmode);
16946
16947 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16948 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16949
16950 n -= mode_bits;
16951
16952 /* Do certain trailing copies as overlapping if it's going to be
16953 cheaper. i.e. less instructions to do so. For instance doing a 15
16954 byte copy it's more efficient to do two overlapping 8 byte copies than
16955 8 + 6 + 1. */
16956 if (n > 0 && n <= 8 * BITS_PER_UNIT)
16957 {
16958 next_mode = smallest_mode_for_size (n, MODE_INT);
16959 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16960 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16961 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16962 n = n_bits;
16963 }
16964 }
16965
16966 return true;
16967 }
16968
16969 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16970 SImode stores. Handle the case when the constant has identical
16971 bottom and top halves. This is beneficial when the two stores can be
16972 merged into an STP and we avoid synthesising potentially expensive
16973 immediates twice. Return true if such a split is possible. */
16974
16975 bool
16976 aarch64_split_dimode_const_store (rtx dst, rtx src)
16977 {
16978 rtx lo = gen_lowpart (SImode, src);
16979 rtx hi = gen_highpart_mode (SImode, DImode, src);
16980
16981 bool size_p = optimize_function_for_size_p (cfun);
16982
16983 if (!rtx_equal_p (lo, hi))
16984 return false;
16985
16986 unsigned int orig_cost
16987 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16988 unsigned int lo_cost
16989 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16990
16991 /* We want to transform:
16992 MOV x1, 49370
16993 MOVK x1, 0x140, lsl 16
16994 MOVK x1, 0xc0da, lsl 32
16995 MOVK x1, 0x140, lsl 48
16996 STR x1, [x0]
16997 into:
16998 MOV w1, 49370
16999 MOVK w1, 0x140, lsl 16
17000 STP w1, w1, [x0]
17001 So we want to perform this only when we save two instructions
17002 or more. When optimizing for size, however, accept any code size
17003 savings we can. */
17004 if (size_p && orig_cost <= lo_cost)
17005 return false;
17006
17007 if (!size_p
17008 && (orig_cost <= lo_cost + 1))
17009 return false;
17010
17011 rtx mem_lo = adjust_address (dst, SImode, 0);
17012 if (!aarch64_mem_pair_operand (mem_lo, SImode))
17013 return false;
17014
17015 rtx tmp_reg = gen_reg_rtx (SImode);
17016 aarch64_expand_mov_immediate (tmp_reg, lo);
17017 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17018 /* Don't emit an explicit store pair as this may not be always profitable.
17019 Let the sched-fusion logic decide whether to merge them. */
17020 emit_move_insn (mem_lo, tmp_reg);
17021 emit_move_insn (mem_hi, tmp_reg);
17022
17023 return true;
17024 }
17025
17026 /* Generate RTL for a conditional branch with rtx comparison CODE in
17027 mode CC_MODE. The destination of the unlikely conditional branch
17028 is LABEL_REF. */
17029
17030 void
17031 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17032 rtx label_ref)
17033 {
17034 rtx x;
17035 x = gen_rtx_fmt_ee (code, VOIDmode,
17036 gen_rtx_REG (cc_mode, CC_REGNUM),
17037 const0_rtx);
17038
17039 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17040 gen_rtx_LABEL_REF (VOIDmode, label_ref),
17041 pc_rtx);
17042 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17043 }
17044
17045 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17046
17047 OP1 represents the TImode destination operand 1
17048 OP2 represents the TImode destination operand 2
17049 LOW_DEST represents the low half (DImode) of TImode operand 0
17050 LOW_IN1 represents the low half (DImode) of TImode operand 1
17051 LOW_IN2 represents the low half (DImode) of TImode operand 2
17052 HIGH_DEST represents the high half (DImode) of TImode operand 0
17053 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17054 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17055
17056 void
17057 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17058 rtx *low_in1, rtx *low_in2,
17059 rtx *high_dest, rtx *high_in1,
17060 rtx *high_in2)
17061 {
17062 *low_dest = gen_reg_rtx (DImode);
17063 *low_in1 = gen_lowpart (DImode, op1);
17064 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17065 subreg_lowpart_offset (DImode, TImode));
17066 *high_dest = gen_reg_rtx (DImode);
17067 *high_in1 = gen_highpart (DImode, op1);
17068 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17069 subreg_highpart_offset (DImode, TImode));
17070 }
17071
17072 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17073
17074 This function differs from 'arch64_addti_scratch_regs' in that
17075 OP1 can be an immediate constant (zero). We must call
17076 subreg_highpart_offset with DImode and TImode arguments, otherwise
17077 VOIDmode will be used for the const_int which generates an internal
17078 error from subreg_size_highpart_offset which does not expect a size of zero.
17079
17080 OP1 represents the TImode destination operand 1
17081 OP2 represents the TImode destination operand 2
17082 LOW_DEST represents the low half (DImode) of TImode operand 0
17083 LOW_IN1 represents the low half (DImode) of TImode operand 1
17084 LOW_IN2 represents the low half (DImode) of TImode operand 2
17085 HIGH_DEST represents the high half (DImode) of TImode operand 0
17086 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17087 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17088
17089
17090 void
17091 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17092 rtx *low_in1, rtx *low_in2,
17093 rtx *high_dest, rtx *high_in1,
17094 rtx *high_in2)
17095 {
17096 *low_dest = gen_reg_rtx (DImode);
17097 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17098 subreg_lowpart_offset (DImode, TImode));
17099
17100 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17101 subreg_lowpart_offset (DImode, TImode));
17102 *high_dest = gen_reg_rtx (DImode);
17103
17104 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17105 subreg_highpart_offset (DImode, TImode));
17106 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17107 subreg_highpart_offset (DImode, TImode));
17108 }
17109
17110 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17111
17112 OP0 represents the TImode destination operand 0
17113 LOW_DEST represents the low half (DImode) of TImode operand 0
17114 LOW_IN1 represents the low half (DImode) of TImode operand 1
17115 LOW_IN2 represents the low half (DImode) of TImode operand 2
17116 HIGH_DEST represents the high half (DImode) of TImode operand 0
17117 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17118 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17119 UNSIGNED_P is true if the operation is being performed on unsigned
17120 values. */
17121 void
17122 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17123 rtx low_in2, rtx high_dest, rtx high_in1,
17124 rtx high_in2, bool unsigned_p)
17125 {
17126 if (low_in2 == const0_rtx)
17127 {
17128 low_dest = low_in1;
17129 high_in2 = force_reg (DImode, high_in2);
17130 if (unsigned_p)
17131 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17132 else
17133 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17134 }
17135 else
17136 {
17137 if (CONST_INT_P (low_in2))
17138 {
17139 high_in2 = force_reg (DImode, high_in2);
17140 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17141 GEN_INT (-INTVAL (low_in2))));
17142 }
17143 else
17144 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17145
17146 if (unsigned_p)
17147 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17148 else
17149 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17150 }
17151
17152 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17153 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17154
17155 }
17156
17157 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17158
17159 static unsigned HOST_WIDE_INT
17160 aarch64_asan_shadow_offset (void)
17161 {
17162 return (HOST_WIDE_INT_1 << 36);
17163 }
17164
17165 static rtx
17166 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17167 int code, tree treeop0, tree treeop1)
17168 {
17169 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17170 rtx op0, op1;
17171 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17172 insn_code icode;
17173 struct expand_operand ops[4];
17174
17175 start_sequence ();
17176 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17177
17178 op_mode = GET_MODE (op0);
17179 if (op_mode == VOIDmode)
17180 op_mode = GET_MODE (op1);
17181
17182 switch (op_mode)
17183 {
17184 case E_QImode:
17185 case E_HImode:
17186 case E_SImode:
17187 cmp_mode = SImode;
17188 icode = CODE_FOR_cmpsi;
17189 break;
17190
17191 case E_DImode:
17192 cmp_mode = DImode;
17193 icode = CODE_FOR_cmpdi;
17194 break;
17195
17196 case E_SFmode:
17197 cmp_mode = SFmode;
17198 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17199 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17200 break;
17201
17202 case E_DFmode:
17203 cmp_mode = DFmode;
17204 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17205 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17206 break;
17207
17208 default:
17209 end_sequence ();
17210 return NULL_RTX;
17211 }
17212
17213 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17214 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17215 if (!op0 || !op1)
17216 {
17217 end_sequence ();
17218 return NULL_RTX;
17219 }
17220 *prep_seq = get_insns ();
17221 end_sequence ();
17222
17223 create_fixed_operand (&ops[0], op0);
17224 create_fixed_operand (&ops[1], op1);
17225
17226 start_sequence ();
17227 if (!maybe_expand_insn (icode, 2, ops))
17228 {
17229 end_sequence ();
17230 return NULL_RTX;
17231 }
17232 *gen_seq = get_insns ();
17233 end_sequence ();
17234
17235 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17236 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17237 }
17238
17239 static rtx
17240 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17241 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17242 {
17243 rtx op0, op1, target;
17244 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17245 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17246 insn_code icode;
17247 struct expand_operand ops[6];
17248 int aarch64_cond;
17249
17250 push_to_sequence (*prep_seq);
17251 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17252
17253 op_mode = GET_MODE (op0);
17254 if (op_mode == VOIDmode)
17255 op_mode = GET_MODE (op1);
17256
17257 switch (op_mode)
17258 {
17259 case E_QImode:
17260 case E_HImode:
17261 case E_SImode:
17262 cmp_mode = SImode;
17263 icode = CODE_FOR_ccmpsi;
17264 break;
17265
17266 case E_DImode:
17267 cmp_mode = DImode;
17268 icode = CODE_FOR_ccmpdi;
17269 break;
17270
17271 case E_SFmode:
17272 cmp_mode = SFmode;
17273 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17274 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17275 break;
17276
17277 case E_DFmode:
17278 cmp_mode = DFmode;
17279 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17280 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17281 break;
17282
17283 default:
17284 end_sequence ();
17285 return NULL_RTX;
17286 }
17287
17288 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17289 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17290 if (!op0 || !op1)
17291 {
17292 end_sequence ();
17293 return NULL_RTX;
17294 }
17295 *prep_seq = get_insns ();
17296 end_sequence ();
17297
17298 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17299 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17300
17301 if (bit_code != AND)
17302 {
17303 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17304 GET_MODE (XEXP (prev, 0))),
17305 VOIDmode, XEXP (prev, 0), const0_rtx);
17306 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17307 }
17308
17309 create_fixed_operand (&ops[0], XEXP (prev, 0));
17310 create_fixed_operand (&ops[1], target);
17311 create_fixed_operand (&ops[2], op0);
17312 create_fixed_operand (&ops[3], op1);
17313 create_fixed_operand (&ops[4], prev);
17314 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17315
17316 push_to_sequence (*gen_seq);
17317 if (!maybe_expand_insn (icode, 6, ops))
17318 {
17319 end_sequence ();
17320 return NULL_RTX;
17321 }
17322
17323 *gen_seq = get_insns ();
17324 end_sequence ();
17325
17326 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17327 }
17328
17329 #undef TARGET_GEN_CCMP_FIRST
17330 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17331
17332 #undef TARGET_GEN_CCMP_NEXT
17333 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17334
17335 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17336 instruction fusion of some sort. */
17337
17338 static bool
17339 aarch64_macro_fusion_p (void)
17340 {
17341 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17342 }
17343
17344
17345 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17346 should be kept together during scheduling. */
17347
17348 static bool
17349 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17350 {
17351 rtx set_dest;
17352 rtx prev_set = single_set (prev);
17353 rtx curr_set = single_set (curr);
17354 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17355 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17356
17357 if (!aarch64_macro_fusion_p ())
17358 return false;
17359
17360 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17361 {
17362 /* We are trying to match:
17363 prev (mov) == (set (reg r0) (const_int imm16))
17364 curr (movk) == (set (zero_extract (reg r0)
17365 (const_int 16)
17366 (const_int 16))
17367 (const_int imm16_1)) */
17368
17369 set_dest = SET_DEST (curr_set);
17370
17371 if (GET_CODE (set_dest) == ZERO_EXTRACT
17372 && CONST_INT_P (SET_SRC (curr_set))
17373 && CONST_INT_P (SET_SRC (prev_set))
17374 && CONST_INT_P (XEXP (set_dest, 2))
17375 && INTVAL (XEXP (set_dest, 2)) == 16
17376 && REG_P (XEXP (set_dest, 0))
17377 && REG_P (SET_DEST (prev_set))
17378 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17379 {
17380 return true;
17381 }
17382 }
17383
17384 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17385 {
17386
17387 /* We're trying to match:
17388 prev (adrp) == (set (reg r1)
17389 (high (symbol_ref ("SYM"))))
17390 curr (add) == (set (reg r0)
17391 (lo_sum (reg r1)
17392 (symbol_ref ("SYM"))))
17393 Note that r0 need not necessarily be the same as r1, especially
17394 during pre-regalloc scheduling. */
17395
17396 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17397 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17398 {
17399 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17400 && REG_P (XEXP (SET_SRC (curr_set), 0))
17401 && REGNO (XEXP (SET_SRC (curr_set), 0))
17402 == REGNO (SET_DEST (prev_set))
17403 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17404 XEXP (SET_SRC (curr_set), 1)))
17405 return true;
17406 }
17407 }
17408
17409 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17410 {
17411
17412 /* We're trying to match:
17413 prev (movk) == (set (zero_extract (reg r0)
17414 (const_int 16)
17415 (const_int 32))
17416 (const_int imm16_1))
17417 curr (movk) == (set (zero_extract (reg r0)
17418 (const_int 16)
17419 (const_int 48))
17420 (const_int imm16_2)) */
17421
17422 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17423 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17424 && REG_P (XEXP (SET_DEST (prev_set), 0))
17425 && REG_P (XEXP (SET_DEST (curr_set), 0))
17426 && REGNO (XEXP (SET_DEST (prev_set), 0))
17427 == REGNO (XEXP (SET_DEST (curr_set), 0))
17428 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17429 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17430 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17431 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17432 && CONST_INT_P (SET_SRC (prev_set))
17433 && CONST_INT_P (SET_SRC (curr_set)))
17434 return true;
17435
17436 }
17437 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17438 {
17439 /* We're trying to match:
17440 prev (adrp) == (set (reg r0)
17441 (high (symbol_ref ("SYM"))))
17442 curr (ldr) == (set (reg r1)
17443 (mem (lo_sum (reg r0)
17444 (symbol_ref ("SYM")))))
17445 or
17446 curr (ldr) == (set (reg r1)
17447 (zero_extend (mem
17448 (lo_sum (reg r0)
17449 (symbol_ref ("SYM")))))) */
17450 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17451 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17452 {
17453 rtx curr_src = SET_SRC (curr_set);
17454
17455 if (GET_CODE (curr_src) == ZERO_EXTEND)
17456 curr_src = XEXP (curr_src, 0);
17457
17458 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17459 && REG_P (XEXP (XEXP (curr_src, 0), 0))
17460 && REGNO (XEXP (XEXP (curr_src, 0), 0))
17461 == REGNO (SET_DEST (prev_set))
17462 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17463 XEXP (SET_SRC (prev_set), 0)))
17464 return true;
17465 }
17466 }
17467
17468 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17469 && aarch_crypto_can_dual_issue (prev, curr))
17470 return true;
17471
17472 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17473 && any_condjump_p (curr))
17474 {
17475 unsigned int condreg1, condreg2;
17476 rtx cc_reg_1;
17477 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17478 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17479
17480 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17481 && prev
17482 && modified_in_p (cc_reg_1, prev))
17483 {
17484 enum attr_type prev_type = get_attr_type (prev);
17485
17486 /* FIXME: this misses some which is considered simple arthematic
17487 instructions for ThunderX. Simple shifts are missed here. */
17488 if (prev_type == TYPE_ALUS_SREG
17489 || prev_type == TYPE_ALUS_IMM
17490 || prev_type == TYPE_LOGICS_REG
17491 || prev_type == TYPE_LOGICS_IMM)
17492 return true;
17493 }
17494 }
17495
17496 if (prev_set
17497 && curr_set
17498 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17499 && any_condjump_p (curr))
17500 {
17501 /* We're trying to match:
17502 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17503 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17504 (const_int 0))
17505 (label_ref ("SYM"))
17506 (pc)) */
17507 if (SET_DEST (curr_set) == (pc_rtx)
17508 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17509 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17510 && REG_P (SET_DEST (prev_set))
17511 && REGNO (SET_DEST (prev_set))
17512 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17513 {
17514 /* Fuse ALU operations followed by conditional branch instruction. */
17515 switch (get_attr_type (prev))
17516 {
17517 case TYPE_ALU_IMM:
17518 case TYPE_ALU_SREG:
17519 case TYPE_ADC_REG:
17520 case TYPE_ADC_IMM:
17521 case TYPE_ADCS_REG:
17522 case TYPE_ADCS_IMM:
17523 case TYPE_LOGIC_REG:
17524 case TYPE_LOGIC_IMM:
17525 case TYPE_CSEL:
17526 case TYPE_ADR:
17527 case TYPE_MOV_IMM:
17528 case TYPE_SHIFT_REG:
17529 case TYPE_SHIFT_IMM:
17530 case TYPE_BFM:
17531 case TYPE_RBIT:
17532 case TYPE_REV:
17533 case TYPE_EXTEND:
17534 return true;
17535
17536 default:;
17537 }
17538 }
17539 }
17540
17541 return false;
17542 }
17543
17544 /* Return true iff the instruction fusion described by OP is enabled. */
17545
17546 bool
17547 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17548 {
17549 return (aarch64_tune_params.fusible_ops & op) != 0;
17550 }
17551
17552 /* If MEM is in the form of [base+offset], extract the two parts
17553 of address and set to BASE and OFFSET, otherwise return false
17554 after clearing BASE and OFFSET. */
17555
17556 bool
17557 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17558 {
17559 rtx addr;
17560
17561 gcc_assert (MEM_P (mem));
17562
17563 addr = XEXP (mem, 0);
17564
17565 if (REG_P (addr))
17566 {
17567 *base = addr;
17568 *offset = const0_rtx;
17569 return true;
17570 }
17571
17572 if (GET_CODE (addr) == PLUS
17573 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17574 {
17575 *base = XEXP (addr, 0);
17576 *offset = XEXP (addr, 1);
17577 return true;
17578 }
17579
17580 *base = NULL_RTX;
17581 *offset = NULL_RTX;
17582
17583 return false;
17584 }
17585
17586 /* Types for scheduling fusion. */
17587 enum sched_fusion_type
17588 {
17589 SCHED_FUSION_NONE = 0,
17590 SCHED_FUSION_LD_SIGN_EXTEND,
17591 SCHED_FUSION_LD_ZERO_EXTEND,
17592 SCHED_FUSION_LD,
17593 SCHED_FUSION_ST,
17594 SCHED_FUSION_NUM
17595 };
17596
17597 /* If INSN is a load or store of address in the form of [base+offset],
17598 extract the two parts and set to BASE and OFFSET. Return scheduling
17599 fusion type this INSN is. */
17600
17601 static enum sched_fusion_type
17602 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17603 {
17604 rtx x, dest, src;
17605 enum sched_fusion_type fusion = SCHED_FUSION_LD;
17606
17607 gcc_assert (INSN_P (insn));
17608 x = PATTERN (insn);
17609 if (GET_CODE (x) != SET)
17610 return SCHED_FUSION_NONE;
17611
17612 src = SET_SRC (x);
17613 dest = SET_DEST (x);
17614
17615 machine_mode dest_mode = GET_MODE (dest);
17616
17617 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17618 return SCHED_FUSION_NONE;
17619
17620 if (GET_CODE (src) == SIGN_EXTEND)
17621 {
17622 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17623 src = XEXP (src, 0);
17624 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17625 return SCHED_FUSION_NONE;
17626 }
17627 else if (GET_CODE (src) == ZERO_EXTEND)
17628 {
17629 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17630 src = XEXP (src, 0);
17631 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17632 return SCHED_FUSION_NONE;
17633 }
17634
17635 if (GET_CODE (src) == MEM && REG_P (dest))
17636 extract_base_offset_in_addr (src, base, offset);
17637 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17638 {
17639 fusion = SCHED_FUSION_ST;
17640 extract_base_offset_in_addr (dest, base, offset);
17641 }
17642 else
17643 return SCHED_FUSION_NONE;
17644
17645 if (*base == NULL_RTX || *offset == NULL_RTX)
17646 fusion = SCHED_FUSION_NONE;
17647
17648 return fusion;
17649 }
17650
17651 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17652
17653 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17654 and PRI are only calculated for these instructions. For other instruction,
17655 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
17656 type instruction fusion can be added by returning different priorities.
17657
17658 It's important that irrelevant instructions get the largest FUSION_PRI. */
17659
17660 static void
17661 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17662 int *fusion_pri, int *pri)
17663 {
17664 int tmp, off_val;
17665 rtx base, offset;
17666 enum sched_fusion_type fusion;
17667
17668 gcc_assert (INSN_P (insn));
17669
17670 tmp = max_pri - 1;
17671 fusion = fusion_load_store (insn, &base, &offset);
17672 if (fusion == SCHED_FUSION_NONE)
17673 {
17674 *pri = tmp;
17675 *fusion_pri = tmp;
17676 return;
17677 }
17678
17679 /* Set FUSION_PRI according to fusion type and base register. */
17680 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17681
17682 /* Calculate PRI. */
17683 tmp /= 2;
17684
17685 /* INSN with smaller offset goes first. */
17686 off_val = (int)(INTVAL (offset));
17687 if (off_val >= 0)
17688 tmp -= (off_val & 0xfffff);
17689 else
17690 tmp += ((- off_val) & 0xfffff);
17691
17692 *pri = tmp;
17693 return;
17694 }
17695
17696 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17697 Adjust priority of sha1h instructions so they are scheduled before
17698 other SHA1 instructions. */
17699
17700 static int
17701 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17702 {
17703 rtx x = PATTERN (insn);
17704
17705 if (GET_CODE (x) == SET)
17706 {
17707 x = SET_SRC (x);
17708
17709 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17710 return priority + 10;
17711 }
17712
17713 return priority;
17714 }
17715
17716 /* Given OPERANDS of consecutive load/store, check if we can merge
17717 them into ldp/stp. LOAD is true if they are load instructions.
17718 MODE is the mode of memory operands. */
17719
17720 bool
17721 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17722 machine_mode mode)
17723 {
17724 HOST_WIDE_INT offval_1, offval_2, msize;
17725 enum reg_class rclass_1, rclass_2;
17726 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17727
17728 if (load)
17729 {
17730 mem_1 = operands[1];
17731 mem_2 = operands[3];
17732 reg_1 = operands[0];
17733 reg_2 = operands[2];
17734 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17735 if (REGNO (reg_1) == REGNO (reg_2))
17736 return false;
17737 }
17738 else
17739 {
17740 mem_1 = operands[0];
17741 mem_2 = operands[2];
17742 reg_1 = operands[1];
17743 reg_2 = operands[3];
17744 }
17745
17746 /* The mems cannot be volatile. */
17747 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17748 return false;
17749
17750 /* If we have SImode and slow unaligned ldp,
17751 check the alignment to be at least 8 byte. */
17752 if (mode == SImode
17753 && (aarch64_tune_params.extra_tuning_flags
17754 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17755 && !optimize_size
17756 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17757 return false;
17758
17759 /* Check if the addresses are in the form of [base+offset]. */
17760 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17761 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17762 return false;
17763 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17764 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17765 return false;
17766
17767 /* Check if the bases are same. */
17768 if (!rtx_equal_p (base_1, base_2))
17769 return false;
17770
17771 /* The operands must be of the same size. */
17772 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17773 GET_MODE_SIZE (GET_MODE (mem_2))));
17774
17775 offval_1 = INTVAL (offset_1);
17776 offval_2 = INTVAL (offset_2);
17777 /* We should only be trying this for fixed-sized modes. There is no
17778 SVE LDP/STP instruction. */
17779 msize = GET_MODE_SIZE (mode).to_constant ();
17780 /* Check if the offsets are consecutive. */
17781 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17782 return false;
17783
17784 /* Check if the addresses are clobbered by load. */
17785 if (load)
17786 {
17787 if (reg_mentioned_p (reg_1, mem_1))
17788 return false;
17789
17790 /* In increasing order, the last load can clobber the address. */
17791 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17792 return false;
17793 }
17794
17795 /* One of the memory accesses must be a mempair operand.
17796 If it is not the first one, they need to be swapped by the
17797 peephole. */
17798 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17799 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17800 return false;
17801
17802 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17803 rclass_1 = FP_REGS;
17804 else
17805 rclass_1 = GENERAL_REGS;
17806
17807 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17808 rclass_2 = FP_REGS;
17809 else
17810 rclass_2 = GENERAL_REGS;
17811
17812 /* Check if the registers are of same class. */
17813 if (rclass_1 != rclass_2)
17814 return false;
17815
17816 return true;
17817 }
17818
17819 /* Given OPERANDS of consecutive load/store that can be merged,
17820 swap them if they are not in ascending order. */
17821 void
17822 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17823 {
17824 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17825 HOST_WIDE_INT offval_1, offval_2;
17826
17827 if (load)
17828 {
17829 mem_1 = operands[1];
17830 mem_2 = operands[3];
17831 }
17832 else
17833 {
17834 mem_1 = operands[0];
17835 mem_2 = operands[2];
17836 }
17837
17838 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17839 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17840
17841 offval_1 = INTVAL (offset_1);
17842 offval_2 = INTVAL (offset_2);
17843
17844 if (offval_1 > offval_2)
17845 {
17846 /* Irrespective of whether this is a load or a store,
17847 we do the same swap. */
17848 std::swap (operands[0], operands[2]);
17849 std::swap (operands[1], operands[3]);
17850 }
17851 }
17852
17853 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17854 comparison between the two. */
17855 int
17856 aarch64_host_wide_int_compare (const void *x, const void *y)
17857 {
17858 return wi::cmps (* ((const HOST_WIDE_INT *) x),
17859 * ((const HOST_WIDE_INT *) y));
17860 }
17861
17862 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17863 other pointing to a REG rtx containing an offset, compare the offsets
17864 of the two pairs.
17865
17866 Return:
17867
17868 1 iff offset (X) > offset (Y)
17869 0 iff offset (X) == offset (Y)
17870 -1 iff offset (X) < offset (Y) */
17871 int
17872 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17873 {
17874 const rtx * operands_1 = (const rtx *) x;
17875 const rtx * operands_2 = (const rtx *) y;
17876 rtx mem_1, mem_2, base, offset_1, offset_2;
17877
17878 if (MEM_P (operands_1[0]))
17879 mem_1 = operands_1[0];
17880 else
17881 mem_1 = operands_1[1];
17882
17883 if (MEM_P (operands_2[0]))
17884 mem_2 = operands_2[0];
17885 else
17886 mem_2 = operands_2[1];
17887
17888 /* Extract the offsets. */
17889 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17890 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17891
17892 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17893
17894 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17895 }
17896
17897 /* Given OPERANDS of consecutive load/store, check if we can merge
17898 them into ldp/stp by adjusting the offset. LOAD is true if they
17899 are load instructions. MODE is the mode of memory operands.
17900
17901 Given below consecutive stores:
17902
17903 str w1, [xb, 0x100]
17904 str w1, [xb, 0x104]
17905 str w1, [xb, 0x108]
17906 str w1, [xb, 0x10c]
17907
17908 Though the offsets are out of the range supported by stp, we can
17909 still pair them after adjusting the offset, like:
17910
17911 add scratch, xb, 0x100
17912 stp w1, w1, [scratch]
17913 stp w1, w1, [scratch, 0x8]
17914
17915 The peephole patterns detecting this opportunity should guarantee
17916 the scratch register is avaliable. */
17917
17918 bool
17919 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17920 scalar_mode mode)
17921 {
17922 const int num_insns = 4;
17923 enum reg_class rclass;
17924 HOST_WIDE_INT offvals[num_insns], msize;
17925 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17926
17927 if (load)
17928 {
17929 for (int i = 0; i < num_insns; i++)
17930 {
17931 reg[i] = operands[2 * i];
17932 mem[i] = operands[2 * i + 1];
17933
17934 gcc_assert (REG_P (reg[i]));
17935 }
17936
17937 /* Do not attempt to merge the loads if the loads clobber each other. */
17938 for (int i = 0; i < 8; i += 2)
17939 for (int j = i + 2; j < 8; j += 2)
17940 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17941 return false;
17942 }
17943 else
17944 for (int i = 0; i < num_insns; i++)
17945 {
17946 mem[i] = operands[2 * i];
17947 reg[i] = operands[2 * i + 1];
17948 }
17949
17950 /* Skip if memory operand is by itself valid for ldp/stp. */
17951 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17952 return false;
17953
17954 for (int i = 0; i < num_insns; i++)
17955 {
17956 /* The mems cannot be volatile. */
17957 if (MEM_VOLATILE_P (mem[i]))
17958 return false;
17959
17960 /* Check if the addresses are in the form of [base+offset]. */
17961 extract_base_offset_in_addr (mem[i], base + i, offset + i);
17962 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17963 return false;
17964 }
17965
17966 /* Check if the registers are of same class. */
17967 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17968 ? FP_REGS : GENERAL_REGS;
17969
17970 for (int i = 1; i < num_insns; i++)
17971 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17972 {
17973 if (rclass != FP_REGS)
17974 return false;
17975 }
17976 else
17977 {
17978 if (rclass != GENERAL_REGS)
17979 return false;
17980 }
17981
17982 /* Only the last register in the order in which they occur
17983 may be clobbered by the load. */
17984 if (rclass == GENERAL_REGS && load)
17985 for (int i = 0; i < num_insns - 1; i++)
17986 if (reg_mentioned_p (reg[i], mem[i]))
17987 return false;
17988
17989 /* Check if the bases are same. */
17990 for (int i = 0; i < num_insns - 1; i++)
17991 if (!rtx_equal_p (base[i], base[i + 1]))
17992 return false;
17993
17994 for (int i = 0; i < num_insns; i++)
17995 offvals[i] = INTVAL (offset[i]);
17996
17997 msize = GET_MODE_SIZE (mode);
17998
17999 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18000 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18001 aarch64_host_wide_int_compare);
18002
18003 if (!(offvals[1] == offvals[0] + msize
18004 && offvals[3] == offvals[2] + msize))
18005 return false;
18006
18007 /* Check that offsets are within range of each other. The ldp/stp
18008 instructions have 7 bit immediate offsets, so use 0x80. */
18009 if (offvals[2] - offvals[0] >= msize * 0x80)
18010 return false;
18011
18012 /* The offsets must be aligned with respect to each other. */
18013 if (offvals[0] % msize != offvals[2] % msize)
18014 return false;
18015
18016 /* If we have SImode and slow unaligned ldp,
18017 check the alignment to be at least 8 byte. */
18018 if (mode == SImode
18019 && (aarch64_tune_params.extra_tuning_flags
18020 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18021 && !optimize_size
18022 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18023 return false;
18024
18025 return true;
18026 }
18027
18028 /* Given OPERANDS of consecutive load/store, this function pairs them
18029 into LDP/STP after adjusting the offset. It depends on the fact
18030 that the operands can be sorted so the offsets are correct for STP.
18031 MODE is the mode of memory operands. CODE is the rtl operator
18032 which should be applied to all memory operands, it's SIGN_EXTEND,
18033 ZERO_EXTEND or UNKNOWN. */
18034
18035 bool
18036 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18037 scalar_mode mode, RTX_CODE code)
18038 {
18039 rtx base, offset_1, offset_3, t1, t2;
18040 rtx mem_1, mem_2, mem_3, mem_4;
18041 rtx temp_operands[8];
18042 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18043 stp_off_upper_limit, stp_off_lower_limit, msize;
18044
18045 /* We make changes on a copy as we may still bail out. */
18046 for (int i = 0; i < 8; i ++)
18047 temp_operands[i] = operands[i];
18048
18049 /* Sort the operands. */
18050 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18051
18052 if (load)
18053 {
18054 mem_1 = temp_operands[1];
18055 mem_2 = temp_operands[3];
18056 mem_3 = temp_operands[5];
18057 mem_4 = temp_operands[7];
18058 }
18059 else
18060 {
18061 mem_1 = temp_operands[0];
18062 mem_2 = temp_operands[2];
18063 mem_3 = temp_operands[4];
18064 mem_4 = temp_operands[6];
18065 gcc_assert (code == UNKNOWN);
18066 }
18067
18068 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18069 extract_base_offset_in_addr (mem_3, &base, &offset_3);
18070 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18071 && offset_3 != NULL_RTX);
18072
18073 /* Adjust offset so it can fit in LDP/STP instruction. */
18074 msize = GET_MODE_SIZE (mode);
18075 stp_off_upper_limit = msize * (0x40 - 1);
18076 stp_off_lower_limit = - msize * 0x40;
18077
18078 off_val_1 = INTVAL (offset_1);
18079 off_val_3 = INTVAL (offset_3);
18080
18081 /* The base offset is optimally half way between the two STP/LDP offsets. */
18082 if (msize <= 4)
18083 base_off = (off_val_1 + off_val_3) / 2;
18084 else
18085 /* However, due to issues with negative LDP/STP offset generation for
18086 larger modes, for DF, DI and vector modes. we must not use negative
18087 addresses smaller than 9 signed unadjusted bits can store. This
18088 provides the most range in this case. */
18089 base_off = off_val_1;
18090
18091 /* Adjust the base so that it is aligned with the addresses but still
18092 optimal. */
18093 if (base_off % msize != off_val_1 % msize)
18094 /* Fix the offset, bearing in mind we want to make it bigger not
18095 smaller. */
18096 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18097 else if (msize <= 4)
18098 /* The negative range of LDP/STP is one larger than the positive range. */
18099 base_off += msize;
18100
18101 /* Check if base offset is too big or too small. We can attempt to resolve
18102 this issue by setting it to the maximum value and seeing if the offsets
18103 still fit. */
18104 if (base_off >= 0x1000)
18105 {
18106 base_off = 0x1000 - 1;
18107 /* We must still make sure that the base offset is aligned with respect
18108 to the address. But it may may not be made any bigger. */
18109 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18110 }
18111
18112 /* Likewise for the case where the base is too small. */
18113 if (base_off <= -0x1000)
18114 {
18115 base_off = -0x1000 + 1;
18116 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18117 }
18118
18119 /* Offset of the first STP/LDP. */
18120 new_off_1 = off_val_1 - base_off;
18121
18122 /* Offset of the second STP/LDP. */
18123 new_off_3 = off_val_3 - base_off;
18124
18125 /* The offsets must be within the range of the LDP/STP instructions. */
18126 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18127 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18128 return false;
18129
18130 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18131 new_off_1), true);
18132 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18133 new_off_1 + msize), true);
18134 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18135 new_off_3), true);
18136 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18137 new_off_3 + msize), true);
18138
18139 if (!aarch64_mem_pair_operand (mem_1, mode)
18140 || !aarch64_mem_pair_operand (mem_3, mode))
18141 return false;
18142
18143 if (code == ZERO_EXTEND)
18144 {
18145 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18146 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18147 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18148 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18149 }
18150 else if (code == SIGN_EXTEND)
18151 {
18152 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18153 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18154 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18155 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18156 }
18157
18158 if (load)
18159 {
18160 operands[0] = temp_operands[0];
18161 operands[1] = mem_1;
18162 operands[2] = temp_operands[2];
18163 operands[3] = mem_2;
18164 operands[4] = temp_operands[4];
18165 operands[5] = mem_3;
18166 operands[6] = temp_operands[6];
18167 operands[7] = mem_4;
18168 }
18169 else
18170 {
18171 operands[0] = mem_1;
18172 operands[1] = temp_operands[1];
18173 operands[2] = mem_2;
18174 operands[3] = temp_operands[3];
18175 operands[4] = mem_3;
18176 operands[5] = temp_operands[5];
18177 operands[6] = mem_4;
18178 operands[7] = temp_operands[7];
18179 }
18180
18181 /* Emit adjusting instruction. */
18182 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18183 /* Emit ldp/stp instructions. */
18184 t1 = gen_rtx_SET (operands[0], operands[1]);
18185 t2 = gen_rtx_SET (operands[2], operands[3]);
18186 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18187 t1 = gen_rtx_SET (operands[4], operands[5]);
18188 t2 = gen_rtx_SET (operands[6], operands[7]);
18189 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18190 return true;
18191 }
18192
18193 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18194 it isn't worth branching around empty masked ops (including masked
18195 stores). */
18196
18197 static bool
18198 aarch64_empty_mask_is_expensive (unsigned)
18199 {
18200 return false;
18201 }
18202
18203 /* Return 1 if pseudo register should be created and used to hold
18204 GOT address for PIC code. */
18205
18206 bool
18207 aarch64_use_pseudo_pic_reg (void)
18208 {
18209 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18210 }
18211
18212 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18213
18214 static int
18215 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18216 {
18217 switch (XINT (x, 1))
18218 {
18219 case UNSPEC_GOTSMALLPIC:
18220 case UNSPEC_GOTSMALLPIC28K:
18221 case UNSPEC_GOTTINYPIC:
18222 return 0;
18223 default:
18224 break;
18225 }
18226
18227 return default_unspec_may_trap_p (x, flags);
18228 }
18229
18230
18231 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18232 return the log2 of that value. Otherwise return -1. */
18233
18234 int
18235 aarch64_fpconst_pow_of_2 (rtx x)
18236 {
18237 const REAL_VALUE_TYPE *r;
18238
18239 if (!CONST_DOUBLE_P (x))
18240 return -1;
18241
18242 r = CONST_DOUBLE_REAL_VALUE (x);
18243
18244 if (REAL_VALUE_NEGATIVE (*r)
18245 || REAL_VALUE_ISNAN (*r)
18246 || REAL_VALUE_ISINF (*r)
18247 || !real_isinteger (r, DFmode))
18248 return -1;
18249
18250 return exact_log2 (real_to_integer (r));
18251 }
18252
18253 /* If X is a vector of equal CONST_DOUBLE values and that value is
18254 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18255
18256 int
18257 aarch64_vec_fpconst_pow_of_2 (rtx x)
18258 {
18259 int nelts;
18260 if (GET_CODE (x) != CONST_VECTOR
18261 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18262 return -1;
18263
18264 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18265 return -1;
18266
18267 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18268 if (firstval <= 0)
18269 return -1;
18270
18271 for (int i = 1; i < nelts; i++)
18272 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18273 return -1;
18274
18275 return firstval;
18276 }
18277
18278 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18279 to float.
18280
18281 __fp16 always promotes through this hook.
18282 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18283 through the generic excess precision logic rather than here. */
18284
18285 static tree
18286 aarch64_promoted_type (const_tree t)
18287 {
18288 if (SCALAR_FLOAT_TYPE_P (t)
18289 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18290 return float_type_node;
18291
18292 return NULL_TREE;
18293 }
18294
18295 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18296
18297 static bool
18298 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18299 optimization_type opt_type)
18300 {
18301 switch (op)
18302 {
18303 case rsqrt_optab:
18304 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18305
18306 default:
18307 return true;
18308 }
18309 }
18310
18311 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18312
18313 static unsigned int
18314 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18315 int *offset)
18316 {
18317 /* Polynomial invariant 1 == (VG / 2) - 1. */
18318 gcc_assert (i == 1);
18319 *factor = 2;
18320 *offset = 1;
18321 return AARCH64_DWARF_VG;
18322 }
18323
18324 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18325 if MODE is HFmode, and punt to the generic implementation otherwise. */
18326
18327 static bool
18328 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18329 {
18330 return (mode == HFmode
18331 ? true
18332 : default_libgcc_floating_mode_supported_p (mode));
18333 }
18334
18335 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18336 if MODE is HFmode, and punt to the generic implementation otherwise. */
18337
18338 static bool
18339 aarch64_scalar_mode_supported_p (scalar_mode mode)
18340 {
18341 return (mode == HFmode
18342 ? true
18343 : default_scalar_mode_supported_p (mode));
18344 }
18345
18346 /* Set the value of FLT_EVAL_METHOD.
18347 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18348
18349 0: evaluate all operations and constants, whose semantic type has at
18350 most the range and precision of type float, to the range and
18351 precision of float; evaluate all other operations and constants to
18352 the range and precision of the semantic type;
18353
18354 N, where _FloatN is a supported interchange floating type
18355 evaluate all operations and constants, whose semantic type has at
18356 most the range and precision of _FloatN type, to the range and
18357 precision of the _FloatN type; evaluate all other operations and
18358 constants to the range and precision of the semantic type;
18359
18360 If we have the ARMv8.2-A extensions then we support _Float16 in native
18361 precision, so we should set this to 16. Otherwise, we support the type,
18362 but want to evaluate expressions in float precision, so set this to
18363 0. */
18364
18365 static enum flt_eval_method
18366 aarch64_excess_precision (enum excess_precision_type type)
18367 {
18368 switch (type)
18369 {
18370 case EXCESS_PRECISION_TYPE_FAST:
18371 case EXCESS_PRECISION_TYPE_STANDARD:
18372 /* We can calculate either in 16-bit range and precision or
18373 32-bit range and precision. Make that decision based on whether
18374 we have native support for the ARMv8.2-A 16-bit floating-point
18375 instructions or not. */
18376 return (TARGET_FP_F16INST
18377 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18378 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18379 case EXCESS_PRECISION_TYPE_IMPLICIT:
18380 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18381 default:
18382 gcc_unreachable ();
18383 }
18384 return FLT_EVAL_METHOD_UNPREDICTABLE;
18385 }
18386
18387 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18388 scheduled for speculative execution. Reject the long-running division
18389 and square-root instructions. */
18390
18391 static bool
18392 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18393 {
18394 switch (get_attr_type (insn))
18395 {
18396 case TYPE_SDIV:
18397 case TYPE_UDIV:
18398 case TYPE_FDIVS:
18399 case TYPE_FDIVD:
18400 case TYPE_FSQRTS:
18401 case TYPE_FSQRTD:
18402 case TYPE_NEON_FP_SQRT_S:
18403 case TYPE_NEON_FP_SQRT_D:
18404 case TYPE_NEON_FP_SQRT_S_Q:
18405 case TYPE_NEON_FP_SQRT_D_Q:
18406 case TYPE_NEON_FP_DIV_S:
18407 case TYPE_NEON_FP_DIV_D:
18408 case TYPE_NEON_FP_DIV_S_Q:
18409 case TYPE_NEON_FP_DIV_D_Q:
18410 return false;
18411 default:
18412 return true;
18413 }
18414 }
18415
18416 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18417
18418 static int
18419 aarch64_compute_pressure_classes (reg_class *classes)
18420 {
18421 int i = 0;
18422 classes[i++] = GENERAL_REGS;
18423 classes[i++] = FP_REGS;
18424 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18425 registers need to go in PR_LO_REGS at some point during their
18426 lifetime. Splitting it into two halves has the effect of making
18427 all predicates count against PR_LO_REGS, so that we try whenever
18428 possible to restrict the number of live predicates to 8. This
18429 greatly reduces the amount of spilling in certain loops. */
18430 classes[i++] = PR_LO_REGS;
18431 classes[i++] = PR_HI_REGS;
18432 return i;
18433 }
18434
18435 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18436
18437 static bool
18438 aarch64_can_change_mode_class (machine_mode from,
18439 machine_mode to, reg_class_t)
18440 {
18441 if (BYTES_BIG_ENDIAN)
18442 {
18443 bool from_sve_p = aarch64_sve_data_mode_p (from);
18444 bool to_sve_p = aarch64_sve_data_mode_p (to);
18445
18446 /* Don't allow changes between SVE data modes and non-SVE modes.
18447 See the comment at the head of aarch64-sve.md for details. */
18448 if (from_sve_p != to_sve_p)
18449 return false;
18450
18451 /* Don't allow changes in element size: lane 0 of the new vector
18452 would not then be lane 0 of the old vector. See the comment
18453 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18454 description.
18455
18456 In the worst case, this forces a register to be spilled in
18457 one mode and reloaded in the other, which handles the
18458 endianness correctly. */
18459 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18460 return false;
18461 }
18462 return true;
18463 }
18464
18465 /* Implement TARGET_EARLY_REMAT_MODES. */
18466
18467 static void
18468 aarch64_select_early_remat_modes (sbitmap modes)
18469 {
18470 /* SVE values are not normally live across a call, so it should be
18471 worth doing early rematerialization even in VL-specific mode. */
18472 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18473 {
18474 machine_mode mode = (machine_mode) i;
18475 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18476 if (vec_flags & VEC_ANY_SVE)
18477 bitmap_set_bit (modes, i);
18478 }
18479 }
18480
18481 /* Override the default target speculation_safe_value. */
18482 static rtx
18483 aarch64_speculation_safe_value (machine_mode mode,
18484 rtx result, rtx val, rtx failval)
18485 {
18486 /* Maybe we should warn if falling back to hard barriers. They are
18487 likely to be noticably more expensive than the alternative below. */
18488 if (!aarch64_track_speculation)
18489 return default_speculation_safe_value (mode, result, val, failval);
18490
18491 if (!REG_P (val))
18492 val = copy_to_mode_reg (mode, val);
18493
18494 if (!aarch64_reg_or_zero (failval, mode))
18495 failval = copy_to_mode_reg (mode, failval);
18496
18497 emit_insn (gen_despeculate_copy (mode, result, val, failval));
18498 return result;
18499 }
18500
18501 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18502 Look into the tuning structure for an estimate.
18503 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18504 Advanced SIMD 128 bits. */
18505
18506 static HOST_WIDE_INT
18507 aarch64_estimated_poly_value (poly_int64 val)
18508 {
18509 enum aarch64_sve_vector_bits_enum width_source
18510 = aarch64_tune_params.sve_width;
18511
18512 /* If we still don't have an estimate, use the default. */
18513 if (width_source == SVE_SCALABLE)
18514 return default_estimated_poly_value (val);
18515
18516 HOST_WIDE_INT over_128 = width_source - 128;
18517 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18518 }
18519
18520
18521 /* Return true for types that could be supported as SIMD return or
18522 argument types. */
18523
18524 static bool
18525 supported_simd_type (tree t)
18526 {
18527 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18528 {
18529 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18530 return s == 1 || s == 2 || s == 4 || s == 8;
18531 }
18532 return false;
18533 }
18534
18535 /* Return true for types that currently are supported as SIMD return
18536 or argument types. */
18537
18538 static bool
18539 currently_supported_simd_type (tree t, tree b)
18540 {
18541 if (COMPLEX_FLOAT_TYPE_P (t))
18542 return false;
18543
18544 if (TYPE_SIZE (t) != TYPE_SIZE (b))
18545 return false;
18546
18547 return supported_simd_type (t);
18548 }
18549
18550 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
18551
18552 static int
18553 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
18554 struct cgraph_simd_clone *clonei,
18555 tree base_type, int num)
18556 {
18557 tree t, ret_type, arg_type;
18558 unsigned int elt_bits, vec_bits, count;
18559
18560 if (!TARGET_SIMD)
18561 return 0;
18562
18563 if (clonei->simdlen
18564 && (clonei->simdlen < 2
18565 || clonei->simdlen > 1024
18566 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
18567 {
18568 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18569 "unsupported simdlen %d", clonei->simdlen);
18570 return 0;
18571 }
18572
18573 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
18574 if (TREE_CODE (ret_type) != VOID_TYPE
18575 && !currently_supported_simd_type (ret_type, base_type))
18576 {
18577 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
18578 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18579 "GCC does not currently support mixed size types "
18580 "for %<simd%> functions");
18581 else if (supported_simd_type (ret_type))
18582 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18583 "GCC does not currently support return type %qT "
18584 "for %<simd%> functions", ret_type);
18585 else
18586 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18587 "unsupported return type %qT for %<simd%> functions",
18588 ret_type);
18589 return 0;
18590 }
18591
18592 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
18593 {
18594 arg_type = TREE_TYPE (t);
18595
18596 if (!currently_supported_simd_type (arg_type, base_type))
18597 {
18598 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
18599 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18600 "GCC does not currently support mixed size types "
18601 "for %<simd%> functions");
18602 else
18603 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18604 "GCC does not currently support argument type %qT "
18605 "for %<simd%> functions", arg_type);
18606 return 0;
18607 }
18608 }
18609
18610 clonei->vecsize_mangle = 'n';
18611 clonei->mask_mode = VOIDmode;
18612 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
18613 if (clonei->simdlen == 0)
18614 {
18615 count = 2;
18616 vec_bits = (num == 0 ? 64 : 128);
18617 clonei->simdlen = vec_bits / elt_bits;
18618 }
18619 else
18620 {
18621 count = 1;
18622 vec_bits = clonei->simdlen * elt_bits;
18623 if (vec_bits != 64 && vec_bits != 128)
18624 {
18625 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18626 "GCC does not currently support simdlen %d for type %qT",
18627 clonei->simdlen, base_type);
18628 return 0;
18629 }
18630 }
18631 clonei->vecsize_int = vec_bits;
18632 clonei->vecsize_float = vec_bits;
18633 return count;
18634 }
18635
18636 /* Implement TARGET_SIMD_CLONE_ADJUST. */
18637
18638 static void
18639 aarch64_simd_clone_adjust (struct cgraph_node *node)
18640 {
18641 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
18642 use the correct ABI. */
18643
18644 tree t = TREE_TYPE (node->decl);
18645 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
18646 TYPE_ATTRIBUTES (t));
18647 }
18648
18649 /* Implement TARGET_SIMD_CLONE_USABLE. */
18650
18651 static int
18652 aarch64_simd_clone_usable (struct cgraph_node *node)
18653 {
18654 switch (node->simdclone->vecsize_mangle)
18655 {
18656 case 'n':
18657 if (!TARGET_SIMD)
18658 return -1;
18659 return 0;
18660 default:
18661 gcc_unreachable ();
18662 }
18663 }
18664
18665 /* Target-specific selftests. */
18666
18667 #if CHECKING_P
18668
18669 namespace selftest {
18670
18671 /* Selftest for the RTL loader.
18672 Verify that the RTL loader copes with a dump from
18673 print_rtx_function. This is essentially just a test that class
18674 function_reader can handle a real dump, but it also verifies
18675 that lookup_reg_by_dump_name correctly handles hard regs.
18676 The presence of hard reg names in the dump means that the test is
18677 target-specific, hence it is in this file. */
18678
18679 static void
18680 aarch64_test_loading_full_dump ()
18681 {
18682 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18683
18684 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18685
18686 rtx_insn *insn_1 = get_insn_by_uid (1);
18687 ASSERT_EQ (NOTE, GET_CODE (insn_1));
18688
18689 rtx_insn *insn_15 = get_insn_by_uid (15);
18690 ASSERT_EQ (INSN, GET_CODE (insn_15));
18691 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18692
18693 /* Verify crtl->return_rtx. */
18694 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18695 ASSERT_EQ (0, REGNO (crtl->return_rtx));
18696 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18697 }
18698
18699 /* Run all target-specific selftests. */
18700
18701 static void
18702 aarch64_run_selftests (void)
18703 {
18704 aarch64_test_loading_full_dump ();
18705 }
18706
18707 } // namespace selftest
18708
18709 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
18710 global variable based guard use the default else
18711 return a null tree. */
18712 static tree
18713 aarch64_stack_protect_guard (void)
18714 {
18715 if (aarch64_stack_protector_guard == SSP_GLOBAL)
18716 return default_stack_protect_guard ();
18717
18718 return NULL_TREE;
18719 }
18720
18721
18722 #endif /* #if CHECKING_P */
18723
18724 #undef TARGET_STACK_PROTECT_GUARD
18725 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
18726
18727 #undef TARGET_ADDRESS_COST
18728 #define TARGET_ADDRESS_COST aarch64_address_cost
18729
18730 /* This hook will determines whether unnamed bitfields affect the alignment
18731 of the containing structure. The hook returns true if the structure
18732 should inherit the alignment requirements of an unnamed bitfield's
18733 type. */
18734 #undef TARGET_ALIGN_ANON_BITFIELD
18735 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18736
18737 #undef TARGET_ASM_ALIGNED_DI_OP
18738 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18739
18740 #undef TARGET_ASM_ALIGNED_HI_OP
18741 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18742
18743 #undef TARGET_ASM_ALIGNED_SI_OP
18744 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18745
18746 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18747 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18748 hook_bool_const_tree_hwi_hwi_const_tree_true
18749
18750 #undef TARGET_ASM_FILE_START
18751 #define TARGET_ASM_FILE_START aarch64_start_file
18752
18753 #undef TARGET_ASM_OUTPUT_MI_THUNK
18754 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18755
18756 #undef TARGET_ASM_SELECT_RTX_SECTION
18757 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18758
18759 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18760 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18761
18762 #undef TARGET_BUILD_BUILTIN_VA_LIST
18763 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18764
18765 #undef TARGET_CALLEE_COPIES
18766 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18767
18768 #undef TARGET_CAN_ELIMINATE
18769 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18770
18771 #undef TARGET_CAN_INLINE_P
18772 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18773
18774 #undef TARGET_CANNOT_FORCE_CONST_MEM
18775 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18776
18777 #undef TARGET_CASE_VALUES_THRESHOLD
18778 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18779
18780 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18781 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18782
18783 /* Only the least significant bit is used for initialization guard
18784 variables. */
18785 #undef TARGET_CXX_GUARD_MASK_BIT
18786 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18787
18788 #undef TARGET_C_MODE_FOR_SUFFIX
18789 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18790
18791 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18792 #undef TARGET_DEFAULT_TARGET_FLAGS
18793 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18794 #endif
18795
18796 #undef TARGET_CLASS_MAX_NREGS
18797 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18798
18799 #undef TARGET_BUILTIN_DECL
18800 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18801
18802 #undef TARGET_BUILTIN_RECIPROCAL
18803 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18804
18805 #undef TARGET_C_EXCESS_PRECISION
18806 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18807
18808 #undef TARGET_EXPAND_BUILTIN
18809 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18810
18811 #undef TARGET_EXPAND_BUILTIN_VA_START
18812 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18813
18814 #undef TARGET_FOLD_BUILTIN
18815 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18816
18817 #undef TARGET_FUNCTION_ARG
18818 #define TARGET_FUNCTION_ARG aarch64_function_arg
18819
18820 #undef TARGET_FUNCTION_ARG_ADVANCE
18821 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18822
18823 #undef TARGET_FUNCTION_ARG_BOUNDARY
18824 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18825
18826 #undef TARGET_FUNCTION_ARG_PADDING
18827 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18828
18829 #undef TARGET_GET_RAW_RESULT_MODE
18830 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18831 #undef TARGET_GET_RAW_ARG_MODE
18832 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18833
18834 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18835 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18836
18837 #undef TARGET_FUNCTION_VALUE
18838 #define TARGET_FUNCTION_VALUE aarch64_function_value
18839
18840 #undef TARGET_FUNCTION_VALUE_REGNO_P
18841 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18842
18843 #undef TARGET_GIMPLE_FOLD_BUILTIN
18844 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18845
18846 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18847 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18848
18849 #undef TARGET_INIT_BUILTINS
18850 #define TARGET_INIT_BUILTINS aarch64_init_builtins
18851
18852 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18853 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18854 aarch64_ira_change_pseudo_allocno_class
18855
18856 #undef TARGET_LEGITIMATE_ADDRESS_P
18857 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18858
18859 #undef TARGET_LEGITIMATE_CONSTANT_P
18860 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18861
18862 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18863 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18864 aarch64_legitimize_address_displacement
18865
18866 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18867 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18868
18869 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18870 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18871 aarch64_libgcc_floating_mode_supported_p
18872
18873 #undef TARGET_MANGLE_TYPE
18874 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18875
18876 #undef TARGET_MEMORY_MOVE_COST
18877 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18878
18879 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18880 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18881
18882 #undef TARGET_MUST_PASS_IN_STACK
18883 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18884
18885 /* This target hook should return true if accesses to volatile bitfields
18886 should use the narrowest mode possible. It should return false if these
18887 accesses should use the bitfield container type. */
18888 #undef TARGET_NARROW_VOLATILE_BITFIELD
18889 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18890
18891 #undef TARGET_OPTION_OVERRIDE
18892 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18893
18894 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18895 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18896 aarch64_override_options_after_change
18897
18898 #undef TARGET_OPTION_SAVE
18899 #define TARGET_OPTION_SAVE aarch64_option_save
18900
18901 #undef TARGET_OPTION_RESTORE
18902 #define TARGET_OPTION_RESTORE aarch64_option_restore
18903
18904 #undef TARGET_OPTION_PRINT
18905 #define TARGET_OPTION_PRINT aarch64_option_print
18906
18907 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18908 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18909
18910 #undef TARGET_SET_CURRENT_FUNCTION
18911 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18912
18913 #undef TARGET_PASS_BY_REFERENCE
18914 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18915
18916 #undef TARGET_PREFERRED_RELOAD_CLASS
18917 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18918
18919 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18920 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18921
18922 #undef TARGET_PROMOTED_TYPE
18923 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18924
18925 #undef TARGET_SECONDARY_RELOAD
18926 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18927
18928 #undef TARGET_SHIFT_TRUNCATION_MASK
18929 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18930
18931 #undef TARGET_SETUP_INCOMING_VARARGS
18932 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18933
18934 #undef TARGET_STRUCT_VALUE_RTX
18935 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
18936
18937 #undef TARGET_REGISTER_MOVE_COST
18938 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18939
18940 #undef TARGET_RETURN_IN_MEMORY
18941 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18942
18943 #undef TARGET_RETURN_IN_MSB
18944 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18945
18946 #undef TARGET_RTX_COSTS
18947 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18948
18949 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18950 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18951
18952 #undef TARGET_SCHED_ISSUE_RATE
18953 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18954
18955 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18956 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18957 aarch64_sched_first_cycle_multipass_dfa_lookahead
18958
18959 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18960 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18961 aarch64_first_cycle_multipass_dfa_lookahead_guard
18962
18963 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18964 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18965 aarch64_get_separate_components
18966
18967 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18968 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18969 aarch64_components_for_bb
18970
18971 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18972 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18973 aarch64_disqualify_components
18974
18975 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18976 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18977 aarch64_emit_prologue_components
18978
18979 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18980 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18981 aarch64_emit_epilogue_components
18982
18983 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18984 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18985 aarch64_set_handled_components
18986
18987 #undef TARGET_TRAMPOLINE_INIT
18988 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18989
18990 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18991 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18992
18993 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18994 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18995
18996 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18997 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18998 aarch64_builtin_support_vector_misalignment
18999
19000 #undef TARGET_ARRAY_MODE
19001 #define TARGET_ARRAY_MODE aarch64_array_mode
19002
19003 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19004 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19005
19006 #undef TARGET_VECTORIZE_ADD_STMT_COST
19007 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19008
19009 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19010 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19011 aarch64_builtin_vectorization_cost
19012
19013 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19014 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19015
19016 #undef TARGET_VECTORIZE_BUILTINS
19017 #define TARGET_VECTORIZE_BUILTINS
19018
19019 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19020 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19021 aarch64_builtin_vectorized_function
19022
19023 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19024 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19025 aarch64_autovectorize_vector_sizes
19026
19027 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19028 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19029 aarch64_atomic_assign_expand_fenv
19030
19031 /* Section anchor support. */
19032
19033 #undef TARGET_MIN_ANCHOR_OFFSET
19034 #define TARGET_MIN_ANCHOR_OFFSET -256
19035
19036 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19037 byte offset; we can do much more for larger data types, but have no way
19038 to determine the size of the access. We assume accesses are aligned. */
19039 #undef TARGET_MAX_ANCHOR_OFFSET
19040 #define TARGET_MAX_ANCHOR_OFFSET 4095
19041
19042 #undef TARGET_VECTOR_ALIGNMENT
19043 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19044
19045 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19046 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19047 aarch64_vectorize_preferred_vector_alignment
19048 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19049 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19050 aarch64_simd_vector_alignment_reachable
19051
19052 /* vec_perm support. */
19053
19054 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19055 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19056 aarch64_vectorize_vec_perm_const
19057
19058 #undef TARGET_VECTORIZE_GET_MASK_MODE
19059 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19060 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19061 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19062 aarch64_empty_mask_is_expensive
19063 #undef TARGET_PREFERRED_ELSE_VALUE
19064 #define TARGET_PREFERRED_ELSE_VALUE \
19065 aarch64_preferred_else_value
19066
19067 #undef TARGET_INIT_LIBFUNCS
19068 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19069
19070 #undef TARGET_FIXED_CONDITION_CODE_REGS
19071 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19072
19073 #undef TARGET_FLAGS_REGNUM
19074 #define TARGET_FLAGS_REGNUM CC_REGNUM
19075
19076 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19077 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19078
19079 #undef TARGET_ASAN_SHADOW_OFFSET
19080 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19081
19082 #undef TARGET_LEGITIMIZE_ADDRESS
19083 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19084
19085 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19086 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19087
19088 #undef TARGET_CAN_USE_DOLOOP_P
19089 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19090
19091 #undef TARGET_SCHED_ADJUST_PRIORITY
19092 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19093
19094 #undef TARGET_SCHED_MACRO_FUSION_P
19095 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19096
19097 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19098 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19099
19100 #undef TARGET_SCHED_FUSION_PRIORITY
19101 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19102
19103 #undef TARGET_UNSPEC_MAY_TRAP_P
19104 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19105
19106 #undef TARGET_USE_PSEUDO_PIC_REG
19107 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19108
19109 #undef TARGET_PRINT_OPERAND
19110 #define TARGET_PRINT_OPERAND aarch64_print_operand
19111
19112 #undef TARGET_PRINT_OPERAND_ADDRESS
19113 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19114
19115 #undef TARGET_OPTAB_SUPPORTED_P
19116 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19117
19118 #undef TARGET_OMIT_STRUCT_RETURN_REG
19119 #define TARGET_OMIT_STRUCT_RETURN_REG true
19120
19121 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19122 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19123 aarch64_dwarf_poly_indeterminate_value
19124
19125 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
19126 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19127 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19128
19129 #undef TARGET_HARD_REGNO_NREGS
19130 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19131 #undef TARGET_HARD_REGNO_MODE_OK
19132 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19133
19134 #undef TARGET_MODES_TIEABLE_P
19135 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19136
19137 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19138 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19139 aarch64_hard_regno_call_part_clobbered
19140
19141 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19142 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19143 aarch64_remove_extra_call_preserved_regs
19144
19145 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19146 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19147 aarch64_return_call_with_max_clobbers
19148
19149 #undef TARGET_CONSTANT_ALIGNMENT
19150 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19151
19152 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19153 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19154 aarch64_stack_clash_protection_alloca_probe_range
19155
19156 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19157 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19158
19159 #undef TARGET_CAN_CHANGE_MODE_CLASS
19160 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19161
19162 #undef TARGET_SELECT_EARLY_REMAT_MODES
19163 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19164
19165 #undef TARGET_SPECULATION_SAFE_VALUE
19166 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19167
19168 #undef TARGET_ESTIMATED_POLY_VALUE
19169 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19170
19171 #undef TARGET_ATTRIBUTE_TABLE
19172 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19173
19174 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19175 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19176 aarch64_simd_clone_compute_vecsize_and_simdlen
19177
19178 #undef TARGET_SIMD_CLONE_ADJUST
19179 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19180
19181 #undef TARGET_SIMD_CLONE_USABLE
19182 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19183
19184 #if CHECKING_P
19185 #undef TARGET_RUN_TARGET_SELFTESTS
19186 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19187 #endif /* #if CHECKING_P */
19188
19189 struct gcc_target targetm = TARGET_INITIALIZER;
19190
19191 #include "gt-aarch64.h"